Filename | /Users/ap13/perl5/lib/perl5/Bio/Tools/GuessSeqFormat.pm |
Statements | Executed 8 statements in 2.47ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 13µs | 26µs | BEGIN@247 | Bio::Tools::GuessSeqFormat::
1 | 1 | 1 | 8µs | 13µs | BEGIN@248 | Bio::Tools::GuessSeqFormat::
1 | 1 | 1 | 8µs | 70µs | BEGIN@251 | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_ace | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_blast | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_bowtie | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_clustalw | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_codata | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_embl | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_fasta | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_fastq | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_fastxy | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_game | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_gcg | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_gcgblast | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_gcgfasta | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_gde | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_genbank | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_genscan | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_gff | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_hmmer | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_mase | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_mega | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_msf | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_nexus | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_pfam | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_phrap | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_phylip | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_pir | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_prodom | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_raw | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_rsf | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_selex | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_stockholm | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_swiss | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_tab | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | _possibly_vcf | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | fh | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | file | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | guess | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | new | Bio::Tools::GuessSeqFormat::
0 | 0 | 0 | 0s | 0s | text | Bio::Tools::GuessSeqFormat::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | #------------------------------------------------------------------ | ||||
2 | # | ||||
3 | # BioPerl module Bio::Tools::GuessSeqFormat | ||||
4 | # | ||||
5 | # Please direct questions and support issues to <bioperl-l@bioperl.org> | ||||
6 | # | ||||
7 | # Cared for by Andreas Kähäri, andreas.kahari@ebi.ac.uk | ||||
8 | # | ||||
9 | # You may distribute this module under the same terms as perl itself | ||||
10 | #------------------------------------------------------------------ | ||||
11 | |||||
12 | =encoding utf-8 | ||||
13 | |||||
14 | =head1 NAME | ||||
15 | |||||
16 | Bio::Tools::GuessSeqFormat - Module for determining the sequence | ||||
17 | format of the contents of a file, a string, or through a | ||||
18 | filehandle. | ||||
19 | |||||
20 | =head1 SYNOPSIS | ||||
21 | |||||
22 | # To guess the format of a flat file, given a filename: | ||||
23 | my $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename ); | ||||
24 | my $format = $guesser->guess; | ||||
25 | |||||
26 | # To guess the format from an already open filehandle: | ||||
27 | my $guesser = Bio::Tools::GuessSeqFormat->new( -fh => $filehandle ); | ||||
28 | my $format = $guesser->guess; | ||||
29 | # The filehandle will be returned to its original position. Note that this | ||||
30 | # filehandle can be STDIN. | ||||
31 | |||||
32 | # To guess the format of one or several lines of text (with | ||||
33 | # embedded newlines): | ||||
34 | my $guesser = Bio::Tools::GuessSeqFormat->new( -text => $linesoftext ); | ||||
35 | my $format = $guesser->guess; | ||||
36 | |||||
37 | # To create a Bio::Tools::GuessSeqFormat object and set the | ||||
38 | # filename, filehandle, or line to parse afterwards: | ||||
39 | my $guesser = Bio::Tools::GuessSeqFormat->new(); | ||||
40 | $guesser->file($filename); | ||||
41 | $guesser->fh($filehandle); | ||||
42 | $guesser->text($linesoftext); | ||||
43 | |||||
44 | # To guess in one go, given e.g. a filename: | ||||
45 | my $format = Bio::Tools::GuessSeqFormat->new( -file => $filename )->guess; | ||||
46 | |||||
47 | =head1 DESCRIPTION | ||||
48 | |||||
49 | Bio::Tools::GuessSeqFormat tries to guess the format ("swiss", | ||||
50 | "pir", "fasta" etc.) of the sequence or MSA in a file, in a | ||||
51 | scalar, or through a filehandle. | ||||
52 | |||||
53 | The guess() method of a Bio::Tools::GuessSeqFormat object will | ||||
54 | examine the data, line by line, until it finds a line to which | ||||
55 | only one format can be assigned. If no conclusive guess can be | ||||
56 | made, undef is returned. | ||||
57 | |||||
58 | If the Bio::Tools::GuessSeqFormat object is given a filehandle, | ||||
59 | e.g. STDIN, it will be restored to its original position on | ||||
60 | return from the guess() method. | ||||
61 | |||||
62 | =head2 Formats | ||||
63 | |||||
64 | Tests are currently implemented for the following formats: | ||||
65 | |||||
66 | =over | ||||
67 | |||||
68 | =item * | ||||
69 | |||||
70 | ACeDB ("ace") | ||||
71 | |||||
72 | =item * | ||||
73 | |||||
74 | Blast ("blast") | ||||
75 | |||||
76 | =item * | ||||
77 | |||||
78 | ClustalW ("clustalw") | ||||
79 | |||||
80 | =item * | ||||
81 | |||||
82 | Codata ("codata") | ||||
83 | |||||
84 | =item * | ||||
85 | |||||
86 | EMBL ("embl") | ||||
87 | |||||
88 | =item * | ||||
89 | |||||
90 | FastA sequence ("fasta") | ||||
91 | |||||
92 | =item * | ||||
93 | |||||
94 | FastQ sequence ("fastq") | ||||
95 | |||||
96 | =item * | ||||
97 | |||||
98 | FastXY/FastA alignment ("fastxy") | ||||
99 | |||||
100 | =item * | ||||
101 | |||||
102 | Game XML ("game") | ||||
103 | |||||
104 | =item * | ||||
105 | |||||
106 | GCG ("gcg") | ||||
107 | |||||
108 | =item * | ||||
109 | |||||
110 | GCG Blast ("gcgblast") | ||||
111 | |||||
112 | =item * | ||||
113 | |||||
114 | GCG FastA ("gcgfasta") | ||||
115 | |||||
116 | =item * | ||||
117 | |||||
118 | GDE ("gde") | ||||
119 | |||||
120 | =item * | ||||
121 | |||||
122 | Genbank ("genbank") | ||||
123 | |||||
124 | =item * | ||||
125 | |||||
126 | Genscan ("genscan") | ||||
127 | |||||
128 | =item * | ||||
129 | |||||
130 | GFF ("gff") | ||||
131 | |||||
132 | =item * | ||||
133 | |||||
134 | HMMER ("hmmer") | ||||
135 | |||||
136 | =item * | ||||
137 | |||||
138 | PAUP/NEXUS ("nexus") | ||||
139 | |||||
140 | =item * | ||||
141 | |||||
142 | Phrap assembly file ("phrap") | ||||
143 | |||||
144 | =item * | ||||
145 | |||||
146 | NBRF/PIR ("pir") | ||||
147 | |||||
148 | =item * | ||||
149 | |||||
150 | Mase ("mase") | ||||
151 | |||||
152 | =item * | ||||
153 | |||||
154 | Mega ("mega") | ||||
155 | |||||
156 | =item * | ||||
157 | |||||
158 | GCG/MSF ("msf") | ||||
159 | |||||
160 | =item * | ||||
161 | |||||
162 | Pfam ("pfam") | ||||
163 | |||||
164 | =item * | ||||
165 | |||||
166 | Phylip ("phylip") | ||||
167 | |||||
168 | =item * | ||||
169 | |||||
170 | Prodom ("prodom") | ||||
171 | |||||
172 | =item * | ||||
173 | |||||
174 | Raw ("raw") | ||||
175 | |||||
176 | =item * | ||||
177 | |||||
178 | RSF ("rsf") | ||||
179 | |||||
180 | =item * | ||||
181 | |||||
182 | Selex ("selex") | ||||
183 | |||||
184 | =item * | ||||
185 | |||||
186 | Stockholm ("stockholm") | ||||
187 | |||||
188 | =item * | ||||
189 | |||||
190 | Swissprot ("swiss") | ||||
191 | |||||
192 | =item * | ||||
193 | |||||
194 | Tab ("tab") | ||||
195 | |||||
196 | =item * | ||||
197 | |||||
198 | Variant Call Format ("vcf") | ||||
199 | |||||
200 | =back | ||||
201 | |||||
202 | =head1 FEEDBACK | ||||
203 | |||||
204 | =head2 Mailing Lists | ||||
205 | |||||
206 | User feedback is an integral part of the evolution of this and | ||||
207 | other Bioperl modules. Send your comments and suggestions | ||||
208 | preferably to one of the Bioperl mailing lists. Your | ||||
209 | participation is much appreciated. | ||||
210 | |||||
211 | bioperl-l@bioperl.org - General discussion | ||||
212 | http://bioperl.org/wiki/Mailing_lists - About the mailing lists | ||||
213 | |||||
214 | =head2 Support | ||||
215 | |||||
216 | Please direct usage questions or support issues to the mailing list: | ||||
217 | |||||
218 | I<bioperl-l@bioperl.org> | ||||
219 | |||||
220 | rather than to the module maintainer directly. Many experienced and | ||||
221 | reponsive experts will be able look at the problem and quickly | ||||
222 | address it. Please include a thorough description of the problem | ||||
223 | with code and data examples if at all possible. | ||||
224 | |||||
225 | =head2 Reporting Bugs | ||||
226 | |||||
227 | Report bugs to the Bioperl bug tracking system to help us | ||||
228 | keep track the bugs and their resolution. Bug reports can be | ||||
229 | submitted via the web: | ||||
230 | |||||
231 | https://github.com/bioperl/bioperl-live/issues | ||||
232 | |||||
233 | =head1 AUTHOR | ||||
234 | |||||
235 | Andreas KE<228>hE<228>ri, andreas.kahari@ebi.ac.uk | ||||
236 | |||||
237 | =head1 CONTRIBUTORS | ||||
238 | |||||
239 | Heikki LehvE<228>slaiho, heikki-at-bioperl-dot-org | ||||
240 | Mark A. Jensen, maj-at-fortinbras-dot-us | ||||
241 | |||||
242 | =cut | ||||
243 | |||||
244 | |||||
245 | package Bio::Tools::GuessSeqFormat; | ||||
246 | |||||
247 | 2 | 24µs | 2 | 39µs | # spent 26µs (13+13) within Bio::Tools::GuessSeqFormat::BEGIN@247 which was called:
# once (13µs+13µs) by Bio::SeqIO::BEGIN@332 at line 247 # spent 26µs making 1 call to Bio::Tools::GuessSeqFormat::BEGIN@247
# spent 13µs making 1 call to strict::import |
248 | 2 | 24µs | 2 | 18µs | # spent 13µs (8+5) within Bio::Tools::GuessSeqFormat::BEGIN@248 which was called:
# once (8µs+5µs) by Bio::SeqIO::BEGIN@332 at line 248 # spent 13µs making 1 call to Bio::Tools::GuessSeqFormat::BEGIN@248
# spent 5µs making 1 call to warnings::import |
249 | |||||
250 | |||||
251 | 2 | 2.35ms | 2 | 132µs | # spent 70µs (8+62) within Bio::Tools::GuessSeqFormat::BEGIN@251 which was called:
# once (8µs+62µs) by Bio::SeqIO::BEGIN@332 at line 251 # spent 70µs making 1 call to Bio::Tools::GuessSeqFormat::BEGIN@251
# spent 62µs making 1 call to base::import |
252 | |||||
253 | =head1 METHODS | ||||
254 | |||||
255 | Methods available to Bio::Tools::GuessSeqFormat objects | ||||
256 | are described below. Methods with names beginning with an | ||||
257 | underscore are considered to be internal. | ||||
258 | |||||
259 | =cut | ||||
260 | |||||
261 | =head2 new | ||||
262 | |||||
263 | Title : new | ||||
264 | Usage : $guesser = Bio::Tools::GuessSeqFormat->new( ... ); | ||||
265 | Function : Creates a new object. | ||||
266 | Example : See SYNOPSIS. | ||||
267 | Returns : A new object. | ||||
268 | Arguments : -file The filename of the file whose format is to | ||||
269 | be guessed, e.g. STDIN, or | ||||
270 | -fh An already opened filehandle from which a text | ||||
271 | stream may be read, or | ||||
272 | -text A scalar containing one or several lines of | ||||
273 | text with embedded newlines. | ||||
274 | |||||
275 | If more than one of the above arguments are given, they | ||||
276 | are tested in the order -text, -file, -fh, and the first | ||||
277 | available argument will be used. | ||||
278 | |||||
279 | =cut | ||||
280 | |||||
281 | sub new | ||||
282 | { | ||||
283 | my $class = shift; | ||||
284 | my @args = @_; | ||||
285 | |||||
286 | my $self = $class->SUPER::new(@args); | ||||
287 | |||||
288 | my $attr; | ||||
289 | my $value; | ||||
290 | |||||
291 | while (@args) { | ||||
292 | $attr = shift @args; | ||||
293 | $attr = lc $attr; | ||||
294 | $value = shift @args; | ||||
295 | $self->{$attr} = $value; | ||||
296 | } | ||||
297 | |||||
298 | return $self; | ||||
299 | } | ||||
300 | |||||
301 | =head2 file | ||||
302 | |||||
303 | Title : file | ||||
304 | Usage : $guesser->file($filename); | ||||
305 | $filename = $guesser->file; | ||||
306 | Function : Gets or sets the current filename associated with | ||||
307 | an object. | ||||
308 | Returns : The new filename. | ||||
309 | Arguments : The filename of the file whose format is to be | ||||
310 | guessed. | ||||
311 | |||||
312 | A call to this method will clear the current filehandle and | ||||
313 | the current lines of text associated with the object. | ||||
314 | |||||
315 | =cut | ||||
316 | |||||
317 | sub file | ||||
318 | { | ||||
319 | # Sets and/or returns the filename to use. | ||||
320 | my $self = shift; | ||||
321 | my $file = shift; | ||||
322 | |||||
323 | if (defined $file) { | ||||
324 | # Set the active filename, and clear the filehandle and | ||||
325 | # text line, if present. | ||||
326 | $self->{-file} = $file; | ||||
327 | $self->{-fh} = $self->{-text} = undef; | ||||
328 | } | ||||
329 | |||||
330 | return $self->{-file}; | ||||
331 | } | ||||
332 | |||||
333 | =head2 fh | ||||
334 | |||||
335 | Title : fh | ||||
336 | Usage : $guesser->fh($filehandle); | ||||
337 | $filehandle = $guesser->fh; | ||||
338 | Function : Gets or sets the current filehandle associated with | ||||
339 | an object. | ||||
340 | Returns : The new filehandle. | ||||
341 | Arguments : An already opened filehandle from which a text | ||||
342 | stream may be read. | ||||
343 | |||||
344 | A call to this method will clear the current filename and | ||||
345 | the current lines of text associated with the object. | ||||
346 | |||||
347 | =cut | ||||
348 | |||||
349 | sub fh | ||||
350 | { | ||||
351 | # Sets and/or returns the filehandle to use. | ||||
352 | my $self = shift; | ||||
353 | my $fh = shift; | ||||
354 | |||||
355 | if (defined $fh) { | ||||
356 | # Set the active filehandle, and clear the filename and | ||||
357 | # text line, if present. | ||||
358 | $self->{-fh} = $fh; | ||||
359 | $self->{-file} = $self->{-text} = undef; | ||||
360 | } | ||||
361 | |||||
362 | return $self->{-fh}; | ||||
363 | } | ||||
364 | |||||
365 | |||||
366 | =head2 text | ||||
367 | |||||
368 | Title : text | ||||
369 | Usage : $guesser->text($linesoftext); | ||||
370 | $linesofext = $guesser->text; | ||||
371 | Function : Gets or sets the current text associated with an | ||||
372 | object. | ||||
373 | Returns : The new lines of texts. | ||||
374 | Arguments : A scalar containing one or several lines of text, | ||||
375 | including embedded newlines. | ||||
376 | |||||
377 | A call to this method will clear the current filename and | ||||
378 | the current filehandle associated with the object. | ||||
379 | |||||
380 | =cut | ||||
381 | |||||
382 | sub text | ||||
383 | { | ||||
384 | # Sets and/or returns the text lines to use. | ||||
385 | my $self = shift; | ||||
386 | my $text = shift; | ||||
387 | |||||
388 | if (defined $text) { | ||||
389 | # Set the active text lines, and clear the filehandle | ||||
390 | # and filename, if present. | ||||
391 | $self->{-text} = $text; | ||||
392 | $self->{-fh} = $self->{-file} = undef; | ||||
393 | } | ||||
394 | |||||
395 | return $self->{-text}; | ||||
396 | } | ||||
397 | |||||
398 | =head2 guess | ||||
399 | |||||
400 | Title : guess | ||||
401 | Usage : $format = $guesser->guess; | ||||
402 | @format = $guesser->guess; # if given a line of text | ||||
403 | Function : Guesses the format of the data accociated with the | ||||
404 | object. | ||||
405 | Returns : A format string such as "swiss" or "pir". If a | ||||
406 | format can not be found, undef is returned. | ||||
407 | Arguments : None. | ||||
408 | |||||
409 | If the object is associated with a filehandle, the position | ||||
410 | of the filehandle will be returned to its original position | ||||
411 | before the method returns. | ||||
412 | |||||
413 | =cut | ||||
414 | |||||
415 | 1 | 26µs | our %formats = ( | ||
416 | ace => { test => \&_possibly_ace }, | ||||
417 | blast => { test => \&_possibly_blast }, | ||||
418 | bowtie => { test => \&_possibly_bowtie }, | ||||
419 | clustalw => { test => \&_possibly_clustalw }, | ||||
420 | codata => { test => \&_possibly_codata }, | ||||
421 | embl => { test => \&_possibly_embl }, | ||||
422 | fasta => { test => \&_possibly_fasta }, | ||||
423 | fastq => { test => \&_possibly_fastq }, | ||||
424 | fastxy => { test => \&_possibly_fastxy }, | ||||
425 | game => { test => \&_possibly_game }, | ||||
426 | gcg => { test => \&_possibly_gcg }, | ||||
427 | gcgblast => { test => \&_possibly_gcgblast }, | ||||
428 | gcgfasta => { test => \&_possibly_gcgfasta }, | ||||
429 | gde => { test => \&_possibly_gde }, | ||||
430 | genbank => { test => \&_possibly_genbank }, | ||||
431 | genscan => { test => \&_possibly_genscan }, | ||||
432 | gff => { test => \&_possibly_gff }, | ||||
433 | hmmer => { test => \&_possibly_hmmer }, | ||||
434 | nexus => { test => \&_possibly_nexus }, | ||||
435 | mase => { test => \&_possibly_mase }, | ||||
436 | mega => { test => \&_possibly_mega }, | ||||
437 | msf => { test => \&_possibly_msf }, | ||||
438 | pfam => { test => \&_possibly_pfam }, | ||||
439 | phrap => { test => \&_possibly_phrap }, | ||||
440 | phylip => { test => \&_possibly_phylip }, | ||||
441 | pir => { test => \&_possibly_pir }, | ||||
442 | prodom => { test => \&_possibly_prodom }, | ||||
443 | raw => { test => \&_possibly_raw }, | ||||
444 | rsf => { test => \&_possibly_rsf }, | ||||
445 | selex => { test => \&_possibly_selex }, | ||||
446 | stockholm => { test => \&_possibly_stockholm }, | ||||
447 | swiss => { test => \&_possibly_swiss }, | ||||
448 | tab => { test => \&_possibly_tab }, | ||||
449 | vcf => { test => \&_possibly_vcf }, | ||||
450 | ); | ||||
451 | |||||
452 | sub guess | ||||
453 | { | ||||
454 | my $self = shift; | ||||
455 | |||||
456 | while (my ($fmt_key) = each (%formats)) { | ||||
457 | $formats{$fmt_key}{fmt_string} = $fmt_key; | ||||
458 | } | ||||
459 | |||||
460 | my $fh; | ||||
461 | my $start_pos; | ||||
462 | if (defined $self->{-text}) { | ||||
463 | # Break the text into separate lines. | ||||
464 | my $text = $self->{-text}; | ||||
465 | open $fh, '<', \$text or $self->throw("Could not read from string: $!"); | ||||
466 | |||||
467 | } elsif (defined $self->{-file}) { | ||||
468 | # If given a filename, open the file. | ||||
469 | my $file = $self->{-file}; | ||||
470 | open $fh, '<', $file or $self->throw("Could not read file '$file': $!"); | ||||
471 | |||||
472 | } elsif (defined $self->{-fh}) { | ||||
473 | # If given a filehandle, get the current position in the stream. | ||||
474 | $fh = $self->{-fh}; | ||||
475 | if (not seek $fh, 0, 1) { # seek to current position to determine seekability | ||||
476 | # Work around non-seekable filehandles if IO::Scalar is available | ||||
477 | # (adapted from http://www.perlmonks.org/?node_id=33587) | ||||
478 | # IO::Mark may be an option for very large streams? | ||||
479 | $self->throw("Need IO::Scalar to guess from unseekable filehandles") | ||||
480 | if not eval { require IO::Scalar }; | ||||
481 | my $data; | ||||
482 | { local $/; $data = <$fh>; $.-- }; # copy raw data from fh | ||||
483 | tie *$fh, 'IO::Scalar', my $s; # replace fh by scalar-tied fh | ||||
484 | print $fh $data; # write raw data to tied fh | ||||
485 | seek $fh, 0, 0; # return to start of tied fh | ||||
486 | } | ||||
487 | $start_pos = tell $fh; | ||||
488 | } | ||||
489 | |||||
490 | my $done = 0; | ||||
491 | my $lineno = 0; | ||||
492 | my $guess; | ||||
493 | while (!$done) { | ||||
494 | my $line; # The next line of the file. | ||||
495 | my $match = 0; # Number of possible formats of this line. | ||||
496 | |||||
497 | last if (!defined($line = <$fh>)); | ||||
498 | next if ($line =~ /^\s*$/); # Skip white and empty lines. | ||||
499 | chomp $line; | ||||
500 | $line =~ s/\r$//; # Fix for DOS files on Unix. | ||||
501 | ++$lineno; | ||||
502 | |||||
503 | while (my ($fmt_key, $fmt) = each (%formats)) { | ||||
504 | if ($fmt->{test}($line, $lineno)) { | ||||
505 | ++$match; | ||||
506 | $guess = $fmt->{fmt_string}; | ||||
507 | } | ||||
508 | } | ||||
509 | |||||
510 | # We're done if there was only one match. | ||||
511 | $done = ($match == 1); | ||||
512 | } | ||||
513 | |||||
514 | if (defined $self->{-fh}) { | ||||
515 | # Go back to original position in filehandle | ||||
516 | seek $fh, $start_pos, 0 or $self->throw("Could not reset filehandle $fh: $!"); | ||||
517 | } else { | ||||
518 | # Close the filehandle we opened | ||||
519 | close $fh; | ||||
520 | } | ||||
521 | return ($done ? $guess : undef); | ||||
522 | } | ||||
523 | |||||
524 | =head1 HELPER SUBROUTINES | ||||
525 | |||||
526 | All helper subroutines will, given a line of text and the line | ||||
527 | number of the same line, return 1 if the line possibly is from a | ||||
528 | file of the type that they perform a test of. | ||||
529 | |||||
530 | A zero return value does not mean that the line is not part | ||||
531 | of a certain type of file, just that the test did not find any | ||||
532 | characteristics of that type of file in the line. | ||||
533 | |||||
534 | =head2 _possibly_ace | ||||
535 | |||||
536 | From bioperl test data, and from | ||||
537 | "http://www.isrec.isb-sib.ch/DEA/module8/B_Stevenson/Practicals/transcriptome_recon/transcriptome_recon.html". | ||||
538 | |||||
539 | =cut | ||||
540 | |||||
541 | sub _possibly_ace | ||||
542 | { | ||||
543 | my ($line, $lineno) = (shift, shift); | ||||
544 | return ($line =~ /^(?:Sequence|Peptide|DNA|Protein) [":]/); | ||||
545 | } | ||||
546 | |||||
547 | =head2 _possibly_blast | ||||
548 | |||||
549 | From various blast results. | ||||
550 | |||||
551 | =cut | ||||
552 | |||||
553 | sub _possibly_blast | ||||
554 | { | ||||
555 | my ($line, $lineno) = (shift, shift); | ||||
556 | return ($lineno == 1 && | ||||
557 | $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/); | ||||
558 | } | ||||
559 | |||||
560 | =head2 _possibly_bowtie | ||||
561 | |||||
562 | Contributed by kortsch. | ||||
563 | |||||
564 | =cut | ||||
565 | |||||
566 | sub _possibly_bowtie | ||||
567 | { | ||||
568 | my ($line, $lineno) = (shift, shift); | ||||
569 | return ($line =~ /^[[:graph:]]+\t[-+]\t[[:graph:]]+\t\d+\t([[:alpha:]]+)\t([[:graph:]]+)\t\d+\t[[:graph:]]?/) | ||||
570 | && length($1)==length($2); | ||||
571 | } | ||||
572 | |||||
573 | =head2 _possibly_clustalw | ||||
574 | |||||
575 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
576 | |||||
577 | =cut | ||||
578 | |||||
579 | sub _possibly_clustalw | ||||
580 | { | ||||
581 | my ($line, $lineno) = (shift, shift); | ||||
582 | return ($lineno == 1 && $line =~ /CLUSTAL/); | ||||
583 | } | ||||
584 | |||||
585 | =head2 _possibly_codata | ||||
586 | |||||
587 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
588 | |||||
589 | =cut | ||||
590 | |||||
591 | sub _possibly_codata | ||||
592 | { | ||||
593 | my ($line, $lineno) = (shift, shift); | ||||
594 | return (($lineno == 1 && $line =~ /^ENTRY/) || | ||||
595 | ($lineno == 2 && $line =~ /^SEQUENCE/) || | ||||
596 | $line =~ m{^(?:ENTRY|SEQUENCE|///)}); | ||||
597 | } | ||||
598 | |||||
599 | =head2 _possibly_embl | ||||
600 | |||||
601 | From | ||||
602 | "http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3.3". | ||||
603 | |||||
604 | =cut | ||||
605 | |||||
606 | sub _possibly_embl | ||||
607 | { | ||||
608 | my ($line, $lineno) = (shift, shift); | ||||
609 | return ($lineno == 1 && $line =~ /^ID / && $line =~ /BP\.$/); | ||||
610 | } | ||||
611 | |||||
612 | =head2 _possibly_fasta | ||||
613 | |||||
614 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
615 | |||||
616 | =cut | ||||
617 | |||||
618 | sub _possibly_fasta | ||||
619 | { | ||||
620 | my ($line, $lineno) = (shift, shift); | ||||
621 | return (($lineno != 1 && $line =~ /^[A-IK-NP-Z]+$/i) || | ||||
622 | $line =~ /^>\s*\w/); | ||||
623 | } | ||||
624 | |||||
625 | =head2 _possibly_fastq | ||||
626 | |||||
627 | From bioperl test data. | ||||
628 | |||||
629 | =cut | ||||
630 | |||||
631 | sub _possibly_fastq | ||||
632 | { | ||||
633 | my ($line, $lineno) = (shift, shift); | ||||
634 | return ( ($lineno == 1 && $line =~ /^@/) || | ||||
635 | ($lineno == 3 && $line =~ /^\+/) ); | ||||
636 | } | ||||
637 | |||||
638 | =head2 _possibly_fastxy | ||||
639 | |||||
640 | From bioperl test data. | ||||
641 | |||||
642 | =cut | ||||
643 | |||||
644 | sub _possibly_fastxy | ||||
645 | { | ||||
646 | my ($line, $lineno) = (shift, shift); | ||||
647 | return (($lineno == 1 && $line =~ /^ FAST(?:XY|A)/) || | ||||
648 | ($lineno == 2 && $line =~ /^ version \d/)); | ||||
649 | } | ||||
650 | |||||
651 | =head2 _possibly_game | ||||
652 | |||||
653 | From bioperl testdata. | ||||
654 | |||||
655 | =cut | ||||
656 | |||||
657 | sub _possibly_game | ||||
658 | { | ||||
659 | my ($line, $lineno) = (shift, shift); | ||||
660 | return ($line =~ /^<!DOCTYPE game/); | ||||
661 | } | ||||
662 | |||||
663 | =head2 _possibly_gcg | ||||
664 | |||||
665 | From bioperl, Bio::SeqIO::gcg. | ||||
666 | |||||
667 | =cut | ||||
668 | |||||
669 | sub _possibly_gcg | ||||
670 | { | ||||
671 | my ($line, $lineno) = (shift, shift); | ||||
672 | return ($line =~ /Length: .*Type: .*Check: .*\.\.$/); | ||||
673 | } | ||||
674 | |||||
675 | =head2 _possibly_gcgblast | ||||
676 | |||||
677 | From bioperl testdata. | ||||
678 | |||||
679 | =cut | ||||
680 | |||||
681 | sub _possibly_gcgblast | ||||
682 | { | ||||
683 | my ($line, $lineno) = (shift, shift); | ||||
684 | return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) || | ||||
685 | ($lineno == 2 && | ||||
686 | $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/)); | ||||
687 | } | ||||
688 | |||||
689 | =head2 _possibly_gcgfasta | ||||
690 | |||||
691 | From bioperl testdata. | ||||
692 | |||||
693 | =cut | ||||
694 | |||||
695 | sub _possibly_gcgfasta | ||||
696 | { | ||||
697 | my ($line, $lineno) = (shift, shift); | ||||
698 | return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) || | ||||
699 | ($lineno == 2 && $line =~ /FASTA/)); | ||||
700 | } | ||||
701 | |||||
702 | =head2 _possibly_gde | ||||
703 | |||||
704 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
705 | |||||
706 | =cut | ||||
707 | |||||
708 | sub _possibly_gde | ||||
709 | { | ||||
710 | my ($line, $lineno) = (shift, shift); | ||||
711 | return ($line =~ /^[{}]$/ || | ||||
712 | $line =~ /^(?:name|longname|sequence-ID| | ||||
713 | creation-date|direction|strandedness| | ||||
714 | type|offset|group-ID|creator|descrip| | ||||
715 | comment|sequence)/x); | ||||
716 | } | ||||
717 | |||||
718 | =head2 _possibly_genbank | ||||
719 | |||||
720 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
721 | Format of [apparantly optional] file header from | ||||
722 | "http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm". (TODO: dead link) | ||||
723 | |||||
724 | =cut | ||||
725 | |||||
726 | sub _possibly_genbank | ||||
727 | { | ||||
728 | my ($line, $lineno) = (shift, shift); | ||||
729 | return (($lineno == 1 && $line =~ /GENETIC SEQUENCE DATA BANK/) || | ||||
730 | ($lineno == 1 && $line =~ /^LOCUS /) || | ||||
731 | ($lineno == 2 && $line =~ /^DEFINITION /) || | ||||
732 | ($lineno == 3 && $line =~ /^ACCESSION /)); | ||||
733 | } | ||||
734 | |||||
735 | =head2 _possibly_genscan | ||||
736 | |||||
737 | From bioperl test data. | ||||
738 | |||||
739 | =cut | ||||
740 | |||||
741 | sub _possibly_genscan | ||||
742 | { | ||||
743 | my ($line, $lineno) = (shift, shift); | ||||
744 | return (($lineno == 1 && $line =~ /^GENSCAN.*Date.*Time/) || | ||||
745 | ($line =~ /^(?:Sequence\s+\w+|Parameter matrix|Predicted genes)/)); | ||||
746 | } | ||||
747 | |||||
748 | =head2 _possibly_gff | ||||
749 | |||||
750 | From bioperl test data. | ||||
751 | |||||
752 | =cut | ||||
753 | |||||
754 | sub _possibly_gff | ||||
755 | { | ||||
756 | my ($line, $lineno) = (shift, shift); | ||||
757 | return (($lineno == 1 && $line =~ /^##gff-version/) || | ||||
758 | ($lineno == 2 && $line =~ /^##date/)); | ||||
759 | } | ||||
760 | |||||
761 | =head2 _possibly_hmmer | ||||
762 | |||||
763 | From bioperl test data. | ||||
764 | |||||
765 | =cut | ||||
766 | |||||
767 | sub _possibly_hmmer | ||||
768 | { | ||||
769 | my ($line, $lineno) = (shift, shift); | ||||
770 | return (($lineno == 2 && $line =~ /^HMMER/) || | ||||
771 | ($lineno == 3 && | ||||
772 | $line =~ /Washington University School of Medicine/)); | ||||
773 | } | ||||
774 | |||||
775 | =head2 _possibly_nexus | ||||
776 | |||||
777 | From "http://paup.csit.fsu.edu/nfiles.html". | ||||
778 | |||||
779 | =cut | ||||
780 | |||||
781 | sub _possibly_nexus | ||||
782 | { | ||||
783 | my ($line, $lineno) = (shift, shift); | ||||
784 | return ($lineno == 1 && $line =~ /^#NEXUS/); | ||||
785 | } | ||||
786 | |||||
787 | =head2 _possibly_mase | ||||
788 | |||||
789 | From bioperl test data. | ||||
790 | More detail from "http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm" (TODO: dead link) | ||||
791 | |||||
792 | =cut | ||||
793 | |||||
794 | sub _possibly_mase | ||||
795 | { | ||||
796 | my ($line, $lineno) = (shift, shift); | ||||
797 | return (($lineno == 1 && $line =~ /^;;/) || | ||||
798 | ($lineno > 1 && $line =~ /^;[^;]?/)); | ||||
799 | } | ||||
800 | |||||
801 | =head2 _possibly_mega | ||||
802 | |||||
803 | From the ensembl broswer (AlignView data export). | ||||
804 | |||||
805 | =cut | ||||
806 | |||||
807 | sub _possibly_mega | ||||
808 | { | ||||
809 | my ($line, $lineno) = (shift, shift); | ||||
810 | return ($lineno == 1 && $line =~ /^#mega$/); | ||||
811 | } | ||||
812 | |||||
813 | |||||
814 | =head2 _possibly_msf | ||||
815 | |||||
816 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
817 | |||||
818 | =cut | ||||
819 | |||||
820 | sub _possibly_msf | ||||
821 | { | ||||
822 | my ($line, $lineno) = (shift, shift); | ||||
823 | return ($line =~ m{^//} || | ||||
824 | $line =~ /MSF:.*Type:.*Check:|Name:.*Len:/); | ||||
825 | } | ||||
826 | |||||
827 | =head2 _possibly_phrap | ||||
828 | |||||
829 | From "http://biodata.ccgb.umn.edu/docs/contigimage.html". (TODO: dead link) | ||||
830 | From "http://genetics.gene.cwru.edu/gene508/Lec6.htm". (TODO: dead link) | ||||
831 | From bioperl test data ("*.ace.1" files). | ||||
832 | |||||
833 | =cut | ||||
834 | |||||
835 | sub _possibly_phrap | ||||
836 | { | ||||
837 | my ($line, $lineno) = (shift, shift); | ||||
838 | return ($line =~ /^(?:AS\ |CO\ Contig|BQ|AF\ |BS\ |RD\ | | ||||
839 | QA\ |DS\ |RT\{)/x); | ||||
840 | } | ||||
841 | |||||
842 | =head2 _possibly_pir | ||||
843 | |||||
844 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
845 | The ".,()" spotted in bioperl test data. | ||||
846 | |||||
847 | =cut | ||||
848 | |||||
849 | sub _possibly_pir # "NBRF/PIR" (?) | ||||
850 | { | ||||
851 | my ($line, $lineno) = (shift, shift); | ||||
852 | return (($lineno != 1 && $line =~ /^[\sA-IK-NP-Z.,()]+\*?$/i) || | ||||
853 | $line =~ /^>(?:P1|F1|DL|DC|RL|RC|N3|N1);/); | ||||
854 | } | ||||
855 | |||||
856 | =head2 _possibly_pfam | ||||
857 | |||||
858 | From bioperl test data. | ||||
859 | |||||
860 | =cut | ||||
861 | |||||
862 | sub _possibly_pfam | ||||
863 | { | ||||
864 | my ($line, $lineno) = (shift, shift); | ||||
865 | return ($line =~ m{^\w+/\d+-\d+\s+[A-IK-NP-Z.]+}i); | ||||
866 | } | ||||
867 | |||||
868 | =head2 _possibly_phylip | ||||
869 | |||||
870 | From "http://www.ebi.ac.uk/help/formats.html". Initial space | ||||
871 | allowed on first line (spotted in ensembl AlignView exported | ||||
872 | data). | ||||
873 | |||||
874 | =cut | ||||
875 | |||||
876 | sub _possibly_phylip | ||||
877 | { | ||||
878 | my ($line, $lineno) = (shift, shift); | ||||
879 | return (($lineno == 1 && $line =~ /^\s*\d+\s\d+/) || | ||||
880 | ($lineno == 2 && $line =~ /^\w\s+[A-IK-NP-Z\s]+/) || | ||||
881 | ($lineno == 3 && $line =~ /(?:^\w\s+[A-IK-NP-Z\s]+|\s+[A-IK-NP-Z\s]+)/) | ||||
882 | ); | ||||
883 | } | ||||
884 | |||||
885 | =head2 _possibly_prodom | ||||
886 | |||||
887 | From "http://prodom.prabi.fr/prodom/current/documentation/data.php". | ||||
888 | |||||
889 | =cut | ||||
890 | |||||
891 | sub _possibly_prodom | ||||
892 | { | ||||
893 | my ($line, $lineno) = (shift, shift); | ||||
894 | return ($lineno == 1 && $line =~ /^ID / && $line =~ /\d+ seq\.$/); | ||||
895 | } | ||||
896 | |||||
897 | =head2 _possibly_raw | ||||
898 | |||||
899 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
900 | |||||
901 | =cut | ||||
902 | |||||
903 | sub _possibly_raw | ||||
904 | { | ||||
905 | my ($line, $lineno) = (shift, shift); | ||||
906 | return ($line =~ /^[A-Za-z\s]+$/); | ||||
907 | } | ||||
908 | |||||
909 | =head2 _possibly_rsf | ||||
910 | |||||
911 | From "http://www.ebi.ac.uk/help/formats.html". | ||||
912 | |||||
913 | =cut | ||||
914 | |||||
915 | sub _possibly_rsf | ||||
916 | { | ||||
917 | my ($line, $lineno) = (shift, shift); | ||||
918 | return (($lineno == 1 && $line =~ /^!!RICH_SEQUENCE/) || | ||||
919 | $line =~ /^[{}]$/ || | ||||
920 | $line =~ /^(?:name|type|longname| | ||||
921 | checksum|creation-date|strand|sequence)/x); | ||||
922 | } | ||||
923 | |||||
924 | =head2 _possibly_selex | ||||
925 | |||||
926 | From "http://www.ebc.ee/WWW/hmmer2-html/node27.html". | ||||
927 | |||||
928 | Assuming presence of Selex file header. Data exported by | ||||
929 | Bioperl on Pfam and Selex formats are identical, but Pfam file | ||||
930 | only holds one alignment. | ||||
931 | |||||
932 | =cut | ||||
933 | |||||
934 | sub _possibly_selex | ||||
935 | { | ||||
936 | my ($line, $lineno) = (shift, shift); | ||||
937 | return (($lineno == 1 && $line =~ /^#=ID /) || | ||||
938 | ($lineno == 2 && $line =~ /^#=AC /) || | ||||
939 | ($line =~ /^#=SQ /)); | ||||
940 | } | ||||
941 | |||||
942 | =head2 _possibly_stockholm | ||||
943 | |||||
944 | From bioperl test data. | ||||
945 | |||||
946 | =cut | ||||
947 | |||||
948 | sub _possibly_stockholm | ||||
949 | { | ||||
950 | my ($line, $lineno) = (shift, shift); | ||||
951 | return (($lineno == 1 && $line =~ /^# STOCKHOLM/) || | ||||
952 | $line =~ /^#=(?:GF|GS) /); | ||||
953 | } | ||||
954 | |||||
- - | |||||
957 | =head2 _possibly_swiss | ||||
958 | |||||
959 | From "http://ca.expasy.org/sprot/userman.html#entrystruc". | ||||
960 | |||||
961 | =cut | ||||
962 | |||||
963 | sub _possibly_swiss | ||||
964 | { | ||||
965 | my ($line, $lineno) = (shift, shift); | ||||
966 | return ($lineno == 1 && $line =~ /^ID / && $line =~ /AA\.$/); | ||||
967 | } | ||||
968 | |||||
969 | =head2 _possibly_tab | ||||
970 | |||||
971 | Contributed by Heikki. | ||||
972 | |||||
973 | =cut | ||||
974 | |||||
975 | sub _possibly_tab | ||||
976 | { | ||||
977 | my ($line, $lineno) = (shift, shift); | ||||
978 | return ($lineno == 1 && $line =~ /^[^\t]+\t[^\t]+/) ; | ||||
979 | } | ||||
980 | |||||
981 | =head2 _possibly_vcf | ||||
982 | |||||
983 | From "http://www.1000genomes.org/wiki/analysis/vcf4.0". | ||||
984 | |||||
985 | Assumptions made about sanity - format and date lines are line 1 and 2 | ||||
986 | respectively. This is not specified in the format document. | ||||
987 | |||||
988 | =cut | ||||
989 | |||||
990 | sub _possibly_vcf | ||||
991 | { | ||||
992 | my ($line, $lineno) = (shift, shift); | ||||
993 | return (($lineno == 1 && $line =~ /##fileformat=VCFv/) || | ||||
994 | ($lineno == 2 && $line =~ /##fileDate=/)); | ||||
995 | } | ||||
996 | |||||
- - | |||||
999 | 1 | 48µs | 1; |