← Index
NYTProf Performance Profile   « block view • line view • sub view »
For bin/pan_genome_post_analysis
  Run on Fri Mar 27 11:43:32 2015
Reported on Fri Mar 27 11:45:28 2015

Filename/Users/ap13/perl5/lib/perl5/Bio/Tools/GuessSeqFormat.pm
StatementsExecuted 8 statements in 2.47ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
11113µs26µsBio::Tools::GuessSeqFormat::::BEGIN@247Bio::Tools::GuessSeqFormat::BEGIN@247
1118µs13µsBio::Tools::GuessSeqFormat::::BEGIN@248Bio::Tools::GuessSeqFormat::BEGIN@248
1118µs70µsBio::Tools::GuessSeqFormat::::BEGIN@251Bio::Tools::GuessSeqFormat::BEGIN@251
0000s0sBio::Tools::GuessSeqFormat::::_possibly_aceBio::Tools::GuessSeqFormat::_possibly_ace
0000s0sBio::Tools::GuessSeqFormat::::_possibly_blastBio::Tools::GuessSeqFormat::_possibly_blast
0000s0sBio::Tools::GuessSeqFormat::::_possibly_bowtieBio::Tools::GuessSeqFormat::_possibly_bowtie
0000s0sBio::Tools::GuessSeqFormat::::_possibly_clustalwBio::Tools::GuessSeqFormat::_possibly_clustalw
0000s0sBio::Tools::GuessSeqFormat::::_possibly_codataBio::Tools::GuessSeqFormat::_possibly_codata
0000s0sBio::Tools::GuessSeqFormat::::_possibly_emblBio::Tools::GuessSeqFormat::_possibly_embl
0000s0sBio::Tools::GuessSeqFormat::::_possibly_fastaBio::Tools::GuessSeqFormat::_possibly_fasta
0000s0sBio::Tools::GuessSeqFormat::::_possibly_fastqBio::Tools::GuessSeqFormat::_possibly_fastq
0000s0sBio::Tools::GuessSeqFormat::::_possibly_fastxyBio::Tools::GuessSeqFormat::_possibly_fastxy
0000s0sBio::Tools::GuessSeqFormat::::_possibly_gameBio::Tools::GuessSeqFormat::_possibly_game
0000s0sBio::Tools::GuessSeqFormat::::_possibly_gcgBio::Tools::GuessSeqFormat::_possibly_gcg
0000s0sBio::Tools::GuessSeqFormat::::_possibly_gcgblastBio::Tools::GuessSeqFormat::_possibly_gcgblast
0000s0sBio::Tools::GuessSeqFormat::::_possibly_gcgfastaBio::Tools::GuessSeqFormat::_possibly_gcgfasta
0000s0sBio::Tools::GuessSeqFormat::::_possibly_gdeBio::Tools::GuessSeqFormat::_possibly_gde
0000s0sBio::Tools::GuessSeqFormat::::_possibly_genbankBio::Tools::GuessSeqFormat::_possibly_genbank
0000s0sBio::Tools::GuessSeqFormat::::_possibly_genscanBio::Tools::GuessSeqFormat::_possibly_genscan
0000s0sBio::Tools::GuessSeqFormat::::_possibly_gffBio::Tools::GuessSeqFormat::_possibly_gff
0000s0sBio::Tools::GuessSeqFormat::::_possibly_hmmerBio::Tools::GuessSeqFormat::_possibly_hmmer
0000s0sBio::Tools::GuessSeqFormat::::_possibly_maseBio::Tools::GuessSeqFormat::_possibly_mase
0000s0sBio::Tools::GuessSeqFormat::::_possibly_megaBio::Tools::GuessSeqFormat::_possibly_mega
0000s0sBio::Tools::GuessSeqFormat::::_possibly_msfBio::Tools::GuessSeqFormat::_possibly_msf
0000s0sBio::Tools::GuessSeqFormat::::_possibly_nexusBio::Tools::GuessSeqFormat::_possibly_nexus
0000s0sBio::Tools::GuessSeqFormat::::_possibly_pfamBio::Tools::GuessSeqFormat::_possibly_pfam
0000s0sBio::Tools::GuessSeqFormat::::_possibly_phrapBio::Tools::GuessSeqFormat::_possibly_phrap
0000s0sBio::Tools::GuessSeqFormat::::_possibly_phylipBio::Tools::GuessSeqFormat::_possibly_phylip
0000s0sBio::Tools::GuessSeqFormat::::_possibly_pirBio::Tools::GuessSeqFormat::_possibly_pir
0000s0sBio::Tools::GuessSeqFormat::::_possibly_prodomBio::Tools::GuessSeqFormat::_possibly_prodom
0000s0sBio::Tools::GuessSeqFormat::::_possibly_rawBio::Tools::GuessSeqFormat::_possibly_raw
0000s0sBio::Tools::GuessSeqFormat::::_possibly_rsfBio::Tools::GuessSeqFormat::_possibly_rsf
0000s0sBio::Tools::GuessSeqFormat::::_possibly_selexBio::Tools::GuessSeqFormat::_possibly_selex
0000s0sBio::Tools::GuessSeqFormat::::_possibly_stockholmBio::Tools::GuessSeqFormat::_possibly_stockholm
0000s0sBio::Tools::GuessSeqFormat::::_possibly_swissBio::Tools::GuessSeqFormat::_possibly_swiss
0000s0sBio::Tools::GuessSeqFormat::::_possibly_tabBio::Tools::GuessSeqFormat::_possibly_tab
0000s0sBio::Tools::GuessSeqFormat::::_possibly_vcfBio::Tools::GuessSeqFormat::_possibly_vcf
0000s0sBio::Tools::GuessSeqFormat::::fhBio::Tools::GuessSeqFormat::fh
0000s0sBio::Tools::GuessSeqFormat::::fileBio::Tools::GuessSeqFormat::file
0000s0sBio::Tools::GuessSeqFormat::::guessBio::Tools::GuessSeqFormat::guess
0000s0sBio::Tools::GuessSeqFormat::::newBio::Tools::GuessSeqFormat::new
0000s0sBio::Tools::GuessSeqFormat::::textBio::Tools::GuessSeqFormat::text
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1#------------------------------------------------------------------
2#
3# BioPerl module Bio::Tools::GuessSeqFormat
4#
5# Please direct questions and support issues to <bioperl-l@bioperl.org>
6#
7# Cared for by Andreas Kähäri, andreas.kahari@ebi.ac.uk
8#
9# You may distribute this module under the same terms as perl itself
10#------------------------------------------------------------------
11
12=encoding utf-8
13
14=head1 NAME
15
16Bio::Tools::GuessSeqFormat - Module for determining the sequence
17format of the contents of a file, a string, or through a
18filehandle.
19
20=head1 SYNOPSIS
21
22 # To guess the format of a flat file, given a filename:
23 my $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename );
24 my $format = $guesser->guess;
25
26 # To guess the format from an already open filehandle:
27 my $guesser = Bio::Tools::GuessSeqFormat->new( -fh => $filehandle );
28 my $format = $guesser->guess;
29 # The filehandle will be returned to its original position. Note that this
30 # filehandle can be STDIN.
31
32 # To guess the format of one or several lines of text (with
33 # embedded newlines):
34 my $guesser = Bio::Tools::GuessSeqFormat->new( -text => $linesoftext );
35 my $format = $guesser->guess;
36
37 # To create a Bio::Tools::GuessSeqFormat object and set the
38 # filename, filehandle, or line to parse afterwards:
39 my $guesser = Bio::Tools::GuessSeqFormat->new();
40 $guesser->file($filename);
41 $guesser->fh($filehandle);
42 $guesser->text($linesoftext);
43
44 # To guess in one go, given e.g. a filename:
45 my $format = Bio::Tools::GuessSeqFormat->new( -file => $filename )->guess;
46
47=head1 DESCRIPTION
48
49Bio::Tools::GuessSeqFormat tries to guess the format ("swiss",
50"pir", "fasta" etc.) of the sequence or MSA in a file, in a
51scalar, or through a filehandle.
52
53The guess() method of a Bio::Tools::GuessSeqFormat object will
54examine the data, line by line, until it finds a line to which
55only one format can be assigned. If no conclusive guess can be
56made, undef is returned.
57
58If the Bio::Tools::GuessSeqFormat object is given a filehandle,
59e.g. STDIN, it will be restored to its original position on
60return from the guess() method.
61
62=head2 Formats
63
64Tests are currently implemented for the following formats:
65
66=over
67
68=item *
69
70ACeDB ("ace")
71
72=item *
73
74Blast ("blast")
75
76=item *
77
78ClustalW ("clustalw")
79
80=item *
81
82Codata ("codata")
83
84=item *
85
86EMBL ("embl")
87
88=item *
89
90FastA sequence ("fasta")
91
92=item *
93
94FastQ sequence ("fastq")
95
96=item *
97
98FastXY/FastA alignment ("fastxy")
99
100=item *
101
102Game XML ("game")
103
104=item *
105
106GCG ("gcg")
107
108=item *
109
110GCG Blast ("gcgblast")
111
112=item *
113
114GCG FastA ("gcgfasta")
115
116=item *
117
118GDE ("gde")
119
120=item *
121
122Genbank ("genbank")
123
124=item *
125
126Genscan ("genscan")
127
128=item *
129
130GFF ("gff")
131
132=item *
133
134HMMER ("hmmer")
135
136=item *
137
138PAUP/NEXUS ("nexus")
139
140=item *
141
142Phrap assembly file ("phrap")
143
144=item *
145
146NBRF/PIR ("pir")
147
148=item *
149
150Mase ("mase")
151
152=item *
153
154Mega ("mega")
155
156=item *
157
158GCG/MSF ("msf")
159
160=item *
161
162Pfam ("pfam")
163
164=item *
165
166Phylip ("phylip")
167
168=item *
169
170Prodom ("prodom")
171
172=item *
173
174Raw ("raw")
175
176=item *
177
178RSF ("rsf")
179
180=item *
181
182Selex ("selex")
183
184=item *
185
186Stockholm ("stockholm")
187
188=item *
189
190Swissprot ("swiss")
191
192=item *
193
194Tab ("tab")
195
196=item *
197
198Variant Call Format ("vcf")
199
200=back
201
202=head1 FEEDBACK
203
204=head2 Mailing Lists
205
206User feedback is an integral part of the evolution of this and
207other Bioperl modules. Send your comments and suggestions
208preferably to one of the Bioperl mailing lists. Your
209participation is much appreciated.
210
211 bioperl-l@bioperl.org - General discussion
212 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
213
214=head2 Support
215
216Please direct usage questions or support issues to the mailing list:
217
218I<bioperl-l@bioperl.org>
219
220rather than to the module maintainer directly. Many experienced and
221reponsive experts will be able look at the problem and quickly
222address it. Please include a thorough description of the problem
223with code and data examples if at all possible.
224
225=head2 Reporting Bugs
226
227Report bugs to the Bioperl bug tracking system to help us
228keep track the bugs and their resolution. Bug reports can be
229submitted via the web:
230
231 https://github.com/bioperl/bioperl-live/issues
232
233=head1 AUTHOR
234
235Andreas KE<228>hE<228>ri, andreas.kahari@ebi.ac.uk
236
237=head1 CONTRIBUTORS
238
239Heikki LehvE<228>slaiho, heikki-at-bioperl-dot-org
240Mark A. Jensen, maj-at-fortinbras-dot-us
241
242=cut
243
244
245package Bio::Tools::GuessSeqFormat;
246
247224µs239µs
# spent 26µs (13+13) within Bio::Tools::GuessSeqFormat::BEGIN@247 which was called: # once (13µs+13µs) by Bio::SeqIO::BEGIN@332 at line 247
use strict;
# spent 26µs making 1 call to Bio::Tools::GuessSeqFormat::BEGIN@247 # spent 13µs making 1 call to strict::import
248224µs218µs
# spent 13µs (8+5) within Bio::Tools::GuessSeqFormat::BEGIN@248 which was called: # once (8µs+5µs) by Bio::SeqIO::BEGIN@332 at line 248
use warnings;
# spent 13µs making 1 call to Bio::Tools::GuessSeqFormat::BEGIN@248 # spent 5µs making 1 call to warnings::import
249
250
25122.35ms2132µs
# spent 70µs (8+62) within Bio::Tools::GuessSeqFormat::BEGIN@251 which was called: # once (8µs+62µs) by Bio::SeqIO::BEGIN@332 at line 251
use base qw(Bio::Root::Root);
# spent 70µs making 1 call to Bio::Tools::GuessSeqFormat::BEGIN@251 # spent 62µs making 1 call to base::import
252
253=head1 METHODS
254
255Methods available to Bio::Tools::GuessSeqFormat objects
256are described below. Methods with names beginning with an
257underscore are considered to be internal.
258
259=cut
260
261=head2 new
262
263 Title : new
264 Usage : $guesser = Bio::Tools::GuessSeqFormat->new( ... );
265 Function : Creates a new object.
266 Example : See SYNOPSIS.
267 Returns : A new object.
268 Arguments : -file The filename of the file whose format is to
269 be guessed, e.g. STDIN, or
270 -fh An already opened filehandle from which a text
271 stream may be read, or
272 -text A scalar containing one or several lines of
273 text with embedded newlines.
274
275 If more than one of the above arguments are given, they
276 are tested in the order -text, -file, -fh, and the first
277 available argument will be used.
278
279=cut
280
281sub new
282{
283 my $class = shift;
284 my @args = @_;
285
286 my $self = $class->SUPER::new(@args);
287
288 my $attr;
289 my $value;
290
291 while (@args) {
292 $attr = shift @args;
293 $attr = lc $attr;
294 $value = shift @args;
295 $self->{$attr} = $value;
296 }
297
298 return $self;
299}
300
301=head2 file
302
303 Title : file
304 Usage : $guesser->file($filename);
305 $filename = $guesser->file;
306 Function : Gets or sets the current filename associated with
307 an object.
308 Returns : The new filename.
309 Arguments : The filename of the file whose format is to be
310 guessed.
311
312 A call to this method will clear the current filehandle and
313 the current lines of text associated with the object.
314
315=cut
316
317sub file
318{
319 # Sets and/or returns the filename to use.
320 my $self = shift;
321 my $file = shift;
322
323 if (defined $file) {
324 # Set the active filename, and clear the filehandle and
325 # text line, if present.
326 $self->{-file} = $file;
327 $self->{-fh} = $self->{-text} = undef;
328 }
329
330 return $self->{-file};
331}
332
333=head2 fh
334
335 Title : fh
336 Usage : $guesser->fh($filehandle);
337 $filehandle = $guesser->fh;
338 Function : Gets or sets the current filehandle associated with
339 an object.
340 Returns : The new filehandle.
341 Arguments : An already opened filehandle from which a text
342 stream may be read.
343
344 A call to this method will clear the current filename and
345 the current lines of text associated with the object.
346
347=cut
348
349sub fh
350{
351 # Sets and/or returns the filehandle to use.
352 my $self = shift;
353 my $fh = shift;
354
355 if (defined $fh) {
356 # Set the active filehandle, and clear the filename and
357 # text line, if present.
358 $self->{-fh} = $fh;
359 $self->{-file} = $self->{-text} = undef;
360 }
361
362 return $self->{-fh};
363}
364
365
366=head2 text
367
368 Title : text
369 Usage : $guesser->text($linesoftext);
370 $linesofext = $guesser->text;
371 Function : Gets or sets the current text associated with an
372 object.
373 Returns : The new lines of texts.
374 Arguments : A scalar containing one or several lines of text,
375 including embedded newlines.
376
377 A call to this method will clear the current filename and
378 the current filehandle associated with the object.
379
380=cut
381
382sub text
383{
384 # Sets and/or returns the text lines to use.
385 my $self = shift;
386 my $text = shift;
387
388 if (defined $text) {
389 # Set the active text lines, and clear the filehandle
390 # and filename, if present.
391 $self->{-text} = $text;
392 $self->{-fh} = $self->{-file} = undef;
393 }
394
395 return $self->{-text};
396}
397
398=head2 guess
399
400 Title : guess
401 Usage : $format = $guesser->guess;
402 @format = $guesser->guess; # if given a line of text
403 Function : Guesses the format of the data accociated with the
404 object.
405 Returns : A format string such as "swiss" or "pir". If a
406 format can not be found, undef is returned.
407 Arguments : None.
408
409 If the object is associated with a filehandle, the position
410 of the filehandle will be returned to its original position
411 before the method returns.
412
413=cut
414
415126µsour %formats = (
416 ace => { test => \&_possibly_ace },
417 blast => { test => \&_possibly_blast },
418 bowtie => { test => \&_possibly_bowtie },
419 clustalw => { test => \&_possibly_clustalw },
420 codata => { test => \&_possibly_codata },
421 embl => { test => \&_possibly_embl },
422 fasta => { test => \&_possibly_fasta },
423 fastq => { test => \&_possibly_fastq },
424 fastxy => { test => \&_possibly_fastxy },
425 game => { test => \&_possibly_game },
426 gcg => { test => \&_possibly_gcg },
427 gcgblast => { test => \&_possibly_gcgblast },
428 gcgfasta => { test => \&_possibly_gcgfasta },
429 gde => { test => \&_possibly_gde },
430 genbank => { test => \&_possibly_genbank },
431 genscan => { test => \&_possibly_genscan },
432 gff => { test => \&_possibly_gff },
433 hmmer => { test => \&_possibly_hmmer },
434 nexus => { test => \&_possibly_nexus },
435 mase => { test => \&_possibly_mase },
436 mega => { test => \&_possibly_mega },
437 msf => { test => \&_possibly_msf },
438 pfam => { test => \&_possibly_pfam },
439 phrap => { test => \&_possibly_phrap },
440 phylip => { test => \&_possibly_phylip },
441 pir => { test => \&_possibly_pir },
442 prodom => { test => \&_possibly_prodom },
443 raw => { test => \&_possibly_raw },
444 rsf => { test => \&_possibly_rsf },
445 selex => { test => \&_possibly_selex },
446 stockholm => { test => \&_possibly_stockholm },
447 swiss => { test => \&_possibly_swiss },
448 tab => { test => \&_possibly_tab },
449 vcf => { test => \&_possibly_vcf },
450);
451
452sub guess
453{
454 my $self = shift;
455
456 while (my ($fmt_key) = each (%formats)) {
457 $formats{$fmt_key}{fmt_string} = $fmt_key;
458 }
459
460 my $fh;
461 my $start_pos;
462 if (defined $self->{-text}) {
463 # Break the text into separate lines.
464 my $text = $self->{-text};
465 open $fh, '<', \$text or $self->throw("Could not read from string: $!");
466
467 } elsif (defined $self->{-file}) {
468 # If given a filename, open the file.
469 my $file = $self->{-file};
470 open $fh, '<', $file or $self->throw("Could not read file '$file': $!");
471
472 } elsif (defined $self->{-fh}) {
473 # If given a filehandle, get the current position in the stream.
474 $fh = $self->{-fh};
475 if (not seek $fh, 0, 1) { # seek to current position to determine seekability
476 # Work around non-seekable filehandles if IO::Scalar is available
477 # (adapted from http://www.perlmonks.org/?node_id=33587)
478 # IO::Mark may be an option for very large streams?
479 $self->throw("Need IO::Scalar to guess from unseekable filehandles")
480 if not eval { require IO::Scalar };
481 my $data;
482 { local $/; $data = <$fh>; $.-- }; # copy raw data from fh
483 tie *$fh, 'IO::Scalar', my $s; # replace fh by scalar-tied fh
484 print $fh $data; # write raw data to tied fh
485 seek $fh, 0, 0; # return to start of tied fh
486 }
487 $start_pos = tell $fh;
488 }
489
490 my $done = 0;
491 my $lineno = 0;
492 my $guess;
493 while (!$done) {
494 my $line; # The next line of the file.
495 my $match = 0; # Number of possible formats of this line.
496
497 last if (!defined($line = <$fh>));
498 next if ($line =~ /^\s*$/); # Skip white and empty lines.
499 chomp $line;
500 $line =~ s/\r$//; # Fix for DOS files on Unix.
501 ++$lineno;
502
503 while (my ($fmt_key, $fmt) = each (%formats)) {
504 if ($fmt->{test}($line, $lineno)) {
505 ++$match;
506 $guess = $fmt->{fmt_string};
507 }
508 }
509
510 # We're done if there was only one match.
511 $done = ($match == 1);
512 }
513
514 if (defined $self->{-fh}) {
515 # Go back to original position in filehandle
516 seek $fh, $start_pos, 0 or $self->throw("Could not reset filehandle $fh: $!");
517 } else {
518 # Close the filehandle we opened
519 close $fh;
520 }
521 return ($done ? $guess : undef);
522}
523
524=head1 HELPER SUBROUTINES
525
526All helper subroutines will, given a line of text and the line
527number of the same line, return 1 if the line possibly is from a
528file of the type that they perform a test of.
529
530A zero return value does not mean that the line is not part
531of a certain type of file, just that the test did not find any
532characteristics of that type of file in the line.
533
534=head2 _possibly_ace
535
536From bioperl test data, and from
537"http://www.isrec.isb-sib.ch/DEA/module8/B_Stevenson/Practicals/transcriptome_recon/transcriptome_recon.html".
538
539=cut
540
541sub _possibly_ace
542{
543 my ($line, $lineno) = (shift, shift);
544 return ($line =~ /^(?:Sequence|Peptide|DNA|Protein) [":]/);
545}
546
547=head2 _possibly_blast
548
549 From various blast results.
550
551=cut
552
553sub _possibly_blast
554{
555 my ($line, $lineno) = (shift, shift);
556 return ($lineno == 1 &&
557 $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/);
558}
559
560=head2 _possibly_bowtie
561
562Contributed by kortsch.
563
564=cut
565
566sub _possibly_bowtie
567{
568 my ($line, $lineno) = (shift, shift);
569 return ($line =~ /^[[:graph:]]+\t[-+]\t[[:graph:]]+\t\d+\t([[:alpha:]]+)\t([[:graph:]]+)\t\d+\t[[:graph:]]?/)
570 && length($1)==length($2);
571}
572
573=head2 _possibly_clustalw
574
575From "http://www.ebi.ac.uk/help/formats.html".
576
577=cut
578
579sub _possibly_clustalw
580{
581 my ($line, $lineno) = (shift, shift);
582 return ($lineno == 1 && $line =~ /CLUSTAL/);
583}
584
585=head2 _possibly_codata
586
587From "http://www.ebi.ac.uk/help/formats.html".
588
589=cut
590
591sub _possibly_codata
592{
593 my ($line, $lineno) = (shift, shift);
594 return (($lineno == 1 && $line =~ /^ENTRY/) ||
595 ($lineno == 2 && $line =~ /^SEQUENCE/) ||
596 $line =~ m{^(?:ENTRY|SEQUENCE|///)});
597}
598
599=head2 _possibly_embl
600
601From
602"http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3.3".
603
604=cut
605
606sub _possibly_embl
607{
608 my ($line, $lineno) = (shift, shift);
609 return ($lineno == 1 && $line =~ /^ID / && $line =~ /BP\.$/);
610}
611
612=head2 _possibly_fasta
613
614From "http://www.ebi.ac.uk/help/formats.html".
615
616=cut
617
618sub _possibly_fasta
619{
620 my ($line, $lineno) = (shift, shift);
621 return (($lineno != 1 && $line =~ /^[A-IK-NP-Z]+$/i) ||
622 $line =~ /^>\s*\w/);
623}
624
625=head2 _possibly_fastq
626
627From bioperl test data.
628
629=cut
630
631sub _possibly_fastq
632{
633 my ($line, $lineno) = (shift, shift);
634 return ( ($lineno == 1 && $line =~ /^@/) ||
635 ($lineno == 3 && $line =~ /^\+/) );
636}
637
638=head2 _possibly_fastxy
639
640From bioperl test data.
641
642=cut
643
644sub _possibly_fastxy
645{
646 my ($line, $lineno) = (shift, shift);
647 return (($lineno == 1 && $line =~ /^ FAST(?:XY|A)/) ||
648 ($lineno == 2 && $line =~ /^ version \d/));
649}
650
651=head2 _possibly_game
652
653From bioperl testdata.
654
655=cut
656
657sub _possibly_game
658{
659 my ($line, $lineno) = (shift, shift);
660 return ($line =~ /^<!DOCTYPE game/);
661}
662
663=head2 _possibly_gcg
664
665From bioperl, Bio::SeqIO::gcg.
666
667=cut
668
669sub _possibly_gcg
670{
671 my ($line, $lineno) = (shift, shift);
672 return ($line =~ /Length: .*Type: .*Check: .*\.\.$/);
673}
674
675=head2 _possibly_gcgblast
676
677From bioperl testdata.
678
679=cut
680
681sub _possibly_gcgblast
682{
683 my ($line, $lineno) = (shift, shift);
684 return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) ||
685 ($lineno == 2 &&
686 $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/));
687}
688
689=head2 _possibly_gcgfasta
690
691From bioperl testdata.
692
693=cut
694
695sub _possibly_gcgfasta
696{
697 my ($line, $lineno) = (shift, shift);
698 return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) ||
699 ($lineno == 2 && $line =~ /FASTA/));
700}
701
702=head2 _possibly_gde
703
704From "http://www.ebi.ac.uk/help/formats.html".
705
706=cut
707
708sub _possibly_gde
709{
710 my ($line, $lineno) = (shift, shift);
711 return ($line =~ /^[{}]$/ ||
712 $line =~ /^(?:name|longname|sequence-ID|
713 creation-date|direction|strandedness|
714 type|offset|group-ID|creator|descrip|
715 comment|sequence)/x);
716}
717
718=head2 _possibly_genbank
719
720From "http://www.ebi.ac.uk/help/formats.html".
721Format of [apparantly optional] file header from
722"http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm". (TODO: dead link)
723
724=cut
725
726sub _possibly_genbank
727{
728 my ($line, $lineno) = (shift, shift);
729 return (($lineno == 1 && $line =~ /GENETIC SEQUENCE DATA BANK/) ||
730 ($lineno == 1 && $line =~ /^LOCUS /) ||
731 ($lineno == 2 && $line =~ /^DEFINITION /) ||
732 ($lineno == 3 && $line =~ /^ACCESSION /));
733}
734
735=head2 _possibly_genscan
736
737From bioperl test data.
738
739=cut
740
741sub _possibly_genscan
742{
743 my ($line, $lineno) = (shift, shift);
744 return (($lineno == 1 && $line =~ /^GENSCAN.*Date.*Time/) ||
745 ($line =~ /^(?:Sequence\s+\w+|Parameter matrix|Predicted genes)/));
746}
747
748=head2 _possibly_gff
749
750From bioperl test data.
751
752=cut
753
754sub _possibly_gff
755{
756 my ($line, $lineno) = (shift, shift);
757 return (($lineno == 1 && $line =~ /^##gff-version/) ||
758 ($lineno == 2 && $line =~ /^##date/));
759}
760
761=head2 _possibly_hmmer
762
763From bioperl test data.
764
765=cut
766
767sub _possibly_hmmer
768{
769 my ($line, $lineno) = (shift, shift);
770 return (($lineno == 2 && $line =~ /^HMMER/) ||
771 ($lineno == 3 &&
772 $line =~ /Washington University School of Medicine/));
773}
774
775=head2 _possibly_nexus
776
777From "http://paup.csit.fsu.edu/nfiles.html".
778
779=cut
780
781sub _possibly_nexus
782{
783 my ($line, $lineno) = (shift, shift);
784 return ($lineno == 1 && $line =~ /^#NEXUS/);
785}
786
787=head2 _possibly_mase
788
789From bioperl test data.
790More detail from "http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm" (TODO: dead link)
791
792=cut
793
794sub _possibly_mase
795{
796 my ($line, $lineno) = (shift, shift);
797 return (($lineno == 1 && $line =~ /^;;/) ||
798 ($lineno > 1 && $line =~ /^;[^;]?/));
799}
800
801=head2 _possibly_mega
802
803From the ensembl broswer (AlignView data export).
804
805=cut
806
807sub _possibly_mega
808{
809 my ($line, $lineno) = (shift, shift);
810 return ($lineno == 1 && $line =~ /^#mega$/);
811}
812
813
814=head2 _possibly_msf
815
816From "http://www.ebi.ac.uk/help/formats.html".
817
818=cut
819
820sub _possibly_msf
821{
822 my ($line, $lineno) = (shift, shift);
823 return ($line =~ m{^//} ||
824 $line =~ /MSF:.*Type:.*Check:|Name:.*Len:/);
825}
826
827=head2 _possibly_phrap
828
829From "http://biodata.ccgb.umn.edu/docs/contigimage.html". (TODO: dead link)
830From "http://genetics.gene.cwru.edu/gene508/Lec6.htm". (TODO: dead link)
831From bioperl test data ("*.ace.1" files).
832
833=cut
834
835sub _possibly_phrap
836{
837 my ($line, $lineno) = (shift, shift);
838 return ($line =~ /^(?:AS\ |CO\ Contig|BQ|AF\ |BS\ |RD\ |
839 QA\ |DS\ |RT\{)/x);
840}
841
842=head2 _possibly_pir
843
844From "http://www.ebi.ac.uk/help/formats.html".
845The ".,()" spotted in bioperl test data.
846
847=cut
848
849sub _possibly_pir # "NBRF/PIR" (?)
850{
851 my ($line, $lineno) = (shift, shift);
852 return (($lineno != 1 && $line =~ /^[\sA-IK-NP-Z.,()]+\*?$/i) ||
853 $line =~ /^>(?:P1|F1|DL|DC|RL|RC|N3|N1);/);
854}
855
856=head2 _possibly_pfam
857
858From bioperl test data.
859
860=cut
861
862sub _possibly_pfam
863{
864 my ($line, $lineno) = (shift, shift);
865 return ($line =~ m{^\w+/\d+-\d+\s+[A-IK-NP-Z.]+}i);
866}
867
868=head2 _possibly_phylip
869
870From "http://www.ebi.ac.uk/help/formats.html". Initial space
871allowed on first line (spotted in ensembl AlignView exported
872data).
873
874=cut
875
876sub _possibly_phylip
877{
878 my ($line, $lineno) = (shift, shift);
879 return (($lineno == 1 && $line =~ /^\s*\d+\s\d+/) ||
880 ($lineno == 2 && $line =~ /^\w\s+[A-IK-NP-Z\s]+/) ||
881 ($lineno == 3 && $line =~ /(?:^\w\s+[A-IK-NP-Z\s]+|\s+[A-IK-NP-Z\s]+)/)
882 );
883}
884
885=head2 _possibly_prodom
886
887From "http://prodom.prabi.fr/prodom/current/documentation/data.php".
888
889=cut
890
891sub _possibly_prodom
892{
893 my ($line, $lineno) = (shift, shift);
894 return ($lineno == 1 && $line =~ /^ID / && $line =~ /\d+ seq\.$/);
895}
896
897=head2 _possibly_raw
898
899From "http://www.ebi.ac.uk/help/formats.html".
900
901=cut
902
903sub _possibly_raw
904{
905 my ($line, $lineno) = (shift, shift);
906 return ($line =~ /^[A-Za-z\s]+$/);
907}
908
909=head2 _possibly_rsf
910
911From "http://www.ebi.ac.uk/help/formats.html".
912
913=cut
914
915sub _possibly_rsf
916{
917 my ($line, $lineno) = (shift, shift);
918 return (($lineno == 1 && $line =~ /^!!RICH_SEQUENCE/) ||
919 $line =~ /^[{}]$/ ||
920 $line =~ /^(?:name|type|longname|
921 checksum|creation-date|strand|sequence)/x);
922}
923
924=head2 _possibly_selex
925
926From "http://www.ebc.ee/WWW/hmmer2-html/node27.html".
927
928Assuming presence of Selex file header. Data exported by
929Bioperl on Pfam and Selex formats are identical, but Pfam file
930only holds one alignment.
931
932=cut
933
934sub _possibly_selex
935{
936 my ($line, $lineno) = (shift, shift);
937 return (($lineno == 1 && $line =~ /^#=ID /) ||
938 ($lineno == 2 && $line =~ /^#=AC /) ||
939 ($line =~ /^#=SQ /));
940}
941
942=head2 _possibly_stockholm
943
944From bioperl test data.
945
946=cut
947
948sub _possibly_stockholm
949{
950 my ($line, $lineno) = (shift, shift);
951 return (($lineno == 1 && $line =~ /^# STOCKHOLM/) ||
952 $line =~ /^#=(?:GF|GS) /);
953}
954
- -
957=head2 _possibly_swiss
958
959From "http://ca.expasy.org/sprot/userman.html#entrystruc".
960
961=cut
962
963sub _possibly_swiss
964{
965 my ($line, $lineno) = (shift, shift);
966 return ($lineno == 1 && $line =~ /^ID / && $line =~ /AA\.$/);
967}
968
969=head2 _possibly_tab
970
971Contributed by Heikki.
972
973=cut
974
975sub _possibly_tab
976{
977 my ($line, $lineno) = (shift, shift);
978 return ($lineno == 1 && $line =~ /^[^\t]+\t[^\t]+/) ;
979}
980
981=head2 _possibly_vcf
982
983From "http://www.1000genomes.org/wiki/analysis/vcf4.0".
984
985Assumptions made about sanity - format and date lines are line 1 and 2
986respectively. This is not specified in the format document.
987
988=cut
989
990sub _possibly_vcf
991{
992 my ($line, $lineno) = (shift, shift);
993 return (($lineno == 1 && $line =~ /##fileformat=VCFv/) ||
994 ($lineno == 2 && $line =~ /##fileDate=/));
995}
996
- -
999148µs1;