Filename | /Users/ap13/perl5/lib/perl5/Bio/PrimarySeq.pm |
Statements | Executed 8 statements in 3.43ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 16µs | 32µs | BEGIN@128 | Bio::PrimarySeq::
1 | 1 | 1 | 12µs | 15.9ms | BEGIN@133 | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | _guess_alphabet | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | _guess_alphabet_from_string | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | _set_seq_by_ref | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | accession | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | accession_number | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | alphabet | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | authority | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | can_call_new | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | desc | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | description | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | display_id | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | display_name | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | id | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | is_circular | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | length | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | namespace | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | new | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | object_id | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | primary_id | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | seq | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | subseq | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | validate_seq | Bio::PrimarySeq::
0 | 0 | 0 | 0s | 0s | version | Bio::PrimarySeq::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # | ||||
2 | # bioperl module for Bio::PrimarySeq | ||||
3 | # | ||||
4 | # Please direct questions and support issues to <bioperl-l@bioperl.org> | ||||
5 | # | ||||
6 | # Cared for by Ewan Birney <birney@ebi.ac.uk> | ||||
7 | # | ||||
8 | # Copyright Ewan Birney | ||||
9 | # | ||||
10 | # You may distribute this module under the same terms as perl itself | ||||
11 | |||||
12 | # POD documentation - main docs before the code | ||||
13 | |||||
14 | =head1 NAME | ||||
15 | |||||
16 | Bio::PrimarySeq - Bioperl lightweight sequence object | ||||
17 | |||||
18 | =head1 SYNOPSIS | ||||
19 | |||||
20 | # Bio::SeqIO for file reading, Bio::DB::GenBank for | ||||
21 | # database reading | ||||
22 | |||||
23 | use Bio::Seq; | ||||
24 | use Bio::SeqIO; | ||||
25 | use Bio::DB::GenBank; | ||||
26 | |||||
27 | # make from memory | ||||
28 | |||||
29 | $seqobj = Bio::PrimarySeq->new ( | ||||
30 | -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG', | ||||
31 | -id => 'GeneFragment-12', | ||||
32 | -accession_number => 'X78121', | ||||
33 | -alphabet => 'dna', | ||||
34 | -is_circular => 1, | ||||
35 | ); | ||||
36 | print "Sequence ", $seqobj->id(), " with accession ", | ||||
37 | $seqobj->accession_number, "\n"; | ||||
38 | |||||
39 | # read from file | ||||
40 | |||||
41 | $inputstream = Bio::SeqIO->new( | ||||
42 | -file => "myseq.fa", | ||||
43 | -format => 'Fasta', | ||||
44 | ); | ||||
45 | $seqobj = $inputstream->next_seq(); | ||||
46 | print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n"; | ||||
47 | |||||
48 | # to get out parts of the sequence. | ||||
49 | |||||
50 | print "Sequence ", $seqobj->id(), " with accession ", | ||||
51 | $seqobj->accession_number, " and desc ", $seqobj->desc, "\n"; | ||||
52 | |||||
53 | $string = $seqobj->seq(); | ||||
54 | $string2 = $seqobj->subseq(1,40); | ||||
55 | |||||
56 | =head1 DESCRIPTION | ||||
57 | |||||
58 | PrimarySeq is a lightweight sequence object, storing the sequence, its | ||||
59 | name, a computer-useful unique name, and other fundamental attributes. | ||||
60 | It does not contain sequence features or other information. To have a | ||||
61 | sequence with sequence features you should use the Seq object which uses | ||||
62 | this object. | ||||
63 | |||||
64 | Although new users will use Bio::PrimarySeq a lot, in general you will | ||||
65 | be using it from the Bio::Seq object. For more information on Bio::Seq | ||||
66 | see L<Bio::Seq>. For interest you might like to know that | ||||
67 | Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls | ||||
68 | to do with sequence to it (the has-a relationship lets us get out of a | ||||
69 | otherwise nasty cyclical reference in Perl which would leak memory). | ||||
70 | |||||
71 | Sequence objects are defined by the Bio::PrimarySeqI interface, and this | ||||
72 | object is a pure Perl implementation of the interface. If that's | ||||
73 | gibberish to you, don't worry. The take home message is that this | ||||
74 | object is the bioperl default sequence object, but other people can | ||||
75 | use their own objects as sequences if they so wish. If you are | ||||
76 | interested in wrapping your own objects as compliant Bioperl sequence | ||||
77 | objects, then you should read the Bio::PrimarySeqI documentation | ||||
78 | |||||
79 | The documentation of this object is a merge of the Bio::PrimarySeq and | ||||
80 | Bio::PrimarySeqI documentation. This allows all the methods which you can | ||||
81 | call on sequence objects here. | ||||
82 | |||||
83 | =head1 FEEDBACK | ||||
84 | |||||
85 | =head2 Mailing Lists | ||||
86 | |||||
87 | User feedback is an integral part of the evolution of this and other | ||||
88 | Bioperl modules. Send your comments and suggestions preferably to one | ||||
89 | of the Bioperl mailing lists. Your participation is much appreciated. | ||||
90 | |||||
91 | bioperl-l@bioperl.org - General discussion | ||||
92 | http://bioperl.org/wiki/Mailing_lists - About the mailing lists | ||||
93 | |||||
94 | =head2 Support | ||||
95 | |||||
96 | Please direct usage questions or support issues to the mailing list: | ||||
97 | |||||
98 | I<bioperl-l@bioperl.org> | ||||
99 | |||||
100 | rather than to the module maintainer directly. Many experienced and | ||||
101 | reponsive experts will be able look at the problem and quickly | ||||
102 | address it. Please include a thorough description of the problem | ||||
103 | with code and data examples if at all possible. | ||||
104 | |||||
105 | =head2 Reporting Bugs | ||||
106 | |||||
107 | Report bugs to the Bioperl bug tracking system to help us keep track | ||||
108 | the bugs and their resolution. Bug reports can be submitted via the | ||||
109 | web: | ||||
110 | |||||
111 | https://github.com/bioperl/bioperl-live/issues | ||||
112 | |||||
113 | =head1 AUTHOR - Ewan Birney | ||||
114 | |||||
115 | Email birney@ebi.ac.uk | ||||
116 | |||||
117 | =head1 APPENDIX | ||||
118 | |||||
119 | The rest of the documentation details each of the object | ||||
120 | methods. Internal methods are usually preceded with a _ | ||||
121 | |||||
122 | =cut | ||||
123 | |||||
- - | |||||
126 | package Bio::PrimarySeq; | ||||
127 | |||||
128 | 2 | 55µs | 2 | 48µs | # spent 32µs (16+16) within Bio::PrimarySeq::BEGIN@128 which was called:
# once (16µs+16µs) by base::import at line 128 # spent 32µs making 1 call to Bio::PrimarySeq::BEGIN@128
# spent 16µs making 1 call to strict::import |
129 | |||||
130 | 1 | 2µs | our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~'; | ||
131 | 1 | 300ns | our $GAP_SYMBOLS = '-~'; | ||
132 | |||||
133 | 2 | 8µs | # spent 15.9ms (12µs+15.9) within Bio::PrimarySeq::BEGIN@133 which was called:
# once (12µs+15.9ms) by base::import at line 134 | ||
134 | 1 | 3.35ms | 2 | 15.9ms | Bio::IdentifiableI Bio::DescribableI); # spent 15.9ms making 1 call to Bio::PrimarySeq::BEGIN@133
# spent 15.9ms making 1 call to base::import, recursion: max depth 1, sum of overlapping time 15.9ms |
135 | |||||
136 | |||||
137 | # Setup the allowed values for alphabet() | ||||
138 | 1 | 9µs | my %valid_type = map {$_, 1} qw( dna rna protein ); | ||
139 | |||||
140 | |||||
141 | =head2 new | ||||
142 | |||||
143 | Title : new | ||||
144 | Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT', | ||||
145 | -id => 'human_id', | ||||
146 | -accession_number => 'AL000012', | ||||
147 | ); | ||||
148 | Function: Returns a new primary seq object from | ||||
149 | basic constructors, being a string for the sequence | ||||
150 | and strings for id and accession_number. | ||||
151 | |||||
152 | Note that you can provide an empty sequence string. However, in | ||||
153 | this case you MUST specify the type of sequence you wish to | ||||
154 | initialize by the parameter -alphabet. See alphabet() for possible | ||||
155 | values. | ||||
156 | Returns : a new Bio::PrimarySeq object | ||||
157 | Args : -seq => sequence string | ||||
158 | -ref_to_seq => ... or reference to a sequence string | ||||
159 | -display_id => display id of the sequence (locus name) | ||||
160 | -accession_number => accession number | ||||
161 | -primary_id => primary id (Genbank id) | ||||
162 | -version => version number | ||||
163 | -namespace => the namespace for the accession | ||||
164 | -authority => the authority for the namespace | ||||
165 | -description => description text | ||||
166 | -desc => alias for description | ||||
167 | -alphabet => skip alphabet guess and set it to dna, rna or protein | ||||
168 | -id => alias for display id | ||||
169 | -is_circular => boolean to indicate that sequence is circular | ||||
170 | -direct => boolean to directly set sequences. The next time -seq, | ||||
171 | seq() or -ref_to_seq is use, the sequence will not be | ||||
172 | validated. Be careful with this... | ||||
173 | -nowarnonempty => boolean to avoid warning when sequence is empty | ||||
174 | |||||
175 | =cut | ||||
176 | |||||
177 | sub new { | ||||
178 | my ($class, @args) = @_; | ||||
179 | my $self = $class->SUPER::new(@args); | ||||
180 | my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description, | ||||
181 | $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len, | ||||
182 | $nowarnonempty) = | ||||
183 | $self->_rearrange([qw(SEQ | ||||
184 | DISPLAY_ID | ||||
185 | ACCESSION_NUMBER | ||||
186 | PRIMARY_ID | ||||
187 | NAMESPACE | ||||
188 | AUTHORITY | ||||
189 | VERSION | ||||
190 | OBJECT_ID | ||||
191 | DESC | ||||
192 | DESCRIPTION | ||||
193 | ALPHABET | ||||
194 | ID | ||||
195 | IS_CIRCULAR | ||||
196 | DIRECT | ||||
197 | REF_TO_SEQ | ||||
198 | LENGTH | ||||
199 | NOWARNONEMPTY | ||||
200 | )], | ||||
201 | @args); | ||||
202 | |||||
203 | # Private var _nowarnonempty, needs to be set before calling _guess_alphabet | ||||
204 | $self->{'_nowarnonempty'} = $nowarnonempty; | ||||
205 | $self->{'_direct'} = $direct; | ||||
206 | |||||
207 | if( defined $id && defined $given_id ) { | ||||
208 | if( $id ne $given_id ) { | ||||
209 | $self->throw("Provided both id and display_id constructors: [$id] [$given_id]"); | ||||
210 | } | ||||
211 | } | ||||
212 | if( defined $given_id ) { $id = $given_id; } | ||||
213 | |||||
214 | # Bernd's idea: set ids now for more informative invalid sequence messages | ||||
215 | defined $id && $self->display_id($id); | ||||
216 | $acc && $self->accession_number($acc); | ||||
217 | defined $pid && $self->primary_id($pid); | ||||
218 | |||||
219 | # Set alphabet now to avoid guessing it later, when sequence is set | ||||
220 | $alphabet && $self->alphabet($alphabet); | ||||
221 | |||||
222 | # Set the length before the seq. If there is a seq, length will be updated later | ||||
223 | $self->{'length'} = $len || 0; | ||||
224 | |||||
225 | # Set the sequence (but also alphabet and length) | ||||
226 | if ($ref_to_seq) { | ||||
227 | $self->_set_seq_by_ref($ref_to_seq, $alphabet); | ||||
228 | } else { | ||||
229 | if (defined $seq) { | ||||
230 | # Note: the sequence string may be empty | ||||
231 | $self->seq($seq); | ||||
232 | } | ||||
233 | } | ||||
234 | |||||
235 | $desc && $self->desc($desc); | ||||
236 | $description && $self->description($description); | ||||
237 | $ns && $self->namespace($ns); | ||||
238 | $auth && $self->authority($auth); | ||||
239 | # Any variable that can have a value "0" must be tested with defined | ||||
240 | # or it will fail to be added to the new object | ||||
241 | defined($v) && $self->version($v); | ||||
242 | defined($oid) && $self->object_id($oid); | ||||
243 | defined($is_circular) && $self->is_circular($is_circular); | ||||
244 | |||||
245 | return $self; | ||||
246 | } | ||||
247 | |||||
248 | |||||
249 | =head2 seq | ||||
250 | |||||
251 | Title : seq | ||||
252 | Usage : $string = $seqobj->seq(); | ||||
253 | Function: Get or set the sequence as a string of letters. The case of | ||||
254 | the letters is left up to the implementer. Suggested cases are | ||||
255 | upper case for proteins and lower case for DNA sequence (IUPAC | ||||
256 | standard), but you should not rely on this. An error is thrown if | ||||
257 | the sequence contains invalid characters: see validate_seq(). | ||||
258 | Returns : A scalar | ||||
259 | Args : - Optional new sequence value (a string) to set | ||||
260 | - Optional alphabet (it is guessed by default) | ||||
261 | |||||
262 | =cut | ||||
263 | |||||
264 | sub seq { | ||||
265 | my ($self, @args) = @_; | ||||
266 | |||||
267 | if( scalar @args == 0 ) { | ||||
268 | return $self->{'seq'}; | ||||
269 | } | ||||
270 | |||||
271 | my ($seq_str, $alphabet) = @args; | ||||
272 | if (@args) { | ||||
273 | $self->_set_seq_by_ref(\$seq_str, $alphabet); | ||||
274 | } | ||||
275 | |||||
276 | return $self->{'seq'}; | ||||
277 | } | ||||
278 | |||||
279 | |||||
280 | sub _set_seq_by_ref { | ||||
281 | # Set a sequence by reference. A reference is used to avoid the cost of | ||||
282 | # copying the sequence (which can be very large) between functions. | ||||
283 | my ($self, $seq_str_ref, $alphabet) = @_; | ||||
284 | |||||
285 | # Validate sequence if sequence is not empty and we are not in direct mode | ||||
286 | if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) { | ||||
287 | $self->validate_seq($$seq_str_ref, 1); | ||||
288 | } | ||||
289 | delete $self->{'_direct'}; # next sequence will have to be validated | ||||
290 | |||||
291 | # Record sequence length | ||||
292 | my $len = CORE::length($$seq_str_ref || ''); | ||||
293 | my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0); | ||||
294 | # Note: if the new seq is empty or undef, this is not considered a change | ||||
295 | delete $self->{'_freeze_length'} if $is_changed_seq; | ||||
296 | $self->{'length'} = $len if not exists $self->{'_freeze_length'}; | ||||
297 | |||||
298 | # Set sequence | ||||
299 | $self->{'seq'} = $$seq_str_ref; | ||||
300 | |||||
301 | # Set or guess alphabet | ||||
302 | if ($alphabet) { | ||||
303 | # Alphabet specified, set it no matter what | ||||
304 | $self->alphabet($alphabet); | ||||
305 | } elsif ($is_changed_seq || (! defined($self->alphabet()))) { | ||||
306 | # If we changed a previous sequence to a new one or if there is no | ||||
307 | # alphabet yet at all, we need to guess the (possibly new) alphabet | ||||
308 | $self->_guess_alphabet(); | ||||
309 | } # else (seq not changed and alphabet was defined) do nothing | ||||
310 | |||||
311 | return 1; | ||||
312 | } | ||||
313 | |||||
314 | |||||
315 | =head2 validate_seq | ||||
316 | |||||
317 | Title : validate_seq | ||||
318 | Usage : if(! $seqobj->validate_seq($seq_str) ) { | ||||
319 | print "sequence $seq_str is not valid for an object of | ||||
320 | alphabet ",$seqobj->alphabet, "\n"; | ||||
321 | } | ||||
322 | Function: Test that the given sequence is valid, i.e. contains only valid | ||||
323 | characters. The allowed characters are all letters (A-Z) and '-','.', | ||||
324 | '*','?','=' and '~'. Spaces are not valid. Note that this | ||||
325 | implementation does not take alphabet() into account and that empty | ||||
326 | sequences are considered valid. | ||||
327 | Returns : 1 if the supplied sequence string is valid, 0 otherwise. | ||||
328 | Args : - Sequence string to be validated | ||||
329 | - Boolean to optionally throw an error if the sequence is invalid | ||||
330 | |||||
331 | =cut | ||||
332 | |||||
333 | sub validate_seq { | ||||
334 | my ($self, $seqstr, $throw) = @_; | ||||
335 | if ( (defined $seqstr ) && | ||||
336 | ($seqstr !~ /^[$MATCHPATTERN]*$/) ) { | ||||
337 | if ($throw) { | ||||
338 | $self->throw("Failed validation of sequence '".(defined($self->id) || | ||||
339 | '[unidentified sequence]')."'. Invalid characters were: " . | ||||
340 | join('',($seqstr =~ /[^$MATCHPATTERN]/g))); | ||||
341 | } | ||||
342 | return 0; | ||||
343 | } | ||||
344 | return 1; | ||||
345 | } | ||||
346 | |||||
347 | |||||
348 | =head2 subseq | ||||
349 | |||||
350 | Title : subseq | ||||
351 | Usage : $substring = $seqobj->subseq(10,40); | ||||
352 | $substring = $seqobj->subseq(10,40,'nogap'); | ||||
353 | $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga'); | ||||
354 | $substring = $seqobj->subseq($location_obj); | ||||
355 | $substring = $seqobj->subseq($location_obj, -nogap => 1); | ||||
356 | Function: Return the subseq from start to end, where the first sequence | ||||
357 | character has coordinate 1 number is inclusive, ie 1-2 are the | ||||
358 | first two characters of the sequence. The given start coordinate | ||||
359 | has to be larger than the end, even if the sequence is circular. | ||||
360 | Returns : a string | ||||
361 | Args : integer for start position | ||||
362 | integer for end position | ||||
363 | OR | ||||
364 | Bio::LocationI location for subseq (strand honored) | ||||
365 | Specify -NOGAP=>1 to return subseq with gap characters removed | ||||
366 | Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned | ||||
367 | with $new_subseq in the sequence object | ||||
368 | |||||
369 | =cut | ||||
370 | |||||
371 | sub subseq { | ||||
372 | my $self = shift; | ||||
373 | my @args = @_; | ||||
374 | my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START | ||||
375 | END | ||||
376 | NOGAP | ||||
377 | REPLACE_WITH)], @args); | ||||
378 | |||||
379 | # If -replace_with is specified, validate the replacement sequence | ||||
380 | if (defined $replace) { | ||||
381 | $self->validate_seq( $replace ) || | ||||
382 | $self->throw("Replacement sequence does not look valid"); | ||||
383 | } | ||||
384 | |||||
385 | if( ref($start) && $start->isa('Bio::LocationI') ) { | ||||
386 | my $loc = $start; | ||||
387 | my $seq = ''; | ||||
388 | |||||
389 | # For Split objects if Guide Strand is negative, | ||||
390 | # pass the sublocations in reverse | ||||
391 | my $order = 0; | ||||
392 | if ($loc->isa('Bio::Location::SplitLocationI')) { | ||||
393 | # guide_strand can return undef, so don't compare directly | ||||
394 | # to avoid 'uninitialized value' warning | ||||
395 | my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0; | ||||
396 | $order = ($guide_strand == -1) ? -1 : 0; | ||||
397 | } | ||||
398 | # Reversing order using ->each_Location(-1) does not work well for | ||||
399 | # cut by origin-splits (like "complement(join(1900..END,START..50))"), | ||||
400 | # so use "reverse" instead | ||||
401 | my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location; | ||||
402 | foreach my $subloc (@sublocs) { | ||||
403 | my $piece = $self->subseq(-start => $subloc->start(), | ||||
404 | -end => $subloc->end(), | ||||
405 | -replace_with => $replace, | ||||
406 | -nogap => $nogap); | ||||
407 | $piece =~ s/[$GAP_SYMBOLS]//g if $nogap; | ||||
408 | |||||
409 | # strand can return undef, so don't compare directly | ||||
410 | # to avoid 'uninitialized value' warning | ||||
411 | my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0; | ||||
412 | if ($strand < 0) { | ||||
413 | $piece = $self->_revcom_from_string($piece, $self->alphabet); | ||||
414 | } | ||||
415 | $seq .= $piece; | ||||
416 | } | ||||
417 | return $seq; | ||||
418 | } elsif( defined $start && defined $end ) { | ||||
419 | if( $start > $end ){ | ||||
420 | $self->throw("Bad start,end parameters. Start [$start] has to be ". | ||||
421 | "less than end [$end]"); | ||||
422 | } | ||||
423 | if( $start <= 0 ) { | ||||
424 | $self->throw("Bad start parameter ($start). Start must be positive."); | ||||
425 | } | ||||
426 | |||||
427 | # Remove one from start, and then length is end-start | ||||
428 | $start--; | ||||
429 | |||||
430 | my $seqstr; | ||||
431 | if (defined $replace) { | ||||
432 | $seqstr = substr $self->{seq}, $start, $end-$start, $replace; | ||||
433 | } else { | ||||
434 | $seqstr = substr $self->{seq}, $start, $end-$start; | ||||
435 | } | ||||
436 | |||||
437 | |||||
438 | if ($end > $self->length) { | ||||
439 | if ($self->is_circular) { | ||||
440 | my $start = 0; | ||||
441 | my $end = $end - $self->length; | ||||
442 | |||||
443 | my $appendstr; | ||||
444 | if (defined $replace) { | ||||
445 | $appendstr = substr $self->{seq}, $start, $end-$start, $replace; | ||||
446 | } else { | ||||
447 | $appendstr = substr $self->{seq}, $start, $end-$start; | ||||
448 | } | ||||
449 | |||||
450 | $seqstr .= $appendstr; | ||||
451 | } else { | ||||
452 | $self->throw("Bad end parameter ($end). End must be less than ". | ||||
453 | "the total length of sequence (total=".$self->length.")") | ||||
454 | } | ||||
455 | } | ||||
456 | |||||
457 | $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap); | ||||
458 | return $seqstr; | ||||
459 | |||||
460 | } else { | ||||
461 | $self->warn("Incorrect parameters to subseq - must be two integers or ". | ||||
462 | "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap); | ||||
463 | return; | ||||
464 | } | ||||
465 | } | ||||
466 | |||||
467 | |||||
468 | =head2 length | ||||
469 | |||||
470 | Title : length | ||||
471 | Usage : $len = $seqobj->length(); | ||||
472 | Function: Get the stored length of the sequence in number of symbols (bases | ||||
473 | or amino acids). In some circumstances, you can also set this attribute: | ||||
474 | |||||
475 | 1. For empty sequences, you can set the length to anything you want: | ||||
476 | my $seqobj = Bio::PrimarySeq->new( -length => 123 ); | ||||
477 | my $len = $seqobj->len; # 123 | ||||
478 | 2. To save memory when using very long sequences, you can set the | ||||
479 | length of the sequence to the length of the sequence (and nothing | ||||
480 | else): | ||||
481 | my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence | ||||
482 | # process $seqobj... then after you're done with it | ||||
483 | $seqobj->length($seqobj->length); | ||||
484 | $seqobj->seq(undef); # free memory! | ||||
485 | my $len = $seqobj->len; # 1 Mbp | ||||
486 | |||||
487 | Note that if you set seq() to a value other than undef at any time, | ||||
488 | the length attribute will be reset. | ||||
489 | Returns : integer representing the length of the sequence. | ||||
490 | Args : Optionally, the value on set | ||||
491 | |||||
492 | =cut | ||||
493 | |||||
494 | sub length { | ||||
495 | my ($self, $val) = @_; | ||||
496 | if (defined $val) { | ||||
497 | my $len = $self->{'length'}; | ||||
498 | if ($len && ($len != $val)) { | ||||
499 | $self->throw("Can not set the length to $val, current length value is $len"); | ||||
500 | } | ||||
501 | $self->{'length'} = $val; | ||||
502 | $self->{'_freeze_length'} = undef; | ||||
503 | } | ||||
504 | return $self->{'length'}; | ||||
505 | } | ||||
506 | |||||
507 | |||||
508 | =head2 display_id | ||||
509 | |||||
510 | Title : display_id or display_name | ||||
511 | Usage : $id_string = $seqobj->display_id(); | ||||
512 | Function: Get or set the display id, aka the common name of the sequence object. | ||||
513 | |||||
514 | The semantics of this is that it is the most likely string to | ||||
515 | be used as an identifier of the sequence, and likely to have | ||||
516 | "human" readability. The id is equivalent to the ID field of | ||||
517 | the GenBank/EMBL databanks and the id field of the | ||||
518 | Swissprot/sptrembl database. In fasta format, the >(\S+) is | ||||
519 | presumed to be the id, though some people overload the id to | ||||
520 | embed other information. Bioperl does not use any embedded | ||||
521 | information in the ID field, and people are encouraged to use | ||||
522 | other mechanisms (accession field for example, or extending | ||||
523 | the sequence object) to solve this. | ||||
524 | |||||
525 | With the new Bio::DescribeableI interface, display_name aliases | ||||
526 | to this method. | ||||
527 | Returns : A string for the display ID | ||||
528 | Args : Optional string for the display ID to set | ||||
529 | |||||
530 | =cut | ||||
531 | |||||
532 | sub display_id { | ||||
533 | my ($self, $value) = @_; | ||||
534 | if( defined $value) { | ||||
535 | $self->{'display_id'} = $value; | ||||
536 | } | ||||
537 | return $self->{'display_id'}; | ||||
538 | } | ||||
539 | |||||
540 | |||||
541 | =head2 accession_number | ||||
542 | |||||
543 | Title : accession_number or object_id | ||||
544 | Usage : $unique_key = $seqobj->accession_number; | ||||
545 | Function: Returns the unique biological id for a sequence, commonly | ||||
546 | called the accession_number. For sequences from established | ||||
547 | databases, the implementors should try to use the correct | ||||
548 | accession number. Notice that primary_id() provides the | ||||
549 | unique id for the implemetation, allowing multiple objects | ||||
550 | to have the same accession number in a particular implementation. | ||||
551 | |||||
552 | For sequences with no accession number, this method should | ||||
553 | return "unknown". | ||||
554 | |||||
555 | [Note this method name is likely to change in 1.3] | ||||
556 | |||||
557 | With the new Bio::IdentifiableI interface, this is aliased | ||||
558 | to object_id | ||||
559 | Returns : A string | ||||
560 | Args : A string (optional) for setting | ||||
561 | |||||
562 | =cut | ||||
563 | |||||
564 | sub accession_number { | ||||
565 | my( $self, $acc ) = @_; | ||||
566 | if (defined $acc) { | ||||
567 | $self->{'accession_number'} = $acc; | ||||
568 | } else { | ||||
569 | $acc = $self->{'accession_number'}; | ||||
570 | $acc = 'unknown' unless defined $acc; | ||||
571 | } | ||||
572 | return $acc; | ||||
573 | } | ||||
574 | |||||
575 | |||||
576 | =head2 primary_id | ||||
577 | |||||
578 | Title : primary_id | ||||
579 | Usage : $unique_key = $seqobj->primary_id; | ||||
580 | Function: Returns the unique id for this object in this | ||||
581 | implementation. This allows implementations to manage their | ||||
582 | own object ids in a way the implementaiton can control | ||||
583 | clients can expect one id to map to one object. | ||||
584 | |||||
585 | For sequences with no natural primary id, this method | ||||
586 | should return a stringified memory location. | ||||
587 | Returns : A string | ||||
588 | Args : A string (optional, for setting) | ||||
589 | |||||
590 | =cut | ||||
591 | |||||
592 | sub primary_id { | ||||
593 | my $self = shift; | ||||
594 | |||||
595 | if(@_) { | ||||
596 | $self->{'primary_id'} = shift; | ||||
597 | } | ||||
598 | if( ! defined($self->{'primary_id'}) ) { | ||||
599 | return "$self"; | ||||
600 | } | ||||
601 | return $self->{'primary_id'}; | ||||
602 | } | ||||
603 | |||||
604 | |||||
605 | =head2 alphabet | ||||
606 | |||||
607 | Title : alphabet | ||||
608 | Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something } | ||||
609 | Function: Get/set the alphabet of sequence, one of | ||||
610 | 'dna', 'rna' or 'protein'. This is case sensitive. | ||||
611 | |||||
612 | This is not called <type> because this would cause | ||||
613 | upgrade problems from the 0.5 and earlier Seq objects. | ||||
614 | Returns : a string either 'dna','rna','protein'. NB - the object must | ||||
615 | make a call of the type - if there is no alphabet specified it | ||||
616 | has to guess. | ||||
617 | Args : optional string to set : 'dna' | 'rna' | 'protein' | ||||
618 | |||||
619 | |||||
620 | =cut | ||||
621 | |||||
622 | sub alphabet { | ||||
623 | my ($self,$value) = @_; | ||||
624 | if (defined $value) { | ||||
625 | $value = lc $value; | ||||
626 | unless ( $valid_type{$value} ) { | ||||
627 | $self->throw("Alphabet '$value' is not a valid alphabet (". | ||||
628 | join(',', map "'$_'", sort keys %valid_type) .") lowercase"); | ||||
629 | } | ||||
630 | $self->{'alphabet'} = $value; | ||||
631 | } | ||||
632 | return $self->{'alphabet'}; | ||||
633 | } | ||||
634 | |||||
635 | |||||
636 | =head2 desc | ||||
637 | |||||
638 | Title : desc or description | ||||
639 | Usage : $seqobj->desc($newval); | ||||
640 | Function: Get/set description of the sequence. | ||||
641 | |||||
642 | 'description' is an alias for this for compliance with the | ||||
643 | Bio::DescribeableI interface. | ||||
644 | Returns : value of desc (a string) | ||||
645 | Args : newvalue (a string or undef, optional) | ||||
646 | |||||
647 | |||||
648 | =cut | ||||
649 | |||||
650 | sub desc{ | ||||
651 | my $self = shift; | ||||
652 | |||||
653 | return $self->{'desc'} = shift if @_; | ||||
654 | return $self->{'desc'}; | ||||
655 | } | ||||
656 | |||||
657 | |||||
658 | =head2 can_call_new | ||||
659 | |||||
660 | Title : can_call_new | ||||
661 | Usage : | ||||
662 | Function: | ||||
663 | Example : | ||||
664 | Returns : true | ||||
665 | Args : | ||||
666 | |||||
667 | =cut | ||||
668 | |||||
669 | sub can_call_new { | ||||
670 | my ($self) = @_; | ||||
671 | |||||
672 | return 1; | ||||
673 | } | ||||
674 | |||||
675 | |||||
676 | =head2 id | ||||
677 | |||||
678 | Title : id | ||||
679 | Usage : $id = $seqobj->id(); | ||||
680 | Function: This is mapped on display_id | ||||
681 | Example : | ||||
682 | Returns : | ||||
683 | Args : | ||||
684 | |||||
685 | =cut | ||||
686 | |||||
687 | sub id { | ||||
688 | return shift->display_id(@_); | ||||
689 | } | ||||
690 | |||||
691 | |||||
692 | =head2 is_circular | ||||
693 | |||||
694 | Title : is_circular | ||||
695 | Usage : if( $seqobj->is_circular) { # Do something } | ||||
696 | Function: Returns true if the molecule is circular | ||||
697 | Returns : Boolean value | ||||
698 | Args : none | ||||
699 | |||||
700 | =cut | ||||
701 | |||||
702 | sub is_circular{ | ||||
703 | my $self = shift; | ||||
704 | return $self->{'is_circular'} = shift if @_; | ||||
705 | return $self->{'is_circular'}; | ||||
706 | } | ||||
707 | |||||
708 | |||||
709 | =head1 Methods for Bio::IdentifiableI compliance | ||||
710 | |||||
711 | =head2 object_id | ||||
712 | |||||
713 | Title : object_id | ||||
714 | Usage : $string = $seqobj->object_id(); | ||||
715 | Function: Get or set a string which represents the stable primary identifier | ||||
716 | in this namespace of this object. For DNA sequences this | ||||
717 | is its accession_number, similarly for protein sequences. | ||||
718 | |||||
719 | This is aliased to accession_number(). | ||||
720 | Returns : A scalar | ||||
721 | Args : Optional object ID to set. | ||||
722 | |||||
723 | =cut | ||||
724 | |||||
725 | sub object_id { | ||||
726 | return shift->accession_number(@_); | ||||
727 | } | ||||
728 | |||||
729 | |||||
730 | =head2 version | ||||
731 | |||||
732 | Title : version | ||||
733 | Usage : $version = $seqobj->version(); | ||||
734 | Function: Get or set a number which differentiates between versions of | ||||
735 | the same object. Higher numbers are considered to be | ||||
736 | later and more relevant, but a single object described | ||||
737 | the same identifier should represent the same concept. | ||||
738 | Returns : A number | ||||
739 | Args : Optional version to set. | ||||
740 | |||||
741 | =cut | ||||
742 | |||||
743 | sub version{ | ||||
744 | my ($self,$value) = @_; | ||||
745 | if( defined $value) { | ||||
746 | $self->{'_version'} = $value; | ||||
747 | } | ||||
748 | return $self->{'_version'}; | ||||
749 | } | ||||
750 | |||||
751 | |||||
752 | =head2 authority | ||||
753 | |||||
754 | Title : authority | ||||
755 | Usage : $authority = $seqobj->authority(); | ||||
756 | Function: Get or set a string which represents the organisation which | ||||
757 | granted the namespace, written as the DNS name of the | ||||
758 | organisation (eg, wormbase.org). | ||||
759 | Returns : A scalar | ||||
760 | Args : Optional authority to set. | ||||
761 | |||||
762 | =cut | ||||
763 | |||||
764 | sub authority { | ||||
765 | my ($self, $value) = @_; | ||||
766 | if( defined $value) { | ||||
767 | $self->{'authority'} = $value; | ||||
768 | } | ||||
769 | return $self->{'authority'}; | ||||
770 | } | ||||
771 | |||||
772 | |||||
773 | =head2 namespace | ||||
774 | |||||
775 | Title : namespace | ||||
776 | Usage : $string = $seqobj->namespace(); | ||||
777 | Function: Get or set a string representing the name space this identifier | ||||
778 | is valid in, often the database name or the name describing the | ||||
779 | collection. | ||||
780 | Returns : A scalar | ||||
781 | Args : Optional namespace to set. | ||||
782 | |||||
783 | =cut | ||||
784 | |||||
785 | sub namespace{ | ||||
786 | my ($self,$value) = @_; | ||||
787 | if( defined $value) { | ||||
788 | $self->{'namespace'} = $value; | ||||
789 | } | ||||
790 | return $self->{'namespace'} || ""; | ||||
791 | } | ||||
792 | |||||
793 | |||||
794 | =head1 Methods for Bio::DescribableI compliance | ||||
795 | |||||
796 | This comprises of display_name and description. | ||||
797 | |||||
798 | =head2 display_name | ||||
799 | |||||
800 | Title : display_name | ||||
801 | Usage : $string = $seqobj->display_name(); | ||||
802 | Function: Get or set a string which is what should be displayed to the user. | ||||
803 | The string should have no spaces (ideally, though a cautious | ||||
804 | user of this interface would not assumme this) and should be | ||||
805 | less than thirty characters (though again, double checking | ||||
806 | this is a good idea). | ||||
807 | |||||
808 | This is aliased to display_id(). | ||||
809 | Returns : A string for the display name | ||||
810 | Args : Optional string for the display name to set. | ||||
811 | |||||
812 | =cut | ||||
813 | |||||
814 | sub display_name { | ||||
815 | return shift->display_id(@_); | ||||
816 | } | ||||
817 | |||||
818 | |||||
819 | =head2 description | ||||
820 | |||||
821 | Title : description | ||||
822 | Usage : $string = $seqobj->description(); | ||||
823 | Function: Get or set a text string suitable for displaying to the user a | ||||
824 | description. This string is likely to have spaces, but | ||||
825 | should not have any newlines or formatting - just plain | ||||
826 | text. The string should not be greater than 255 characters | ||||
827 | and clients can feel justified at truncating strings at 255 | ||||
828 | characters for the purposes of display. | ||||
829 | |||||
830 | This is aliased to desc(). | ||||
831 | Returns : A string for the description | ||||
832 | Args : Optional string for the description to set. | ||||
833 | |||||
834 | =cut | ||||
835 | |||||
836 | sub description { | ||||
837 | return shift->desc(@_); | ||||
838 | } | ||||
839 | |||||
840 | |||||
841 | =head1 Methods Inherited from Bio::PrimarySeqI | ||||
842 | |||||
843 | These methods are available on Bio::PrimarySeq, although they are | ||||
844 | actually implemented on Bio::PrimarySeqI | ||||
845 | |||||
846 | =head2 revcom | ||||
847 | |||||
848 | Title : revcom | ||||
849 | Usage : $rev = $seqobj->revcom(); | ||||
850 | Function: Produces a new Bio::SeqI implementing object which | ||||
851 | is the reversed complement of the sequence. For protein | ||||
852 | sequences this throws an exception of | ||||
853 | "Sequence is a protein. Cannot revcom". | ||||
854 | |||||
855 | The id is the same id as the orginal sequence, and the | ||||
856 | accession number is also indentical. If someone wants to | ||||
857 | track that this sequence has be reversed, it needs to | ||||
858 | define its own extensions. | ||||
859 | |||||
860 | To do an inplace edit of an object you can go: | ||||
861 | |||||
862 | $seqobj = $seqobj->revcom(); | ||||
863 | |||||
864 | This of course, causes Perl to handle the garbage | ||||
865 | collection of the old object, but it is roughly speaking as | ||||
866 | efficient as an inplace edit. | ||||
867 | Returns : A new (fresh) Bio::SeqI object | ||||
868 | Args : none | ||||
869 | |||||
870 | =head2 trunc | ||||
871 | |||||
872 | Title : trunc | ||||
873 | Usage : $subseq = $myseq->trunc(10,100); | ||||
874 | Function: Provides a truncation of a sequence, | ||||
875 | Returns : A fresh Bio::SeqI implementing object. | ||||
876 | Args : Numbers for the start and end positions | ||||
877 | |||||
878 | =head1 Internal methods | ||||
879 | |||||
880 | These are internal methods to PrimarySeq | ||||
881 | |||||
882 | =head2 _guess_alphabet | ||||
883 | |||||
884 | Title : _guess_alphabet | ||||
885 | Usage : | ||||
886 | Function: Automatically guess and set the type of sequence: dna, rna, protein | ||||
887 | or '' if the sequence was empty. This method first removes dots (.), | ||||
888 | dashes (-) and question marks (?) before guessing the alphabet | ||||
889 | using the IUPAC conventions for ambiguous residues. Since the DNA and | ||||
890 | RNA characters are also valid characters for proteins, there is | ||||
891 | no foolproof way of determining the right alphabet. This is our best | ||||
892 | guess only! | ||||
893 | Returns : string 'dna', 'rna', 'protein' or ''. | ||||
894 | Args : none | ||||
895 | |||||
896 | =cut | ||||
897 | |||||
898 | sub _guess_alphabet { | ||||
899 | my ($self) = @_; | ||||
900 | # Guess alphabet | ||||
901 | my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'}); | ||||
902 | # Set alphabet unless it is unknown | ||||
903 | $self->alphabet($alphabet) if $alphabet; | ||||
904 | return $alphabet; | ||||
905 | } | ||||
906 | |||||
907 | |||||
908 | sub _guess_alphabet_from_string { | ||||
909 | # Get the alphabet from a sequence string | ||||
910 | my ($self, $str, $nowarnonempty) = @_; | ||||
911 | |||||
912 | $nowarnonempty = 0 if not defined $nowarnonempty; | ||||
913 | |||||
914 | # Remove chars that clearly don't denote nucleic or amino acids | ||||
915 | $str =~ s/[-.?]//gi; | ||||
916 | |||||
917 | # Check for sequences without valid letters | ||||
918 | my $alphabet; | ||||
919 | my $total = CORE::length($str); | ||||
920 | if( $total == 0 ) { | ||||
921 | if (not $nowarnonempty) { | ||||
922 | $self->warn("Got a sequence without letters. Could not guess alphabet"); | ||||
923 | } | ||||
924 | $alphabet = ''; | ||||
925 | } | ||||
926 | |||||
927 | # Determine alphabet now | ||||
928 | if (not defined $alphabet) { | ||||
929 | if ($str =~ m/[EFIJLOPQXZ]/i) { | ||||
930 | # Start with a safe method to find proteins. | ||||
931 | # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z | ||||
932 | $alphabet = 'protein'; | ||||
933 | } else { | ||||
934 | # Alphabet is unsure, could still be DNA, RNA or protein | ||||
935 | # DNA and RNA contain mostly A, T, U, G, C and N, but the other | ||||
936 | # letters they use are also among the 15 valid letters that a | ||||
937 | # protein sequence can contain at this stage. Make our best guess | ||||
938 | # based on sequence composition. If it contains over 70% of ACGTUN, | ||||
939 | # it is likely nucleic. | ||||
940 | if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) { | ||||
941 | if ( $str =~ m/U/i ) { | ||||
942 | $alphabet = 'rna'; | ||||
943 | } else { | ||||
944 | $alphabet = 'dna'; | ||||
945 | } | ||||
946 | } else { | ||||
947 | $alphabet = 'protein'; | ||||
948 | } | ||||
949 | } | ||||
950 | } | ||||
951 | |||||
952 | return $alphabet; | ||||
953 | } | ||||
954 | |||||
955 | |||||
956 | ############################################################################ | ||||
957 | # aliases due to name changes or to compensate for our lack of consistency # | ||||
958 | ############################################################################ | ||||
959 | |||||
960 | sub accession { | ||||
961 | my $self = shift; | ||||
962 | |||||
963 | $self->warn(ref($self)."::accession is deprecated, ". | ||||
964 | "use accession_number() instead"); | ||||
965 | return $self->accession_number(@_); | ||||
966 | } | ||||
967 | |||||
968 | 1 | 10µs | 1; |