Filename | /Users/ap13/perl5/lib/perl5/Bio/PrimarySeqI.pm |
Statements | Executed 7 statements in 2.77ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 2.36ms | 11.8ms | BEGIN@124 | Bio::PrimarySeqI::
1 | 1 | 1 | 21µs | 37µs | BEGIN@123 | Bio::PrimarySeqI::
1 | 1 | 1 | 9µs | 69µs | BEGIN@126 | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | __ANON__[:899] | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | __ANON__[:900] | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | _attempt_to_load_Seq | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | _find_orfs_nucleotide | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | _orf_sequence | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | _revcom_from_string | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | _setup_class | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | _truncate_seq | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | accession_number | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | alphabet | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | can_call_new | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | desc | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | display_id | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | id | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | is_circular | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | length | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | moltype | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | primary_id | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | rev_transcribe | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | revcom | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | seq | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | subseq | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | transcribe | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | translate | Bio::PrimarySeqI::
0 | 0 | 0 | 0s | 0s | trunc | Bio::PrimarySeqI::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # | ||||
2 | # BioPerl module for Bio::PrimarySeqI | ||||
3 | # | ||||
4 | # Please direct questions and support issues to <bioperl-l@bioperl.org> | ||||
5 | # | ||||
6 | # Cared for by Ewan Birney <birney@ebi.ac.uk> | ||||
7 | # | ||||
8 | # Copyright Ewan Birney | ||||
9 | # | ||||
10 | # You may distribute this module under the same terms as perl itself | ||||
11 | |||||
12 | # POD documentation - main docs before the code | ||||
13 | |||||
14 | |||||
15 | =head1 NAME | ||||
16 | |||||
17 | Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq | ||||
18 | |||||
19 | =head1 SYNOPSIS | ||||
20 | |||||
21 | # Bio::PrimarySeqI is the interface class for sequences. | ||||
22 | # If you are a newcomer to bioperl, you might want to start with | ||||
23 | # Bio::Seq documentation. | ||||
24 | |||||
25 | # Test if this is a seq object | ||||
26 | $obj->isa("Bio::PrimarySeqI") || | ||||
27 | $obj->throw("$obj does not implement the Bio::PrimarySeqI interface"); | ||||
28 | |||||
29 | # Accessors | ||||
30 | $string = $obj->seq(); | ||||
31 | $substring = $obj->subseq(12,50); | ||||
32 | $display = $obj->display_id(); # for human display | ||||
33 | $id = $obj->primary_id(); # unique id for this object, | ||||
34 | # implementation defined | ||||
35 | $unique_key= $obj->accession_number(); # unique biological id | ||||
36 | |||||
37 | |||||
38 | # Object manipulation | ||||
39 | eval { | ||||
40 | $rev = $obj->revcom(); | ||||
41 | }; | ||||
42 | if( $@ ) { | ||||
43 | $obj->throw( "Could not reverse complement. ". | ||||
44 | "Probably not DNA. Actual exception\n$@\n" ); | ||||
45 | } | ||||
46 | |||||
47 | $trunc = $obj->trunc(12,50); | ||||
48 | # $rev and $trunc are Bio::PrimarySeqI compliant objects | ||||
49 | |||||
50 | |||||
51 | =head1 DESCRIPTION | ||||
52 | |||||
53 | This object defines an abstract interface to basic sequence | ||||
54 | information - for most users of the package the documentation (and | ||||
55 | methods) in this class are not useful - this is a developers-only | ||||
56 | class which defines what methods have to be implmented by other Perl | ||||
57 | objects to comply to the Bio::PrimarySeqI interface. Go "perldoc | ||||
58 | Bio::Seq" or "man Bio::Seq" for more information on the main class for | ||||
59 | sequences. | ||||
60 | |||||
61 | PrimarySeq is an object just for the sequence and its name(s), nothing | ||||
62 | more. Seq is the larger object complete with features. There is a pure | ||||
63 | perl implementation of this in L<Bio::PrimarySeq>. If you just want to | ||||
64 | use L<Bio::PrimarySeq> objects, then please read that module first. This | ||||
65 | module defines the interface, and is of more interest to people who | ||||
66 | want to wrap their own Perl Objects/RDBs/FileSystems etc in way that | ||||
67 | they "are" bioperl sequence objects, even though it is not using Perl | ||||
68 | to store the sequence etc. | ||||
69 | |||||
70 | This interface defines what bioperl considers necessary to "be" a | ||||
71 | sequence, without providing an implementation of this, an | ||||
72 | implementation is provided in L<Bio::PrimarySeq>. If you want to provide | ||||
73 | a Bio::PrimarySeq-compliant object which in fact wraps another | ||||
74 | object/database/out-of-perl experience, then this is the correct thing | ||||
75 | to wrap, generally by providing a wrapper class which would inherit | ||||
76 | from your object and this Bio::PrimarySeqI interface. The wrapper class | ||||
77 | then would have methods lists in the "Implementation Specific | ||||
78 | Functions" which would provide these methods for your object. | ||||
79 | |||||
80 | =head1 FEEDBACK | ||||
81 | |||||
82 | =head2 Mailing Lists | ||||
83 | |||||
84 | User feedback is an integral part of the evolution of this and other | ||||
85 | Bioperl modules. Send your comments and suggestions preferably to one | ||||
86 | of the Bioperl mailing lists. Your participation is much appreciated. | ||||
87 | |||||
88 | bioperl-l@bioperl.org - General discussion | ||||
89 | http://bioperl.org/wiki/Mailing_lists - About the mailing lists | ||||
90 | |||||
91 | =head2 Support | ||||
92 | |||||
93 | Please direct usage questions or support issues to the mailing list: | ||||
94 | |||||
95 | I<bioperl-l@bioperl.org> | ||||
96 | |||||
97 | rather than to the module maintainer directly. Many experienced and | ||||
98 | reponsive experts will be able look at the problem and quickly | ||||
99 | address it. Please include a thorough description of the problem | ||||
100 | with code and data examples if at all possible. | ||||
101 | |||||
102 | =head2 Reporting Bugs | ||||
103 | |||||
104 | Report bugs to the Bioperl bug tracking system to help us keep track | ||||
105 | the bugs and their resolution. Bug reports can be submitted via the | ||||
106 | web: | ||||
107 | |||||
108 | https://github.com/bioperl/bioperl-live/issues | ||||
109 | |||||
110 | =head1 AUTHOR - Ewan Birney | ||||
111 | |||||
112 | Email birney@ebi.ac.uk | ||||
113 | |||||
114 | =head1 APPENDIX | ||||
115 | |||||
116 | The rest of the documentation details each of the object | ||||
117 | methods. Internal methods are usually preceded with a _ | ||||
118 | |||||
119 | =cut | ||||
120 | |||||
121 | |||||
122 | package Bio::PrimarySeqI; | ||||
123 | 2 | 30µs | 2 | 52µs | # spent 37µs (21+15) within Bio::PrimarySeqI::BEGIN@123 which was called:
# once (21µs+15µs) by base::import at line 123 # spent 37µs making 1 call to Bio::PrimarySeqI::BEGIN@123
# spent 15µs making 1 call to strict::import |
124 | 2 | 208µs | 1 | 11.8ms | # spent 11.8ms (2.36+9.40) within Bio::PrimarySeqI::BEGIN@124 which was called:
# once (2.36ms+9.40ms) by base::import at line 124 # spent 11.8ms making 1 call to Bio::PrimarySeqI::BEGIN@124 |
125 | |||||
126 | 2 | 2.53ms | 2 | 69µs | # spent 69µs (9+60) within Bio::PrimarySeqI::BEGIN@126 which was called:
# once (9µs+60µs) by base::import at line 126 # spent 69µs making 1 call to Bio::PrimarySeqI::BEGIN@126
# spent 60µs making 1 call to base::import, recursion: max depth 2, sum of overlapping time 60µs |
127 | |||||
128 | |||||
129 | =head1 Implementation-specific Functions | ||||
130 | |||||
131 | These functions are the ones that a specific implementation must | ||||
132 | define. | ||||
133 | |||||
134 | =head2 seq | ||||
135 | |||||
136 | Title : seq | ||||
137 | Usage : $string = $obj->seq() | ||||
138 | Function: Returns the sequence as a string of letters. The | ||||
139 | case of the letters is left up to the implementer. | ||||
140 | Suggested cases are upper case for proteins and lower case for | ||||
141 | DNA sequence (IUPAC standard), but implementations are suggested to | ||||
142 | keep an open mind about case (some users... want mixed case!) | ||||
143 | Returns : A scalar | ||||
144 | Status : Virtual | ||||
145 | |||||
146 | =cut | ||||
147 | |||||
148 | sub seq { | ||||
149 | my ($self) = @_; | ||||
150 | $self->throw_not_implemented(); | ||||
151 | } | ||||
152 | |||||
153 | |||||
154 | =head2 subseq | ||||
155 | |||||
156 | Title : subseq | ||||
157 | Usage : $substring = $obj->subseq(10,40); | ||||
158 | Function: Returns the subseq from start to end, where the first base | ||||
159 | is 1 and the number is inclusive, i.e. 1-2 are the first two | ||||
160 | bases of the sequence. | ||||
161 | |||||
162 | Start cannot be larger than end but can be equal. | ||||
163 | |||||
164 | Returns : A string | ||||
165 | Args : | ||||
166 | Status : Virtual | ||||
167 | |||||
168 | =cut | ||||
169 | |||||
170 | sub subseq { | ||||
171 | my ($self) = @_; | ||||
172 | $self->throw_not_implemented(); | ||||
173 | } | ||||
174 | |||||
175 | |||||
176 | =head2 display_id | ||||
177 | |||||
178 | Title : display_id | ||||
179 | Usage : $id_string = $obj->display_id(); | ||||
180 | Function: Returns the display id, also known as the common name of the Sequence | ||||
181 | object. | ||||
182 | |||||
183 | The semantics of this is that it is the most likely string | ||||
184 | to be used as an identifier of the sequence, and likely to | ||||
185 | have "human" readability. The id is equivalent to the ID | ||||
186 | field of the GenBank/EMBL databanks and the id field of the | ||||
187 | Swissprot/sptrembl database. In fasta format, the >(\S+) is | ||||
188 | presumed to be the id, though some people overload the id | ||||
189 | to embed other information. Bioperl does not use any | ||||
190 | embedded information in the ID field, and people are | ||||
191 | encouraged to use other mechanisms (accession field for | ||||
192 | example, or extending the sequence object) to solve this. | ||||
193 | |||||
194 | Notice that $seq->id() maps to this function, mainly for | ||||
195 | legacy/convenience reasons. | ||||
196 | Returns : A string | ||||
197 | Args : None | ||||
198 | Status : Virtual | ||||
199 | |||||
200 | =cut | ||||
201 | |||||
202 | sub display_id { | ||||
203 | my ($self) = @_; | ||||
204 | $self->throw_not_implemented(); | ||||
205 | } | ||||
206 | |||||
207 | |||||
208 | =head2 accession_number | ||||
209 | |||||
210 | Title : accession_number | ||||
211 | Usage : $unique_biological_key = $obj->accession_number; | ||||
212 | Function: Returns the unique biological id for a sequence, commonly | ||||
213 | called the accession_number. For sequences from established | ||||
214 | databases, the implementors should try to use the correct | ||||
215 | accession number. Notice that primary_id() provides the | ||||
216 | unique id for the implemetation, allowing multiple objects | ||||
217 | to have the same accession number in a particular implementation. | ||||
218 | |||||
219 | For sequences with no accession number, this method should return | ||||
220 | "unknown". | ||||
221 | Returns : A string | ||||
222 | Args : None | ||||
223 | Status : Virtual | ||||
224 | |||||
225 | =cut | ||||
226 | |||||
227 | sub accession_number { | ||||
228 | my ($self,@args) = @_; | ||||
229 | $self->throw_not_implemented(); | ||||
230 | } | ||||
231 | |||||
232 | |||||
233 | =head2 primary_id | ||||
234 | |||||
235 | Title : primary_id | ||||
236 | Usage : $unique_implementation_key = $obj->primary_id; | ||||
237 | Function: Returns the unique id for this object in this | ||||
238 | implementation. This allows implementations to manage their | ||||
239 | own object ids in a way the implementaiton can control | ||||
240 | clients can expect one id to map to one object. | ||||
241 | |||||
242 | For sequences with no accession number, this method should | ||||
243 | return a stringified memory location. | ||||
244 | |||||
245 | Returns : A string | ||||
246 | Args : None | ||||
247 | Status : Virtual | ||||
248 | |||||
249 | =cut | ||||
250 | |||||
251 | sub primary_id { | ||||
252 | my ($self,@args) = @_; | ||||
253 | $self->throw_not_implemented(); | ||||
254 | } | ||||
255 | |||||
256 | |||||
257 | =head2 can_call_new | ||||
258 | |||||
259 | Title : can_call_new | ||||
260 | Usage : if( $obj->can_call_new ) { | ||||
261 | $newobj = $obj->new( %param ); | ||||
262 | } | ||||
263 | Function: Can_call_new returns 1 or 0 depending | ||||
264 | on whether an implementation allows new | ||||
265 | constructor to be called. If a new constructor | ||||
266 | is allowed, then it should take the followed hashed | ||||
267 | constructor list. | ||||
268 | |||||
269 | $myobject->new( -seq => $sequence_as_string, | ||||
270 | -display_id => $id | ||||
271 | -accession_number => $accession | ||||
272 | -alphabet => 'dna', | ||||
273 | ); | ||||
274 | Returns : 1 or 0 | ||||
275 | Args : | ||||
276 | |||||
277 | |||||
278 | =cut | ||||
279 | |||||
280 | sub can_call_new { | ||||
281 | my ($self,@args) = @_; | ||||
282 | # we default to 0 here | ||||
283 | return 0; | ||||
284 | } | ||||
285 | |||||
286 | |||||
287 | =head2 alphabet | ||||
288 | |||||
289 | Title : alphabet | ||||
290 | Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } | ||||
291 | Function: Returns the type of sequence being one of | ||||
292 | 'dna', 'rna' or 'protein'. This is case sensitive. | ||||
293 | |||||
294 | This is not called "type" because this would cause | ||||
295 | upgrade problems from the 0.5 and earlier Seq objects. | ||||
296 | |||||
297 | Returns : A string either 'dna','rna','protein'. NB - the object must | ||||
298 | make a call of the alphabet, if there is no alphabet specified it | ||||
299 | has to guess. | ||||
300 | Args : None | ||||
301 | Status : Virtual | ||||
302 | |||||
303 | =cut | ||||
304 | |||||
305 | sub alphabet { | ||||
306 | my ( $self ) = @_; | ||||
307 | $self->throw_not_implemented(); | ||||
308 | } | ||||
309 | |||||
310 | |||||
311 | =head2 moltype | ||||
312 | |||||
313 | Title : moltype | ||||
314 | Usage : Deprecated. Use alphabet() instead. | ||||
315 | |||||
316 | =cut | ||||
317 | |||||
318 | sub moltype { | ||||
319 | my ($self,@args) = @_; | ||||
320 | $self->warn("moltype: pre v1.0 method. Calling alphabet() instead..."); | ||||
321 | return $self->alphabet(@args); | ||||
322 | } | ||||
323 | |||||
324 | |||||
325 | =head1 Implementation-optional Functions | ||||
326 | |||||
327 | The following functions rely on the above functions. An | ||||
328 | implementing class does not need to provide these functions, as they | ||||
329 | will be provided by this class, but is free to override these | ||||
330 | functions. | ||||
331 | |||||
332 | The revcom(), trunc(), and translate() methods create new sequence | ||||
333 | objects. They will call new() on the class of the sequence object | ||||
334 | instance passed as argument, unless can_call_new() returns FALSE. In | ||||
335 | the latter case a Bio::PrimarySeq object will be created. Implementors | ||||
336 | which really want to control how objects are created (eg, for object | ||||
337 | persistence over a database, or objects in a CORBA framework), they | ||||
338 | are encouraged to override these methods | ||||
339 | |||||
340 | =head2 revcom | ||||
341 | |||||
342 | Title : revcom | ||||
343 | Usage : $rev = $seq->revcom() | ||||
344 | Function: Produces a new Bio::PrimarySeqI implementing object which | ||||
345 | is the reversed complement of the sequence. For protein | ||||
346 | sequences this throws an exception of "Sequence is a | ||||
347 | protein. Cannot revcom". | ||||
348 | |||||
349 | The id is the same id as the original sequence, and the | ||||
350 | accession number is also indentical. If someone wants to | ||||
351 | track that this sequence has be reversed, it needs to | ||||
352 | define its own extensions. | ||||
353 | |||||
354 | To do an inplace edit of an object you can go: | ||||
355 | |||||
356 | $seq = $seq->revcom(); | ||||
357 | |||||
358 | This of course, causes Perl to handle the garbage | ||||
359 | collection of the old object, but it is roughly speaking as | ||||
360 | efficient as an inplace edit. | ||||
361 | |||||
362 | Returns : A new (fresh) Bio::PrimarySeqI object | ||||
363 | Args : None | ||||
364 | |||||
365 | |||||
366 | =cut | ||||
367 | |||||
368 | sub revcom { | ||||
369 | my ($self) = @_; | ||||
370 | |||||
371 | # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' | ||||
372 | # or 'Bio::Seq::LargeSeq', if not take advantage of | ||||
373 | # Bio::Root::clone to get an object copy | ||||
374 | my $out; | ||||
375 | if ( $self->isa('Bio::Seq::LargePrimarySeq') | ||||
376 | or $self->isa('Bio::Seq::LargeSeq') | ||||
377 | ) { | ||||
378 | my ($seqclass, $opts) = $self->_setup_class; | ||||
379 | $out = $seqclass->new( | ||||
380 | -seq => $self->_revcom_from_string($self->seq, $self->alphabet), | ||||
381 | -is_circular => $self->is_circular, | ||||
382 | -display_id => $self->display_id, | ||||
383 | -accession_number => $self->accession_number, | ||||
384 | -alphabet => $self->alphabet, | ||||
385 | -desc => $self->desc, | ||||
386 | -verbose => $self->verbose, | ||||
387 | %$opts, | ||||
388 | ); | ||||
389 | } else { | ||||
390 | $out = $self->clone; | ||||
391 | $out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) ); | ||||
392 | } | ||||
393 | return $out; | ||||
394 | } | ||||
395 | |||||
396 | |||||
397 | sub _revcom_from_string { | ||||
398 | my ($self, $string, $alphabet) = @_; | ||||
399 | |||||
400 | # Check that reverse-complementing makes sense | ||||
401 | if( $alphabet eq 'protein' ) { | ||||
402 | $self->throw("Sequence is a protein. Cannot revcom."); | ||||
403 | } | ||||
404 | if( $alphabet ne 'dna' && $alphabet ne 'rna' ) { | ||||
405 | my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ". | ||||
406 | "but unsure if this is right."; | ||||
407 | if( $self->can('warn') ) { | ||||
408 | $self->warn($msg); | ||||
409 | } else { | ||||
410 | warn("[$self] $msg"); | ||||
411 | } | ||||
412 | } | ||||
413 | |||||
414 | # If sequence is RNA, map to DNA (then map back later) | ||||
415 | if( $alphabet eq 'rna' ) { | ||||
416 | $string =~ tr/uU/tT/; | ||||
417 | } | ||||
418 | |||||
419 | # Reverse-complement now | ||||
420 | $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; | ||||
421 | $string = CORE::reverse $string; | ||||
422 | |||||
423 | # Map back RNA to DNA | ||||
424 | if( $alphabet eq 'rna' ) { | ||||
425 | $string =~ tr/tT/uU/; | ||||
426 | } | ||||
427 | |||||
428 | return $string; | ||||
429 | } | ||||
430 | |||||
431 | |||||
432 | =head2 trunc | ||||
433 | |||||
434 | Title : trunc | ||||
435 | Usage : $subseq = $myseq->trunc(10,100); | ||||
436 | Function: Provides a truncation of a sequence. | ||||
437 | Returns : A fresh Bio::PrimarySeqI implementing object. | ||||
438 | Args : Two integers denoting first and last base of the sub-sequence. | ||||
439 | |||||
440 | |||||
441 | =cut | ||||
442 | |||||
443 | sub trunc { | ||||
444 | my ($self,$start,$end) = @_; | ||||
445 | |||||
446 | my $str; | ||||
447 | if( defined $start && ref($start) && | ||||
448 | $start->isa('Bio::LocationI') ) { | ||||
449 | $str = $self->subseq($start); # start is a location actually | ||||
450 | } elsif( !$end ) { | ||||
451 | $self->throw("trunc start,end -- there was no end for $start"); | ||||
452 | } elsif( $end < $start ) { | ||||
453 | my $msg = "start [$start] is greater than end [$end]. \n". | ||||
454 | "If you want to truncated and reverse complement, \n". | ||||
455 | "you must call trunc followed by revcom. Sorry."; | ||||
456 | $self->throw($msg); | ||||
457 | } else { | ||||
458 | $str = $self->subseq($start,$end); | ||||
459 | } | ||||
460 | |||||
461 | # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' | ||||
462 | # or 'Bio::Seq::LargeSeq', if not take advantage of | ||||
463 | # Bio::Root::clone to get an object copy | ||||
464 | my $out; | ||||
465 | if ( $self->isa('Bio::Seq::LargePrimarySeq') | ||||
466 | or $self->isa('Bio::Seq::LargeSeq') | ||||
467 | ) { | ||||
468 | my ($seqclass, $opts) = $self->_setup_class; | ||||
469 | $out = $seqclass->new( | ||||
470 | -seq => $str, | ||||
471 | -is_circular => $self->is_circular, | ||||
472 | -display_id => $self->display_id, | ||||
473 | -accession_number => $self->accession_number, | ||||
474 | -alphabet => $self->alphabet, | ||||
475 | -desc => $self->desc, | ||||
476 | -verbose => $self->verbose, | ||||
477 | %$opts, | ||||
478 | ); | ||||
479 | } else { | ||||
480 | $out = $self->clone; | ||||
481 | $out->seq($str); | ||||
482 | } | ||||
483 | return $out; | ||||
484 | } | ||||
485 | |||||
486 | |||||
487 | =head2 translate | ||||
488 | |||||
489 | Title : translate | ||||
490 | Usage : $protein_seq_obj = $dna_seq_obj->translate | ||||
491 | |||||
492 | Or if you expect a complete coding sequence (CDS) translation, | ||||
493 | with initiator at the beginning and terminator at the end: | ||||
494 | |||||
495 | $protein_seq_obj = $cds_seq_obj->translate(-complete => 1); | ||||
496 | |||||
497 | Or if you want translate() to find the first initiation | ||||
498 | codon and return the corresponding protein: | ||||
499 | |||||
500 | $protein_seq_obj = $cds_seq_obj->translate(-orf => 1); | ||||
501 | |||||
502 | Function: Provides the translation of the DNA sequence using full | ||||
503 | IUPAC ambiguities in DNA/RNA and amino acid codes. | ||||
504 | |||||
505 | The complete CDS translation is identical to EMBL/TREMBL | ||||
506 | database translation. Note that the trailing terminator | ||||
507 | character is removed before returning the translated protein | ||||
508 | object. | ||||
509 | |||||
510 | Note: if you set $dna_seq_obj->verbose(1) you will get a | ||||
511 | warning if the first codon is not a valid initiator. | ||||
512 | |||||
513 | Returns : A Bio::PrimarySeqI implementing object | ||||
514 | Args : -terminator | ||||
515 | character for terminator, default '*' | ||||
516 | -unknown | ||||
517 | character for unknown, default 'X' | ||||
518 | -frame | ||||
519 | positive integer frame shift (in bases), default 0 | ||||
520 | -codontable_id | ||||
521 | integer codon table id, default 1 | ||||
522 | -complete | ||||
523 | boolean, if true, complete CDS is expected. default false | ||||
524 | -complete_codons | ||||
525 | boolean, if true, codons which are incomplete are translated if a | ||||
526 | suitable amino acid is found. For instance, if the incomplete | ||||
527 | codon is 'GG', the completed codon is 'GGN', which is glycine | ||||
528 | (G). Defaults to 'false'; setting '-complete' also makes this | ||||
529 | true. | ||||
530 | -throw | ||||
531 | boolean, throw exception if ORF not complete, default false | ||||
532 | -orf | ||||
533 | if 'longest', find longest ORF. other true value, find | ||||
534 | first ORF. default 0 | ||||
535 | -codontable | ||||
536 | optional L<Bio::Tools::CodonTable> object to use for | ||||
537 | translation | ||||
538 | -start | ||||
539 | optional three-character string to force as initiation | ||||
540 | codon (e.g. 'atg'). If unset, start codons are | ||||
541 | determined by the CodonTable. Case insensitive. | ||||
542 | -offset | ||||
543 | optional positive integer offset for fuzzy locations. | ||||
544 | if set, must be either 1, 2, or 3 | ||||
545 | |||||
546 | =head3 Notes | ||||
547 | |||||
548 | The -start argument only applies when -orf is set to 1. By default all | ||||
549 | initiation codons found in the given codon table are used but when | ||||
550 | "start" is set to some codon this codon will be used exclusively as | ||||
551 | the initiation codon. Note that the default codon table (NCBI | ||||
552 | "Standard") has 3 initiation codons! | ||||
553 | |||||
554 | By default translate() translates termination codons to the some | ||||
555 | character (default is *), both internal and trailing codons. Setting | ||||
556 | "-complete" to 1 tells translate() to remove the trailing character. | ||||
557 | |||||
558 | -offset is used for seqfeatures which contain the the \codon_start tag | ||||
559 | and can be set to 1, 2, or 3. This is the offset by which the | ||||
560 | sequence translation starts relative to the first base of the feature | ||||
561 | |||||
562 | For details on codon tables used by translate() see L<Bio::Tools::CodonTable>. | ||||
563 | |||||
564 | Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an | ||||
565 | element in an array: | ||||
566 | |||||
567 | 1: character for terminator (optional), defaults to '*'. | ||||
568 | 2: character for unknown amino acid (optional), defaults to 'X'. | ||||
569 | 3: frame (optional), valid values are 0, 1, 2, defaults to 0. | ||||
570 | 4: codon table id (optional), defaults to 1. | ||||
571 | 5: complete coding sequence expected, defaults to 0 (false). | ||||
572 | 6: boolean, throw exception if not complete coding sequence | ||||
573 | (true), defaults to warning (false) | ||||
574 | 7: codontable, a custom Bio::Tools::CodonTable object (optional). | ||||
575 | |||||
576 | =cut | ||||
577 | |||||
578 | sub translate { | ||||
579 | my ($self,@args) = @_; | ||||
580 | my ($terminator, $unknown, $frame, $codonTableId, $complete, | ||||
581 | $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset); | ||||
582 | |||||
583 | ## new API with named parameters, post 1.5.1 | ||||
584 | if ($args[0] && $args[0] =~ /^-[A-Z]+/i) { | ||||
585 | ($terminator, $unknown, $frame, $codonTableId, $complete, | ||||
586 | $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) = | ||||
587 | $self->_rearrange([qw(TERMINATOR | ||||
588 | UNKNOWN | ||||
589 | FRAME | ||||
590 | CODONTABLE_ID | ||||
591 | COMPLETE | ||||
592 | COMPLETE_CODONS | ||||
593 | THROW | ||||
594 | CODONTABLE | ||||
595 | ORF | ||||
596 | START | ||||
597 | OFFSET)], @args); | ||||
598 | ## old API, 1.5.1 and preceding versions | ||||
599 | } else { | ||||
600 | ($terminator, $unknown, $frame, $codonTableId, | ||||
601 | $complete, $throw, $codonTable, $offset) = @args; | ||||
602 | } | ||||
603 | |||||
604 | ## Initialize termination codon, unknown codon, codon table id, frame | ||||
605 | $terminator = '*' unless (defined($terminator) and $terminator ne ''); | ||||
606 | $unknown = "X" unless (defined($unknown) and $unknown ne ''); | ||||
607 | $frame = 0 unless (defined($frame) and $frame ne ''); | ||||
608 | $codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne ''); | ||||
609 | $complete_codons ||= $complete || 0; | ||||
610 | |||||
611 | ## Get a CodonTable, error if custom CodonTable is invalid | ||||
612 | if ($codonTable) { | ||||
613 | $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable) | ||||
614 | unless $codonTable->isa('Bio::Tools::CodonTable'); | ||||
615 | } else { | ||||
616 | |||||
617 | # shouldn't this be cached? Seems wasteful to have a new instance | ||||
618 | # every time... | ||||
619 | $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId); | ||||
620 | } | ||||
621 | |||||
622 | ## Error if alphabet is "protein" | ||||
623 | $self->throw("Can't translate an amino acid sequence.") if | ||||
624 | ($self->alphabet =~ /protein/i); | ||||
625 | |||||
626 | ## Error if -start parameter isn't a valid codon | ||||
627 | if ($start_codon) { | ||||
628 | $self->throw("Invalid start codon: $start_codon.") if | ||||
629 | ( $start_codon !~ /^[A-Z]{3}$/i ); | ||||
630 | } | ||||
631 | |||||
632 | my $seq; | ||||
633 | if ($offset) { | ||||
634 | $self->throw("Offset must be 1, 2, or 3.") if | ||||
635 | ( $offset !~ /^[123]$/ ); | ||||
636 | my ($start, $end) = ($offset, $self->length); | ||||
637 | ($seq) = $self->subseq($start, $end); | ||||
638 | } else { | ||||
639 | ($seq) = $self->seq(); | ||||
640 | } | ||||
641 | |||||
642 | ## ignore frame if an ORF is supposed to be found | ||||
643 | if ( $orf ) { | ||||
644 | my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' ); | ||||
645 | $seq = $self->_orf_sequence( $seq, $orf_region ); | ||||
646 | } else { | ||||
647 | ## use frame, error if frame is not 0, 1 or 2 | ||||
648 | $self->throw("Valid values for frame are 0, 1, or 2, not $frame.") | ||||
649 | unless ($frame == 0 or $frame == 1 or $frame == 2); | ||||
650 | $seq = substr($seq,$frame); | ||||
651 | } | ||||
652 | |||||
653 | ## Translate it | ||||
654 | my $output = $codonTable->translate($seq, $complete_codons); | ||||
655 | # Use user-input terminator/unknown | ||||
656 | $output =~ s/\*/$terminator/g; | ||||
657 | $output =~ s/X/$unknown/g; | ||||
658 | |||||
659 | ## Only if we are expecting to translate a complete coding region | ||||
660 | if ($complete) { | ||||
661 | my $id = $self->display_id; | ||||
662 | # remove the terminator character | ||||
663 | if( substr($output,-1,1) eq $terminator ) { | ||||
664 | chop $output; | ||||
665 | } else { | ||||
666 | $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!"); | ||||
667 | $self->warn("Seq [$id]: Not using a valid terminator codon!"); | ||||
668 | } | ||||
669 | # test if there are terminator characters inside the protein sequence! | ||||
670 | if ($output =~ /\Q$terminator\E/) { | ||||
671 | $id ||= ''; | ||||
672 | $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!"); | ||||
673 | $self->warn("Seq [$id]: Terminator codon inside CDS!"); | ||||
674 | } | ||||
675 | # if the initiator codon is not ATG, the amino acid needs to be changed to M | ||||
676 | if ( substr($output,0,1) ne 'M' ) { | ||||
677 | if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) { | ||||
678 | $output = 'M'. substr($output,1); | ||||
679 | } elsif ($throw) { | ||||
680 | $self->throw("Seq [$id]: Not using a valid initiator codon!"); | ||||
681 | } else { | ||||
682 | $self->warn("Seq [$id]: Not using a valid initiator codon!"); | ||||
683 | } | ||||
684 | } | ||||
685 | } | ||||
686 | |||||
687 | # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' | ||||
688 | # or 'Bio::Seq::LargeSeq', if not take advantage of | ||||
689 | # Bio::Root::clone to get an object copy | ||||
690 | my $out; | ||||
691 | if ( $self->isa('Bio::Seq::LargePrimarySeq') | ||||
692 | or $self->isa('Bio::Seq::LargeSeq') | ||||
693 | ) { | ||||
694 | my ($seqclass, $opts) = $self->_setup_class; | ||||
695 | $out = $seqclass->new( | ||||
696 | -seq => $output, | ||||
697 | -is_circular => $self->is_circular, | ||||
698 | -display_id => $self->display_id, | ||||
699 | -accession_number => $self->accession_number, | ||||
700 | -alphabet => 'protein', | ||||
701 | -desc => $self->desc, | ||||
702 | -verbose => $self->verbose, | ||||
703 | %$opts, | ||||
704 | ); | ||||
705 | } else { | ||||
706 | $out = $self->clone; | ||||
707 | $out->seq($output); | ||||
708 | $out->alphabet('protein'); | ||||
709 | } | ||||
710 | return $out; | ||||
711 | } | ||||
712 | |||||
713 | |||||
714 | =head2 transcribe() | ||||
715 | |||||
716 | Title : transcribe | ||||
717 | Usage : $xseq = $seq->transcribe | ||||
718 | Function: Convert base T to base U | ||||
719 | Returns : PrimarySeqI object of alphabet 'rna' or | ||||
720 | undef if $seq->alphabet ne 'dna' | ||||
721 | Args : | ||||
722 | |||||
723 | =cut | ||||
724 | |||||
725 | sub transcribe { | ||||
726 | my $self = shift; | ||||
727 | return unless $self->alphabet eq 'dna'; | ||||
728 | my $s = $self->seq; | ||||
729 | $s =~ tr/tT/uU/; | ||||
730 | my $desc = $self->desc || ''; | ||||
731 | |||||
732 | # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' | ||||
733 | # or 'Bio::Seq::LargeSeq', if not take advantage of | ||||
734 | # Bio::Root::clone to get an object copy | ||||
735 | my $out; | ||||
736 | if ( $self->isa('Bio::Seq::LargePrimarySeq') | ||||
737 | or $self->isa('Bio::Seq::LargeSeq') | ||||
738 | ) { | ||||
739 | my ($seqclass, $opts) = $self->_setup_class; | ||||
740 | $out = $seqclass->new( | ||||
741 | -seq => $s, | ||||
742 | -is_circular => $self->is_circular, | ||||
743 | -display_id => $self->display_id, | ||||
744 | -accession_number => $self->accession_number, | ||||
745 | -alphabet => 'rna', | ||||
746 | -desc => "${desc}[TRANSCRIBED]", | ||||
747 | -verbose => $self->verbose, | ||||
748 | %$opts, | ||||
749 | ); | ||||
750 | } else { | ||||
751 | $out = $self->clone; | ||||
752 | $out->seq($s); | ||||
753 | $out->alphabet('rna'); | ||||
754 | $out->desc($desc . "[TRANSCRIBED]"); | ||||
755 | } | ||||
756 | return $out; | ||||
757 | } | ||||
758 | |||||
759 | |||||
760 | =head2 rev_transcribe() | ||||
761 | |||||
762 | Title : rev_transcribe | ||||
763 | Usage : $rtseq = $seq->rev_transcribe | ||||
764 | Function: Convert base U to base T | ||||
765 | Returns : PrimarySeqI object of alphabet 'dna' or | ||||
766 | undef if $seq->alphabet ne 'rna' | ||||
767 | Args : | ||||
768 | |||||
769 | =cut | ||||
770 | |||||
771 | sub rev_transcribe { | ||||
772 | my $self = shift; | ||||
773 | return unless $self->alphabet eq 'rna'; | ||||
774 | my $s = $self->seq; | ||||
775 | $s =~ tr/uU/tT/; | ||||
776 | my $desc = $self->desc || ''; | ||||
777 | |||||
778 | # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' | ||||
779 | # or 'Bio::Seq::LargeSeq', if not take advantage of | ||||
780 | # Bio::Root::clone to get an object copy | ||||
781 | my $out; | ||||
782 | if ( $self->isa('Bio::Seq::LargePrimarySeq') | ||||
783 | or $self->isa('Bio::Seq::LargeSeq') | ||||
784 | ) { | ||||
785 | my ($seqclass, $opts) = $self->_setup_class; | ||||
786 | $out = $seqclass->new( | ||||
787 | -seq => $s, | ||||
788 | -is_circular => $self->is_circular, | ||||
789 | -display_id => $self->display_id, | ||||
790 | -accession_number => $self->accession_number, | ||||
791 | -alphabet => 'dna', | ||||
792 | -desc => $self->desc . "[REVERSE TRANSCRIBED]", | ||||
793 | -verbose => $self->verbose, | ||||
794 | %$opts, | ||||
795 | ); | ||||
796 | } else { | ||||
797 | $out = $self->clone; | ||||
798 | $out->seq($s); | ||||
799 | $out->alphabet('dna'); | ||||
800 | $out->desc($desc . "[REVERSE TRANSCRIBED]"); | ||||
801 | } | ||||
802 | return $out; | ||||
803 | } | ||||
804 | |||||
805 | |||||
806 | =head2 id | ||||
807 | |||||
808 | Title : id | ||||
809 | Usage : $id = $seq->id() | ||||
810 | Function: ID of the sequence. This should normally be (and actually is in | ||||
811 | the implementation provided here) just a synonym for display_id(). | ||||
812 | Returns : A string. | ||||
813 | Args : | ||||
814 | |||||
815 | =cut | ||||
816 | |||||
817 | sub id { | ||||
818 | my ($self)= @_; | ||||
819 | return $self->display_id(); | ||||
820 | } | ||||
821 | |||||
822 | |||||
823 | =head2 length | ||||
824 | |||||
825 | Title : length | ||||
826 | Usage : $len = $seq->length() | ||||
827 | Function: | ||||
828 | Returns : Integer representing the length of the sequence. | ||||
829 | Args : | ||||
830 | |||||
831 | =cut | ||||
832 | |||||
833 | sub length { | ||||
834 | my ($self)= @_; | ||||
835 | $self->throw_not_implemented(); | ||||
836 | } | ||||
837 | |||||
838 | |||||
839 | =head2 desc | ||||
840 | |||||
841 | Title : desc | ||||
842 | Usage : $seq->desc($newval); | ||||
843 | $description = $seq->desc(); | ||||
844 | Function: Get/set description text for a seq object | ||||
845 | Returns : Value of desc | ||||
846 | Args : newvalue (optional) | ||||
847 | |||||
848 | =cut | ||||
849 | |||||
850 | sub desc { | ||||
851 | shift->throw_not_implemented(); | ||||
852 | } | ||||
853 | |||||
854 | |||||
855 | =head2 is_circular | ||||
856 | |||||
857 | Title : is_circular | ||||
858 | Usage : if( $obj->is_circular) { # Do something } | ||||
859 | Function: Returns true if the molecule is circular | ||||
860 | Returns : Boolean value | ||||
861 | Args : none | ||||
862 | |||||
863 | =cut | ||||
864 | |||||
865 | sub is_circular { | ||||
866 | shift->throw_not_implemented; | ||||
867 | } | ||||
868 | |||||
869 | |||||
870 | =head1 Private functions | ||||
871 | |||||
872 | These are some private functions for the PrimarySeqI interface. You do not | ||||
873 | need to implement these functions | ||||
874 | |||||
875 | =head2 _find_orfs_nucleotide | ||||
876 | |||||
877 | Title : _find_orfs_nucleotide | ||||
878 | Usage : | ||||
879 | Function: Finds ORF starting at 1st initiation codon in nucleotide sequence. | ||||
880 | The ORF is not required to have a termination codon. | ||||
881 | Example : | ||||
882 | Returns : a list of string coordinates of ORF locations (0-based half-open), | ||||
883 | sorted descending by length (so that the longest is first) | ||||
884 | as: [ start, end, frame, length ], [ start, end, frame, length ], ... | ||||
885 | Args : Nucleotide sequence, | ||||
886 | CodonTable object, | ||||
887 | (optional) alternative initiation codon (e.g. 'ATA'), | ||||
888 | (optional) boolean that, if true, stops after finding the | ||||
889 | first available ORF | ||||
890 | |||||
891 | =cut | ||||
892 | |||||
893 | sub _find_orfs_nucleotide { | ||||
894 | my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_; | ||||
895 | $sequence = uc $sequence; | ||||
896 | $start_codon = uc $start_codon if $start_codon; | ||||
897 | |||||
898 | my $is_start = $start_codon | ||||
899 | ? sub { shift eq $start_codon } | ||||
900 | : sub { $codon_table->is_start_codon( shift ) }; | ||||
901 | |||||
902 | # stores the begin index of the currently-running ORF in each | ||||
903 | # reading frame | ||||
904 | my @current_orf_start = (-1,-1,-1); | ||||
905 | |||||
906 | #< stores coordinates of longest observed orf (so far) in each | ||||
907 | # reading frame | ||||
908 | my @orfs; | ||||
909 | |||||
910 | # go through each base of the sequence, and each reading frame for each base | ||||
911 | my $seqlen = CORE::length $sequence; | ||||
912 | for( my $j = 0; $j <= $seqlen-3; $j++ ) { | ||||
913 | my $frame = $j % 3; | ||||
914 | |||||
915 | my $this_codon = substr( $sequence, $j, 3 ); | ||||
916 | |||||
917 | # if in an orf and this is either a stop codon or the last in-frame codon in the string | ||||
918 | if ( $current_orf_start[$frame] >= 0 ) { | ||||
919 | if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) { | ||||
920 | # record ORF start, end (half-open), length, and frame | ||||
921 | my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame ); | ||||
922 | my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] ); | ||||
923 | |||||
924 | $self->warn( "Translating partial ORF " | ||||
925 | .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf )) | ||||
926 | .' from end of nucleotide sequence' | ||||
927 | ) | ||||
928 | if $first_only && $is_last_codon_in_frame; | ||||
929 | |||||
930 | return \@this_orf if $first_only; | ||||
931 | push @orfs, \@this_orf; | ||||
932 | $current_orf_start[$frame] = -1; | ||||
933 | } | ||||
934 | } | ||||
935 | # if this is a start codon | ||||
936 | elsif ( $is_start->($this_codon) ) { | ||||
937 | $current_orf_start[$frame] = $j; | ||||
938 | } | ||||
939 | } | ||||
940 | |||||
941 | return sort { $b->[2] <=> $a->[2] } @orfs; | ||||
942 | } | ||||
943 | |||||
944 | |||||
945 | sub _truncate_seq { | ||||
946 | my ($self, $seq) = @_; | ||||
947 | return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq; | ||||
948 | } | ||||
949 | |||||
950 | |||||
951 | sub _orf_sequence { | ||||
952 | my ($self, $seq, $orf ) = @_; | ||||
953 | return '' unless $orf; | ||||
954 | return substr( $seq, $orf->[0], $orf->[2] ) | ||||
955 | } | ||||
956 | |||||
957 | |||||
958 | =head2 _attempt_to_load_Seq | ||||
959 | |||||
960 | Title : _attempt_to_load_Seq | ||||
961 | Usage : | ||||
962 | Function: | ||||
963 | Example : | ||||
964 | Returns : | ||||
965 | Args : | ||||
966 | |||||
967 | =cut | ||||
968 | |||||
969 | sub _attempt_to_load_Seq { | ||||
970 | my ($self) = @_; | ||||
971 | |||||
972 | if( $main::{'Bio::PrimarySeq'} ) { | ||||
973 | return 1; | ||||
974 | } else { | ||||
975 | eval { | ||||
976 | require Bio::PrimarySeq; | ||||
977 | }; | ||||
978 | if( $@ ) { | ||||
979 | my $text = "Bio::PrimarySeq could not be loaded for [$self]\n". | ||||
980 | "This indicates that you are using Bio::PrimarySeqI ". | ||||
981 | "without Bio::PrimarySeq loaded or without providing a ". | ||||
982 | "complete implementation.\nThe most likely problem is that there ". | ||||
983 | "has been a misconfiguration of the bioperl environment\n". | ||||
984 | "Actual exception:\n\n"; | ||||
985 | $self->throw("$text$@\n"); | ||||
986 | return 0; | ||||
987 | } | ||||
988 | return 1; | ||||
989 | } | ||||
990 | } | ||||
991 | |||||
992 | |||||
993 | sub _setup_class { | ||||
994 | # Return name of class and setup some default parameters | ||||
995 | my ($self) = @_; | ||||
996 | my $seqclass; | ||||
997 | if ($self->can_call_new()) { | ||||
998 | $seqclass = ref($self); | ||||
999 | } else { | ||||
1000 | $seqclass = 'Bio::PrimarySeq'; | ||||
1001 | $self->_attempt_to_load_Seq(); | ||||
1002 | } | ||||
1003 | my %opts; | ||||
1004 | if ($seqclass eq 'Bio::PrimarySeq') { | ||||
1005 | # Since sequence is in a Seq object, it has already been validated. | ||||
1006 | # We do not need to validate its trunc(), revcom(), etc | ||||
1007 | $opts{ -direct } = 1; | ||||
1008 | } | ||||
1009 | return $seqclass, \%opts; | ||||
1010 | } | ||||
1011 | |||||
1012 | |||||
1013 | 1 | 4µs | 1; |