Filename | /Users/ap13/perl5/lib/perl5/Bio/LocatableSeq.pm |
Statements | Executed 16 statements in 2.45ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 1.97ms | 2.55ms | BEGIN@106 | Bio::LocatableSeq::
1 | 1 | 1 | 1.03ms | 7.28ms | BEGIN@105 | Bio::LocatableSeq::
1 | 1 | 1 | 15µs | 28µs | BEGIN@103 | Bio::LocatableSeq::
1 | 1 | 1 | 12µs | 98µs | BEGIN@107 | Bio::LocatableSeq::
1 | 1 | 1 | 11µs | 19.7ms | BEGIN@120 | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | _ungapped_len | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | column_from_residue_number | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | end | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | force_nse | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | frameshifts | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | get_nse | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | location_from_column | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | mapping | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | new | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | no_gaps | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | no_sequences | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | num_gaps | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | revcom | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | start | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | strand | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | trunc | Bio::LocatableSeq::
0 | 0 | 0 | 0s | 0s | validate_seq | Bio::LocatableSeq::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # | ||||
2 | # BioPerl module for Bio::LocatableSeq | ||||
3 | # | ||||
4 | # Please direct questions and support issues to <bioperl-l@bioperl.org> | ||||
5 | # | ||||
6 | # Cared for by Ewan Birney <birney@ebi.ac.uk> | ||||
7 | # | ||||
8 | # Copyright Ewan Birney | ||||
9 | # | ||||
10 | # You may distribute this module under the same terms as perl itself | ||||
11 | |||||
12 | # POD documentation - main docs before the code | ||||
13 | |||||
14 | =head1 NAME | ||||
15 | |||||
16 | Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it | ||||
17 | that can be projected into a MSA or have coordinates relative to | ||||
18 | another seq. | ||||
19 | |||||
20 | =head1 SYNOPSIS | ||||
21 | |||||
22 | use Bio::LocatableSeq; | ||||
23 | my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT", | ||||
24 | -id => "seq1", | ||||
25 | -start => 1, | ||||
26 | -end => 7); | ||||
27 | |||||
28 | # a normal sequence object | ||||
29 | $locseq->seq(); | ||||
30 | $locseq->id(); | ||||
31 | |||||
32 | # has start,end points | ||||
33 | $locseq->start(); | ||||
34 | $locseq->end(); | ||||
35 | |||||
36 | # inherits off RangeI, so range operations possible | ||||
37 | |||||
38 | =head1 DESCRIPTION | ||||
39 | |||||
40 | The LocatableSeq sequence object was developed mainly because the SimpleAlign | ||||
41 | object requires this functionality, and in the rewrite of the Sequence object we | ||||
42 | had to decide what to do with this. | ||||
43 | |||||
44 | It is, to be honest, not well integrated with the rest of bioperl. For example, | ||||
45 | the trunc() function does not return a LocatableSeq object, as some might have | ||||
46 | thought. Also, the sequence is not a Bio::SeqI, so the location is simply | ||||
47 | inherited from Bio::RangeI and is not stored in a Bio::Location. | ||||
48 | |||||
49 | There are all sorts of nasty gotcha's about interactions between coordinate | ||||
50 | systems when these sort of objects are used. Some mapping now occurs to deal | ||||
51 | with HSP data, however it can probably be integrated in better and most methods | ||||
52 | do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(), | ||||
53 | trunc(), etc.) do not behave as expected and must be used with care. Due to this, | ||||
54 | LocatableSeq functionality is to be refactored in a future BioPerl release. | ||||
55 | However, for alignment functionality it works adequately for the time being. | ||||
56 | |||||
57 | If you do not need alignment functionality, L<Bio::SeqfeatureI>-implementing | ||||
58 | modules may be a suitable alternative to L<Bio::LocatableSeq>. For example, | ||||
59 | L<Bio::SeqFeature::Generic> and L<Bio::SeqFeature::Lite> provide methods to | ||||
60 | attach a sequence to a specific region of a parent sequence and to set other | ||||
61 | useful attributes. | ||||
62 | |||||
63 | =head1 FEEDBACK | ||||
64 | |||||
65 | =head2 Mailing Lists | ||||
66 | |||||
67 | User feedback is an integral part of the evolution of this and other | ||||
68 | Bioperl modules. Send your comments and suggestions preferably to one | ||||
69 | of the Bioperl mailing lists. Your participation is much appreciated. | ||||
70 | |||||
71 | bioperl-l@bioperl.org - General discussion | ||||
72 | http://bioperl.org/wiki/Mailing_lists - About the mailing lists | ||||
73 | |||||
74 | =head2 Support | ||||
75 | |||||
76 | Please direct usage questions or support issues to the mailing list: | ||||
77 | |||||
78 | I<bioperl-l@bioperl.org> | ||||
79 | |||||
80 | rather than to the module maintainer directly. Many experienced and | ||||
81 | reponsive experts will be able look at the problem and quickly | ||||
82 | address it. Please include a thorough description of the problem | ||||
83 | with code and data examples if at all possible. | ||||
84 | |||||
85 | =head2 Reporting Bugs | ||||
86 | |||||
87 | Report bugs to the Bioperl bug tracking system to help us keep track | ||||
88 | the bugs and their resolution. Bug reports can be submitted via the | ||||
89 | web: | ||||
90 | |||||
91 | https://github.com/bioperl/bioperl-live/issues | ||||
92 | |||||
93 | =head1 APPENDIX | ||||
94 | |||||
95 | The rest of the documentation details each of the object | ||||
96 | methods. Internal methods are usually preceded with a _ | ||||
97 | |||||
98 | =cut | ||||
99 | |||||
- - | |||||
102 | package Bio::LocatableSeq; | ||||
103 | 2 | 23µs | 2 | 40µs | # spent 28µs (15+12) within Bio::LocatableSeq::BEGIN@103 which was called:
# once (15µs+12µs) by Bio::Tools::GFF::BEGIN@151 at line 103 # spent 28µs making 1 call to Bio::LocatableSeq::BEGIN@103
# spent 12µs making 1 call to strict::import |
104 | |||||
105 | 2 | 143µs | 1 | 7.28ms | # spent 7.28ms (1.03+6.24) within Bio::LocatableSeq::BEGIN@105 which was called:
# once (1.03ms+6.24ms) by Bio::Tools::GFF::BEGIN@151 at line 105 # spent 7.28ms making 1 call to Bio::LocatableSeq::BEGIN@105 |
106 | 2 | 179µs | 1 | 2.55ms | # spent 2.55ms (1.97+577µs) within Bio::LocatableSeq::BEGIN@106 which was called:
# once (1.97ms+577µs) by Bio::Tools::GFF::BEGIN@151 at line 106 # spent 2.55ms making 1 call to Bio::LocatableSeq::BEGIN@106 |
107 | 2 | 69µs | 2 | 184µs | # spent 98µs (12+86) within Bio::LocatableSeq::BEGIN@107 which was called:
# once (12µs+86µs) by Bio::Tools::GFF::BEGIN@151 at line 107 # spent 98µs making 1 call to Bio::LocatableSeq::BEGIN@107
# spent 86µs making 1 call to vars::import |
108 | |||||
109 | # The following global variables contain symbols used to represent gaps, | ||||
110 | # frameshifts, residues, and other valid symbols. These are set at compile-time; | ||||
111 | # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see | ||||
112 | # LocatableSeq.t) | ||||
113 | |||||
114 | 1 | 800ns | $GAP_SYMBOLS = '\-\.=~'; | ||
115 | 1 | 300ns | $FRAMESHIFT_SYMBOLS = '\\\/'; | ||
116 | 1 | 300ns | $OTHER_SYMBOLS = '\?'; | ||
117 | 1 | 300ns | $RESIDUE_SYMBOLS = '0-9A-Za-z\*'; | ||
118 | 1 | 1µs | $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS; | ||
119 | |||||
120 | 2 | 2.02ms | 2 | 39.3ms | # spent 19.7ms (11µs+19.6) within Bio::LocatableSeq::BEGIN@120 which was called:
# once (11µs+19.6ms) by Bio::Tools::GFF::BEGIN@151 at line 120 # spent 19.7ms making 1 call to Bio::LocatableSeq::BEGIN@120
# spent 19.6ms making 1 call to base::import |
121 | |||||
122 | |||||
123 | sub new { | ||||
124 | my ($class, @args) = @_; | ||||
125 | my $self = $class->SUPER::new(@args); | ||||
126 | |||||
127 | my ($start,$end,$strand, $mapping, $fs, $nse) = | ||||
128 | $self->_rearrange( [qw(START | ||||
129 | END | ||||
130 | STRAND | ||||
131 | MAPPING | ||||
132 | FRAMESHIFTS | ||||
133 | FORCE_NSE | ||||
134 | )], | ||||
135 | @args); | ||||
136 | |||||
137 | $mapping ||= [1,1]; | ||||
138 | $self->mapping($mapping); | ||||
139 | $nse || 0; | ||||
140 | $self->force_nse($nse); | ||||
141 | defined $fs && $self->frameshifts($fs); | ||||
142 | defined $start && $self->start($start); | ||||
143 | defined $end && $self->end($end); | ||||
144 | defined $strand && $self->strand($strand); | ||||
145 | |||||
146 | return $self; # success - we hope! | ||||
147 | } | ||||
148 | |||||
149 | |||||
150 | =head2 start | ||||
151 | |||||
152 | Title : start | ||||
153 | Usage : $obj->start($newval) | ||||
154 | Function: Get/set the 1-based start position of this sequence in the original | ||||
155 | sequence. '0' means before the original sequence starts. | ||||
156 | Returns : value of start | ||||
157 | Args : newvalue (optional) | ||||
158 | |||||
159 | =cut | ||||
160 | |||||
161 | sub start { | ||||
162 | my $self = shift; | ||||
163 | if( @_ ) { | ||||
164 | my $value = shift; | ||||
165 | $self->{'start'} = $value; | ||||
166 | } | ||||
167 | return $self->{'start'} if defined $self->{'start'}; | ||||
168 | return 1 if $self->seq; | ||||
169 | return; | ||||
170 | } | ||||
171 | |||||
172 | |||||
173 | =head2 end | ||||
174 | |||||
175 | Title : end | ||||
176 | Usage : $obj->end($newval) | ||||
177 | Function: Get/set the 1-based end position of this sequence in the original | ||||
178 | sequence. '0' means before the original sequence starts. | ||||
179 | Returns : value of end | ||||
180 | Args : newvalue (optional) | ||||
181 | Note : although this is a get/set, it checks passed values against the | ||||
182 | calculated end point ( derived from the sequence and based on | ||||
183 | $GAP_SYMBOLS and possible frameshifts() ). If there is no match, | ||||
184 | it will warn and set the proper value. Probably best used for | ||||
185 | debugging proper sequence calculations. | ||||
186 | |||||
187 | =cut | ||||
188 | |||||
189 | sub end { | ||||
190 | my $self = shift; | ||||
191 | if( @_ ) { | ||||
192 | my $value = shift; | ||||
193 | my $st = $self->start; | ||||
194 | # start of 0 usually means the sequence is all gaps but maps to | ||||
195 | # other sequences in an alignment | ||||
196 | if ($self->seq && $st != 0 ) { | ||||
197 | my $len = $self->_ungapped_len; | ||||
198 | my $calend = $st + $len - 1; | ||||
199 | my $id = $self->id || 'unknown'; | ||||
200 | if ($calend != $value) { | ||||
201 | $self->warn("In sequence $id residue count gives end value ". | ||||
202 | "$calend. \nOverriding value [$value] with value $calend for ". | ||||
203 | "Bio::LocatableSeq::end().\n".$self->seq); | ||||
204 | $value = $calend; | ||||
205 | } | ||||
206 | } | ||||
207 | $self->{'end'} = $value; | ||||
208 | } | ||||
209 | |||||
210 | if (defined $self->{'end'}) { | ||||
211 | return $self->{'end'} | ||||
212 | } elsif ( my $len = $self->_ungapped_len) { | ||||
213 | return $len + $self->start - 1; | ||||
214 | } else { | ||||
215 | return; | ||||
216 | } | ||||
217 | } | ||||
218 | |||||
219 | |||||
220 | # changed 08.10.26 to return ungapped length, not the calculated end | ||||
221 | # of the sequence | ||||
222 | sub _ungapped_len { | ||||
223 | my $self = shift; | ||||
224 | return unless my $string = $self->seq; | ||||
225 | my ($map_res, $map_coord) = $self->mapping; | ||||
226 | my $offset = 0; | ||||
227 | if (my %data = $self->frameshifts) { | ||||
228 | map {$offset += $_} values %data; | ||||
229 | } | ||||
230 | $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g; | ||||
231 | return CORE::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res); | ||||
232 | } | ||||
233 | |||||
234 | #sub length { | ||||
235 | # my $self = shift; | ||||
236 | # return unless my $string = $self->seq; | ||||
237 | # $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g; | ||||
238 | # return CORE::length($string); | ||||
239 | #} | ||||
240 | |||||
241 | |||||
242 | =head2 strand | ||||
243 | |||||
244 | Title : strand | ||||
245 | Usage : $obj->strand($newval) | ||||
246 | Function: return or set the strandedness | ||||
247 | Returns : the value of the strandedness (-1, 0 or 1) | ||||
248 | Args : the value of the strandedness (-1, 0 or 1) | ||||
249 | |||||
250 | =cut | ||||
251 | |||||
252 | sub strand { | ||||
253 | my $self = shift; | ||||
254 | if( @_ ) { | ||||
255 | my $value = shift; | ||||
256 | $self->{'strand'} = $value; | ||||
257 | } | ||||
258 | return $self->{'strand'}; | ||||
259 | } | ||||
260 | |||||
261 | |||||
262 | =head2 mapping | ||||
263 | |||||
264 | Title : mapping | ||||
265 | Usage : $obj->mapping($newval) | ||||
266 | Function: return or set the mapping indices (indicates # symbols/positions in | ||||
267 | the source string mapping to # of coordinate positions) | ||||
268 | Returns : two-element array (# symbols => # coordinate pos) | ||||
269 | Args : two elements (# symbols => # coordinate pos); this can also be | ||||
270 | passed in as an array reference of the two elements (as might be | ||||
271 | passed upon Bio::LocatableSeq instantiation, for instance). | ||||
272 | |||||
273 | =cut | ||||
274 | |||||
275 | sub mapping { | ||||
276 | my $self = shift; | ||||
277 | if( @_ ) { | ||||
278 | my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_; | ||||
279 | $self->throw("Must pass two values (# residues mapped to # positions)") | ||||
280 | if @mapping != 2; | ||||
281 | if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) { | ||||
282 | $self->throw("Mapping values other than 1 or 3 are not currently supported") | ||||
283 | } | ||||
284 | $self->{'_mapping'} = \@mapping; | ||||
285 | } | ||||
286 | $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'}; | ||||
287 | return @{ $self->{'_mapping'} }; | ||||
288 | } | ||||
289 | |||||
290 | |||||
291 | =head2 frameshifts | ||||
292 | |||||
293 | Title : frameshifts | ||||
294 | Usage : $obj->frameshifts($newval) | ||||
295 | Function: get/set the frameshift hash, which contains sequence positions as | ||||
296 | keys and the shift (-2, -1, 1, 2) as the value | ||||
297 | Returns : hash | ||||
298 | Args : hash or hash reference | ||||
299 | |||||
300 | =cut | ||||
301 | |||||
302 | sub frameshifts { | ||||
303 | my $self = shift; | ||||
304 | if( @_ ) { | ||||
305 | if (ref $_[0] eq 'HASH') { | ||||
306 | $self->{_frameshifts} = $_[0]; | ||||
307 | } else { | ||||
308 | # assume this is a full list to be converted to a hash | ||||
309 | $self->{_frameshifts} = \%{@_} # coerce into hash ref | ||||
310 | } | ||||
311 | } | ||||
312 | (defined $self->{_frameshifts} && ref $self->{_frameshifts} eq 'HASH') ? | ||||
313 | return %{$self->{_frameshifts}} : return (); | ||||
314 | } | ||||
315 | |||||
316 | |||||
317 | =head2 get_nse | ||||
318 | |||||
319 | Title : get_nse | ||||
320 | Usage : | ||||
321 | Function: read-only name of form id/start-end | ||||
322 | Example : | ||||
323 | Returns : | ||||
324 | Args : | ||||
325 | |||||
326 | =cut | ||||
327 | |||||
328 | sub get_nse { | ||||
329 | my ($self,$char1,$char2) = @_; | ||||
330 | |||||
331 | $char1 ||= "/"; | ||||
332 | $char2 ||= "-"; | ||||
333 | |||||
334 | my ($id, $st, $end, $strand) = ($self->id(), $self->start(), | ||||
335 | $self->end(), $self->strand || 0); | ||||
336 | |||||
337 | if ($self->force_nse) { | ||||
338 | $id ||= ''; | ||||
339 | $st ||= 0; | ||||
340 | $end ||= 0; | ||||
341 | } | ||||
342 | |||||
343 | $self->throw("Attribute id not set") unless defined($id); | ||||
344 | $self->throw("Attribute start not set") unless defined($st); | ||||
345 | $self->throw("Attribute end not set") unless defined($end); | ||||
346 | |||||
347 | if ($strand && $strand == -1) { | ||||
348 | ($st, $end) = ($end, $st); | ||||
349 | } | ||||
350 | |||||
351 | #Stockholm Rfam includes version if present so it is optional | ||||
352 | my $v = $self->version ? '.'.$self->version : ''; | ||||
353 | return join('',$id, $v, $char1, $st, $char2, $end); | ||||
354 | } | ||||
355 | |||||
356 | |||||
357 | =head2 force_nse | ||||
358 | |||||
359 | Title : force_nse | ||||
360 | Usage : $ls->force_nse() | ||||
361 | Function: Boolean which forces get_nse() to build an NSE, regardless | ||||
362 | of whether id(), start(), or end() is set | ||||
363 | Returns : Boolean value | ||||
364 | Args : (optional) Boolean (1 or 0) | ||||
365 | Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0 | ||||
366 | respectively | ||||
367 | |||||
368 | =cut | ||||
369 | |||||
370 | sub force_nse { | ||||
371 | my ($self, $flag) = @_; | ||||
372 | if (defined $flag) { | ||||
373 | $flag ? (return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0); | ||||
374 | } | ||||
375 | return $self->{'_force_nse'}; | ||||
376 | } | ||||
377 | |||||
378 | |||||
379 | =head2 num_gaps | ||||
380 | |||||
381 | Title : num_gaps | ||||
382 | Usage :$self->num_gaps('.') | ||||
383 | Function:Gets number of gaps in the sequence. The count excludes | ||||
384 | leading or trailing gap characters. | ||||
385 | |||||
386 | Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of | ||||
387 | these, '.' and '-' are counted as gap characters unless an | ||||
388 | optional argument specifies one of them. | ||||
389 | |||||
390 | Returns : number of internal gaps in the sequence. | ||||
391 | Args : a gap character (optional) | ||||
392 | Status : Stable | ||||
393 | Note : replaces no_gaps | ||||
394 | |||||
395 | =cut | ||||
396 | |||||
397 | sub num_gaps { | ||||
398 | my ($self,$char) = @_; | ||||
399 | my ($seq, $count) = (undef, 0); | ||||
400 | |||||
401 | # default gap characters | ||||
402 | $char ||= $GAP_SYMBOLS; | ||||
403 | |||||
404 | $self->warn("I hope you know what you are doing setting gap to [$char]") | ||||
405 | unless $char =~ /[$GAP_SYMBOLS]/; | ||||
406 | |||||
407 | $seq = $self->seq; | ||||
408 | return 0 unless $seq; # empty sequence does not have gaps | ||||
409 | |||||
410 | $seq =~ s/^([$char]+)//; | ||||
411 | $seq =~ s/([$char]+)$//; | ||||
412 | while ( $seq =~ /[$char]+/g ) { | ||||
413 | $count++; | ||||
414 | } | ||||
415 | |||||
416 | return $count; | ||||
417 | } | ||||
418 | |||||
419 | |||||
420 | =head2 column_from_residue_number | ||||
421 | |||||
422 | Title : column_from_residue_number | ||||
423 | Usage : $col = $seq->column_from_residue_number($resnumber) | ||||
424 | Function: | ||||
425 | |||||
426 | This function gives the position in the alignment | ||||
427 | (i.e. column number) of the given residue number in the | ||||
428 | sequence. For example, for the sequence | ||||
429 | |||||
430 | Seq1/91-97 AC..DEF.GH | ||||
431 | |||||
432 | column_from_residue_number(94) returns 6. | ||||
433 | |||||
434 | An exception is thrown if the residue number would lie | ||||
435 | outside the length of the aligment | ||||
436 | (e.g. column_from_residue_number( "Seq2", 22 ) | ||||
437 | |||||
438 | Returns : A column number for the position of the | ||||
439 | given residue in the given sequence (1 = first column) | ||||
440 | Args : A residue number in the whole sequence (not just that | ||||
441 | segment of it in the alignment) | ||||
442 | |||||
443 | =cut | ||||
444 | |||||
445 | sub column_from_residue_number { | ||||
446 | my ($self, $resnumber) = @_; | ||||
447 | |||||
448 | $self->throw("Residue number has to be a positive integer, not [$resnumber]") | ||||
449 | unless $resnumber =~ /^\d+$/ and $resnumber > 0; | ||||
450 | |||||
451 | if ($resnumber >= $self->start() and $resnumber <= $self->end()) { | ||||
452 | my @chunks; | ||||
453 | my $column_incr; | ||||
454 | my $current_column; | ||||
455 | my $current_residue = $self->start - 1; | ||||
456 | my $seq = $self->seq; | ||||
457 | my $strand = $self->strand || 0; | ||||
458 | |||||
459 | if ($strand == -1) { | ||||
460 | #@chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go; | ||||
461 | @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go; | ||||
462 | $column_incr = -1; | ||||
463 | $current_column = (CORE::length $seq) + 1; | ||||
464 | } | ||||
465 | else { | ||||
466 | #@chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go; | ||||
467 | @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go; | ||||
468 | $column_incr = 1; | ||||
469 | $current_column = 0; | ||||
470 | } | ||||
471 | |||||
472 | while (my $chunk = shift @chunks) { | ||||
473 | #if ($chunk =~ m|^[\.\-]|o) { | ||||
474 | if ($chunk =~ m|^[$GAP_SYMBOLS]|o) { | ||||
475 | $current_column += $column_incr * CORE::length($chunk); | ||||
476 | } | ||||
477 | else { | ||||
478 | if ($current_residue + CORE::length($chunk) < $resnumber) { | ||||
479 | $current_column += $column_incr * CORE::length($chunk); | ||||
480 | $current_residue += CORE::length($chunk); | ||||
481 | } | ||||
482 | else { | ||||
483 | if ($strand == -1) { | ||||
484 | $current_column -= $resnumber - $current_residue; | ||||
485 | } | ||||
486 | else { | ||||
487 | $current_column += $resnumber - $current_residue; | ||||
488 | } | ||||
489 | return $current_column; | ||||
490 | } | ||||
491 | } | ||||
492 | } | ||||
493 | } | ||||
494 | |||||
495 | $self->throw("Could not find residue number $resnumber"); | ||||
496 | |||||
497 | } | ||||
498 | |||||
499 | |||||
500 | =head2 location_from_column | ||||
501 | |||||
502 | Title : location_from_column | ||||
503 | Usage : $loc = $ali->location_from_column($column_number) | ||||
504 | Function: | ||||
505 | |||||
506 | This function gives the residue number for a given position | ||||
507 | in the alignment (i.e. column number) of the given. Gaps | ||||
508 | complicate this process and force the output to be a | ||||
509 | L<Bio::Location::Simple> where values can be undefined. | ||||
510 | For example, for the sequence: | ||||
511 | |||||
512 | Seq/91-96 .AC..DEF.G. | ||||
513 | |||||
514 | location_from_column( 3 ) position 92 | ||||
515 | location_from_column( 4 ) position 92^93 | ||||
516 | location_from_column( 9 ) position 95^96 | ||||
517 | location_from_column( 1 ) position undef | ||||
518 | |||||
519 | An exact position returns a Bio::Location::Simple object | ||||
520 | where where location_type() returns 'EXACT', if a position | ||||
521 | is between bases location_type() returns 'IN-BETWEEN'. | ||||
522 | Column before the first residue returns undef. Note that if | ||||
523 | the position is after the last residue in the alignment, | ||||
524 | that there is no guarantee that the original sequence has | ||||
525 | residues after that position. | ||||
526 | |||||
527 | An exception is thrown if the column number is not within | ||||
528 | the sequence. | ||||
529 | |||||
530 | Returns : Bio::Location::Simple or undef | ||||
531 | Args : A column number | ||||
532 | Throws : If column is not within the sequence | ||||
533 | |||||
534 | See L<Bio::Location::Simple> for more. | ||||
535 | |||||
536 | =cut | ||||
537 | |||||
538 | sub location_from_column { | ||||
539 | my ($self, $column) = @_; | ||||
540 | |||||
541 | $self->throw("Column number has to be a positive integer, not [$column]") | ||||
542 | unless $column =~ /^\d+$/ and $column > 0; | ||||
543 | $self->throw("Column number [$column] is larger than". | ||||
544 | " sequence length [". $self->length. "]") | ||||
545 | unless $column <= $self->length; | ||||
546 | |||||
547 | my ($loc); | ||||
548 | my $s = $self->subseq(1,$column); | ||||
549 | $s =~ s/[^a-zA-Z\*]//g; | ||||
550 | |||||
551 | my $pos = CORE::length $s; | ||||
552 | |||||
553 | my $start = $self->start || 0 ; | ||||
554 | my $strand = $self->strand() || 1; | ||||
555 | my $relative_pos = ($strand == -1) | ||||
556 | ? ($self->end - $pos + 1) | ||||
557 | : ($pos + $start - 1); | ||||
558 | if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) { | ||||
559 | $loc = Bio::Location::Simple->new | ||||
560 | (-start => $relative_pos, | ||||
561 | -end => $relative_pos, | ||||
562 | -strand => 1, | ||||
563 | ); | ||||
564 | } elsif ($pos == 0 and $self->start == 1) { | ||||
565 | } else { | ||||
566 | my ($start,$end) = ($relative_pos, $relative_pos + $strand); | ||||
567 | if ($strand == -1) { | ||||
568 | ($start,$end) = ($end,$start); | ||||
569 | } | ||||
570 | $loc = Bio::Location::Simple->new | ||||
571 | (-start => $start, | ||||
572 | -end => $end, | ||||
573 | -strand => 1, | ||||
574 | -location_type => 'IN-BETWEEN' | ||||
575 | ); | ||||
576 | } | ||||
577 | return $loc; | ||||
578 | } | ||||
579 | |||||
580 | |||||
581 | =head2 revcom | ||||
582 | |||||
583 | Title : revcom | ||||
584 | Usage : $rev = $seq->revcom() | ||||
585 | Function: Produces a new Bio::LocatableSeq object which | ||||
586 | has the reversed complement of the sequence. For protein | ||||
587 | sequences this throws an exception of "Sequence is a | ||||
588 | protein. Cannot revcom" | ||||
589 | |||||
590 | Returns : A new Bio::LocatableSeq object | ||||
591 | Args : none | ||||
592 | |||||
593 | =cut | ||||
594 | |||||
595 | sub revcom { | ||||
596 | my ($self) = @_; | ||||
597 | # since we don't know whether sequences without 1 => 1 correlation can be | ||||
598 | # revcom'd, kick back | ||||
599 | if (grep {$_ != 1} $self->mapping) { | ||||
600 | $self->warn('revcom() not supported for sequences with mapped values of > 1'); | ||||
601 | return; | ||||
602 | } | ||||
603 | my $new = $self->SUPER::revcom; | ||||
604 | $new->strand($self->strand * -1) if $self->strand; | ||||
605 | $new->start($self->start) if $self->start; | ||||
606 | $new->end($self->end) if $self->end; | ||||
607 | return $new; | ||||
608 | } | ||||
609 | |||||
610 | |||||
611 | =head2 trunc | ||||
612 | |||||
613 | Title : trunc | ||||
614 | Usage : $subseq = $myseq->trunc(10,100); | ||||
615 | Function: Provides a truncation of a sequence, | ||||
616 | Returns : a fresh Bio::PrimarySeqI implementing object | ||||
617 | Args : Two integers denoting first and last columns of the | ||||
618 | sequence to be included into sub-sequence. | ||||
619 | |||||
620 | =cut | ||||
621 | |||||
622 | sub trunc { | ||||
623 | my ($self, $start, $end) = @_; | ||||
624 | my $new = $self->SUPER::trunc($start, $end); | ||||
625 | $new->strand($self->strand); | ||||
626 | |||||
627 | # end will be automatically calculated | ||||
628 | $start = $end if $self->strand && $self->strand == -1; | ||||
629 | |||||
630 | $start = $self->location_from_column($start); | ||||
631 | $start ? ($start = $start->end) : ($start = 1); | ||||
632 | $new->start($start) if $start; | ||||
633 | |||||
634 | return $new; | ||||
635 | } | ||||
636 | |||||
637 | |||||
638 | =head2 validate_seq | ||||
639 | |||||
640 | Title : validate_seq | ||||
641 | Usage : if(! $seqobj->validate_seq($seq_str) ) { | ||||
642 | print "sequence $seq_str is not valid for an object of | ||||
643 | alphabet ",$seqobj->alphabet, "\n"; | ||||
644 | } | ||||
645 | Function: Test that the given sequence is valid, i.e. contains only valid | ||||
646 | characters. The allowed characters are all letters (A-Z) and '-','.', | ||||
647 | '*','?','=' and '~'. Spaces are not valid. Note that this | ||||
648 | implementation does not take alphabet() into account. | ||||
649 | Returns : 1 if the supplied sequence string is valid, 0 otherwise. | ||||
650 | Args : - Sequence string to be validated | ||||
651 | - Boolean to throw an error if the sequence is invalid | ||||
652 | |||||
653 | =cut | ||||
654 | |||||
655 | sub validate_seq { | ||||
656 | my ($self, $seqstr, $throw) = @_; | ||||
657 | $seqstr = '' if not defined $seqstr; | ||||
658 | $throw = 0 if not defined $throw ; # 0 for backward compatiblity | ||||
659 | if ( (CORE::length $seqstr > 0 ) && | ||||
660 | ($seqstr !~ /^([$MATCHPATTERN]+)$/) ) { | ||||
661 | if ($throw) { | ||||
662 | $self->throw("Failed validation of sequence '".(defined($self->id) || | ||||
663 | '[unidentified sequence]')."'. Invalid characters were: " . | ||||
664 | join('',($seqstr =~ /([^$MATCHPATTERN]+)/g))); | ||||
665 | } | ||||
666 | return 0; | ||||
667 | } | ||||
668 | return 1; | ||||
669 | } | ||||
670 | |||||
671 | |||||
672 | ################## DEPRECATED METHODS ################## | ||||
673 | |||||
674 | |||||
675 | =head2 no_gap | ||||
676 | |||||
677 | Title : no_gaps | ||||
678 | Usage : $self->no_gaps('.') | ||||
679 | Function : Gets number of gaps in the sequence. The count excludes | ||||
680 | leading or trailing gap characters. | ||||
681 | |||||
682 | Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of | ||||
683 | these, '.' and '-' are counted as gap characters unless an | ||||
684 | optional argument specifies one of them. | ||||
685 | |||||
686 | Returns : number of internal gaps in the sequence. | ||||
687 | Args : a gap character (optional) | ||||
688 | Status : Deprecated (in favor of num_gaps()) | ||||
689 | |||||
690 | =cut | ||||
691 | |||||
692 | sub no_gaps { | ||||
693 | my $self = shift; | ||||
694 | $self->deprecated( -warn_version => 1.0069, | ||||
695 | -throw_version => 1.0075, | ||||
696 | -message => 'Use of method no_gaps() is deprecated, use num_gaps() instead' ); | ||||
697 | return $self->num_gaps(@_); | ||||
698 | } | ||||
699 | |||||
700 | |||||
701 | =head2 no_sequences | ||||
702 | |||||
703 | Title : no_sequences | ||||
704 | Usage : $gaps = $seq->no_sequences | ||||
705 | Function : number of sequence in the sequence alignment | ||||
706 | Returns : integer | ||||
707 | Argument : | ||||
708 | Status : Deprecated (in favor of num_sequences()) | ||||
709 | |||||
710 | =cut | ||||
711 | |||||
712 | sub no_sequences { | ||||
713 | my $self = shift; | ||||
714 | $self->deprecated( -warn_version => 1.0069, | ||||
715 | -throw_version => 1.0075, | ||||
716 | -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead' ); | ||||
717 | return $self->num_sequences(@_); | ||||
718 | } | ||||
719 | |||||
720 | 1 | 8µs | 1; |