Filename | /Users/ap13/pathogens/Roary/lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm |
Statements | Executed 741331 statements in 1.51s |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
8 | 1 | 1 | 877ms | 1.40s | _build_contig_to_ids | Bio::Roary::ContigsToGeneIDsFromGFF::
8 | 1 | 1 | 314ms | 470ms | _build_overlapping_hypothetical_protein_ids | Bio::Roary::ContigsToGeneIDsFromGFF::
197 | 1 | 1 | 1.03ms | 1.03ms | _percent_overlap | Bio::Roary::ContigsToGeneIDsFromGFF::
8 | 1 | 1 | 66µs | 117µs | _build__awk_filter | Bio::Roary::ContigsToGeneIDsFromGFF::
8 | 1 | 1 | 55µs | 55µs | __ANON__[lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm:24] | Bio::Roary::ContigsToGeneIDsFromGFF::
1 | 1 | 1 | 33µs | 4.18ms | BEGIN@17 | Bio::Roary::ContigsToGeneIDsFromGFF::
1 | 1 | 1 | 10µs | 10µs | BEGIN@18 | Bio::Roary::ContigsToGeneIDsFromGFF::
1 | 1 | 1 | 10µs | 100µs | BEGIN@142 | Bio::Roary::ContigsToGeneIDsFromGFF::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package Bio::Roary::ContigsToGeneIDsFromGFF; | ||||
2 | |||||
3 | # ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig | ||||
4 | |||||
5 | =head1 SYNOPSIS | ||||
6 | |||||
7 | Parse a GFF and efficiently and extract ordered gene ids on each contig | ||||
8 | use Bio::Roary::ContigsToGeneIDsFromGFF; | ||||
9 | |||||
10 | my $obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( | ||||
11 | gff_file => 'abc.gff' | ||||
12 | ); | ||||
13 | $obj->contig_to_ids; | ||||
14 | |||||
15 | =cut | ||||
16 | |||||
17 | 2 | 48µs | 2 | 8.33ms | # spent 4.18ms (33µs+4.15) within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@17 which was called:
# once (33µs+4.15ms) by Bio::Roary::OrderGenes::BEGIN@21 at line 17 # spent 4.18ms making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@17
# spent 4.15ms making 1 call to Moose::import |
18 | 2 | 596µs | 1 | 10µs | # spent 10µs within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@18 which was called:
# once (10µs+0s) by Bio::Roary::OrderGenes::BEGIN@21 at line 18 # spent 10µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@18 |
19 | 1 | 3µs | 1 | 9.12ms | with 'Bio::Roary::ParseGFFAnnotationRole'; # spent 9.12ms making 1 call to Moose::with |
20 | |||||
21 | 1 | 3µs | 1 | 2.13ms | has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids'); # spent 2.13ms making 1 call to Moose::has |
22 | |||||
23 | 1 | 2µs | 1 | 1.86ms | has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids'); # spent 1.86ms making 1 call to Moose::has |
24 | 9 | 47µs | 1 | 1.79ms | # spent 55µs within Bio::Roary::ContigsToGeneIDsFromGFF::__ANON__[lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm:24] which was called 8 times, avg 7µs/call:
# 8 times (55µs+0s) by Bio::Roary::ContigsToGeneIDsFromGFF::new at line 52 of (eval 25)[Eval/Closure.pm:125], avg 7µs/call # spent 1.79ms making 1 call to Moose::has |
25 | |||||
26 | 1 | 2µs | 1 | 1.62ms | has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10); # spent 1.62ms making 1 call to Moose::has |
27 | |||||
28 | # Manually parse the GFF file because the BioPerl module is too slow | ||||
29 | sub _build_contig_to_ids | ||||
30 | # spent 1.40s (877ms+522ms) within Bio::Roary::ContigsToGeneIDsFromGFF::_build_contig_to_ids which was called 8 times, avg 175ms/call:
# 8 times (877ms+522ms) by Bio::Roary::ContigsToGeneIDsFromGFF::contig_to_ids at line 15 of (eval 25)[Eval/Closure.pm:125], avg 175ms/call | ||||
31 | 8 | 8µs | my ($self) = @_; | ||
32 | 8 | 3µs | my %contigs_to_ids; | ||
33 | 8 | 4µs | my @genes_annotation; | ||
34 | |||||
35 | 8 | 16.6ms | 16 | 16.1ms | open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file"; # spent 15.5ms making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:open, avg 1.94ms/call
# spent 589µs making 8 calls to Bio::Roary::ParseGFFAnnotationRole::_gff_fh_input_string, avg 74µs/call |
36 | 8 | 63.3ms | 8 | 63.1ms | while(<$fh>) # spent 63.1ms making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:readline, avg 7.88ms/call |
37 | { | ||||
38 | 40008 | 8.13ms | chomp; | ||
39 | 40008 | 12.5ms | my $line = $_; | ||
40 | 40008 | 2.31ms | my $id_name; | ||
41 | 40008 | 194ms | 40008 | 94.7ms | if($line =~/ID=["']?([^;"']+)["']?;?/i) # spent 94.7ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 2µs/call |
42 | { | ||||
43 | $id_name= $1; | ||||
44 | } | ||||
45 | else | ||||
46 | { | ||||
47 | next; | ||||
48 | } | ||||
49 | |||||
50 | 40008 | 122ms | my @annotation_elements = split(/\t/,$line); | ||
51 | # Map gene IDs to the contig | ||||
52 | 40008 | 43.0ms | push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name); | ||
53 | |||||
54 | 40008 | 525ms | 80016 | 314ms | if($line =~/product=["']?([^;,"']+)[,"']?;?/i) # spent 192ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:readline, avg 5µs/call
# spent 123ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 3µs/call |
55 | { | ||||
56 | 40008 | 4.18ms | my %gene_data; | ||
57 | 40008 | 63.5ms | $gene_data{product} = $1; | ||
58 | 40008 | 25.3ms | $gene_data{id_name} = $id_name; | ||
59 | 40008 | 144ms | 59516 | 32.9ms | if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/) # spent 32.9ms making 59516 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 553ns/call |
60 | { | ||||
61 | $gene_data{database_annotation_exists} = 1; | ||||
62 | } | ||||
63 | else | ||||
64 | { | ||||
65 | 4684 | 2.60ms | $gene_data{database_annotation_exists} = 0; | ||
66 | } | ||||
67 | |||||
68 | 40008 | 28.4ms | $gene_data{contig} = $annotation_elements[0]; | ||
69 | 40008 | 17.7ms | $gene_data{start} = $annotation_elements[1]; | ||
70 | 40008 | 17.6ms | $gene_data{end} = $annotation_elements[2]; | ||
71 | 40008 | 39.7ms | push(@genes_annotation,\%gene_data); | ||
72 | } | ||||
73 | |||||
74 | } | ||||
75 | 8 | 332µs | 8 | 286µs | close($fh); # spent 286µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:close, avg 36µs/call |
76 | |||||
77 | 8 | 127µs | 8 | 170µs | $self->_genes_annotation(\@genes_annotation); # spent 170µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 21µs/call |
78 | 8 | 189µs | return \%contigs_to_ids; | ||
79 | } | ||||
80 | |||||
81 | sub _build_overlapping_hypothetical_protein_ids | ||||
82 | # spent 470ms (314+156) within Bio::Roary::ContigsToGeneIDsFromGFF::_build_overlapping_hypothetical_protein_ids which was called 8 times, avg 58.7ms/call:
# 8 times (314ms+156ms) by Bio::Roary::ContigsToGeneIDsFromGFF::overlapping_hypothetical_protein_ids at line 12 of (eval 25)[Eval/Closure.pm:125], avg 58.7ms/call | ||||
83 | 8 | 7µs | my ($self) = @_; | ||
84 | 8 | 13µs | 8 | 18µs | $self->contig_to_ids; # spent 18µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::contig_to_ids, avg 2µs/call |
85 | |||||
86 | 8 | 2µs | my %overlapping_protein_ids; | ||
87 | |||||
88 | #Checking to see if the current feature is hypotheitical and if the next one has annotation | ||||
89 | 8 | 45.1ms | 40008 | 51.7ms | for(my $i = 0; $i< (@{$self->_genes_annotation} -1) ; $i++ ) # spent 51.7ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call |
90 | { | ||||
91 | 40000 | 41.6ms | 40000 | 49.5ms | my $current_feature = $self->_genes_annotation->[$i]; # spent 49.5ms making 40000 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call |
92 | 40000 | 48.4ms | 40000 | 49.5ms | my $next_feature = $self->_genes_annotation->[$i+1]; # spent 49.5ms making 40000 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call |
93 | |||||
94 | 40000 | 24.6ms | next if($current_feature->{database_annotation_exists} == 1); | ||
95 | 4677 | 12.2ms | 4677 | 3.74ms | next unless($current_feature->{product} =~ /hypothetical/i); # spent 3.74ms making 4677 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 800ns/call |
96 | 2596 | 1.26ms | next unless($next_feature->{database_annotation_exists} == 1); | ||
97 | |||||
98 | 1468 | 637µs | my $start_coord = $current_feature->{start} ; | ||
99 | 1468 | 336µs | my $end_coord = $current_feature->{end} ; | ||
100 | 1468 | 393µs | my $comparison_start_coord =$next_feature->{start} ; | ||
101 | 1468 | 288µs | my $comparison_end_coord =$next_feature->{end} ; | ||
102 | 1468 | 1.15ms | if($comparison_start_coord < $end_coord && $comparison_end_coord > $start_coord ) | ||
103 | { | ||||
104 | 197 | 401µs | 197 | 1.03ms | my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord); # spent 1.03ms making 197 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_percent_overlap, avg 5µs/call |
105 | 197 | 1.85ms | 197 | 517µs | if($percent_overlap >= $self->_min_nucleotide_overlap_percentage) # spent 517µs making 197 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_min_nucleotide_overlap_percentage, avg 3µs/call |
106 | { | ||||
107 | $overlapping_protein_ids{$current_feature->{id_name}}++; | ||||
108 | } | ||||
109 | } | ||||
110 | } | ||||
111 | |||||
112 | 8 | 110µs | return \%overlapping_protein_ids; | ||
113 | } | ||||
114 | |||||
115 | sub _percent_overlap | ||||
116 | # spent 1.03ms within Bio::Roary::ContigsToGeneIDsFromGFF::_percent_overlap which was called 197 times, avg 5µs/call:
# 197 times (1.03ms+0s) by Bio::Roary::ContigsToGeneIDsFromGFF::_build_overlapping_hypothetical_protein_ids at line 104, avg 5µs/call | ||||
117 | 197 | 192µs | my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_; | ||
118 | 197 | 54µs | my $size_of_hypothetical_gene = $end_coord - $start_coord; | ||
119 | |||||
120 | 197 | 36µs | my $lower_bound = $start_coord; | ||
121 | 197 | 59µs | if($comparison_start_coord > $start_coord) | ||
122 | { | ||||
123 | $lower_bound = $comparison_start_coord; | ||||
124 | } | ||||
125 | 197 | 39µs | my $upper_bound = $end_coord; | ||
126 | 197 | 31µs | if($comparison_end_coord < $end_coord ) | ||
127 | { | ||||
128 | $upper_bound = $comparison_end_coord; | ||||
129 | } | ||||
130 | 197 | 616µs | return (($upper_bound-$lower_bound)*100) / $size_of_hypothetical_gene; | ||
131 | } | ||||
132 | |||||
133 | |||||
134 | # spent 117µs (66+51) within Bio::Roary::ContigsToGeneIDsFromGFF::_build__awk_filter which was called 8 times, avg 15µs/call:
# 8 times (66µs+51µs) by Bio::Roary::ContigsToGeneIDsFromGFF::_awk_filter at line 12 of (eval 25)[Eval/Closure.pm:125], avg 15µs/call | ||||
135 | 8 | 4µs | my ($self) = @_; | ||
136 | return | ||||
137 | 8 | 63µs | 8 | 50µs | 'awk \'BEGIN {FS="\t"};{ if ($3 ~/' # spent 50µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_tags_to_filter, avg 6µs/call |
138 | . $self->_tags_to_filter | ||||
139 | . '/) print $1"\t"$4"\t"$5"\t"$9;}\' '; | ||||
140 | } | ||||
141 | |||||
142 | 2 | 47µs | 2 | 190µs | # spent 100µs (10+90) within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@142 which was called:
# once (10µs+90µs) by Bio::Roary::OrderGenes::BEGIN@21 at line 142 # spent 100µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@142
# spent 90µs making 1 call to Moose::unimport |
143 | 1 | 6µs | 2 | 6.68ms | __PACKAGE__->meta->make_immutable; # spent 6.67ms making 1 call to Class::MOP::Class::make_immutable
# spent 15µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::meta |
144 | |||||
145 | 1 | 36µs | 1; |