Filename | /Users/ap13/pathogens/Roary/lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm |
Statements | Executed 741331 statements in 1.51s |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
8 | 1 | 1 | 877ms | 1.40s | _build_contig_to_ids | Bio::Roary::ContigsToGeneIDsFromGFF::
8 | 1 | 1 | 314ms | 470ms | _build_overlapping_hypothetical_protein_ids | Bio::Roary::ContigsToGeneIDsFromGFF::
197 | 1 | 1 | 1.03ms | 1.03ms | _percent_overlap | Bio::Roary::ContigsToGeneIDsFromGFF::
8 | 1 | 1 | 66µs | 117µs | _build__awk_filter | Bio::Roary::ContigsToGeneIDsFromGFF::
8 | 1 | 1 | 55µs | 55µs | __ANON__[lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm:24] | Bio::Roary::ContigsToGeneIDsFromGFF::
1 | 1 | 1 | 33µs | 4.18ms | BEGIN@17 | Bio::Roary::ContigsToGeneIDsFromGFF::
1 | 1 | 1 | 10µs | 10µs | BEGIN@18 | Bio::Roary::ContigsToGeneIDsFromGFF::
1 | 1 | 1 | 10µs | 100µs | BEGIN@142 | Bio::Roary::ContigsToGeneIDsFromGFF::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package Bio::Roary::ContigsToGeneIDsFromGFF; | ||||
2 | |||||
3 | # ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig | ||||
4 | |||||
5 | =head1 SYNOPSIS | ||||
6 | |||||
7 | Parse a GFF and efficiently and extract ordered gene ids on each contig | ||||
8 | use Bio::Roary::ContigsToGeneIDsFromGFF; | ||||
9 | |||||
10 | my $obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( | ||||
11 | gff_file => 'abc.gff' | ||||
12 | ); | ||||
13 | $obj->contig_to_ids; | ||||
14 | |||||
15 | =cut | ||||
16 | |||||
17 | 2 | 48µs | 2 | 8.33ms | # spent 4.18ms (33µs+4.15) within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@17 which was called:
# once (33µs+4.15ms) by Bio::Roary::OrderGenes::BEGIN@21 at line 17 # spent 4.18ms making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@17
# spent 4.15ms making 1 call to Moose::import |
18 | 2 | 596µs | 1 | 10µs | # spent 10µs within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@18 which was called:
# once (10µs+0s) by Bio::Roary::OrderGenes::BEGIN@21 at line 18 # spent 10µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@18 |
19 | 1 | 3µs | 1 | 9.12ms | with 'Bio::Roary::ParseGFFAnnotationRole'; # spent 9.12ms making 1 call to Moose::with |
20 | |||||
21 | 1 | 3µs | 1 | 2.13ms | has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids'); # spent 2.13ms making 1 call to Moose::has |
22 | |||||
23 | 1 | 2µs | 1 | 1.86ms | has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids'); # spent 1.86ms making 1 call to Moose::has |
24 | 9 | 47µs | 1 | 1.79ms | # spent 55µs within Bio::Roary::ContigsToGeneIDsFromGFF::__ANON__[lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm:24] which was called 8 times, avg 7µs/call:
# 8 times (55µs+0s) by Bio::Roary::ContigsToGeneIDsFromGFF::new at line 52 of (eval 25)[Eval/Closure.pm:125], avg 7µs/call # spent 1.79ms making 1 call to Moose::has |
25 | |||||
26 | 1 | 2µs | 1 | 1.62ms | has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10); # spent 1.62ms making 1 call to Moose::has |
27 | |||||
28 | # Manually parse the GFF file because the BioPerl module is too slow | ||||
29 | sub _build_contig_to_ids | ||||
30 | # spent 1.40s (877ms+522ms) within Bio::Roary::ContigsToGeneIDsFromGFF::_build_contig_to_ids which was called 8 times, avg 175ms/call:
# 8 times (877ms+522ms) by Bio::Roary::ContigsToGeneIDsFromGFF::contig_to_ids at line 15 of (eval 25)[Eval/Closure.pm:125], avg 175ms/call | ||||
31 | 64 | 80.5ms | my ($self) = @_; | ||
32 | my %contigs_to_ids; | ||||
33 | my @genes_annotation; | ||||
34 | |||||
35 | 16 | 16.1ms | open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file"; # spent 15.5ms making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:open, avg 1.94ms/call
# spent 589µs making 8 calls to Bio::Roary::ParseGFFAnnotationRole::_gff_fh_input_string, avg 74µs/call | ||
36 | 8 | 63.1ms | while(<$fh>) # spent 63.1ms making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:readline, avg 7.88ms/call | ||
37 | { | ||||
38 | 280056 | 906ms | chomp; | ||
39 | my $line = $_; | ||||
40 | my $id_name; | ||||
41 | 40008 | 94.7ms | if($line =~/ID=["']?([^;"']+)["']?;?/i) # spent 94.7ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 2µs/call | ||
42 | { | ||||
43 | $id_name= $1; | ||||
44 | } | ||||
45 | else | ||||
46 | { | ||||
47 | next; | ||||
48 | } | ||||
49 | |||||
50 | my @annotation_elements = split(/\t/,$line); | ||||
51 | # Map gene IDs to the contig | ||||
52 | push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name); | ||||
53 | |||||
54 | 320064 | 340ms | 80016 | 314ms | if($line =~/product=["']?([^;,"']+)[,"']?;?/i) # spent 192ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:readline, avg 5µs/call
# spent 123ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 3µs/call |
55 | { | ||||
56 | my %gene_data; | ||||
57 | $gene_data{product} = $1; | ||||
58 | $gene_data{id_name} = $id_name; | ||||
59 | 4684 | 2.60ms | 59516 | 32.9ms | if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/) # spent 32.9ms making 59516 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 553ns/call |
60 | { | ||||
61 | $gene_data{database_annotation_exists} = 1; | ||||
62 | } | ||||
63 | else | ||||
64 | { | ||||
65 | $gene_data{database_annotation_exists} = 0; | ||||
66 | } | ||||
67 | |||||
68 | $gene_data{contig} = $annotation_elements[0]; | ||||
69 | $gene_data{start} = $annotation_elements[1]; | ||||
70 | $gene_data{end} = $annotation_elements[2]; | ||||
71 | push(@genes_annotation,\%gene_data); | ||||
72 | } | ||||
73 | |||||
74 | } | ||||
75 | 8 | 286µs | close($fh); # spent 286µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:close, avg 36µs/call | ||
76 | |||||
77 | 8 | 170µs | $self->_genes_annotation(\@genes_annotation); # spent 170µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 21µs/call | ||
78 | return \%contigs_to_ids; | ||||
79 | } | ||||
80 | |||||
81 | sub _build_overlapping_hypothetical_protein_ids | ||||
82 | # spent 470ms (314+156) within Bio::Roary::ContigsToGeneIDsFromGFF::_build_overlapping_hypothetical_protein_ids which was called 8 times, avg 58.7ms/call:
# 8 times (314ms+156ms) by Bio::Roary::ContigsToGeneIDsFromGFF::overlapping_hypothetical_protein_ids at line 12 of (eval 25)[Eval/Closure.pm:125], avg 58.7ms/call | ||||
83 | 40 | 45.2ms | my ($self) = @_; | ||
84 | 8 | 18µs | $self->contig_to_ids; # spent 18µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::contig_to_ids, avg 2µs/call | ||
85 | |||||
86 | my %overlapping_protein_ids; | ||||
87 | |||||
88 | #Checking to see if the current feature is hypotheitical and if the next one has annotation | ||||
89 | 134613 | 131ms | 40008 | 51.7ms | for(my $i = 0; $i< (@{$self->_genes_annotation} -1) ; $i++ ) # spent 51.7ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call |
90 | { | ||||
91 | 40000 | 49.5ms | my $current_feature = $self->_genes_annotation->[$i]; # spent 49.5ms making 40000 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call | ||
92 | 40000 | 49.5ms | my $next_feature = $self->_genes_annotation->[$i+1]; # spent 49.5ms making 40000 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call | ||
93 | |||||
94 | next if($current_feature->{database_annotation_exists} == 1); | ||||
95 | 4677 | 3.74ms | next unless($current_feature->{product} =~ /hypothetical/i); # spent 3.74ms making 4677 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 800ns/call | ||
96 | next unless($next_feature->{database_annotation_exists} == 1); | ||||
97 | |||||
98 | my $start_coord = $current_feature->{start} ; | ||||
99 | my $end_coord = $current_feature->{end} ; | ||||
100 | my $comparison_start_coord =$next_feature->{start} ; | ||||
101 | my $comparison_end_coord =$next_feature->{end} ; | ||||
102 | 394 | 2.25ms | if($comparison_start_coord < $end_coord && $comparison_end_coord > $start_coord ) | ||
103 | { | ||||
104 | 197 | 1.03ms | my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord); # spent 1.03ms making 197 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_percent_overlap, avg 5µs/call | ||
105 | 197 | 517µs | if($percent_overlap >= $self->_min_nucleotide_overlap_percentage) # spent 517µs making 197 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_min_nucleotide_overlap_percentage, avg 3µs/call | ||
106 | { | ||||
107 | $overlapping_protein_ids{$current_feature->{id_name}}++; | ||||
108 | } | ||||
109 | } | ||||
110 | } | ||||
111 | |||||
112 | return \%overlapping_protein_ids; | ||||
113 | } | ||||
114 | |||||
115 | sub _percent_overlap | ||||
116 | # spent 1.03ms within Bio::Roary::ContigsToGeneIDsFromGFF::_percent_overlap which was called 197 times, avg 5µs/call:
# 197 times (1.03ms+0s) by Bio::Roary::ContigsToGeneIDsFromGFF::_build_overlapping_hypothetical_protein_ids at line 104, avg 5µs/call | ||||
117 | 1379 | 1.03ms | my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_; | ||
118 | my $size_of_hypothetical_gene = $end_coord - $start_coord; | ||||
119 | |||||
120 | my $lower_bound = $start_coord; | ||||
121 | if($comparison_start_coord > $start_coord) | ||||
122 | { | ||||
123 | $lower_bound = $comparison_start_coord; | ||||
124 | } | ||||
125 | my $upper_bound = $end_coord; | ||||
126 | if($comparison_end_coord < $end_coord ) | ||||
127 | { | ||||
128 | $upper_bound = $comparison_end_coord; | ||||
129 | } | ||||
130 | return (($upper_bound-$lower_bound)*100) / $size_of_hypothetical_gene; | ||||
131 | } | ||||
132 | |||||
133 | |||||
134 | # spent 117µs (66+51) within Bio::Roary::ContigsToGeneIDsFromGFF::_build__awk_filter which was called 8 times, avg 15µs/call:
# 8 times (66µs+51µs) by Bio::Roary::ContigsToGeneIDsFromGFF::_awk_filter at line 12 of (eval 25)[Eval/Closure.pm:125], avg 15µs/call | ||||
135 | 16 | 67µs | my ($self) = @_; | ||
136 | return | ||||
137 | 8 | 50µs | 'awk \'BEGIN {FS="\t"};{ if ($3 ~/' # spent 50µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_tags_to_filter, avg 6µs/call | ||
138 | . $self->_tags_to_filter | ||||
139 | . '/) print $1"\t"$4"\t"$5"\t"$9;}\' '; | ||||
140 | } | ||||
141 | |||||
142 | 2 | 47µs | 2 | 190µs | # spent 100µs (10+90) within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@142 which was called:
# once (10µs+90µs) by Bio::Roary::OrderGenes::BEGIN@21 at line 142 # spent 100µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@142
# spent 90µs making 1 call to Moose::unimport |
143 | 1 | 6µs | 2 | 6.68ms | __PACKAGE__->meta->make_immutable; # spent 6.67ms making 1 call to Class::MOP::Class::make_immutable
# spent 15µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::meta |
144 | |||||
145 | 1 | 36µs | 1; |