Mercurial > repos > dereeper > roary_plots
comparison Roary/lib/Bio/Roary/QC/Report.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c47a5f61bc9f |
---|---|
1 package Bio::Roary::QC::Report; | |
2 | |
3 # ABSTRACT: generate a report based on kraken output | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 =cut | |
8 | |
9 use Moose; | |
10 use File::Temp; | |
11 use File::Path 'rmtree'; | |
12 use Cwd; | |
13 use File::Basename; | |
14 with 'Bio::Roary::JobRunner::Role'; | |
15 | |
16 has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); | |
17 has 'kraken_exec' => ( is => 'ro', isa => 'Str', default => 'kraken' ); | |
18 has 'kraken_report_exec' => ( is => 'ro', isa => 'Str', default => 'kraken-report' ); | |
19 has 'kraken_db' => ( is => 'ro', isa => 'Str', required => 1 ); | |
20 has 'outfile' => ( is => 'rw', isa => 'Str', default => 'qc_report.csv' ); | |
21 has '_kraken_data' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 ); | |
22 has '_header' => ( is => 'rw', isa => 'Str', lazy_build => 1 ); | |
23 has 'kraken_memory' => ( is => 'rw', isa => 'Int', default => 2000 ); | |
24 | |
25 has '_tmp_directory_obj' => ( is => 'rw', lazy_build => 1 ); | |
26 has '_tmp_directory' => ( is => 'rw', lazy_build => 1, isa => 'Str', ); | |
27 | |
28 | |
29 sub _nuc_fasta_filename | |
30 { | |
31 my ($self, $gff) = @_; | |
32 | |
33 my $prefix = basename( $gff, ".gff" ); | |
34 my $outfile = $self->_tmp_directory . "/$prefix.fna"; | |
35 return $outfile; | |
36 } | |
37 | |
38 sub _extract_nuc_fasta_cmd { | |
39 my ($self, $gff) = @_; | |
40 my $outfile = $self->_nuc_fasta_filename($gff); | |
41 my $cmd = "sed -n '/##FASTA/,//p' $gff | grep -v \'##FASTA\' > $outfile"; | |
42 | |
43 return $cmd; | |
44 } | |
45 | |
46 sub _extract_nuc_files_from_all_gffs | |
47 { | |
48 my ($self) = @_; | |
49 my @nuc_files; | |
50 my @commands_to_run; | |
51 for my $input_file(@{$self->input_files}) | |
52 { | |
53 push(@nuc_files,$self->_nuc_fasta_filename($input_file)); | |
54 push(@commands_to_run,$self->_extract_nuc_fasta_cmd($input_file)); | |
55 } | |
56 my $kraken_runner_obj = $self->_job_runner_class->new( | |
57 commands_to_run => \@commands_to_run, | |
58 memory_in_mb => $self->kraken_memory, | |
59 verbose => $self->verbose, | |
60 cpus => $self->cpus | |
61 ); | |
62 $kraken_runner_obj->run(); | |
63 return \@nuc_files; | |
64 } | |
65 | |
66 sub _kraken_cmd { | |
67 my ( $self, $a, $kraken_output ) = @_; | |
68 | |
69 my $kcmd = $self->kraken_exec . | |
70 " --fasta-input ". | |
71 " --preload ". | |
72 " --db " . $self->kraken_db . | |
73 " --output $kraken_output $a > /dev/null 2>&1"; | |
74 return $kcmd; | |
75 } | |
76 | |
77 sub _kraken_report_cmd { | |
78 my ( $self, $k, $report_output ) = @_; | |
79 | |
80 my $krcmd = $self->kraken_report_exec . | |
81 " --db " . $self->kraken_db . | |
82 " $k > $report_output"; | |
83 return $krcmd; | |
84 } | |
85 | |
86 sub _kraken_output_filename | |
87 { | |
88 my ( $self, $assembly ) = @_; | |
89 my $kraken_output = $assembly; | |
90 $kraken_output =~ s/fna$/kraken/; | |
91 return $kraken_output; | |
92 } | |
93 | |
94 sub _run_kraken_on_nuc_files | |
95 { | |
96 my ( $self, $nuc_files ) = @_; | |
97 my @kraken_output_files; | |
98 my @commands_to_run; | |
99 for my $nuc_file(@{$nuc_files}) | |
100 { | |
101 my $kraken_output = $self->_kraken_output_filename($nuc_file); | |
102 push(@kraken_output_files, $kraken_output ); | |
103 push(@commands_to_run, $self->_kraken_cmd( $nuc_file, $kraken_output )); | |
104 } | |
105 | |
106 my $kraken_runner_obj = $self->_job_runner_class->new( | |
107 commands_to_run => \@commands_to_run, | |
108 memory_in_mb => $self->kraken_memory, | |
109 verbose => $self->verbose, | |
110 cpus => $self->cpus | |
111 ); | |
112 $kraken_runner_obj->run(); | |
113 | |
114 for my $filename(@{$nuc_files}) | |
115 { | |
116 unlink($filename); | |
117 } | |
118 | |
119 return \@kraken_output_files; | |
120 } | |
121 | |
122 sub _kraken_report_output_filename | |
123 { | |
124 my ( $self, $assembly ) = @_; | |
125 return $assembly.".report"; | |
126 } | |
127 | |
128 sub _run_kraken_report_on_kraken_files | |
129 { | |
130 my ( $self, $kraken_files ) = @_; | |
131 | |
132 my @kraken_report_output_files; | |
133 my @commands_to_run; | |
134 for my $nuc_file(@{$kraken_files}) | |
135 { | |
136 my $kraken_output = $self->_kraken_report_output_filename($nuc_file); | |
137 push(@kraken_report_output_files, $kraken_output ); | |
138 push(@commands_to_run, $self->_kraken_report_cmd( $nuc_file, $kraken_output )); | |
139 } | |
140 | |
141 my $kraken_runner_obj = $self->_job_runner_class->new( | |
142 commands_to_run => \@commands_to_run, | |
143 memory_in_mb => $self->kraken_memory, | |
144 verbose => $self->verbose, | |
145 cpus => $self->cpus | |
146 ); | |
147 $kraken_runner_obj->run(); | |
148 for my $filename(@{$kraken_files}) | |
149 { | |
150 unlink($filename); | |
151 } | |
152 return \@kraken_report_output_files; | |
153 } | |
154 | |
155 sub _build__kraken_data { | |
156 my $self = shift; | |
157 my $nuc_files = $self->_extract_nuc_files_from_all_gffs(); | |
158 my $kraken_files = $self->_run_kraken_on_nuc_files($nuc_files); | |
159 my $kraken_report_files = $self->_run_kraken_report_on_kraken_files( $kraken_files ); | |
160 | |
161 return $self->_parse_kraken_reports($kraken_report_files); | |
162 } | |
163 | |
164 sub _parse_kraken_reports | |
165 { | |
166 my ( $self, $kraken_report_files ) = @_; | |
167 | |
168 my @report_rows; | |
169 for my $kraken_report(@{$kraken_report_files}) | |
170 { | |
171 push(@report_rows, $self->_parse_kraken_report($kraken_report)); | |
172 } | |
173 | |
174 for my $kraken_report(@{$kraken_report_files}) | |
175 { | |
176 unlink($kraken_report); | |
177 } | |
178 | |
179 return \@report_rows; | |
180 } | |
181 | |
182 sub _parse_kraken_report { | |
183 my ( $self, $kraken_report ) = @_; | |
184 | |
185 # parse report | |
186 open( my $report_fh, '<', $kraken_report ); | |
187 | |
188 my $sample_name = $kraken_report; | |
189 $sample_name =~ s/.report$//; | |
190 $sample_name =~ s/.kraken$//; | |
191 my($sample_base_name, $dirs, $suffix) = fileparse($sample_name); | |
192 | |
193 my ( $top_genus, $top_species ); | |
194 while ( <$report_fh> ){ | |
195 my @parts = split( "\t" ); | |
196 chomp @parts; | |
197 | |
198 $top_genus = $parts[5] if ( (! defined $top_genus) && $parts[3] eq 'G' ); | |
199 $top_species = $parts[5] if ( (! defined $top_species) && $parts[3] eq 'S' ); | |
200 | |
201 last if (defined $top_genus && defined $top_species); | |
202 } | |
203 close($report_fh); | |
204 | |
205 $top_genus ||= "not_found"; | |
206 $top_genus =~ s/^\s+//g; | |
207 $top_species ||= "not_found"; | |
208 $top_species =~ s/^\s+//g; | |
209 | |
210 return [ $sample_base_name, $top_genus, $top_species ]; | |
211 } | |
212 | |
213 | |
214 sub _build__header { | |
215 return join( ',', ( 'Sample', 'Genus', 'Species' ) ); | |
216 } | |
217 | |
218 sub _build__tmp_directory_obj { | |
219 return File::Temp->newdir(DIR => getcwd, CLEANUP => 1 ); | |
220 } | |
221 | |
222 sub _build__tmp_directory { | |
223 my $self = shift; | |
224 return $self->_tmp_directory_obj->dirname(); | |
225 } | |
226 | |
227 sub report { | |
228 my $self = shift; | |
229 | |
230 open( OUTFILE, '>', $self->outfile ); | |
231 print OUTFILE $self->_header . "\n"; | |
232 for my $line ( @{ $self->_kraken_data } ){ | |
233 print OUTFILE join( ',', @{ $line } ) . "\n"; | |
234 } | |
235 close OUTFILE; | |
236 } | |
237 | |
238 | |
239 __PACKAGE__->meta->make_immutable; | |
240 no Moose; | |
241 1; |