0
|
1 package Bio::Roary::QC::Report;
|
|
2
|
|
3 # ABSTRACT: generate a report based on kraken output
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 =cut
|
|
8
|
|
9 use Moose;
|
|
10 use File::Temp;
|
|
11 use File::Path 'rmtree';
|
|
12 use Cwd;
|
|
13 use File::Basename;
|
|
14 with 'Bio::Roary::JobRunner::Role';
|
|
15
|
|
16 has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
|
|
17 has 'kraken_exec' => ( is => 'ro', isa => 'Str', default => 'kraken' );
|
|
18 has 'kraken_report_exec' => ( is => 'ro', isa => 'Str', default => 'kraken-report' );
|
|
19 has 'kraken_db' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
20 has 'outfile' => ( is => 'rw', isa => 'Str', default => 'qc_report.csv' );
|
|
21 has '_kraken_data' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
|
|
22 has '_header' => ( is => 'rw', isa => 'Str', lazy_build => 1 );
|
|
23 has 'kraken_memory' => ( is => 'rw', isa => 'Int', default => 2000 );
|
|
24
|
|
25 has '_tmp_directory_obj' => ( is => 'rw', lazy_build => 1 );
|
|
26 has '_tmp_directory' => ( is => 'rw', lazy_build => 1, isa => 'Str', );
|
|
27
|
|
28
|
|
29 sub _nuc_fasta_filename
|
|
30 {
|
|
31 my ($self, $gff) = @_;
|
|
32
|
|
33 my $prefix = basename( $gff, ".gff" );
|
|
34 my $outfile = $self->_tmp_directory . "/$prefix.fna";
|
|
35 return $outfile;
|
|
36 }
|
|
37
|
|
38 sub _extract_nuc_fasta_cmd {
|
|
39 my ($self, $gff) = @_;
|
|
40 my $outfile = $self->_nuc_fasta_filename($gff);
|
|
41 my $cmd = "sed -n '/##FASTA/,//p' $gff | grep -v \'##FASTA\' > $outfile";
|
|
42
|
|
43 return $cmd;
|
|
44 }
|
|
45
|
|
46 sub _extract_nuc_files_from_all_gffs
|
|
47 {
|
|
48 my ($self) = @_;
|
|
49 my @nuc_files;
|
|
50 my @commands_to_run;
|
|
51 for my $input_file(@{$self->input_files})
|
|
52 {
|
|
53 push(@nuc_files,$self->_nuc_fasta_filename($input_file));
|
|
54 push(@commands_to_run,$self->_extract_nuc_fasta_cmd($input_file));
|
|
55 }
|
|
56 my $kraken_runner_obj = $self->_job_runner_class->new(
|
|
57 commands_to_run => \@commands_to_run,
|
|
58 memory_in_mb => $self->kraken_memory,
|
|
59 verbose => $self->verbose,
|
|
60 cpus => $self->cpus
|
|
61 );
|
|
62 $kraken_runner_obj->run();
|
|
63 return \@nuc_files;
|
|
64 }
|
|
65
|
|
66 sub _kraken_cmd {
|
|
67 my ( $self, $a, $kraken_output ) = @_;
|
|
68
|
|
69 my $kcmd = $self->kraken_exec .
|
|
70 " --fasta-input ".
|
|
71 " --preload ".
|
|
72 " --db " . $self->kraken_db .
|
|
73 " --output $kraken_output $a > /dev/null 2>&1";
|
|
74 return $kcmd;
|
|
75 }
|
|
76
|
|
77 sub _kraken_report_cmd {
|
|
78 my ( $self, $k, $report_output ) = @_;
|
|
79
|
|
80 my $krcmd = $self->kraken_report_exec .
|
|
81 " --db " . $self->kraken_db .
|
|
82 " $k > $report_output";
|
|
83 return $krcmd;
|
|
84 }
|
|
85
|
|
86 sub _kraken_output_filename
|
|
87 {
|
|
88 my ( $self, $assembly ) = @_;
|
|
89 my $kraken_output = $assembly;
|
|
90 $kraken_output =~ s/fna$/kraken/;
|
|
91 return $kraken_output;
|
|
92 }
|
|
93
|
|
94 sub _run_kraken_on_nuc_files
|
|
95 {
|
|
96 my ( $self, $nuc_files ) = @_;
|
|
97 my @kraken_output_files;
|
|
98 my @commands_to_run;
|
|
99 for my $nuc_file(@{$nuc_files})
|
|
100 {
|
|
101 my $kraken_output = $self->_kraken_output_filename($nuc_file);
|
|
102 push(@kraken_output_files, $kraken_output );
|
|
103 push(@commands_to_run, $self->_kraken_cmd( $nuc_file, $kraken_output ));
|
|
104 }
|
|
105
|
|
106 my $kraken_runner_obj = $self->_job_runner_class->new(
|
|
107 commands_to_run => \@commands_to_run,
|
|
108 memory_in_mb => $self->kraken_memory,
|
|
109 verbose => $self->verbose,
|
|
110 cpus => $self->cpus
|
|
111 );
|
|
112 $kraken_runner_obj->run();
|
|
113
|
|
114 for my $filename(@{$nuc_files})
|
|
115 {
|
|
116 unlink($filename);
|
|
117 }
|
|
118
|
|
119 return \@kraken_output_files;
|
|
120 }
|
|
121
|
|
122 sub _kraken_report_output_filename
|
|
123 {
|
|
124 my ( $self, $assembly ) = @_;
|
|
125 return $assembly.".report";
|
|
126 }
|
|
127
|
|
128 sub _run_kraken_report_on_kraken_files
|
|
129 {
|
|
130 my ( $self, $kraken_files ) = @_;
|
|
131
|
|
132 my @kraken_report_output_files;
|
|
133 my @commands_to_run;
|
|
134 for my $nuc_file(@{$kraken_files})
|
|
135 {
|
|
136 my $kraken_output = $self->_kraken_report_output_filename($nuc_file);
|
|
137 push(@kraken_report_output_files, $kraken_output );
|
|
138 push(@commands_to_run, $self->_kraken_report_cmd( $nuc_file, $kraken_output ));
|
|
139 }
|
|
140
|
|
141 my $kraken_runner_obj = $self->_job_runner_class->new(
|
|
142 commands_to_run => \@commands_to_run,
|
|
143 memory_in_mb => $self->kraken_memory,
|
|
144 verbose => $self->verbose,
|
|
145 cpus => $self->cpus
|
|
146 );
|
|
147 $kraken_runner_obj->run();
|
|
148 for my $filename(@{$kraken_files})
|
|
149 {
|
|
150 unlink($filename);
|
|
151 }
|
|
152 return \@kraken_report_output_files;
|
|
153 }
|
|
154
|
|
155 sub _build__kraken_data {
|
|
156 my $self = shift;
|
|
157 my $nuc_files = $self->_extract_nuc_files_from_all_gffs();
|
|
158 my $kraken_files = $self->_run_kraken_on_nuc_files($nuc_files);
|
|
159 my $kraken_report_files = $self->_run_kraken_report_on_kraken_files( $kraken_files );
|
|
160
|
|
161 return $self->_parse_kraken_reports($kraken_report_files);
|
|
162 }
|
|
163
|
|
164 sub _parse_kraken_reports
|
|
165 {
|
|
166 my ( $self, $kraken_report_files ) = @_;
|
|
167
|
|
168 my @report_rows;
|
|
169 for my $kraken_report(@{$kraken_report_files})
|
|
170 {
|
|
171 push(@report_rows, $self->_parse_kraken_report($kraken_report));
|
|
172 }
|
|
173
|
|
174 for my $kraken_report(@{$kraken_report_files})
|
|
175 {
|
|
176 unlink($kraken_report);
|
|
177 }
|
|
178
|
|
179 return \@report_rows;
|
|
180 }
|
|
181
|
|
182 sub _parse_kraken_report {
|
|
183 my ( $self, $kraken_report ) = @_;
|
|
184
|
|
185 # parse report
|
|
186 open( my $report_fh, '<', $kraken_report );
|
|
187
|
|
188 my $sample_name = $kraken_report;
|
|
189 $sample_name =~ s/.report$//;
|
|
190 $sample_name =~ s/.kraken$//;
|
|
191 my($sample_base_name, $dirs, $suffix) = fileparse($sample_name);
|
|
192
|
|
193 my ( $top_genus, $top_species );
|
|
194 while ( <$report_fh> ){
|
|
195 my @parts = split( "\t" );
|
|
196 chomp @parts;
|
|
197
|
|
198 $top_genus = $parts[5] if ( (! defined $top_genus) && $parts[3] eq 'G' );
|
|
199 $top_species = $parts[5] if ( (! defined $top_species) && $parts[3] eq 'S' );
|
|
200
|
|
201 last if (defined $top_genus && defined $top_species);
|
|
202 }
|
|
203 close($report_fh);
|
|
204
|
|
205 $top_genus ||= "not_found";
|
|
206 $top_genus =~ s/^\s+//g;
|
|
207 $top_species ||= "not_found";
|
|
208 $top_species =~ s/^\s+//g;
|
|
209
|
|
210 return [ $sample_base_name, $top_genus, $top_species ];
|
|
211 }
|
|
212
|
|
213
|
|
214 sub _build__header {
|
|
215 return join( ',', ( 'Sample', 'Genus', 'Species' ) );
|
|
216 }
|
|
217
|
|
218 sub _build__tmp_directory_obj {
|
|
219 return File::Temp->newdir(DIR => getcwd, CLEANUP => 1 );
|
|
220 }
|
|
221
|
|
222 sub _build__tmp_directory {
|
|
223 my $self = shift;
|
|
224 return $self->_tmp_directory_obj->dirname();
|
|
225 }
|
|
226
|
|
227 sub report {
|
|
228 my $self = shift;
|
|
229
|
|
230 open( OUTFILE, '>', $self->outfile );
|
|
231 print OUTFILE $self->_header . "\n";
|
|
232 for my $line ( @{ $self->_kraken_data } ){
|
|
233 print OUTFILE join( ',', @{ $line } ) . "\n";
|
|
234 }
|
|
235 close OUTFILE;
|
|
236 }
|
|
237
|
|
238
|
|
239 __PACKAGE__->meta->make_immutable;
|
|
240 no Moose;
|
|
241 1;
|