Mercurial > repos > greg > pima_report
comparison pima_report.py @ 0:0a558f444c98 draft
Uploaded
author | greg |
---|---|
date | Fri, 03 Mar 2023 22:06:23 +0000 |
parents | |
children | 67d0939b56b0 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0a558f444c98 |
---|---|
1 import argparse | |
2 import os | |
3 import pandas | |
4 import pypandoc | |
5 import re | |
6 import subprocess | |
7 import sys | |
8 | |
9 from Bio import SeqIO | |
10 from datetime import date | |
11 from mdutils.mdutils import MdUtils | |
12 | |
13 CDC_ADVISORY = 'The analysis and report presented here should be treated as preliminary. Please contact the CDC/BDRD with any results regarding _Bacillus anthracis_.' | |
14 | |
15 | |
16 class PimaReport: | |
17 | |
18 def __init__(self, analysis_name=None, assembly_fasta_file=None, assembly_name=None, feature_bed_files=None, feature_png_files=None, | |
19 contig_coverage_file=None, dbkey=None, gzipped=None, illumina_fastq_file=None, mutation_regions_bed_file=None, | |
20 mutation_regions_tsv_files=None, pima_css=None): | |
21 self.ofh = open("process_log.txt", "w") | |
22 | |
23 self.ofh.write("analysis_name: %s\n" % str(analysis_name)) | |
24 self.ofh.write("assembly_fasta_file: %s\n" % str(assembly_fasta_file)) | |
25 self.ofh.write("assembly_name: %s\n" % str(assembly_name)) | |
26 self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files)) | |
27 self.ofh.write("feature_png_files: %s\n" % str(feature_png_files)) | |
28 self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file)) | |
29 self.ofh.write("dbkey: %s\n" % str(dbkey)) | |
30 self.ofh.write("gzipped: %s\n" % str(gzipped)) | |
31 self.ofh.write("illumina_fastq_file: %s\n" % str(illumina_fastq_file)) | |
32 self.ofh.write("mutation_regions_bed_file: %s\n" % str(mutation_regions_bed_file)) | |
33 self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files)) | |
34 self.ofh.write("pima_css: %s\n" % str(pima_css)) | |
35 | |
36 # General | |
37 self.doc = None | |
38 self.report_md = 'pima_report.md' | |
39 | |
40 # Inputs | |
41 self.analysis_name = analysis_name | |
42 self.assembly_fasta_file = assembly_fasta_file | |
43 self.assembly_name = assembly_name | |
44 self.feature_bed_files = feature_bed_files | |
45 self.feature_png_files = feature_png_files | |
46 self.contig_coverage_file = contig_coverage_file | |
47 self.dbkey = dbkey | |
48 self.gzipped = gzipped | |
49 self.illumina_fastq_file = illumina_fastq_file | |
50 self.mutation_regions_bed_file = mutation_regions_bed_file | |
51 self.mutation_regions_tsv_files = mutation_regions_tsv_files | |
52 self.read_type = 'Illumina' | |
53 self.ont_bases = None | |
54 self.ont_n50 = None | |
55 self.ont_read_count = None | |
56 self.pima_css = pima_css | |
57 | |
58 # Titles | |
59 self.alignment_title = 'Comparison with reference' | |
60 self.alignment_notes_title = 'Alignment notes' | |
61 self.amr_matrix_title = 'AMR matrix' | |
62 self.assembly_methods_title = 'Assembly' | |
63 self.assembly_notes_title = 'Assembly notes' | |
64 self.basecalling_title = 'Basecalling' | |
65 self.basecalling_methods_title = 'Basecalling' | |
66 self.contamination_methods_title = 'Contamination check' | |
67 self.contig_alignment_title = 'Alignment vs. reference contigs' | |
68 self.feature_title = 'Features found in the assembly' | |
69 self.feature_methods_title = 'Feature annotation' | |
70 self.feature_plot_title = 'Feature annotation plots' | |
71 self.large_indel_title = 'Large insertions & deletions' | |
72 self.methods_title = 'Methods' | |
73 self.mutation_title = 'Mutations found in the sample' | |
74 self.mutation_methods_title = 'Mutation screening' | |
75 self.plasmid_methods_title = 'Plasmid annotation' | |
76 self.reference_methods_title = 'Reference comparison' | |
77 self.snp_indel_title = 'SNPs and small indels' | |
78 #self.summary_title = 'Summary' | |
79 self.summary_title = 'Analysis of %s' % analysis_name | |
80 | |
81 # Methods | |
82 self.methods = pandas.Series(dtype='float64') | |
83 self.methods[self.contamination_methods_title] = pandas.Series(dtype='float64') | |
84 self.methods[self.assembly_methods_title] = pandas.Series(dtype='float64') | |
85 self.methods[self.reference_methods_title] = pandas.Series(dtype='float64') | |
86 self.methods[self.mutation_methods_title] = pandas.Series(dtype='float64') | |
87 self.methods[self.feature_methods_title] = pandas.Series(dtype='float64') | |
88 self.methods[self.plasmid_methods_title] = pandas.Series(dtype='float64') | |
89 | |
90 # Contamination | |
91 self.kraken_fracs = pandas.Series(dtype=object) | |
92 | |
93 # Notes | |
94 self.assembly_notes = pandas.Series(dtype=object) | |
95 self.alignment_notes = pandas.Series(dtype=object) | |
96 self.contig_alignment = pandas.Series(dtype=object) | |
97 | |
98 # Values | |
99 self.assembly_size = 0 | |
100 self.contig_info = None | |
101 self.did_flye_ont_fastq = False | |
102 self.did_medaka_ont_assembly = False | |
103 self.feature_hits = pandas.Series(dtype='float64') | |
104 self.illumina_length_mean = 0 | |
105 self.illumina_read_count = 0 | |
106 self.illumina_bases = 0 | |
107 self.mean_coverage = 0 | |
108 self.num_assembly_contigs = 0 | |
109 | |
110 # Actions | |
111 self.did_guppy_ont_fast5 = False | |
112 self.did_qcat_ont_fastq = False | |
113 self.info_illumina_fastq() | |
114 self.load_contig_info() | |
115 | |
116 def run_command(self, command): | |
117 self.ofh.write("\nXXXXXX In run_command, command:\n%s\n\n" % str(command)) | |
118 try: | |
119 return re.split('\\n', subprocess.check_output(command, shell=True).decode('utf-8')) | |
120 except Exception: | |
121 message = 'Command %s failed: exiting...' % command | |
122 sys.exit(message) | |
123 | |
124 def format_kmg(self, number, decimals=0): | |
125 self.ofh.write("\nXXXXXX In format_kmg, number:\n%s\n" % str(number)) | |
126 self.ofh.write("XXXXXX In format_kmg, decimals:\n%s\n\n" % str(decimals)) | |
127 if number == 0: | |
128 return '0' | |
129 magnitude_powers = [10**9, 10**6, 10**3, 1] | |
130 magnitude_units = ['G', 'M', 'K', ''] | |
131 for i in range(len(magnitude_units)): | |
132 if number >= magnitude_powers[i]: | |
133 magnitude_power = magnitude_powers[i] | |
134 magnitude_unit = magnitude_units[i] | |
135 return ('{:0.' + str(decimals) + 'f}').format(number / magnitude_power) + magnitude_unit | |
136 | |
137 def load_contig_info(self): | |
138 self.contig_info = pandas.Series(dtype=object) | |
139 self.contig_info[self.read_type] = pandas.read_csv(self.contig_coverage_file, header=None, index_col=None, sep='\t').sort_values(1, axis=0, ascending=False) | |
140 self.contig_info[self.read_type].columns = ['contig', 'size', 'coverage'] | |
141 self.mean_coverage = (self.contig_info[self.read_type].iloc[:, 1] * self.contig_info[self.read_type].iloc[:, 2]).sum() / self.contig_info[self.read_type].iloc[:, 1].sum() | |
142 | |
143 def load_fasta(self, fasta): | |
144 sequence = pandas.Series(dtype=object) | |
145 for contig in SeqIO.parse(fasta, 'fasta'): | |
146 sequence[contig.id] = contig | |
147 return sequence | |
148 | |
149 def load_assembly(self): | |
150 self.assembly = self.load_fasta(self.assembly_fasta_file) | |
151 self.num_assembly_contigs = len(self.assembly) | |
152 for i in self.assembly: | |
153 self.assembly_size += len(i.seq) | |
154 self.assembly_size = self.format_kmg(self.assembly_size, decimals=1) | |
155 | |
156 def info_illumina_fastq(self): | |
157 self.ofh.write("\nXXXXXX In info_illumina_fastq\n\n") | |
158 if self.gzipped: | |
159 opener = 'gunzip -c' | |
160 else: | |
161 opener = 'cat' | |
162 command = ' '.join([opener, | |
163 self.illumina_fastq_file, | |
164 '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'']) | |
165 output = self.run_command(command) | |
166 self.ofh.write("output:\n%s\n" % str(output)) | |
167 self.ofh.write("re.split('\\t', self.run_command(command)[0]:\n%s\n" % str(re.split('\\t', self.run_command(command)[0]))) | |
168 values = [] | |
169 for i in re.split('\\t', self.run_command(command)[0]): | |
170 if i == '': | |
171 values.append(float('nan')) | |
172 else: | |
173 values.append(float(i)) | |
174 self.ofh.write("values:\n%s\n" % str(values)) | |
175 self.ofh.write("values[0]:\n%s\n" % str(values[0])) | |
176 self.illumina_length_mean += values[0] | |
177 self.ofh.write("values[1]:\n%s\n" % str(values[1])) | |
178 self.illumina_read_count += int(values[1]) | |
179 self.ofh.write("values[2]:\n%s\n" % str(values[2])) | |
180 self.illumina_bases += int(values[2]) | |
181 # The original PIMA code inserts self.illumina_fastq into | |
182 # a list for no apparent reason. We don't do that here. | |
183 # self.illumina_length_mean /= len(self.illumina_fastq) | |
184 self.illumina_length_mean /= 1 | |
185 self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1) | |
186 | |
187 def start_doc(self): | |
188 #header_text = 'Analysis of %s' % self.analysis_name | |
189 self.doc = MdUtils(file_name=self.report_md, title='') | |
190 | |
191 def add_run_information(self): | |
192 self.ofh.write("\nXXXXXX In add_run_information\n\n") | |
193 self.doc.new_line() | |
194 self.doc.new_header(1, 'Run information') | |
195 # Tables in md.utils are implemented as a wrapping function. | |
196 Table_list = [ | |
197 "Category", | |
198 "Information", | |
199 "Date", | |
200 date.today(), | |
201 "ONT FAST5", | |
202 "N/A", | |
203 "ONT FASTQ", | |
204 "N/A", | |
205 "Illumina FASTQ", | |
206 self.wordwrap_markdown(self.analysis_name), | |
207 "Assembly", | |
208 self.wordwrap_markdown(self.assembly_name), | |
209 "Reference", | |
210 self.wordwrap_markdown(self.dbkey), | |
211 ] | |
212 self.doc.new_table(columns=2, rows=7, text=Table_list, text_align='left') | |
213 self.doc.new_line() | |
214 self.doc.new_line() | |
215 | |
216 def add_ont_library_information(self): | |
217 self.ofh.write("\nXXXXXX In add_ont_library_information\n\n") | |
218 if self.ont_n50 is None: | |
219 return | |
220 self.doc.new_line() | |
221 self.doc.new_header(2, 'ONT library statistics') | |
222 Table_List = [ | |
223 "Category", | |
224 "Quantity", | |
225 "ONT N50", | |
226 '{:,}'.format(self.ont_n50), | |
227 "ONT reads", | |
228 '{:,}'.format(self.ont_read_count), | |
229 "ONT bases", | |
230 '{:s}'.format(self.ont_bases), | |
231 "Illumina FASTQ", | |
232 self.wordwrap_markdown(self.illumina_fastq_file), | |
233 "Assembly", | |
234 self.wordwrap_markdown(self.assembly_name), | |
235 "Reference", | |
236 self.wordwrap_markdown(self.dbkey), | |
237 ] | |
238 self.doc.new_table(columns=2, rows=7, text=Table_List, text_align='left') | |
239 self.doc.new_line() | |
240 | |
241 def add_illumina_library_information(self): | |
242 self.ofh.write("\nXXXXXX In add_illumina_library_information\n\n") | |
243 if self.illumina_length_mean is None: | |
244 return | |
245 self.doc.new_line() | |
246 self.doc.new_header(2, 'Illumina library statistics') | |
247 Table_List = [ | |
248 "Illumina Info.", | |
249 "Quantity", | |
250 'Illumina mean length', | |
251 '{:.1f}'.format(self.illumina_length_mean), | |
252 'Illumina reads', | |
253 '{:,}'.format(self.illumina_read_count), | |
254 'Illumina bases', | |
255 '{:s}'.format(self.illumina_bases) | |
256 ] | |
257 self.doc.new_table(columns=2, rows=4, text=Table_List, text_align='left') | |
258 | |
259 def add_assembly_information(self): | |
260 self.ofh.write("\nXXXXXX In add_assembly_information\n\n") | |
261 if self.assembly_fasta_file is None: | |
262 return | |
263 self.load_assembly() | |
264 self.doc.new_line() | |
265 self.doc.new_header(2, 'Assembly statistics') | |
266 Table_List = [ | |
267 "Category", | |
268 "Information", | |
269 "Contigs", | |
270 str(self.num_assembly_contigs), | |
271 "Assembly size", | |
272 str(self.assembly_size), | |
273 ] | |
274 self.doc.new_table(columns=2, rows=3, text=Table_List, text_align='left') | |
275 | |
276 def info_ont_fastq(self, fastq_file): | |
277 self.ofh.write("\nXXXXXX In info_ont_fastq, fastq_file:\n%s\n\n" % str(fastq_file)) | |
278 opener = 'cat' | |
279 if self.gzipped: | |
280 opener = 'gunzip -c' | |
281 else: | |
282 opener = 'cat' | |
283 command = ' '.join([opener, | |
284 fastq_file, | |
285 '| awk \'{getline;print length($0);s += length($1);getline;getline;}END{print "+"s}\'', | |
286 '| sort -gr', | |
287 '| awk \'BEGIN{bp = 0;f = 0}', | |
288 '{if(NR == 1){sub(/+/, "", $1);s=$1}else{bp += $1;if(bp > s / 2 && f == 0){n50 = $1;f = 1}}}', | |
289 'END{printf "%d\\t%d\\t%d\\n", n50, (NR - 1), s;exit}\'']) | |
290 result = list(re.split('\\t', self.run_command(command)[0])) | |
291 if result[1] == '0': | |
292 self.error_out('No ONT reads found') | |
293 ont_n50, ont_read_count, ont_raw_bases = [int(i) for i in result] | |
294 | |
295 command = ' '.join([opener, | |
296 fastq_file, | |
297 '| awk \'{getline;print length($0);getline;getline;}\'']) | |
298 result = self.run_command(command) | |
299 result = list(filter(lambda x: x != '', result)) | |
300 ont_read_lengths = [int(i) for i in result] | |
301 | |
302 return ([ont_n50, ont_read_count, ont_raw_bases, ont_read_lengths]) | |
303 | |
304 def wordwrap_markdown(self, string): | |
305 if string: | |
306 if len(string) < 35: | |
307 return string | |
308 else: | |
309 if '/' in string: | |
310 adjust = string.split('/') | |
311 out = '' | |
312 max = 35 | |
313 for i in adjust: | |
314 out = out + '/' + i | |
315 if len(out) > max: | |
316 out += '<br>' | |
317 max += 35 | |
318 return out | |
319 else: | |
320 out = [string[i:i + 35] for i in range(0, len(string), 50)] | |
321 return '<br>'.join(out) | |
322 else: | |
323 return string | |
324 | |
325 def add_contig_info(self): | |
326 self.ofh.write("\nXXXXXX In add_contig_info\n\n") | |
327 if self.contig_info is None: | |
328 return | |
329 for method in ['ONT', 'Illumina']: | |
330 if method not in self.contig_info.index: | |
331 continue | |
332 self.doc.new_line() | |
333 self.doc.new_header(2, 'Assembly coverage by ' + method) | |
334 Table_List = ["Contig", "Length (bp)", "Coverage (X)"] | |
335 formatted = self.contig_info[method].copy() | |
336 formatted.iloc[:, 1] = formatted.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) | |
337 for i in range(self.contig_info[method].shape[0]): | |
338 Table_List = Table_List + formatted.iloc[i, :].values.tolist() | |
339 row_count = int(len(Table_List) / 3) | |
340 self.doc.new_table(columns=3, rows=row_count, text=Table_List, text_align='left') | |
341 | |
342 def add_assembly_notes(self): | |
343 self.ofh.write("\nXXXXXX In add_assembly_notes\n\n") | |
344 if len(self.assembly_notes) == 0: | |
345 return | |
346 self.doc.new_line() | |
347 self.doc.new_line('<div style="page-break-after: always;"></div>') | |
348 self.doc.new_line() | |
349 self.doc.new_header(2, self.assembly_notes_title) | |
350 #for note in self.analysis.assembly_notes: | |
351 # self.doc.new_line(note) | |
352 | |
353 def add_contamination(self): | |
354 self.ofh.write("\nXXXXXX In add_contamination\n\n") | |
355 if len(self.kraken_fracs) == 0: | |
356 return | |
357 self.doc.new_line() | |
358 self.doc.new_header(2, 'Contamination check') | |
359 for read_type, kraken_fracs in self.kraken_fracs.iteritems(): | |
360 self.doc.new_line(read_type + ' classifications') | |
361 self.doc.new_line() | |
362 Table_List = ["Percent of Reads", "Reads", "Level", "Label"] | |
363 for index, row in kraken_fracs.iterrows(): | |
364 Table_List = Table_List + row.tolist() | |
365 row_count = int(len(Table_List) / 4) | |
366 self.doc.new_table(columns=4, rows=row_count, text=Table_List, text_align='left') | |
367 if self.contamination_methods_title not in self.methods: | |
368 self.methods[self.contamination_methods_title] = '' | |
369 method = 'Kraken2 was used to assign the raw reads into taxa.' | |
370 self.methods[self.contamination_methods_title] = self.methods[self.contamination_methods_title].append(pandas.Series(method)) | |
371 | |
372 def add_alignment(self): | |
373 self.ofh.write("\nXXXXXX In add_alignment\n\n") | |
374 # TODO: implement the draw_circos function for this. | |
375 if len(self.contig_alignment) > 0: | |
376 alignments = self.contig_alignment | |
377 else: | |
378 return | |
379 self.doc.new_line() | |
380 self.doc.new_header(level=2, title=self.alignment_title) | |
381 self.doc.new_line() | |
382 self.doc.new_header(level=3, title=self.snp_indel_title) | |
383 Table_1 = [ | |
384 "Category", | |
385 "Quantity", | |
386 'SNPs', | |
387 #'{:,}'.format(self.analysis.quast_mismatches), | |
388 'NA' | |
389 'Small indels', | |
390 #'{:,}'.format(self.analysis.quast_indels) | |
391 'NA' | |
392 ] | |
393 self.doc.new_table(columns=2, rows=3, text=Table_1, text_align='left') | |
394 self.doc.new_line('<div style="page-break-after: always;"></div>') | |
395 self.doc.new_line() | |
396 if len(self.alignment_notes) > 0: | |
397 self.doc.new_header(level=3, title=self.alignment_notes_title) | |
398 for note in self.alignment_notes: | |
399 self.doc.new_line(note) | |
400 for contig in alignments.index.tolist(): | |
401 contig_title = 'Alignment to %s' % contig | |
402 image_png = alignments[contig] | |
403 self.doc.new_line() | |
404 self.doc.new_header(level=3, title=contig_title) | |
405 self.doc.new_line(self.doc.new_inline_image(text='contig_title', path=os.path.abspath(image_png))) | |
406 self.doc.new_line('<div style="page-break-after: always;"></div>') | |
407 self.doc.new_line() | |
408 method = 'The genome assembly was aligned against the reference sequencing using dnadiff.' | |
409 self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append(pandas.Series(method)) | |
410 | |
411 def add_features(self): | |
412 self.ofh.write("\nXXXXXX In add_features\n\n") | |
413 if len(self.feature_bed_files) == 0: | |
414 return | |
415 for bbf in self.feature_bed_files: | |
416 if os.path.getsize(bbf) > 0: | |
417 best = pandas.read_csv(filepath_or_buffer=bbf, sep='\t', header=None) | |
418 self.feature_hits[os.path.basename(bbf)] = best | |
419 if len(self.feature_hits) == 0: | |
420 return | |
421 self.ofh.write("self.feature_hits: %s\n" % str(self.feature_hits)) | |
422 self.doc.new_line() | |
423 self.doc.new_header(level=2, title=self.feature_title) | |
424 for feature_name in self.feature_hits.index.tolist(): | |
425 self.ofh.write("feature_name: %s\n" % str(feature_name)) | |
426 features = self.feature_hits[feature_name].copy() | |
427 self.ofh.write("features: %s\n" % str(features)) | |
428 if features.shape[0] == 0: | |
429 continue | |
430 features.iloc[:, 1] = features.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) | |
431 features.iloc[:, 2] = features.iloc[:, 2].apply(lambda x: '{:,}'.format(x)) | |
432 self.doc.new_line() | |
433 self.doc.new_header(level=3, title=feature_name) | |
434 if (features.shape[0] == 0): | |
435 continue | |
436 for contig in pandas.unique(features.iloc[:, 0]): | |
437 self.ofh.write("contig: %s\n" % str(contig)) | |
438 self.doc.new_line(contig) | |
439 contig_features = features.loc[(features.iloc[:, 0] == contig), :] | |
440 self.ofh.write("contig_features: %s\n" % str(contig_features)) | |
441 Table_List = ['Start', 'Stop', 'Feature', 'Identity (%)', 'Strand'] | |
442 for i in range(contig_features.shape[0]): | |
443 self.ofh.write("i: %s\n" % str(i)) | |
444 feature = contig_features.iloc[i, :].copy(deep=True) | |
445 self.ofh.write("feature: %s\n" % str(feature)) | |
446 feature[4] = '{:.3f}'.format(feature[4]) | |
447 Table_List = Table_List + feature[1:].values.tolist() | |
448 self.ofh.write("Table_List: %s\n" % str(Table_List)) | |
449 row_count = int(len(Table_List) / 5) | |
450 self.ofh.write("row_count: %s\n" % str(row_count)) | |
451 self.doc.new_line() | |
452 self.doc.new_table(columns=7, rows=row_count, text=Table_List, text_align='left') | |
453 blastn_version = 'The genome assembly was queried for features using blastn.' | |
454 bedtools_version = 'Feature hits were clustered using bedtools and the highest scoring hit for each cluster was reported.' | |
455 method = '%s %s' % (blastn_version, bedtools_version) | |
456 self.methods[self.feature_methods_title] = self.methods[self.feature_methods_title].append(pandas.Series(method)) | |
457 | |
458 def add_feature_plots(self): | |
459 self.ofh.write("\nXXXXXX In add_feature_plots\n\n") | |
460 if len(self.feature_png_files) == 0: | |
461 return | |
462 self.doc.new_line() | |
463 self.doc.new_header(level=2, title='Feature Plots') | |
464 self.doc.new_paragraph('Only contigs with features are shown') | |
465 for feature_png_file in self.feature_png_files: | |
466 self.doc.new_line(self.doc.new_inline_image(text='Analysis', path=os.path.abspath(feature_png_file))) | |
467 | |
468 def add_mutations(self): | |
469 self.ofh.write("\nXXXXXX In add_mutations\n\n") | |
470 if len(self.mutation_regions_tsv_files) == 0: | |
471 return | |
472 try: | |
473 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) | |
474 except Exception: | |
475 # Likely an empty file. | |
476 return | |
477 amr_mutations = pandas.Series(dtype=object) | |
478 for region_i in range(mutation_regions.shape[0]): | |
479 region = mutation_regions.iloc[region_i, :] | |
480 region_name = str(region['name']) | |
481 self.ofh.write("Processing mutations for region %s\n" % region_name) | |
482 region_mutations_tsv_name = '%s_mutations.tsv' % region_name | |
483 if region_mutations_tsv_name not in self.mutation_regions_tsv_files: | |
484 continue | |
485 region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name] | |
486 try: | |
487 region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False) | |
488 except Exception: | |
489 region_mutations = pandas.DataFrame() | |
490 if region_mutations.shape[0] == 0: | |
491 continue | |
492 # Figure out what kind of mutations are in this region | |
493 region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index) | |
494 region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel' | |
495 region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index) | |
496 region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index) | |
497 region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1) | |
498 region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']] | |
499 amr_mutations[region['name']] = region_mutations | |
500 # Report the mutations. | |
501 self.doc.new_line() | |
502 self.doc.new_header(level=2, title=self.mutation_title) | |
503 for region_name in amr_mutations.index.tolist(): | |
504 region_mutations = amr_mutations[region_name].copy() | |
505 self.doc.new_line() | |
506 self.doc.new_header(level=3, title=region_name) | |
507 if (region_mutations.shape[0] == 0): | |
508 self.doc.append('None') | |
509 continue | |
510 region_mutations.iloc[:, 1] = region_mutations.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) | |
511 Table_List = ['Reference contig', 'Position', 'Reference', 'Alternate', 'Drug', 'Note'] | |
512 for i in range(region_mutations.shape[0]): | |
513 Table_List = Table_List + region_mutations.iloc[i, [0, 1, 3, 4, 5, 6]].values.tolist() | |
514 row_count = int(len(Table_List) / 6) | |
515 self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left') | |
516 method = '%s reads were mapped to the reference sequence using minimap2.' % self.read_type | |
517 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) | |
518 method = 'Mutations were identified using samtools mpileup and varscan.' | |
519 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) | |
520 | |
521 def add_amr_matrix(self): | |
522 self.ofh.write("\nXXXXXX In add_amr_matrix\n\n") | |
523 # Make sure that we have an AMR matrix to plot | |
524 #if not getattr(self.analysis, 'did_draw_amr_matrix', False): | |
525 # return | |
526 #amr_matrix_png = self.analysis.amr_matrix_png | |
527 #self.doc.new_line() | |
528 #self.doc.new_header(level=2, title=self.amr_matrix_title) | |
529 #self.doc.new_line('AMR genes and mutations with their corresponding drugs.') | |
530 #self.doc.new_line( | |
531 # self.doc.new_inline_image( | |
532 # text='AMR genes and mutations with their corresponding drugs', | |
533 # path=amr_matrix_png | |
534 # ) | |
535 #) | |
536 pass | |
537 | |
538 def add_large_indels(self): | |
539 self.ofh.write("\nXXXXXX In add_large_indels\n\n") | |
540 # Make sure we looked for mutations | |
541 #if len(self.analysis.large_indels) == 0: | |
542 # return | |
543 #large_indels = self.analysis.large_indels | |
544 #self.doc.new_line() | |
545 #self.doc.new_header(level=2, title=self.large_indel_title) | |
546 #for genome in ['Reference insertions', 'Query insertions']: | |
547 # genome_indels = large_indels[genome].copy() | |
548 # self.doc.new_line() | |
549 # self.doc.new_header(level=3, title=genome) | |
550 # if (genome_indels.shape[0] == 0): | |
551 # continue | |
552 # genome_indels.iloc[:, 1] = genome_indels.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) | |
553 # genome_indels.iloc[:, 2] = genome_indels.iloc[:, 2].apply(lambda x: '{:,}'.format(x)) | |
554 # genome_indels.iloc[:, 3] = genome_indels.iloc[:, 3].apply(lambda x: '{:,}'.format(x)) | |
555 # Table_List = [ | |
556 # 'Reference contig', 'Start', 'Stop', 'Size (bp)' | |
557 # ] | |
558 # for i in range(genome_indels.shape[0]): | |
559 # Table_List = Table_List + genome_indels.iloc[i, :].values.tolist() | |
560 # row_count = int(len(Table_List) / 4) | |
561 # self.doc.new_table(columns=4, rows=row_count, text=Table_List, text_align='left') | |
562 #method = 'Large insertions or deletions were found as the complement of aligned regions using bedtools.' | |
563 #self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append( | |
564 # pandas.Series(method)) | |
565 #self.doc.new_line() | |
566 #self.doc.new_line('<div style="page-break-after: always;"></div>') | |
567 #self.doc.new_line() | |
568 pass | |
569 | |
570 def add_plasmids(self): | |
571 self.ofh.write("\nXXXXXX In add_plasmids\n\n") | |
572 """ | |
573 if not getattr(self.analysis, 'did_call_plasmids', False): | |
574 return | |
575 # Make sure we looked for mutations | |
576 #plasmids = self.analysis.plasmids | |
577 if plasmids is None: | |
578 return | |
579 plasmids = plasmids.copy() | |
580 self.doc.new_line() | |
581 #self.doc.new_header(level=2, title=self.analysis.plasmid_title) | |
582 if (plasmids.shape[0] == 0): | |
583 self.doc.new_line('None') | |
584 return | |
585 plasmids.iloc[:, 3] = plasmids.iloc[:, 3].apply(lambda x: '{:,}'.format(x)) | |
586 plasmids.iloc[:, 4] = plasmids.iloc[:, 4].apply(lambda x: '{:,}'.format(x)) | |
587 plasmids.iloc[:, 5] = plasmids.iloc[:, 5].apply(lambda x: '{:,}'.format(x)) | |
588 Table_List = [ | |
589 'Genome contig', | |
590 'Plasmid hit', | |
591 'Plasmid acc.', | |
592 'Contig size', | |
593 'Aliged', | |
594 'Plasmid size' | |
595 ] | |
596 for i in range(plasmids.shape[0]): | |
597 Table_List = Table_List + plasmids.iloc[i, 0:6].values.tolist() | |
598 row_count = int(len(Table_List) / 6) | |
599 self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left') | |
600 method = 'The plasmid reference database was queried against the genome assembly using minimap2.' | |
601 self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) | |
602 method = 'The resulting SAM was converted to a PSL using a custom version of sam2psl.' | |
603 self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) | |
604 method = 'Plasmid-to-genome hits were resolved using the pChunks algorithm.' | |
605 self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) | |
606 """ | |
607 pass | |
608 | |
609 def add_methods(self): | |
610 self.ofh.write("\nXXXXXX In add_methods\n\n") | |
611 self.doc.new_line('<div style="page-break-after: always;"></div>') | |
612 self.doc.new_line() | |
613 if len(self.methods) == 0: | |
614 return | |
615 self.doc.new_line() | |
616 self.doc.new_header(level=2, title=self.methods_title) | |
617 | |
618 for methods_section in self.methods.index.tolist(): | |
619 if self.methods[methods_section] is None or len(self.methods[methods_section]) == 0: | |
620 continue | |
621 self.doc.new_line() | |
622 self.doc.new_header(level=3, title=methods_section) | |
623 self.doc.new_paragraph(' '.join(self.methods[methods_section])) | |
624 | |
625 def add_summary(self): | |
626 self.ofh.write("\nXXXXXX In add_summary\n\n") | |
627 # Add summary title | |
628 self.doc.new_header(level=1, title=self.summary_title) | |
629 # First section of Summary | |
630 self.doc.new_header(level=1, title='CDC Advisory') | |
631 self.doc.new_paragraph(CDC_ADVISORY) | |
632 self.doc.new_line() | |
633 self.add_run_information() | |
634 self.add_ont_library_information() | |
635 methods = [] | |
636 if self.did_guppy_ont_fast5: | |
637 methods += ['ONT reads were basecalled using guppy'] | |
638 if self.did_qcat_ont_fastq: | |
639 methods += ['ONT reads were demultiplexed and trimmed using qcat'] | |
640 self.methods[self.basecalling_methods_title] = pandas.Series(methods) | |
641 self.add_illumina_library_information() | |
642 self.add_assembly_information() | |
643 self.add_contig_info() | |
644 self.add_assembly_notes() | |
645 if self.did_flye_ont_fastq: | |
646 method = 'ONT reads were assembled using Flye.' | |
647 self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.Series(method)) | |
648 if self.did_medaka_ont_assembly: | |
649 method = 'the genome assembly was polished using ont reads and medaka.' | |
650 self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.series(method)) | |
651 | |
652 def make_tex(self): | |
653 self.doc.new_table_of_contents(table_title='detailed run information', depth=2, marker="tableofcontents") | |
654 text = self.doc.file_data_text | |
655 text = text.replace("##--[", "") | |
656 text = text.replace("]--##", "") | |
657 self.doc.file_data_text = text | |
658 self.doc.create_md_file() | |
659 | |
660 def make_report(self): | |
661 self.ofh.write("\nXXXXXX In make_report\n\n") | |
662 self.start_doc() | |
663 self.add_summary() | |
664 self.add_contamination() | |
665 self.add_alignment() | |
666 self.add_features() | |
667 self.add_feature_plots() | |
668 self.add_mutations() | |
669 # TODO stuff working to here... | |
670 self.add_large_indels() | |
671 self.add_plasmids() | |
672 self.add_amr_matrix() | |
673 # self.add_snps() | |
674 self.add_methods() | |
675 self.make_tex() | |
676 # It took me quite a long time to find out that the value of the -t | |
677 # (implied) argument in the following command must be 'html' instead of | |
678 # the more logical 'pdf'. see the answer from snsn in this thread: | |
679 # https://github.com/jessicategner/pypandoc/issues/186 | |
680 self.ofh.write("\nXXXXX In make_report, calling pypandoc.convert_file...\n\n") | |
681 pypandoc.convert_file(self.report_md, | |
682 'html', | |
683 extra_args=['--pdf-engine=weasyprint', '-V', '-css=%s' % self.pima_css], | |
684 outputfile='pima_report.pdf') | |
685 self.ofh.close() | |
686 | |
687 | |
688 parser = argparse.ArgumentParser() | |
689 | |
690 parser.add_argument('--analysis_name', action='store', dest='analysis_name', help='Sample identifier') | |
691 parser.add_argument('--assembly_fasta_file', action='store', dest='assembly_fasta_file', help='Assembly fasta file') | |
692 parser.add_argument('--assembly_name', action='store', dest='assembly_name', help='Assembly identifier') | |
693 parser.add_argument('--feature_bed_dir', action='store', dest='feature_bed_dir', help='Directory of best feature hits bed files') | |
694 parser.add_argument('--feature_png_dir', action='store', dest='feature_png_dir', help='Directory of best feature hits png files') | |
695 parser.add_argument('--contig_coverage_file', action='store', dest='contig_coverage_file', help='Contig coverage TSV file') | |
696 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference genome') | |
697 parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input sample is gzipped') | |
698 parser.add_argument('--illumina_fastq_file', action='store', dest='illumina_fastq_file', help='Input sample') | |
699 parser.add_argument('--mutation_regions_bed_file', action='store', dest='mutation_regions_bed_file', help='AMR mutation regions BRD file') | |
700 parser.add_argument('--mutation_regions_dir', action='store', dest='mutation_regions_dir', help='Directory of mutation regions TSV files') | |
701 parser.add_argument('--pima_css', action='store', dest='pima_css', help='PIMA css stypesheet') | |
702 | |
703 args = parser.parse_args() | |
704 | |
705 # Prepare the features BED files. | |
706 feature_bed_files = [] | |
707 for file_name in sorted(os.listdir(args.feature_bed_dir)): | |
708 file_path = os.path.abspath(os.path.join(args.feature_bed_dir, file_name)) | |
709 feature_bed_files.append(file_path) | |
710 # Prepare the features PNG files. | |
711 feature_png_files = [] | |
712 for file_name in sorted(os.listdir(args.feature_png_dir)): | |
713 file_path = os.path.abspath(os.path.join(args.feature_png_dir, file_name)) | |
714 feature_png_files.append(file_path) | |
715 # Prepare the mutation regions TSV files. | |
716 mutation_regions_files = [] | |
717 for file_name in sorted(os.listdir(args.mutation_regions_dir)): | |
718 file_path = os.path.abspath(os.path.join(args.feature_png_dir, file_name)) | |
719 mutation_regions_files.append(file_path) | |
720 | |
721 markdown_report = PimaReport(args.analysis_name, | |
722 args.assembly_fasta_file, | |
723 args.assembly_name, | |
724 feature_bed_files, | |
725 feature_png_files, | |
726 args.contig_coverage_file, | |
727 args.dbkey, | |
728 args.gzipped, | |
729 args.illumina_fastq_file, | |
730 args.mutation_regions_bed_file, | |
731 mutation_regions_files, | |
732 args.pima_css) | |
733 markdown_report.make_report() |