Mercurial > repos > greg > pima_report
comparison pima_report.py @ 31:a859de7cce94 draft
Uploaded
author | greg |
---|---|
date | Tue, 27 Jun 2023 13:39:14 +0000 |
parents | 134a0879d0b6 |
children | 163260afc1b1 |
comparison
equal
deleted
inserted
replaced
30:134a0879d0b6 | 31:a859de7cce94 |
---|---|
27 illumina_forward_read_file=None, illumina_reverse_read_file=None, kraken2_report_file=None, | 27 illumina_forward_read_file=None, illumina_reverse_read_file=None, kraken2_report_file=None, |
28 kraken2_version=None, lrn_risk_amr_file=None, lrn_risk_blacklist_file=None, lrn_risk_vf_file=None, | 28 kraken2_version=None, lrn_risk_amr_file=None, lrn_risk_blacklist_file=None, lrn_risk_vf_file=None, |
29 minimap2_version=None, mutation_regions_bed_file=None, mutation_regions_tsv_files=None, | 29 minimap2_version=None, mutation_regions_bed_file=None, mutation_regions_tsv_files=None, |
30 ont_file=None, pima_css=None, plasmids_file=None, quast_report_file=None, read_type=None, | 30 ont_file=None, pima_css=None, plasmids_file=None, quast_report_file=None, read_type=None, |
31 reference_insertions_file=None, samtools_version=None, varscan_version=None): | 31 reference_insertions_file=None, samtools_version=None, varscan_version=None): |
32 self.ofh = open("process_log.txt", "w") | |
33 | |
34 self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file)) | |
35 self.ofh.write("amr_matrix_files: %s\n" % str(amr_matrix_files)) | |
36 self.ofh.write("analysis_name: %s\n" % str(analysis_name)) | |
37 self.ofh.write("assembler_version: %s\n" % str(assembler_version)) | |
38 self.ofh.write("assembly_fasta_file: %s\n" % str(assembly_fasta_file)) | |
39 self.ofh.write("assembly_name: %s\n" % str(assembly_name)) | |
40 self.ofh.write("bedtools_version: %s\n" % str(bedtools_version)) | |
41 self.ofh.write("blastn_version: %s\n" % str(blastn_version)) | |
42 self.ofh.write("circos_files: %s\n" % str(circos_files)) | |
43 self.ofh.write("compute_sequence_length_file: %s\n" % str(compute_sequence_length_file)) | |
44 self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file)) | |
45 self.ofh.write("dbkey: %s\n" % str(dbkey)) | |
46 self.ofh.write("dnadiff_snps_file: %s\n" % str(dnadiff_snps_file)) | |
47 self.ofh.write("dnadiff_version: %s\n" % str(dnadiff_version)) | |
48 self.ofh.write("errors_file: %s\n" % str(errors_file)) | |
49 self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files)) | |
50 self.ofh.write("feature_png_files: %s\n" % str(feature_png_files)) | |
51 self.ofh.write("flye_assembly_info_file: %s\n" % str(flye_assembly_info_file)) | |
52 self.ofh.write("gzipped: %s\n" % str(gzipped)) | |
53 self.ofh.write("genome_insertions_file: %s\n" % str(genome_insertions_file)) | |
54 self.ofh.write("illumina_forward_read_file: %s\n" % str(illumina_forward_read_file)) | |
55 self.ofh.write("illumina_reverse_read_file: %s\n" % str(illumina_reverse_read_file)) | |
56 self.ofh.write("kraken2_report_file: %s\n" % str(kraken2_report_file)) | |
57 self.ofh.write("kraken2_version: %s\n" % str(kraken2_version)) | |
58 self.ofh.write("lrn_risk_amr_file: %s\n" % str(lrn_risk_amr_file)) | |
59 self.ofh.write("lrn_risk_blacklist_file: %s\n" % str(lrn_risk_blacklist_file)) | |
60 self.ofh.write("lrn_risk_vf_file: %s\n" % str(lrn_risk_vf_file)) | |
61 self.ofh.write("minimap2_version: %s\n" % str(minimap2_version)) | |
62 self.ofh.write("mutation_regions_bed_file: %s\n" % str(mutation_regions_bed_file)) | |
63 self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files)) | |
64 self.ofh.write("ont_file: %s\n" % str(ont_file)) | |
65 self.ofh.write("pima_css: %s\n" % str(pima_css)) | |
66 self.ofh.write("plasmids_file: %s\n" % str(plasmids_file)) | |
67 self.ofh.write("quast_report_file: %s\n" % str(quast_report_file)) | |
68 self.ofh.write("read_type: %s\n" % str(read_type)) | |
69 self.ofh.write("reference_insertions_file: %s\n" % str(reference_insertions_file)) | |
70 self.ofh.write("samtools_version: %s\n" % str(samtools_version)) | |
71 self.ofh.write("varscan_version: %s\n" % str(varscan_version)) | |
72 | 32 |
73 # General | 33 # General |
74 self.doc = None | 34 self.doc = None |
75 self.report_md = 'pima_report.md' | 35 self.report_md = 'pima_report.md' |
76 | 36 |
77 # Inputs | 37 # Inputs |
78 self.amr_deletions_file = amr_deletions_file | 38 self.amr_deletions_file = amr_deletions_file |
79 self.amr_matrix_files = amr_matrix_files | 39 self.amr_matrix_files = amr_matrix_files |
80 self.analysis_name = analysis_name.split('_')[0] | 40 self.analysis_name = analysis_name.split('_')[0] |
81 self.ofh.write("self.analysis_name: %s\n" % str(self.analysis_name)) | |
82 if assembler_version is None: | 41 if assembler_version is None: |
83 self.assembler_version = 'assembler (version unknown)' | 42 self.assembler_version = 'assembler (version unknown)' |
84 else: | 43 else: |
85 if read_type == 'ont': | 44 if read_type == 'ont': |
86 # Assembler is flye. | 45 # Assembler is flye. |
215 self.ont_raw_fastq = None | 174 self.ont_raw_fastq = None |
216 | 175 |
217 # Actions | 176 # Actions |
218 self.did_guppy_ont_fast5 = False | 177 self.did_guppy_ont_fast5 = False |
219 self.did_qcat_ont_fastq = False | 178 self.did_qcat_ont_fastq = False |
220 self.ofh.write("self.read_type: %s\n" % str(self.read_type)) | |
221 if self.read_type == 'ONT': | 179 if self.read_type == 'ONT': |
222 self.info_ont_fastq(self.ont_file) | 180 self.info_ont_fastq(self.ont_file) |
223 else: | 181 else: |
224 self.info_illumina_fastq([self.illumina_forward_read_file, self.illumina_reverse_read_file]) | 182 self.info_illumina_fastq([self.illumina_forward_read_file, self.illumina_reverse_read_file]) |
225 self.load_contig_info() | 183 self.load_contig_info() |
226 | 184 |
227 def run_command(self, command): | 185 def run_command(self, command): |
228 self.ofh.write("\nXXXXXX In run_command, command:\n%s\n\n" % str(command)) | |
229 try: | 186 try: |
230 return re.split('\\n', subprocess.check_output(command, shell=True).decode('utf-8')) | 187 return re.split('\\n', subprocess.check_output(command, shell=True).decode('utf-8')) |
231 except Exception: | 188 except Exception: |
232 message = 'Command %s failed: exiting...' % command | 189 message = 'Command %s failed: exiting...' % command |
233 sys.exit(message) | 190 sys.exit(message) |
234 | 191 |
235 def format_kmg(self, number, decimals=0): | 192 def format_kmg(self, number, decimals=0): |
236 self.ofh.write("\nXXXXXX In format_kmg, number:\n%s\n" % str(number)) | |
237 self.ofh.write("XXXXXX In format_kmg, decimals:\n%s\n\n" % str(decimals)) | |
238 if number == 0: | 193 if number == 0: |
239 return '0' | 194 return '0' |
240 magnitude_powers = [10**9, 10**6, 10**3, 1] | 195 magnitude_powers = [10**9, 10**6, 10**3, 1] |
241 magnitude_units = ['G', 'M', 'K', ''] | 196 magnitude_units = ['G', 'M', 'K', ''] |
242 for i in range(len(magnitude_units)): | 197 for i in range(len(magnitude_units)): |
277 self.assembly = self.load_fasta(self.assembly_fasta_file) | 232 self.assembly = self.load_fasta(self.assembly_fasta_file) |
278 self.num_assembly_contigs = len(self.assembly) | 233 self.num_assembly_contigs = len(self.assembly) |
279 self.assembly_size = self.format_kmg(sum([len(x) for x in self.assembly]), decimals=1) | 234 self.assembly_size = self.format_kmg(sum([len(x) for x in self.assembly]), decimals=1) |
280 | 235 |
281 def info_illumina_fastq(self, illumina_read_files): | 236 def info_illumina_fastq(self, illumina_read_files): |
282 self.ofh.write("\nXXXXXX In info_illumina_fastq\n\n") | |
283 if self.gzipped: | 237 if self.gzipped: |
284 opener = 'gunzip -c' | 238 opener = 'gunzip -c' |
285 else: | 239 else: |
286 opener = 'cat' | 240 opener = 'cat' |
287 for fastq_file in illumina_read_files: | 241 for fastq_file in illumina_read_files: |
288 command = ' '.join([opener, | 242 command = ' '.join([opener, |
289 fastq_file, | 243 fastq_file, |
290 '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'']) | 244 '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'']) |
291 output = self.run_command(command) | |
292 self.ofh.write("output:\n%s\n" % str(output)) | |
293 self.ofh.write("re.split('\\t', self.run_command(command)[0]:\n%s\n" % str(re.split('\\t', self.run_command(command)[0]))) | |
294 values = [] | 245 values = [] |
295 for i in re.split('\\t', self.run_command(command)[0]): | 246 for i in re.split('\\t', self.run_command(command)[0]): |
296 if i == '': | 247 if i == '': |
297 values.append(float('nan')) | 248 values.append(float('nan')) |
298 else: | 249 else: |
299 values.append(float(i)) | 250 values.append(float(i)) |
300 self.ofh.write("values:\n%s\n" % str(values)) | |
301 self.ofh.write("values[0]:\n%s\n" % str(values[0])) | |
302 self.illumina_length_mean += values[0] | 251 self.illumina_length_mean += values[0] |
303 self.ofh.write("values[1]:\n%s\n" % str(values[1])) | |
304 self.illumina_read_count += int(values[1]) | 252 self.illumina_read_count += int(values[1]) |
305 self.ofh.write("values[2]:\n%s\n" % str(values[2])) | |
306 self.illumina_bases += int(values[2]) | 253 self.illumina_bases += int(values[2]) |
307 self.illumina_length_mean /= 2 | 254 self.illumina_length_mean /= 2 |
308 self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1) | 255 self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1) |
309 | 256 |
310 def start_doc(self): | 257 def start_doc(self): |
316 self.doc.new_line() | 263 self.doc.new_line() |
317 self.doc.new_line('<div style="page-break-after: always;"></div>') | 264 self.doc.new_line('<div style="page-break-after: always;"></div>') |
318 self.doc.new_line() | 265 self.doc.new_line() |
319 | 266 |
320 def add_run_information(self): | 267 def add_run_information(self): |
321 self.ofh.write("\nXXXXXX In add_run_information\n\n") | |
322 self.doc.new_line() | 268 self.doc.new_line() |
323 self.doc.new_header(1, 'Run information') | 269 self.doc.new_header(1, 'Run information') |
324 # Tables in md.utils are implemented as a wrapping function. | 270 # Tables in md.utils are implemented as a wrapping function. |
325 Table_list = [ | 271 Table_list = [ |
326 "Category", | 272 "Category", |
343 # FIXME: the following doesn't work. | 289 # FIXME: the following doesn't work. |
344 # self.add_table_of_contents() | 290 # self.add_table_of_contents() |
345 self.doc.new_line() | 291 self.doc.new_line() |
346 | 292 |
347 def add_ont_library_information(self): | 293 def add_ont_library_information(self): |
348 self.ofh.write("\nXXXXXX In add_ont_library_information\n\n") | |
349 if self.ont_n50 is None: | 294 if self.ont_n50 is None: |
350 return | 295 return |
351 self.doc.new_line() | 296 self.doc.new_line() |
352 self.doc.new_header(2, 'ONT library statistics') | 297 self.doc.new_header(2, 'ONT library statistics') |
353 Table_List = [ | 298 Table_List = [ |
368 ] | 313 ] |
369 self.doc.new_table(columns=2, rows=7, text=Table_List, text_align='left') | 314 self.doc.new_table(columns=2, rows=7, text=Table_List, text_align='left') |
370 self.doc.new_line() | 315 self.doc.new_line() |
371 | 316 |
372 def add_illumina_library_information(self): | 317 def add_illumina_library_information(self): |
373 self.ofh.write("\nXXXXXX In add_illumina_library_information\n\n") | |
374 if self.illumina_length_mean is None: | 318 if self.illumina_length_mean is None: |
375 return | 319 return |
376 self.doc.new_line() | 320 self.doc.new_line() |
377 self.doc.new_header(2, 'Illumina library statistics') | 321 self.doc.new_header(2, 'Illumina library statistics') |
378 Table_List = [ | 322 Table_List = [ |
400 if small_contigs.shape[0] > 0: | 344 if small_contigs.shape[0] > 0: |
401 warning = 'Assembly produced {:d} small contigs ({:s}); assembly may include spurious sequences.'.format(small_contigs.shape[0], ', '.join(small_contigs['contig'])) | 345 warning = 'Assembly produced {:d} small contigs ({:s}); assembly may include spurious sequences.'.format(small_contigs.shape[0], ', '.join(small_contigs['contig'])) |
402 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) | 346 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) |
403 | 347 |
404 def add_assembly_information(self): | 348 def add_assembly_information(self): |
405 self.ofh.write("\nXXXXXX In add_assembly_information\n\n") | |
406 if self.assembly_fasta_file is None: | 349 if self.assembly_fasta_file is None: |
407 return | 350 return |
408 self.load_assembly() | 351 self.load_assembly() |
409 self.doc.new_line() | 352 self.doc.new_line() |
410 self.doc.new_header(2, 'Assembly statistics') | 353 self.doc.new_header(2, 'Assembly statistics') |
417 str(self.assembly_size), | 360 str(self.assembly_size), |
418 ] | 361 ] |
419 self.doc.new_table(columns=2, rows=3, text=Table_List, text_align='left') | 362 self.doc.new_table(columns=2, rows=3, text=Table_List, text_align='left') |
420 | 363 |
421 def info_ont_fastq(self, fastq_file): | 364 def info_ont_fastq(self, fastq_file): |
422 self.ofh.write("\nXXXXXX In info_ont_fastq, fastq_file:\n%s\n\n" % str(fastq_file)) | |
423 opener = 'cat' | 365 opener = 'cat' |
424 if self.gzipped: | 366 if self.gzipped: |
425 opener = 'gunzip -c' | 367 opener = 'gunzip -c' |
426 else: | 368 else: |
427 opener = 'cat' | 369 opener = 'cat' |
467 return '<br>'.join(out) | 409 return '<br>'.join(out) |
468 else: | 410 else: |
469 return string | 411 return string |
470 | 412 |
471 def add_contig_info(self): | 413 def add_contig_info(self): |
472 self.ofh.write("\nXXXXXX In add_contig_info\n\n") | |
473 if self.contig_info is None or self.read_type not in self.contig_info.index: | 414 if self.contig_info is None or self.read_type not in self.contig_info.index: |
474 return | 415 return |
475 self.doc.new_line() | 416 self.doc.new_line() |
476 self.doc.new_header(2, 'Assembly coverage by ' + self.read_type) | 417 self.doc.new_header(2, 'Assembly coverage by ' + self.read_type) |
477 Table_List = ["Contig", "Length (bp)", "Coverage (X)"] | 418 Table_List = ["Contig", "Length (bp)", "Coverage (X)"] |
481 Table_List = Table_List + formatted.iloc[i, :].values.tolist() | 422 Table_List = Table_List + formatted.iloc[i, :].values.tolist() |
482 row_count = int(len(Table_List) / 3) | 423 row_count = int(len(Table_List) / 3) |
483 self.doc.new_table(columns=3, rows=row_count, text=Table_List, text_align='left') | 424 self.doc.new_table(columns=3, rows=row_count, text=Table_List, text_align='left') |
484 | 425 |
485 def add_assembly_notes(self): | 426 def add_assembly_notes(self): |
486 self.ofh.write("\nXXXXXX In add_assembly_notes\n\n") | |
487 if len(self.assembly_notes) == 0: | 427 if len(self.assembly_notes) == 0: |
488 return | 428 return |
489 self.doc.new_line() | 429 self.doc.new_line() |
490 self.doc.new_line('<div style="page-break-after: always;"></div>') | 430 self.doc.new_line('<div style="page-break-after: always;"></div>') |
491 self.doc.new_line() | 431 self.doc.new_line() |
492 self.doc.new_header(2, self.assembly_notes_title) | 432 self.doc.new_header(2, self.assembly_notes_title) |
493 for note in self.assembly_notes: | 433 for note in self.assembly_notes: |
494 self.doc.new_line(note) | 434 self.doc.new_line(note) |
495 | 435 |
496 def add_contamination(self): | 436 def add_contamination(self): |
497 self.ofh.write("\nXXXXXX In add_contamination\n\n") | |
498 if self.kraken2_report_file is None: | 437 if self.kraken2_report_file is None: |
499 return | 438 return |
500 # Read in the Kraken fractions and pull out the useful parts | 439 # Read in the Kraken fractions and pull out the useful parts |
501 kraken_fracs = pandas.read_csv(self.kraken2_report_file, delimiter='\t', header=None) | 440 kraken_fracs = pandas.read_csv(self.kraken2_report_file, delimiter='\t', header=None) |
502 kraken_fracs.index = kraken_fracs.iloc[:, 4].values | 441 kraken_fracs.index = kraken_fracs.iloc[:, 4].values |
520 self.methods[self.contamination_methods_title] = '' | 459 self.methods[self.contamination_methods_title] = '' |
521 method = '%s was used to assign the raw reads into taxa.' % self.kraken2_version.rstrip('report') | 460 method = '%s was used to assign the raw reads into taxa.' % self.kraken2_version.rstrip('report') |
522 self.methods[self.contamination_methods_title] = self.methods[self.contamination_methods_title].append(pandas.Series(method)) | 461 self.methods[self.contamination_methods_title] = self.methods[self.contamination_methods_title].append(pandas.Series(method)) |
523 | 462 |
524 def add_alignment(self): | 463 def add_alignment(self): |
525 self.ofh.write("\nXXXXXX In add_alignment\n\n") | |
526 if self.quast_report_file is not None: | 464 if self.quast_report_file is not None: |
527 # Process quast values. | 465 # Process quast values. |
528 quast_report = pandas.read_csv(self.quast_report_file, header=0, index_col=0, sep='\t') | 466 quast_report = pandas.read_csv(self.quast_report_file, header=0, index_col=0, sep='\t') |
529 quast_mismatches = int(float(quast_report.loc['# mismatches per 100 kbp', :][0]) * (float(quast_report.loc['Total length (>= 0 bp)', :][0]) / 100000.)) | 467 quast_mismatches = int(float(quast_report.loc['# mismatches per 100 kbp', :][0]) * (float(quast_report.loc['Total length (>= 0 bp)', :][0]) / 100000.)) |
530 quast_indels = int(float(quast_report.loc['# indels per 100 kbp', :][0]) * (float(quast_report.loc['Total length (>= 0 bp)', :][0]) / 100000.)) | 468 quast_indels = int(float(quast_report.loc['# indels per 100 kbp', :][0]) * (float(quast_report.loc['Total length (>= 0 bp)', :][0]) / 100000.)) |
553 for circos_file in self.circos_files: | 491 for circos_file in self.circos_files: |
554 contig = os.path.basename(circos_file) | 492 contig = os.path.basename(circos_file) |
555 contig_title = 'Alignment to %s' % contig | 493 contig_title = 'Alignment to %s' % contig |
556 self.doc.new_line() | 494 self.doc.new_line() |
557 self.doc.new_header(level=3, title=contig_title) | 495 self.doc.new_header(level=3, title=contig_title) |
558 self.doc.new_line('Blue color indicates query sequences aligned to the reference sequence, which is shown in yellow') | 496 self.doc.new_line('Blue indicates query sequences aligned to the reference sequence, yellow indicates no alignment') |
559 self.doc.new_line(self.doc.new_inline_image(text='contig_title', path=os.path.abspath(circos_file))) | 497 self.doc.new_line(self.doc.new_inline_image(text='contig_title', path=os.path.abspath(circos_file))) |
560 self.doc.new_line('<div style="page-break-after: always;"></div>') | 498 self.doc.new_line('<div style="page-break-after: always;"></div>') |
561 self.doc.new_line() | 499 self.doc.new_line() |
562 if self.dbkey == 'ref_genome': | 500 if self.dbkey == 'ref_genome': |
563 headers = ["* Chromosome - NC_007530.2 Bacillus anthracis str. 'Ames Ancestor', complete sequence", | 501 headers = ["* Chromosome - NC_007530.2 Bacillus anthracis str. 'Ames Ancestor', complete sequence", |
567 self.methods[self.reference_genome_title] = self.methods[self.reference_genome_title].append(pandas.Series(method)) | 505 self.methods[self.reference_genome_title] = self.methods[self.reference_genome_title].append(pandas.Series(method)) |
568 method = 'The genome assembly was aligned against the reference sequence using %s.' % self.dnadiff_version | 506 method = 'The genome assembly was aligned against the reference sequence using %s.' % self.dnadiff_version |
569 self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append(pandas.Series(method)) | 507 self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append(pandas.Series(method)) |
570 | 508 |
571 def add_features(self): | 509 def add_features(self): |
572 self.ofh.write("\nXXXXXX In add_features\n\n") | |
573 if len(self.feature_bed_files) == 0: | 510 if len(self.feature_bed_files) == 0: |
574 return | 511 return |
575 for bbf in self.feature_bed_files: | 512 for bbf in self.feature_bed_files: |
576 if os.path.getsize(bbf) > 0: | 513 if os.path.getsize(bbf) > 0: |
577 best = pandas.read_csv(filepath_or_buffer=bbf, sep='\t', header=None) | 514 best = pandas.read_csv(filepath_or_buffer=bbf, sep='\t', header=None) |
578 self.feature_hits[os.path.basename(bbf)] = best | 515 self.feature_hits[os.path.basename(bbf)] = best |
579 if len(self.feature_hits) == 0: | 516 if len(self.feature_hits) == 0: |
580 return | 517 return |
581 self.ofh.write("self.feature_hits: %s\n" % str(self.feature_hits)) | |
582 self.doc.new_line() | 518 self.doc.new_line() |
583 self.doc.new_header(level=2, title=self.feature_title) | 519 self.doc.new_header(level=2, title=self.feature_title) |
584 for feature_name in self.feature_hits.index.tolist(): | 520 for feature_name in self.feature_hits.index.tolist(): |
585 self.ofh.write("feature_name: %s\n" % str(feature_name)) | |
586 features = self.feature_hits[feature_name].copy() | 521 features = self.feature_hits[feature_name].copy() |
587 self.ofh.write("features: %s\n" % str(features)) | |
588 if features.shape[0] == 0: | 522 if features.shape[0] == 0: |
589 continue | 523 continue |
590 features.iloc[:, 1] = features.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) | 524 features.iloc[:, 1] = features.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) |
591 features.iloc[:, 2] = features.iloc[:, 2].apply(lambda x: '{:,}'.format(x)) | 525 features.iloc[:, 2] = features.iloc[:, 2].apply(lambda x: '{:,}'.format(x)) |
592 self.doc.new_line() | 526 self.doc.new_line() |
593 self.doc.new_header(level=3, title=feature_name) | 527 self.doc.new_header(level=3, title=feature_name) |
594 if (features.shape[0] == 0): | 528 if (features.shape[0] == 0): |
595 continue | 529 continue |
596 for contig in pandas.unique(features.iloc[:, 0]): | 530 for contig in pandas.unique(features.iloc[:, 0]): |
597 self.ofh.write("contig: %s\n" % str(contig)) | |
598 self.doc.new_line(contig) | 531 self.doc.new_line(contig) |
599 contig_features = features.loc[(features.iloc[:, 0] == contig), :] | 532 contig_features = features.loc[(features.iloc[:, 0] == contig), :] |
600 self.ofh.write("contig_features: %s\n" % str(contig_features)) | |
601 Table_List = ['Start', 'Stop', 'Feature', 'Identity (%)', 'Strand'] | 533 Table_List = ['Start', 'Stop', 'Feature', 'Identity (%)', 'Strand'] |
602 for i in range(contig_features.shape[0]): | 534 for i in range(contig_features.shape[0]): |
603 self.ofh.write("i: %s\n" % str(i)) | |
604 feature = contig_features.iloc[i, :].copy(deep=True) | 535 feature = contig_features.iloc[i, :].copy(deep=True) |
605 self.ofh.write("feature: %s\n" % str(feature)) | |
606 feature[4] = '{:.3f}'.format(feature[4]) | 536 feature[4] = '{:.3f}'.format(feature[4]) |
607 self.ofh.write("feature[1:].values.tolist(): %s\n" % str(feature[1:].values.tolist())) | |
608 Table_List = Table_List + feature[1:].values.tolist() | 537 Table_List = Table_List + feature[1:].values.tolist() |
609 self.ofh.write("Table_List: %s\n" % str(Table_List)) | |
610 row_count = int(len(Table_List) / 5) | 538 row_count = int(len(Table_List) / 5) |
611 self.ofh.write("row_count: %s\n" % str(row_count)) | |
612 self.doc.new_line() | 539 self.doc.new_line() |
613 self.ofh.write("Before new_table, len(Table_List):: %s\n" % str(len(Table_List))) | |
614 self.doc.new_table(columns=5, rows=row_count, text=Table_List, text_align='left') | 540 self.doc.new_table(columns=5, rows=row_count, text=Table_List, text_align='left') |
615 blastn_version = 'The genome assembly was queried for features using %s.' % self.blastn_version | 541 blastn_version = 'The genome assembly was queried for features using %s.' % self.blastn_version |
616 bedtools_version = 'Feature hits were clustered using %s and the highest scoring hit for each cluster was reported.' % self.bedtools_version | 542 bedtools_version = 'Feature hits were clustered using %s and the highest scoring hit for each cluster was reported.' % self.bedtools_version |
617 method = '%s %s' % (blastn_version, bedtools_version) | 543 method = '%s %s' % (blastn_version, bedtools_version) |
618 self.methods[self.feature_methods_title] = self.methods[self.feature_methods_title].append(pandas.Series(method)) | 544 self.methods[self.feature_methods_title] = self.methods[self.feature_methods_title].append(pandas.Series(method)) |
619 | 545 |
620 def add_feature_plots(self): | 546 def add_feature_plots(self): |
621 self.ofh.write("\nXXXXXX In add_feature_plots\n\n") | |
622 if len(self.feature_png_files) == 0: | 547 if len(self.feature_png_files) == 0: |
623 return | 548 return |
624 self.doc.new_line() | 549 self.doc.new_line() |
625 self.doc.new_header(level=2, title='Feature Plots') | 550 self.doc.new_header(level=2, title='Feature Plots') |
626 self.doc.new_paragraph('Only contigs with features are shown') | 551 self.doc.new_paragraph('Only contigs with features are shown') |
627 for feature_png_file in self.feature_png_files: | 552 for feature_png_file in self.feature_png_files: |
628 self.doc.new_line(self.doc.new_inline_image(text='Analysis', path=os.path.abspath(feature_png_file))) | 553 self.doc.new_line(self.doc.new_inline_image(text='Analysis', path=os.path.abspath(feature_png_file))) |
629 | 554 |
630 def add_mutations(self): | 555 def add_mutations(self): |
631 self.ofh.write("\nXXXXXX In add_mutations\n\n") | |
632 if len(self.mutation_regions_tsv_files) == 0: | 556 if len(self.mutation_regions_tsv_files) == 0: |
633 return | 557 return |
634 try: | 558 try: |
635 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) | 559 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) |
636 except Exception: | 560 except Exception: |
638 return | 562 return |
639 amr_mutations = pandas.Series(dtype=object) | 563 amr_mutations = pandas.Series(dtype=object) |
640 for region_i in range(mutation_regions.shape[0]): | 564 for region_i in range(mutation_regions.shape[0]): |
641 region = mutation_regions.iloc[region_i, :] | 565 region = mutation_regions.iloc[region_i, :] |
642 region_name = str(region['name']) | 566 region_name = str(region['name']) |
643 self.ofh.write("Processing mutations for region %s\n" % region_name) | |
644 region_mutations_tsv_name = '%s_mutations.tsv' % region_name | 567 region_mutations_tsv_name = '%s_mutations.tsv' % region_name |
645 if region_mutations_tsv_name not in self.mutation_regions_tsv_files: | 568 if region_mutations_tsv_name not in self.mutation_regions_tsv_files: |
646 continue | 569 continue |
647 region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name] | 570 region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name] |
648 try: | 571 try: |
690 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) | 613 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) |
691 method = 'Mutations were identified using %s and %s.' % (self.samtools_version, self.varscan_version) | 614 method = 'Mutations were identified using %s and %s.' % (self.samtools_version, self.varscan_version) |
692 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) | 615 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) |
693 | 616 |
694 def add_amr_matrix(self): | 617 def add_amr_matrix(self): |
695 self.ofh.write("\nXXXXXX In add_amr_matrix\n\n") | |
696 # Make sure that we have an AMR matrix to plot | 618 # Make sure that we have an AMR matrix to plot |
697 if len(self.amr_matrix_files) == 0: | 619 if len(self.amr_matrix_files) == 0: |
698 return | 620 return |
699 self.doc.new_line() | 621 self.doc.new_line() |
700 self.doc.new_header(level=2, title=self.amr_matrix_title) | 622 self.doc.new_header(level=2, title=self.amr_matrix_title) |
701 self.doc.new_line('AMR genes and mutations with their corresponding drugs') | 623 amr_matrix_text = 'AMR genes and mutations with their corresponding drugs: dark blue indicates the presence of a gene/mutation, light blue indicates the absence of a gene/mutation' |
624 self.doc.new_line(amr_matrix_text) | |
702 for amr_matrix_file in self.amr_matrix_files: | 625 for amr_matrix_file in self.amr_matrix_files: |
703 self.doc.new_line(self.doc.new_inline_image(text='AMR genes and mutations with their corresponding drugs', | 626 self.doc.new_line(self.doc.new_inline_image(text=amr_matrix_text, path=os.path.abspath(amr_matrix_file))) |
704 path=os.path.abspath(amr_matrix_file))) | |
705 | 627 |
706 def add_large_indels(self): | 628 def add_large_indels(self): |
707 self.ofh.write("\nXXXXXX In add_large_indels\n\n") | |
708 large_indels = pandas.Series(dtype='float64') | 629 large_indels = pandas.Series(dtype='float64') |
709 # Pull in insertions. | 630 # Pull in insertions. |
710 try: | 631 try: |
711 reference_insertions = pandas.read_csv(filepath_or_buffer=self.reference_insertions_file, sep='\t', header=None) | 632 reference_insertions = pandas.read_csv(filepath_or_buffer=self.reference_insertions_file, sep='\t', header=None) |
712 except Exception: | 633 except Exception: |
749 self.doc.new_line() | 670 self.doc.new_line() |
750 self.doc.new_line('<div style="page-break-after: always;"></div>') | 671 self.doc.new_line('<div style="page-break-after: always;"></div>') |
751 self.doc.new_line() | 672 self.doc.new_line() |
752 | 673 |
753 def add_lrn_risk_info(self): | 674 def add_lrn_risk_info(self): |
754 self.ofh.write("\nXXXXXX In add_lrn_risk_info\n\n") | |
755 if self.lrn_risk_amr_file is None and self.lrn_risk_blacklist_file is None and self.lrn_risk_vf_file is None: | 675 if self.lrn_risk_amr_file is None and self.lrn_risk_blacklist_file is None and self.lrn_risk_vf_file is None: |
756 return | 676 return |
757 if not os.path.isfile(self.lrn_risk_amr_file) and not os.path.isfile(self.lrn_risk_blacklist_file) and not os.path.isfile(self.lrn_risk_vf_file): | 677 if not os.path.isfile(self.lrn_risk_amr_file) and not os.path.isfile(self.lrn_risk_blacklist_file) and not os.path.isfile(self.lrn_risk_vf_file): |
758 return | 678 return |
759 if os.path.getsize(self.lrn_risk_amr_file) == 0 and os.path.getsize(self.lrn_risk_blacklist_file) == 0 and os.path.getsize(self.lrn_risk_vf_file) == 0: | 679 if os.path.getsize(self.lrn_risk_amr_file) == 0 and os.path.getsize(self.lrn_risk_blacklist_file) == 0 and os.path.getsize(self.lrn_risk_vf_file) == 0: |
830 self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) | 750 self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) |
831 method = 'Plasmid-to-genome hits were resolved using the pChunks algorithm.' | 751 method = 'Plasmid-to-genome hits were resolved using the pChunks algorithm.' |
832 self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) | 752 self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) |
833 | 753 |
834 def add_methods(self): | 754 def add_methods(self): |
835 self.ofh.write("\nXXXXXX In add_methods\n\n") | |
836 if len(self.methods) == 0: | 755 if len(self.methods) == 0: |
837 return | 756 return |
838 self.doc.new_line() | 757 self.doc.new_line() |
839 self.doc.new_header(level=2, title=self.methods_title) | 758 self.doc.new_header(level=2, title=self.methods_title) |
840 for methods_section in self.methods.index.tolist(): | 759 for methods_section in self.methods.index.tolist(): |
845 self.doc.new_paragraph(' '.join(self.methods[methods_section])) | 764 self.doc.new_paragraph(' '.join(self.methods[methods_section])) |
846 self.doc.new_line('<div style="page-break-after: always;"></div>') | 765 self.doc.new_line('<div style="page-break-after: always;"></div>') |
847 self.doc.new_line() | 766 self.doc.new_line() |
848 | 767 |
849 def add_summary(self): | 768 def add_summary(self): |
850 self.ofh.write("\nXXXXXX In add_summary\n\n") | |
851 # Add summary title | 769 # Add summary title |
852 self.doc.new_header(level=1, title=self.summary_title) | 770 self.doc.new_header(level=1, title=self.summary_title) |
853 # First section of Summary | 771 # First section of Summary |
854 self.doc.new_header(level=1, title='CDC Advisory') | 772 self.doc.new_header(level=1, title='CDC Advisory') |
855 self.doc.new_paragraph(CDC_ADVISORY) | 773 self.doc.new_paragraph(CDC_ADVISORY) |
891 text = text.replace("]--##", "") | 809 text = text.replace("]--##", "") |
892 self.doc.file_data_text = text | 810 self.doc.file_data_text = text |
893 self.doc.create_md_file() | 811 self.doc.create_md_file() |
894 | 812 |
895 def make_report(self): | 813 def make_report(self): |
896 self.ofh.write("\nXXXXXX In make_report\n\n") | |
897 self.start_doc() | 814 self.start_doc() |
898 self.add_summary() | 815 self.add_summary() |
899 self.add_contamination() | 816 self.add_contamination() |
900 self.add_alignment() | 817 self.add_alignment() |
901 self.add_features() | 818 self.add_features() |
910 self.make_tex() | 827 self.make_tex() |
911 # It took me quite a long time to find out that the value of the -t | 828 # It took me quite a long time to find out that the value of the -t |
912 # (implied) argument in the following command must be 'html' instead of | 829 # (implied) argument in the following command must be 'html' instead of |
913 # the more logical 'pdf'. see the answer from snsn in this thread: | 830 # the more logical 'pdf'. see the answer from snsn in this thread: |
914 # https://github.com/jessicategner/pypandoc/issues/186 | 831 # https://github.com/jessicategner/pypandoc/issues/186 |
915 self.ofh.write("\nXXXXX In make_report, calling pypandoc.convert_file...\n\n") | |
916 pypandoc.convert_file(self.report_md, | 832 pypandoc.convert_file(self.report_md, |
917 'html', | 833 'html', |
918 extra_args=['--pdf-engine=weasyprint', '-V', '-css=%s' % self.pima_css], | 834 extra_args=['--pdf-engine=weasyprint', '-V', '-css=%s' % self.pima_css], |
919 outputfile='pima_report.pdf') | 835 outputfile='pima_report.pdf') |
920 self.ofh.close() | |
921 | 836 |
922 | 837 |
923 parser = argparse.ArgumentParser() | 838 parser = argparse.ArgumentParser() |
924 | 839 |
925 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', help='AMR deletions BED file') | 840 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', help='AMR deletions BED file') |