comparison pima_report.py @ 8:66c7cdf7e93f draft

Uploaded
author greg
date Thu, 09 Mar 2023 18:09:29 +0000
parents 9213359bfda8
children 9d608c530bbc
comparison
equal deleted inserted replaced
7:9213359bfda8 8:66c7cdf7e93f
107 self.plasmid_title = 'Plasmid annotation' 107 self.plasmid_title = 'Plasmid annotation'
108 self.reference_methods_title = 'Reference comparison' 108 self.reference_methods_title = 'Reference comparison'
109 self.snp_indel_title = 'SNPs and small indels' 109 self.snp_indel_title = 'SNPs and small indels'
110 self.summary_title = 'Analysis of %s' % analysis_name 110 self.summary_title = 'Analysis of %s' % analysis_name
111 111
112 # Contamination
113 self.kraken_fracs = pandas.Series(dtype=object)
114
115 # Methods 112 # Methods
116 self.methods = pandas.Series(dtype='float64') 113 self.methods = pandas.Series(dtype='float64')
117 self.methods[self.contamination_methods_title] = pandas.Series(dtype='float64') 114 self.methods[self.contamination_methods_title] = pandas.Series(dtype='float64')
118 self.methods[self.assembly_methods_title] = pandas.Series(dtype='float64') 115 self.methods[self.assembly_methods_title] = pandas.Series(dtype='float64')
119 self.methods[self.reference_methods_title] = pandas.Series(dtype='float64') 116 self.methods[self.reference_methods_title] = pandas.Series(dtype='float64')
184 warning = '%s coverage of {:s} ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.ont_coverage_min) % self.read_type 181 warning = '%s coverage of {:s} ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.ont_coverage_min) % self.read_type
185 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) 182 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning))
186 # See if some contigs have anolously low coverage. 183 # See if some contigs have anolously low coverage.
187 fold_coverage = self.contig_info[self.read_type]['coverage'] / self.mean_coverage 184 fold_coverage = self.contig_info[self.read_type]['coverage'] / self.mean_coverage
188 low_coverage = self.contig_info[self.read_type].loc[fold_coverage < 1 / 5, :] 185 low_coverage = self.contig_info[self.read_type].loc[fold_coverage < 1 / 5, :]
189 if low_coverage.shape[0] >= 0 : 186 if low_coverage.shape[0] >= 0:
190 for contig_i in range(low_coverage.shape[0]): 187 for contig_i in range(low_coverage.shape[0]):
191 warning = '%s coverage of {:s} ({:.0f}X) is less than 1/5 the mean coverage ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.mean_coverage) % self.read_type 188 warning = '%s coverage of {:s} ({:.0f}X) is less than 1/5 the mean coverage ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.mean_coverage) % self.read_type
192 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) 189 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning))
193 190
194 def load_fasta(self, fasta): 191 def load_fasta(self, fasta):
304 'Illumina bases', 301 'Illumina bases',
305 '{:s}'.format(self.illumina_bases) 302 '{:s}'.format(self.illumina_bases)
306 ] 303 ]
307 self.doc.new_table(columns=2, rows=4, text=Table_List, text_align='left') 304 self.doc.new_table(columns=2, rows=4, text=Table_List, text_align='left')
308 305
309 def evaluate_assembly(self) : 306 def evaluate_assembly(self):
310 assembly_info = pandas.read_csv(self.compute_sequence_length_file, sep='\t', header=None) 307 assembly_info = pandas.read_csv(self.compute_sequence_length_file, sep='\t', header=None)
311 assembly_info.columns = ['contig', 'length'] 308 assembly_info.columns = ['contig', 'length']
312 self.contig_sizes = assembly_info 309 self.contig_sizes = assembly_info
313 # Take a look at the number of contigs, their sizes, 310 # Take a look at the number of contigs, their sizes,
314 # and circularity. Warn if things don't look good. 311 # and circularity. Warn if things don't look good.
419 def add_contamination(self): 416 def add_contamination(self):
420 self.ofh.write("\nXXXXXX In add_contamination\n\n") 417 self.ofh.write("\nXXXXXX In add_contamination\n\n")
421 if self.kraken2_report_file is None: 418 if self.kraken2_report_file is None:
422 return 419 return
423 # Read in the Kraken fractions and pull out the useful parts 420 # Read in the Kraken fractions and pull out the useful parts
424 self.kraken_fracs = pandas.read_csv(self.kraken2_report_file, delimiter='\t', header=None) 421 kraken_fracs = pandas.read_csv(self.kraken2_report_file, delimiter='\t', header=None)
425 self.kraken_fracs.index = self.kraken_fracs.iloc[:, 4].values 422 kraken_fracs.index = kraken_fracs.iloc[:, 4].values
426 self.kraken_fracs = self.kraken_fracs.loc[self.kraken_fracs.iloc[:, 3].str.match('[UG]1?'), :] 423 kraken_fracs = kraken_fracs.loc[kraken_fracs.iloc[:, 3].str.match('[UG]1?'), :]
427 self.kraken_fracs = self.kraken_fracs.loc[(self.kraken_fracs.iloc[:, 0] >= 1) | (self.kraken_fracs.iloc[:, 3] == 'U'), :] 424 kraken_fracs = kraken_fracs.loc[(kraken_fracs.iloc[:, 0] >= 1) | (kraken_fracs.iloc[:, 3] == 'U'), :]
428 self.kraken_fracs = self.kraken_fracs.iloc[:, [0, 1, 3, 5]] 425 kraken_fracs = kraken_fracs.iloc[:, [0, 1, 3, 5]]
429 self.kraken_fracs.columns = ['Fraction', 'Reads', 'Level', 'Taxa'] 426 kraken_fracs.columns = ['Fraction', 'Reads', 'Level', 'Taxa']
430 self.kraken_fracs['Fraction'] = (self.kraken_fracs['Fraction'] / 100).round(4) 427 kraken_fracs['Fraction'] = (kraken_fracs['Fraction'] / 100).round(4)
431 self.kraken_fracs.sort_values(by='Fraction', inplace=True, ascending=False) 428 kraken_fracs.sort_values(by='Fraction', inplace=True, ascending=False)
432 self.kraken_fracs['Taxa'] = self.kraken_fracs['Taxa'].str.lstrip() 429 kraken_fracs['Taxa'] = kraken_fracs['Taxa'].str.lstrip()
433 self.doc.new_line() 430 self.doc.new_line()
434 self.doc.new_header(2, 'Contamination check') 431 self.doc.new_header(2, 'Contamination check')
435 for read_type, kraken_fracs in self.kraken_fracs.iteritems(): 432 for read_type, kraken_fracs in kraken_fracs.iteritems():
436 self.doc.new_line(self.read_type + ' classifications') 433 self.doc.new_line(self.read_type + ' classifications')
437 self.doc.new_line() 434 self.doc.new_line()
438 Table_List = ["Percent of Reads", "Reads", "Level", "Label"] 435 Table_List = ["Percent of Reads", "Reads", "Level", "Label"]
439 for index, row in kraken_fracs.iterrows(): 436 for index, row in kraken_fracs.iterrows():
440 Table_List = Table_List + row.tolist() 437 Table_List = Table_List + row.tolist()
544 541
545 def add_mutations(self): 542 def add_mutations(self):
546 self.ofh.write("\nXXXXXX In add_mutations\n\n") 543 self.ofh.write("\nXXXXXX In add_mutations\n\n")
547 if len(self.mutation_regions_tsv_files) == 0: 544 if len(self.mutation_regions_tsv_files) == 0:
548 return 545 return
549 try : 546 try:
550 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) 547 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False)
551 except Exception: 548 except Exception:
552 # Likely an empty file. 549 # Likely an empty file.
553 return 550 return
554 # TODO: this is the only place where reference_genome is used, 551 # TODO: this is the only place where reference_genome is used,
581 self.ofh.write("Processing mutations for region %s\n" % region_name) 578 self.ofh.write("Processing mutations for region %s\n" % region_name)
582 region_mutations_tsv_name = '%s_mutations.tsv' % region_name 579 region_mutations_tsv_name = '%s_mutations.tsv' % region_name
583 if region_mutations_tsv_name not in self.mutation_regions_tsv_files: 580 if region_mutations_tsv_name not in self.mutation_regions_tsv_files:
584 continue 581 continue
585 region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name] 582 region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name]
586 try : 583 try:
587 region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False) 584 region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False)
588 except Exception: 585 except Exception:
589 region_mutations = pandas.DataFrame() 586 region_mutations = pandas.DataFrame()
590 if region_mutations.shape[0] == 0: 587 if region_mutations.shape[0] == 0:
591 continue 588 continue
685 self.doc.new_line() 682 self.doc.new_line()
686 self.doc.new_line('<div style="page-break-after: always;"></div>') 683 self.doc.new_line('<div style="page-break-after: always;"></div>')
687 self.doc.new_line() 684 self.doc.new_line()
688 685
689 def add_plasmids(self): 686 def add_plasmids(self):
690 try : 687 try:
691 plasmids = pandas.read_csv(filepath_or_buffer=self.plasmids_file, sep='\t', header=0) 688 plasmids = pandas.read_csv(filepath_or_buffer=self.plasmids_file, sep='\t', header=0)
692 except Exception: 689 except Exception:
693 return 690 return
694 plasmids = plasmids.copy() 691 plasmids = plasmids.copy()
695 self.doc.new_line() 692 self.doc.new_line()