Mercurial > repos > greg > pima_report
comparison pima_report.py @ 8:66c7cdf7e93f draft
Uploaded
author | greg |
---|---|
date | Thu, 09 Mar 2023 18:09:29 +0000 |
parents | 9213359bfda8 |
children | 9d608c530bbc |
comparison
equal
deleted
inserted
replaced
7:9213359bfda8 | 8:66c7cdf7e93f |
---|---|
107 self.plasmid_title = 'Plasmid annotation' | 107 self.plasmid_title = 'Plasmid annotation' |
108 self.reference_methods_title = 'Reference comparison' | 108 self.reference_methods_title = 'Reference comparison' |
109 self.snp_indel_title = 'SNPs and small indels' | 109 self.snp_indel_title = 'SNPs and small indels' |
110 self.summary_title = 'Analysis of %s' % analysis_name | 110 self.summary_title = 'Analysis of %s' % analysis_name |
111 | 111 |
112 # Contamination | |
113 self.kraken_fracs = pandas.Series(dtype=object) | |
114 | |
115 # Methods | 112 # Methods |
116 self.methods = pandas.Series(dtype='float64') | 113 self.methods = pandas.Series(dtype='float64') |
117 self.methods[self.contamination_methods_title] = pandas.Series(dtype='float64') | 114 self.methods[self.contamination_methods_title] = pandas.Series(dtype='float64') |
118 self.methods[self.assembly_methods_title] = pandas.Series(dtype='float64') | 115 self.methods[self.assembly_methods_title] = pandas.Series(dtype='float64') |
119 self.methods[self.reference_methods_title] = pandas.Series(dtype='float64') | 116 self.methods[self.reference_methods_title] = pandas.Series(dtype='float64') |
184 warning = '%s coverage of {:s} ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.ont_coverage_min) % self.read_type | 181 warning = '%s coverage of {:s} ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.ont_coverage_min) % self.read_type |
185 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) | 182 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) |
186 # See if some contigs have anolously low coverage. | 183 # See if some contigs have anolously low coverage. |
187 fold_coverage = self.contig_info[self.read_type]['coverage'] / self.mean_coverage | 184 fold_coverage = self.contig_info[self.read_type]['coverage'] / self.mean_coverage |
188 low_coverage = self.contig_info[self.read_type].loc[fold_coverage < 1 / 5, :] | 185 low_coverage = self.contig_info[self.read_type].loc[fold_coverage < 1 / 5, :] |
189 if low_coverage.shape[0] >= 0 : | 186 if low_coverage.shape[0] >= 0: |
190 for contig_i in range(low_coverage.shape[0]): | 187 for contig_i in range(low_coverage.shape[0]): |
191 warning = '%s coverage of {:s} ({:.0f}X) is less than 1/5 the mean coverage ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.mean_coverage) % self.read_type | 188 warning = '%s coverage of {:s} ({:.0f}X) is less than 1/5 the mean coverage ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.mean_coverage) % self.read_type |
192 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) | 189 self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) |
193 | 190 |
194 def load_fasta(self, fasta): | 191 def load_fasta(self, fasta): |
304 'Illumina bases', | 301 'Illumina bases', |
305 '{:s}'.format(self.illumina_bases) | 302 '{:s}'.format(self.illumina_bases) |
306 ] | 303 ] |
307 self.doc.new_table(columns=2, rows=4, text=Table_List, text_align='left') | 304 self.doc.new_table(columns=2, rows=4, text=Table_List, text_align='left') |
308 | 305 |
309 def evaluate_assembly(self) : | 306 def evaluate_assembly(self): |
310 assembly_info = pandas.read_csv(self.compute_sequence_length_file, sep='\t', header=None) | 307 assembly_info = pandas.read_csv(self.compute_sequence_length_file, sep='\t', header=None) |
311 assembly_info.columns = ['contig', 'length'] | 308 assembly_info.columns = ['contig', 'length'] |
312 self.contig_sizes = assembly_info | 309 self.contig_sizes = assembly_info |
313 # Take a look at the number of contigs, their sizes, | 310 # Take a look at the number of contigs, their sizes, |
314 # and circularity. Warn if things don't look good. | 311 # and circularity. Warn if things don't look good. |
419 def add_contamination(self): | 416 def add_contamination(self): |
420 self.ofh.write("\nXXXXXX In add_contamination\n\n") | 417 self.ofh.write("\nXXXXXX In add_contamination\n\n") |
421 if self.kraken2_report_file is None: | 418 if self.kraken2_report_file is None: |
422 return | 419 return |
423 # Read in the Kraken fractions and pull out the useful parts | 420 # Read in the Kraken fractions and pull out the useful parts |
424 self.kraken_fracs = pandas.read_csv(self.kraken2_report_file, delimiter='\t', header=None) | 421 kraken_fracs = pandas.read_csv(self.kraken2_report_file, delimiter='\t', header=None) |
425 self.kraken_fracs.index = self.kraken_fracs.iloc[:, 4].values | 422 kraken_fracs.index = kraken_fracs.iloc[:, 4].values |
426 self.kraken_fracs = self.kraken_fracs.loc[self.kraken_fracs.iloc[:, 3].str.match('[UG]1?'), :] | 423 kraken_fracs = kraken_fracs.loc[kraken_fracs.iloc[:, 3].str.match('[UG]1?'), :] |
427 self.kraken_fracs = self.kraken_fracs.loc[(self.kraken_fracs.iloc[:, 0] >= 1) | (self.kraken_fracs.iloc[:, 3] == 'U'), :] | 424 kraken_fracs = kraken_fracs.loc[(kraken_fracs.iloc[:, 0] >= 1) | (kraken_fracs.iloc[:, 3] == 'U'), :] |
428 self.kraken_fracs = self.kraken_fracs.iloc[:, [0, 1, 3, 5]] | 425 kraken_fracs = kraken_fracs.iloc[:, [0, 1, 3, 5]] |
429 self.kraken_fracs.columns = ['Fraction', 'Reads', 'Level', 'Taxa'] | 426 kraken_fracs.columns = ['Fraction', 'Reads', 'Level', 'Taxa'] |
430 self.kraken_fracs['Fraction'] = (self.kraken_fracs['Fraction'] / 100).round(4) | 427 kraken_fracs['Fraction'] = (kraken_fracs['Fraction'] / 100).round(4) |
431 self.kraken_fracs.sort_values(by='Fraction', inplace=True, ascending=False) | 428 kraken_fracs.sort_values(by='Fraction', inplace=True, ascending=False) |
432 self.kraken_fracs['Taxa'] = self.kraken_fracs['Taxa'].str.lstrip() | 429 kraken_fracs['Taxa'] = kraken_fracs['Taxa'].str.lstrip() |
433 self.doc.new_line() | 430 self.doc.new_line() |
434 self.doc.new_header(2, 'Contamination check') | 431 self.doc.new_header(2, 'Contamination check') |
435 for read_type, kraken_fracs in self.kraken_fracs.iteritems(): | 432 for read_type, kraken_fracs in kraken_fracs.iteritems(): |
436 self.doc.new_line(self.read_type + ' classifications') | 433 self.doc.new_line(self.read_type + ' classifications') |
437 self.doc.new_line() | 434 self.doc.new_line() |
438 Table_List = ["Percent of Reads", "Reads", "Level", "Label"] | 435 Table_List = ["Percent of Reads", "Reads", "Level", "Label"] |
439 for index, row in kraken_fracs.iterrows(): | 436 for index, row in kraken_fracs.iterrows(): |
440 Table_List = Table_List + row.tolist() | 437 Table_List = Table_List + row.tolist() |
544 | 541 |
545 def add_mutations(self): | 542 def add_mutations(self): |
546 self.ofh.write("\nXXXXXX In add_mutations\n\n") | 543 self.ofh.write("\nXXXXXX In add_mutations\n\n") |
547 if len(self.mutation_regions_tsv_files) == 0: | 544 if len(self.mutation_regions_tsv_files) == 0: |
548 return | 545 return |
549 try : | 546 try: |
550 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) | 547 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) |
551 except Exception: | 548 except Exception: |
552 # Likely an empty file. | 549 # Likely an empty file. |
553 return | 550 return |
554 # TODO: this is the only place where reference_genome is used, | 551 # TODO: this is the only place where reference_genome is used, |
581 self.ofh.write("Processing mutations for region %s\n" % region_name) | 578 self.ofh.write("Processing mutations for region %s\n" % region_name) |
582 region_mutations_tsv_name = '%s_mutations.tsv' % region_name | 579 region_mutations_tsv_name = '%s_mutations.tsv' % region_name |
583 if region_mutations_tsv_name not in self.mutation_regions_tsv_files: | 580 if region_mutations_tsv_name not in self.mutation_regions_tsv_files: |
584 continue | 581 continue |
585 region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name] | 582 region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name] |
586 try : | 583 try: |
587 region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False) | 584 region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False) |
588 except Exception: | 585 except Exception: |
589 region_mutations = pandas.DataFrame() | 586 region_mutations = pandas.DataFrame() |
590 if region_mutations.shape[0] == 0: | 587 if region_mutations.shape[0] == 0: |
591 continue | 588 continue |
685 self.doc.new_line() | 682 self.doc.new_line() |
686 self.doc.new_line('<div style="page-break-after: always;"></div>') | 683 self.doc.new_line('<div style="page-break-after: always;"></div>') |
687 self.doc.new_line() | 684 self.doc.new_line() |
688 | 685 |
689 def add_plasmids(self): | 686 def add_plasmids(self): |
690 try : | 687 try: |
691 plasmids = pandas.read_csv(filepath_or_buffer=self.plasmids_file, sep='\t', header=0) | 688 plasmids = pandas.read_csv(filepath_or_buffer=self.plasmids_file, sep='\t', header=0) |
692 except Exception: | 689 except Exception: |
693 return | 690 return |
694 plasmids = plasmids.copy() | 691 plasmids = plasmids.copy() |
695 self.doc.new_line() | 692 self.doc.new_line() |