Mercurial > repos > iuc > vsnp_build_tables
diff vsnp_statistics.py @ 11:6b3b0f5858e6 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit c38fd63f7980c70390d104a73ba4c72b266444c3
author | iuc |
---|---|
date | Fri, 10 Jun 2022 06:11:08 +0000 |
parents | 25714108bb22 |
children |
line wrap: on
line diff
--- a/vsnp_statistics.py Mon Dec 06 18:28:04 2021 +0000 +++ b/vsnp_statistics.py Fri Jun 10 06:11:08 2022 +0000 @@ -1,25 +1,29 @@ #!/usr/bin/env python import argparse -import gzip import os -from functools import partial - -import numpy -import pandas -from Bio import SeqIO class Statistics: - def __init__(self, reference, fastq_file, file_size, total_reads, mean_read_length, mean_read_quality, reads_passing_q30): - self.reference = reference - self.fastq_file = fastq_file + def __init__(self, file_name, file_size, seq_type, num_seqs, sum_len, min_len, avg_len, + max_len, q1, q2, q3, sum_gap, n50, pass_q20, pass_q30, read_quality_average): + self.file_name = file_name self.file_size = file_size - self.total_reads = total_reads - self.mean_read_length = mean_read_length - self.mean_read_quality = mean_read_quality - self.reads_passing_q30 = reads_passing_q30 + self.seq_type = seq_type + self.num_seqs = num_seqs + self.sum_len = sum_len + self.min_len = min_len + self.avg_len = avg_len + self.max_len = max_len + self.q1 = q1 + self.q2 = q2 + self.q3 = q3 + self.sum_gap = sum_gap + self.n50 = n50 + self.pass_q20 = pass_q20 + self.pass_q30 = pass_q30 + self.read_quality_average = read_quality_average def nice_size(size): @@ -43,75 +47,19 @@ return '??? bytes' -def get_statistics(dbkey, fastq_file, gzipped): - sampling_size = 10000 - # Read fastq_file into a data fram to - # get the phred quality scores. - _open = partial(gzip.open, mode='rt') if gzipped else open - with _open(fastq_file) as fh: - identifiers = [] - seqs = [] - letter_annotations = [] - for seq_record in SeqIO.parse(fh, "fastq"): - identifiers.append(seq_record.id) - seqs.append(seq_record.seq) - letter_annotations.append(seq_record.letter_annotations["phred_quality"]) - # Convert lists to Pandas series. - s1 = pandas.Series(identifiers, name='id') - s2 = pandas.Series(seqs, name='seq') - # Gather Series into a data frame. - fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) - # Starting at row 3, keep every 4 row - # random sample specified number of rows. - file_size = nice_size(os.path.getsize(fastq_file)) - total_reads = len(seqs) - # Mean Read Length - if sampling_size > total_reads: - sampling_size = total_reads - try: - fastq_df = fastq_df.iloc[3::4].sample(sampling_size) - except ValueError: - fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True) - dict_mean = {} - list_length = [] - i = 0 - for id, seq, in fastq_df.iterrows(): - dict_mean[id] = numpy.mean(letter_annotations[i]) - list_length.append(len(seq.array[0])) - i += 1 - mean_read_length = '%.1f' % numpy.mean(list_length) - # Mean Read Quality - df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) - mean_read_quality = '%.1f' % df_mean['ave'].mean() - # Reads Passing Q30 - reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) - reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) - stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length, - mean_read_quality, reads_passing_q30) - return stats - - -def accrue_statistics(dbkey, read1, read2, gzipped): - read1_stats = get_statistics(dbkey, read1, gzipped) - if read2 is None: - read2_stats = None - else: - read2_stats = get_statistics(dbkey, read2, gzipped) - return read1_stats, read2_stats - - -def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file): +def output_statistics(read1_stats, read2_stats, output_file): paired_reads = read2_stats is not None if paired_reads: - columns = ['Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', - 'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', - 'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', - 'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage', - 'Good SNP Count', 'Reference'] + columns = ['R1 FASTQ', 'R1 File Size', 'R1 Read Count', 'R1 Length Sum', 'R1 Min Length', + 'R1 Ave Length', 'R1 Max Length', 'R1 Q1', 'R1 Q2', 'R1 Q3', 'R1 Sum Gap', + 'R1 N50', 'R1 Passing Q20', 'R1 Passing Q30', 'R1 Read Quality Ave', 'R2 FASTQ', + 'R2 File Size', 'R2 Read Count', 'R2 Length Sum', 'R2 Min Length', 'R2 Ave Length', + 'R2 Max Length', 'R2 Q1', 'R2 Q2', 'R2 Q3', 'R2 Sum Gap', 'R2 N50', 'R2 Passing Q20', + 'R2 Passing Q30', 'R2 Read Quality Ave'] else: - columns = ['FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', - 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', - 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count', 'Reference'] + columns = ['FASTQ', 'File Size', 'Read Count', 'Length Sum', 'Min Length', 'Ave Length', + 'Max Length', 'Q1', 'Q2', 'Q3', 'Sum Gap', 'N50', 'Passing Q20', 'Passing Q30', + 'Read Quality Ave'] with open(output_file, "w") as outfh: # Make sure the header starts with a # so # MultiQC can properly handle the output. @@ -119,97 +67,102 @@ line_items = [] # Get the current stats and associated files. # Get and output the statistics. - line_items.append(read1_stats.fastq_file) + line_items.append(read1_stats.file_name) line_items.append(read1_stats.file_size) - if paired_reads: - line_items.append(read1_stats.total_reads) - line_items.append(read1_stats.mean_read_length) - line_items.append(read1_stats.mean_read_quality) - line_items.append(read1_stats.reads_passing_q30) - if paired_reads: - line_items.append(read2_stats.fastq_file) - line_items.append(read2_stats.file_size) - line_items.append(read2_stats.total_reads) - line_items.append(read2_stats.mean_read_length) - line_items.append(read2_stats.mean_read_quality) - line_items.append(read2_stats.reads_passing_q30) - # Total Reads + line_items.append(read1_stats.num_seqs) + line_items.append(read1_stats.sum_len) + line_items.append(read1_stats.min_len) + line_items.append(read1_stats.avg_len) + line_items.append(read1_stats.max_len) + line_items.append(read1_stats.q1) + line_items.append(read1_stats.q2) + line_items.append(read1_stats.q3) + line_items.append(read1_stats.sum_gap) + line_items.append(read1_stats.n50) + line_items.append(read1_stats.pass_q20) + line_items.append(read1_stats.pass_q30) + line_items.append(read1_stats.read_quality_average) if paired_reads: - total_reads = read1_stats.total_reads + read2_stats.total_reads - else: - total_reads = read1_stats.total_reads - line_items.append(total_reads) - # All Mapped Reads - all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file) - line_items.append(all_mapped_reads) - line_items.append(unmapped_reads) - # Unmapped Reads Percentage of Total - if unmapped_reads > 0: - unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads) - else: - unmapped_reads_percentage = 0 - line_items.append(unmapped_reads_percentage) - # Reference with Coverage - ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file) - line_items.append(ref_with_coverage) - line_items.append(avg_depth_of_coverage) - line_items.append(good_snp_count) - line_items.append(read1_stats.reference) + line_items.append(read2_stats.file_name) + line_items.append(read2_stats.file_size) + line_items.append(read2_stats.num_seqs) + line_items.append(read2_stats.sum_len) + line_items.append(read2_stats.min_len) + line_items.append(read2_stats.avg_len) + line_items.append(read2_stats.max_len) + line_items.append(read2_stats.q1) + line_items.append(read2_stats.q2) + line_items.append(read2_stats.q3) + line_items.append(read2_stats.sum_gap) + line_items.append(read2_stats.n50) + line_items.append(read2_stats.pass_q20) + line_items.append(read2_stats.pass_q30) + line_items.append(read2_stats.read_quality_average) outfh.write('%s\n' % '\t'.join(str(x) for x in line_items)) -def process_idxstats_file(idxstats_file): - all_mapped_reads = 0 - unmapped_reads = 0 - with open(idxstats_file, "r") as fh: +def get_statistics(fastq_file, seqkit_stats_file, seqkit_fx2tab_file): + file_size = nice_size(os.path.getsize(fastq_file)) + # SeqKit statistics. + with open(seqkit_stats_file, "r") as fh: + # This is a 2-line file for i, line in enumerate(fh): - line = line.rstrip('\r\n') - items = line.split("\t") if i == 0: - # NC_002945.4 4349904 213570 4047 - all_mapped_reads = int(items[2]) - elif i == 1: - # * 0 0 82774 - unmapped_reads = int(items[3]) - return all_mapped_reads, unmapped_reads - - -def process_metrics_file(metrics_file): - ref_with_coverage = '0%' - avg_depth_of_coverage = 0 - good_snp_count = 0 - with open(metrics_file, "r") as ifh: - for i, line in enumerate(ifh): - if i == 0: - # Skip comments. + # Skip header continue line = line.rstrip('\r\n') items = line.split("\t") - if i == 1: - # MarkDuplicates 10.338671 98.74% - ref_with_coverage = items[3] - avg_depth_of_coverage = items[2] - elif i == 2: - # VCFfilter 611 - good_snp_count = items[1] - return ref_with_coverage, avg_depth_of_coverage, good_snp_count + file_name = fastq_file + seq_type = items[2] + num_seqs = items[3] + sum_len = items[4] + min_len = items[5] + avg_len = items[6] + max_len = items[7] + q1 = items[8] + q2 = items[9] + q3 = items[10] + sum_gap = items[11] + n50 = items[12] + try: + pass_q20 = items[13] + except IndexError: + pass_q20 = 0 + try: + pass_q30 = items[14] + except IndexError: + pass_q30 = 0 + # Average read quality is not normalized on length. + avg_sum = 0 + with open(seqkit_fx2tab_file, "r") as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + line = line.rstrip('\r\n') + items = line.split("\t") + avg_sum += float(items[3]) + read_quality_average = "{:.2f}".format(avg_sum / float(i - 1)) + return Statistics(file_name, file_size, seq_type, num_seqs, sum_len, min_len, avg_len, + max_len, q1, q2, q3, sum_gap, n50, pass_q20, pass_q30, read_quality_average) parser = argparse.ArgumentParser() -parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') -parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') -parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') -parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage') +parser.add_argument('--read1_seqkit_stats', action='store', dest='read1_seqkit_stats', help='Output of SeqKit statistics for forward read') +parser.add_argument('--read2_seqkit_stats', action='store', dest='read2_seqkit_stats', required=False, default=None, help='Output of SeqKit statistics for reverse read') +parser.add_argument('--read1_seqkit_fx2tab', action='store', dest='read1_seqkit_fx2tab', help='Output of SeqKit fx2tab for forward read') +parser.add_argument('--read2_seqkit_fx2tab', action='store', dest='read2_seqkit_fx2tab', required=False, default=None, help='Output of SeqKit fx2tab for reverse read') args = parser.parse_args() -stats_list = [] -idxstats_files = [] -metrics_files = [] -# Accumulate inputs. -read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped) -output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output) +read1_stats = get_statistics(args.read1, args.read1_seqkit_stats, args.read1_seqkit_fx2tab) +if args.read2 is None: + read2_stats = None +else: + read2_stats = get_statistics(args.read2, args.read2_seqkit_stats, args.read2_seqkit_fx2tab) + +output_statistics(read1_stats, read2_stats, args.output)