comparison vsnp_statistics.py @ 1:9ac0b1d5560d draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 2a94c64d6c7236550bf483d2ffc4e86248c63aab"
author iuc
date Tue, 16 Nov 2021 20:11:30 +0000
parents ec6e02f4eab7
children 4535ad8b74f3
comparison
equal deleted inserted replaced
0:ec6e02f4eab7 1:9ac0b1d5560d
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import csv
5 import gzip 4 import gzip
6 import os 5 import os
7 from functools import partial 6 from functools import partial
8 7
9 import numpy 8 import numpy
10 import pandas 9 import pandas
11 from Bio import SeqIO 10 from Bio import SeqIO
11
12
13 class Statistics:
14
15 def __init__(self, reference, fastq_file, file_size, total_reads, mean_read_length, mean_read_quality, reads_passing_q30):
16 self.reference = reference
17 self.fastq_file = fastq_file
18 self.file_size = file_size
19 self.total_reads = total_reads
20 self.mean_read_length = mean_read_length
21 self.mean_read_quality = mean_read_quality
22 self.reads_passing_q30 = reads_passing_q30
12 23
13 24
14 def nice_size(size): 25 def nice_size(size):
15 # Returns a readably formatted string with the size 26 # Returns a readably formatted string with the size
16 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] 27 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
30 return "%s%d bytes" % (prefix, size) 41 return "%s%d bytes" % (prefix, size)
31 return "%s%.1f %s" % (prefix, size, word) 42 return "%s%.1f %s" % (prefix, size, word)
32 return '??? bytes' 43 return '??? bytes'
33 44
34 45
35 def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): 46 def get_statistics(dbkey, fastq_file, gzipped):
36 # Produce an Excel spreadsheet that 47 sampling_size = 10000
37 # contains a row for each sample. 48 # Read fastq_file into a data fram to
38 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', 49 # get the phred quality scores.
39 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', 50 _open = partial(gzip.open, mode='rt') if gzipped else open
40 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] 51 with _open(fastq_file) as fh:
41 data_frames = [] 52 identifiers = []
42 for i, fastq_file in enumerate(fastq_files): 53 seqs = []
43 idxstats_file = idxstats_files[i] 54 letter_annotations = []
44 metrics_file = metrics_files[i] 55 for seq_record in SeqIO.parse(fh, "fastq"):
45 file_name_base = os.path.basename(fastq_file) 56 identifiers.append(seq_record.id)
46 # Read fastq_file into a data frame. 57 seqs.append(seq_record.seq)
47 _open = partial(gzip.open, mode='rt') if gzipped else open 58 letter_annotations.append(seq_record.letter_annotations["phred_quality"])
48 with _open(fastq_file) as fh: 59 # Convert lists to Pandas series.
49 identifiers = [] 60 s1 = pandas.Series(identifiers, name='id')
50 seqs = [] 61 s2 = pandas.Series(seqs, name='seq')
51 letter_annotations = [] 62 # Gather Series into a data frame.
52 for seq_record in SeqIO.parse(fh, "fastq"): 63 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
53 identifiers.append(seq_record.id) 64 # Starting at row 3, keep every 4 row
54 seqs.append(seq_record.seq) 65 # random sample specified number of rows.
55 letter_annotations.append(seq_record.letter_annotations["phred_quality"]) 66 file_size = nice_size(os.path.getsize(fastq_file))
56 # Convert lists to Pandas series. 67 total_reads = len(seqs)
57 s1 = pandas.Series(identifiers, name='id') 68 # Mean Read Length
58 s2 = pandas.Series(seqs, name='seq') 69 if sampling_size > total_reads:
59 # Gather Series into a data frame. 70 sampling_size = total_reads
60 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) 71 try:
61 total_reads = int(len(fastq_df.index) / 4)
62 current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
63 # Reference
64 current_sample_df.at[file_name_base, 'Reference'] = dbkey
65 # File Size
66 current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file))
67 # Mean Read Length
68 sampling_size = 10000
69 if sampling_size > total_reads:
70 sampling_size = total_reads
71 fastq_df = fastq_df.iloc[3::4].sample(sampling_size) 72 fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
72 dict_mean = {} 73 except ValueError:
73 list_length = [] 74 fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True)
74 i = 0 75 dict_mean = {}
75 for id, seq, in fastq_df.iterrows(): 76 list_length = []
76 dict_mean[id] = numpy.mean(letter_annotations[i]) 77 i = 0
77 list_length.append(len(seq.array[0])) 78 for id, seq, in fastq_df.iterrows():
78 i += 1 79 dict_mean[id] = numpy.mean(letter_annotations[i])
79 current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length) 80 list_length.append(len(seq.array[0]))
80 # Mean Read Quality 81 i += 1
81 df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) 82 mean_read_length = '%.1f' % numpy.mean(list_length)
82 current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean() 83 # Mean Read Quality
83 # Reads Passing Q30 84 df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
84 reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) 85 mean_read_quality = '%.1f' % df_mean['ave'].mean()
85 reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) 86 # Reads Passing Q30
86 current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30 87 reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
88 reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
89 stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length,
90 mean_read_quality, reads_passing_q30)
91 return stats
92
93
94 def accrue_statistics(dbkey, read1, read2, gzipped):
95 read1_stats = get_statistics(dbkey, read1, gzipped)
96 if read2 is None:
97 read2_stats = None
98 else:
99 read2_stats = get_statistics(dbkey, read2, gzipped)
100 return read1_stats, read2_stats
101
102
103 def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file):
104 paired_reads = read2_stats is not None
105 if paired_reads:
106 columns = ['Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
107 'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
108 'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads',
109 'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage',
110 'Good SNP Count', 'Reference']
111 else:
112 columns = ['FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
113 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
114 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count', 'Reference']
115 with open(output_file, "w") as outfh:
116 # Make sure the header starts with a # so
117 # MultiQC can properly handle the output.
118 outfh.write("%s\n" % "\t".join(columns))
119 line_items = []
120 # Get the current stats and associated files.
121 # Get and output the statistics.
122 line_items.append(read1_stats.fastq_file)
123 line_items.append(read1_stats.file_size)
124 if paired_reads:
125 line_items.append(read1_stats.total_reads)
126 line_items.append(read1_stats.mean_read_length)
127 line_items.append(read1_stats.mean_read_quality)
128 line_items.append(read1_stats.reads_passing_q30)
129 if paired_reads:
130 line_items.append(read2_stats.fastq_file)
131 line_items.append(read2_stats.file_size)
132 line_items.append(read2_stats.total_reads)
133 line_items.append(read2_stats.mean_read_length)
134 line_items.append(read2_stats.mean_read_quality)
135 line_items.append(read2_stats.reads_passing_q30)
87 # Total Reads 136 # Total Reads
88 current_sample_df.at[file_name_base, 'Total Reads'] = total_reads 137 if paired_reads:
138 total_reads = read1_stats.total_reads + read2_stats.total_reads
139 else:
140 total_reads = read1_stats.total_reads
141 line_items.append(total_reads)
89 # All Mapped Reads 142 # All Mapped Reads
90 all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file) 143 all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file)
91 current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads 144 line_items.append(all_mapped_reads)
92 # Unmapped Reads 145 line_items.append(unmapped_reads)
93 current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
94 # Unmapped Reads Percentage of Total 146 # Unmapped Reads Percentage of Total
95 if unmapped_reads > 0: 147 if unmapped_reads > 0:
96 unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads) 148 unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
97 else: 149 else:
98 unmapped_reads_percentage = 0 150 unmapped_reads_percentage = 0
99 current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage 151 line_items.append(unmapped_reads_percentage)
100 # Reference with Coverage 152 # Reference with Coverage
101 ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file) 153 ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file)
102 current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage 154 line_items.append(ref_with_coverage)
103 # Average Depth of Coverage 155 line_items.append(avg_depth_of_coverage)
104 current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage 156 line_items.append(good_snp_count)
105 # Good SNP Count 157 line_items.append(read1_stats.reference)
106 current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count 158 outfh.write('%s\n' % '\t'.join(str(x) for x in line_items))
107 data_frames.append(current_sample_df)
108 output_df = pandas.concat(data_frames)
109 output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')
110 159
111 160
112 def process_idxstats_file(idxstats_file): 161 def process_idxstats_file(idxstats_file):
113 all_mapped_reads = 0 162 all_mapped_reads = 0
114 unmapped_reads = 0 163 unmapped_reads = 0
148 197
149 parser = argparse.ArgumentParser() 198 parser = argparse.ArgumentParser()
150 199
151 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') 200 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
152 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') 201 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
153 parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory')
154 parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory')
155 parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory')
156 parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads')
157 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') 202 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
158 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') 203 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
159 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') 204 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
160 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') 205 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats')
161 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage') 206 parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage')
162 207
163 args = parser.parse_args() 208 args = parser.parse_args()
164 209
165 fastq_files = [] 210 stats_list = []
166 idxstats_files = [] 211 idxstats_files = []
167 metrics_files = [] 212 metrics_files = []
168 # Accumulate inputs. 213 # Accumulate inputs.
169 if args.read1 is not None: 214 read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped)
170 # The inputs are not dataset collections, so 215 output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output)
171 # read1, read2 (possibly) and vsnp_azc will also
172 # not be None.
173 fastq_files.append(args.read1)
174 idxstats_files.append(args.samtools_idxstats)
175 metrics_files.append(args.vsnp_azc)
176 if args.read2 is not None:
177 fastq_files.append(args.read2)
178 idxstats_files.append(args.samtools_idxstats)
179 metrics_files.append(args.vsnp_azc)
180 else:
181 for file_name in sorted(os.listdir(args.input_reads_dir)):
182 fastq_files.append(os.path.join(args.input_reads_dir, file_name))
183 for file_name in sorted(os.listdir(args.input_idxstats_dir)):
184 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
185 if args.list_paired:
186 # Add the idxstats file for reverse.
187 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
188 for file_name in sorted(os.listdir(args.input_metrics_dir)):
189 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
190 if args.list_paired:
191 # Add the metrics file for reverse.
192 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
193 output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)