comparison vsnp_statistics.py @ 9:40b97055bb99 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit c38fd63f7980c70390d104a73ba4c72b266444c3
author iuc
date Fri, 10 Jun 2022 06:08:02 +0000
parents 6dc6dd4666e3
children
comparison
equal deleted inserted replaced
8:18b59c38017e 9:40b97055bb99
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import gzip
5 import os 4 import os
6 from functools import partial
7
8 import numpy
9 import pandas
10 from Bio import SeqIO
11 5
12 6
13 class Statistics: 7 class Statistics:
14 8
15 def __init__(self, reference, fastq_file, file_size, total_reads, mean_read_length, mean_read_quality, reads_passing_q30): 9 def __init__(self, file_name, file_size, seq_type, num_seqs, sum_len, min_len, avg_len,
16 self.reference = reference 10 max_len, q1, q2, q3, sum_gap, n50, pass_q20, pass_q30, read_quality_average):
17 self.fastq_file = fastq_file 11 self.file_name = file_name
18 self.file_size = file_size 12 self.file_size = file_size
19 self.total_reads = total_reads 13 self.seq_type = seq_type
20 self.mean_read_length = mean_read_length 14 self.num_seqs = num_seqs
21 self.mean_read_quality = mean_read_quality 15 self.sum_len = sum_len
22 self.reads_passing_q30 = reads_passing_q30 16 self.min_len = min_len
17 self.avg_len = avg_len
18 self.max_len = max_len
19 self.q1 = q1
20 self.q2 = q2
21 self.q3 = q3
22 self.sum_gap = sum_gap
23 self.n50 = n50
24 self.pass_q20 = pass_q20
25 self.pass_q30 = pass_q30
26 self.read_quality_average = read_quality_average
23 27
24 28
25 def nice_size(size): 29 def nice_size(size):
26 # Returns a readably formatted string with the size 30 # Returns a readably formatted string with the size
27 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] 31 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
41 return "%s%d bytes" % (prefix, size) 45 return "%s%d bytes" % (prefix, size)
42 return "%s%.1f %s" % (prefix, size, word) 46 return "%s%.1f %s" % (prefix, size, word)
43 return '??? bytes' 47 return '??? bytes'
44 48
45 49
46 def get_statistics(dbkey, fastq_file, gzipped): 50 def output_statistics(read1_stats, read2_stats, output_file):
47 sampling_size = 10000
48 # Read fastq_file into a data fram to
49 # get the phred quality scores.
50 _open = partial(gzip.open, mode='rt') if gzipped else open
51 with _open(fastq_file) as fh:
52 identifiers = []
53 seqs = []
54 letter_annotations = []
55 for seq_record in SeqIO.parse(fh, "fastq"):
56 identifiers.append(seq_record.id)
57 seqs.append(seq_record.seq)
58 letter_annotations.append(seq_record.letter_annotations["phred_quality"])
59 # Convert lists to Pandas series.
60 s1 = pandas.Series(identifiers, name='id')
61 s2 = pandas.Series(seqs, name='seq')
62 # Gather Series into a data frame.
63 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
64 # Starting at row 3, keep every 4 row
65 # random sample specified number of rows.
66 file_size = nice_size(os.path.getsize(fastq_file))
67 total_reads = len(seqs)
68 # Mean Read Length
69 if sampling_size > total_reads:
70 sampling_size = total_reads
71 try:
72 fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
73 except ValueError:
74 fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True)
75 dict_mean = {}
76 list_length = []
77 i = 0
78 for id, seq, in fastq_df.iterrows():
79 dict_mean[id] = numpy.mean(letter_annotations[i])
80 list_length.append(len(seq.array[0]))
81 i += 1
82 mean_read_length = '%.1f' % numpy.mean(list_length)
83 # Mean Read Quality
84 df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
85 mean_read_quality = '%.1f' % df_mean['ave'].mean()
86 # Reads Passing Q30
87 reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
88 reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
89 stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length,
90 mean_read_quality, reads_passing_q30)
91 return stats
92
93
94 def accrue_statistics(dbkey, read1, read2, gzipped):
95 read1_stats = get_statistics(dbkey, read1, gzipped)
96 if read2 is None:
97 read2_stats = None
98 else:
99 read2_stats = get_statistics(dbkey, read2, gzipped)
100 return read1_stats, read2_stats
101
102
103 def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file):
104 paired_reads = read2_stats is not None 51 paired_reads = read2_stats is not None
105 if paired_reads: 52 if paired_reads:
106 columns = ['Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', 53 columns = ['R1 FASTQ', 'R1 File Size', 'R1 Read Count', 'R1 Length Sum', 'R1 Min Length',
107 'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', 54 'R1 Ave Length', 'R1 Max Length', 'R1 Q1', 'R1 Q2', 'R1 Q3', 'R1 Sum Gap',
108 'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 55 'R1 N50', 'R1 Passing Q20', 'R1 Passing Q30', 'R1 Read Quality Ave', 'R2 FASTQ',
109 'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage', 56 'R2 File Size', 'R2 Read Count', 'R2 Length Sum', 'R2 Min Length', 'R2 Ave Length',
110 'Good SNP Count', 'Reference'] 57 'R2 Max Length', 'R2 Q1', 'R2 Q2', 'R2 Q3', 'R2 Sum Gap', 'R2 N50', 'R2 Passing Q20',
58 'R2 Passing Q30', 'R2 Read Quality Ave']
111 else: 59 else:
112 columns = ['FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', 60 columns = ['FASTQ', 'File Size', 'Read Count', 'Length Sum', 'Min Length', 'Ave Length',
113 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', 61 'Max Length', 'Q1', 'Q2', 'Q3', 'Sum Gap', 'N50', 'Passing Q20', 'Passing Q30',
114 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count', 'Reference'] 62 'Read Quality Ave']
115 with open(output_file, "w") as outfh: 63 with open(output_file, "w") as outfh:
116 # Make sure the header starts with a # so 64 # Make sure the header starts with a # so
117 # MultiQC can properly handle the output. 65 # MultiQC can properly handle the output.
118 outfh.write("%s\n" % "\t".join(columns)) 66 outfh.write("%s\n" % "\t".join(columns))
119 line_items = [] 67 line_items = []
120 # Get the current stats and associated files. 68 # Get the current stats and associated files.
121 # Get and output the statistics. 69 # Get and output the statistics.
122 line_items.append(read1_stats.fastq_file) 70 line_items.append(read1_stats.file_name)
123 line_items.append(read1_stats.file_size) 71 line_items.append(read1_stats.file_size)
72 line_items.append(read1_stats.num_seqs)
73 line_items.append(read1_stats.sum_len)
74 line_items.append(read1_stats.min_len)
75 line_items.append(read1_stats.avg_len)
76 line_items.append(read1_stats.max_len)
77 line_items.append(read1_stats.q1)
78 line_items.append(read1_stats.q2)
79 line_items.append(read1_stats.q3)
80 line_items.append(read1_stats.sum_gap)
81 line_items.append(read1_stats.n50)
82 line_items.append(read1_stats.pass_q20)
83 line_items.append(read1_stats.pass_q30)
84 line_items.append(read1_stats.read_quality_average)
124 if paired_reads: 85 if paired_reads:
125 line_items.append(read1_stats.total_reads) 86 line_items.append(read2_stats.file_name)
126 line_items.append(read1_stats.mean_read_length)
127 line_items.append(read1_stats.mean_read_quality)
128 line_items.append(read1_stats.reads_passing_q30)
129 if paired_reads:
130 line_items.append(read2_stats.fastq_file)
131 line_items.append(read2_stats.file_size) 87 line_items.append(read2_stats.file_size)
132 line_items.append(read2_stats.total_reads) 88 line_items.append(read2_stats.num_seqs)
133 line_items.append(read2_stats.mean_read_length) 89 line_items.append(read2_stats.sum_len)
134 line_items.append(read2_stats.mean_read_quality) 90 line_items.append(read2_stats.min_len)
135 line_items.append(read2_stats.reads_passing_q30) 91 line_items.append(read2_stats.avg_len)
136 # Total Reads 92 line_items.append(read2_stats.max_len)
137 if paired_reads: 93 line_items.append(read2_stats.q1)
138 total_reads = read1_stats.total_reads + read2_stats.total_reads 94 line_items.append(read2_stats.q2)
139 else: 95 line_items.append(read2_stats.q3)
140 total_reads = read1_stats.total_reads 96 line_items.append(read2_stats.sum_gap)
141 line_items.append(total_reads) 97 line_items.append(read2_stats.n50)
142 # All Mapped Reads 98 line_items.append(read2_stats.pass_q20)
143 all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file) 99 line_items.append(read2_stats.pass_q30)
144 line_items.append(all_mapped_reads) 100 line_items.append(read2_stats.read_quality_average)
145 line_items.append(unmapped_reads)
146 # Unmapped Reads Percentage of Total
147 if unmapped_reads > 0:
148 unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
149 else:
150 unmapped_reads_percentage = 0
151 line_items.append(unmapped_reads_percentage)
152 # Reference with Coverage
153 ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file)
154 line_items.append(ref_with_coverage)
155 line_items.append(avg_depth_of_coverage)
156 line_items.append(good_snp_count)
157 line_items.append(read1_stats.reference)
158 outfh.write('%s\n' % '\t'.join(str(x) for x in line_items)) 101 outfh.write('%s\n' % '\t'.join(str(x) for x in line_items))
159 102
160 103
161 def process_idxstats_file(idxstats_file): 104 def get_statistics(fastq_file, seqkit_stats_file, seqkit_fx2tab_file):
162 all_mapped_reads = 0 105 file_size = nice_size(os.path.getsize(fastq_file))
163 unmapped_reads = 0 106 # SeqKit statistics.
164 with open(idxstats_file, "r") as fh: 107 with open(seqkit_stats_file, "r") as fh:
108 # This is a 2-line file
165 for i, line in enumerate(fh): 109 for i, line in enumerate(fh):
166 line = line.rstrip('\r\n')
167 items = line.split("\t")
168 if i == 0: 110 if i == 0:
169 # NC_002945.4 4349904 213570 4047 111 # Skip header
170 all_mapped_reads = int(items[2])
171 elif i == 1:
172 # * 0 0 82774
173 unmapped_reads = int(items[3])
174 return all_mapped_reads, unmapped_reads
175
176
177 def process_metrics_file(metrics_file):
178 ref_with_coverage = '0%'
179 avg_depth_of_coverage = 0
180 good_snp_count = 0
181 with open(metrics_file, "r") as ifh:
182 for i, line in enumerate(ifh):
183 if i == 0:
184 # Skip comments.
185 continue 112 continue
186 line = line.rstrip('\r\n') 113 line = line.rstrip('\r\n')
187 items = line.split("\t") 114 items = line.split("\t")
188 if i == 1: 115 file_name = fastq_file
189 # MarkDuplicates 10.338671 98.74% 116 seq_type = items[2]
190 ref_with_coverage = items[3] 117 num_seqs = items[3]
191 avg_depth_of_coverage = items[2] 118 sum_len = items[4]
192 elif i == 2: 119 min_len = items[5]
193 # VCFfilter 611 120 avg_len = items[6]
194 good_snp_count = items[1] 121 max_len = items[7]
195 return ref_with_coverage, avg_depth_of_coverage, good_snp_count 122 q1 = items[8]
123 q2 = items[9]
124 q3 = items[10]
125 sum_gap = items[11]
126 n50 = items[12]
127 try:
128 pass_q20 = items[13]
129 except IndexError:
130 pass_q20 = 0
131 try:
132 pass_q30 = items[14]
133 except IndexError:
134 pass_q30 = 0
135 # Average read quality is not normalized on length.
136 avg_sum = 0
137 with open(seqkit_fx2tab_file, "r") as fh:
138 for i, line in enumerate(fh):
139 if i == 0:
140 # Skip header
141 continue
142 line = line.rstrip('\r\n')
143 items = line.split("\t")
144 avg_sum += float(items[3])
145 read_quality_average = "{:.2f}".format(avg_sum / float(i - 1))
146 return Statistics(file_name, file_size, seq_type, num_seqs, sum_len, min_len, avg_len,
147 max_len, q1, q2, q3, sum_gap, n50, pass_q20, pass_q30, read_quality_average)
196 148
197 149
198 parser = argparse.ArgumentParser() 150 parser = argparse.ArgumentParser()
199 151
200 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
201 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
202 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') 152 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
203 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') 153 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
204 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') 154 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
205 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') 155 parser.add_argument('--read1_seqkit_stats', action='store', dest='read1_seqkit_stats', help='Output of SeqKit statistics for forward read')
206 parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage') 156 parser.add_argument('--read2_seqkit_stats', action='store', dest='read2_seqkit_stats', required=False, default=None, help='Output of SeqKit statistics for reverse read')
157 parser.add_argument('--read1_seqkit_fx2tab', action='store', dest='read1_seqkit_fx2tab', help='Output of SeqKit fx2tab for forward read')
158 parser.add_argument('--read2_seqkit_fx2tab', action='store', dest='read2_seqkit_fx2tab', required=False, default=None, help='Output of SeqKit fx2tab for reverse read')
207 159
208 args = parser.parse_args() 160 args = parser.parse_args()
209 161
210 stats_list = [] 162 read1_stats = get_statistics(args.read1, args.read1_seqkit_stats, args.read1_seqkit_fx2tab)
211 idxstats_files = [] 163 if args.read2 is None:
212 metrics_files = [] 164 read2_stats = None
213 # Accumulate inputs. 165 else:
214 read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped) 166 read2_stats = get_statistics(args.read2, args.read2_seqkit_stats, args.read2_seqkit_fx2tab)
215 output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output) 167
168 output_statistics(read1_stats, read2_stats, args.output)