comparison vsnp_statistics.py @ 0:ec6e02f4eab7 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
author iuc
date Tue, 16 Nov 2021 08:26:58 +0000
parents
children 9ac0b1d5560d
comparison
equal deleted inserted replaced
-1:000000000000 0:ec6e02f4eab7
1 #!/usr/bin/env python
2
3 import argparse
4 import csv
5 import gzip
6 import os
7 from functools import partial
8
9 import numpy
10 import pandas
11 from Bio import SeqIO
12
13
14 def nice_size(size):
15 # Returns a readably formatted string with the size
16 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
17 prefix = ''
18 try:
19 size = float(size)
20 if size < 0:
21 size = abs(size)
22 prefix = '-'
23 except Exception:
24 return '??? bytes'
25 for ind, word in enumerate(words):
26 step = 1024 ** (ind + 1)
27 if step > size:
28 size = size / float(1024 ** ind)
29 if word == 'bytes': # No decimals for bytes
30 return "%s%d bytes" % (prefix, size)
31 return "%s%.1f %s" % (prefix, size, word)
32 return '??? bytes'
33
34
35 def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey):
36 # Produce an Excel spreadsheet that
37 # contains a row for each sample.
38 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
39 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
40 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
41 data_frames = []
42 for i, fastq_file in enumerate(fastq_files):
43 idxstats_file = idxstats_files[i]
44 metrics_file = metrics_files[i]
45 file_name_base = os.path.basename(fastq_file)
46 # Read fastq_file into a data frame.
47 _open = partial(gzip.open, mode='rt') if gzipped else open
48 with _open(fastq_file) as fh:
49 identifiers = []
50 seqs = []
51 letter_annotations = []
52 for seq_record in SeqIO.parse(fh, "fastq"):
53 identifiers.append(seq_record.id)
54 seqs.append(seq_record.seq)
55 letter_annotations.append(seq_record.letter_annotations["phred_quality"])
56 # Convert lists to Pandas series.
57 s1 = pandas.Series(identifiers, name='id')
58 s2 = pandas.Series(seqs, name='seq')
59 # Gather Series into a data frame.
60 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
61 total_reads = int(len(fastq_df.index) / 4)
62 current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
63 # Reference
64 current_sample_df.at[file_name_base, 'Reference'] = dbkey
65 # File Size
66 current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file))
67 # Mean Read Length
68 sampling_size = 10000
69 if sampling_size > total_reads:
70 sampling_size = total_reads
71 fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
72 dict_mean = {}
73 list_length = []
74 i = 0
75 for id, seq, in fastq_df.iterrows():
76 dict_mean[id] = numpy.mean(letter_annotations[i])
77 list_length.append(len(seq.array[0]))
78 i += 1
79 current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length)
80 # Mean Read Quality
81 df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
82 current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean()
83 # Reads Passing Q30
84 reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
85 reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
86 current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30
87 # Total Reads
88 current_sample_df.at[file_name_base, 'Total Reads'] = total_reads
89 # All Mapped Reads
90 all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file)
91 current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads
92 # Unmapped Reads
93 current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
94 # Unmapped Reads Percentage of Total
95 if unmapped_reads > 0:
96 unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
97 else:
98 unmapped_reads_percentage = 0
99 current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage
100 # Reference with Coverage
101 ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file)
102 current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage
103 # Average Depth of Coverage
104 current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage
105 # Good SNP Count
106 current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
107 data_frames.append(current_sample_df)
108 output_df = pandas.concat(data_frames)
109 output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')
110
111
112 def process_idxstats_file(idxstats_file):
113 all_mapped_reads = 0
114 unmapped_reads = 0
115 with open(idxstats_file, "r") as fh:
116 for i, line in enumerate(fh):
117 line = line.rstrip('\r\n')
118 items = line.split("\t")
119 if i == 0:
120 # NC_002945.4 4349904 213570 4047
121 all_mapped_reads = int(items[2])
122 elif i == 1:
123 # * 0 0 82774
124 unmapped_reads = int(items[3])
125 return all_mapped_reads, unmapped_reads
126
127
128 def process_metrics_file(metrics_file):
129 ref_with_coverage = '0%'
130 avg_depth_of_coverage = 0
131 good_snp_count = 0
132 with open(metrics_file, "r") as ifh:
133 for i, line in enumerate(ifh):
134 if i == 0:
135 # Skip comments.
136 continue
137 line = line.rstrip('\r\n')
138 items = line.split("\t")
139 if i == 1:
140 # MarkDuplicates 10.338671 98.74%
141 ref_with_coverage = items[3]
142 avg_depth_of_coverage = items[2]
143 elif i == 2:
144 # VCFfilter 611
145 good_snp_count = items[1]
146 return ref_with_coverage, avg_depth_of_coverage, good_snp_count
147
148
149 parser = argparse.ArgumentParser()
150
151 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
152 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
153 parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory')
154 parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory')
155 parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory')
156 parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads')
157 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
158 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
159 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
160 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats')
161 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage')
162
163 args = parser.parse_args()
164
165 fastq_files = []
166 idxstats_files = []
167 metrics_files = []
168 # Accumulate inputs.
169 if args.read1 is not None:
170 # The inputs are not dataset collections, so
171 # read1, read2 (possibly) and vsnp_azc will also
172 # not be None.
173 fastq_files.append(args.read1)
174 idxstats_files.append(args.samtools_idxstats)
175 metrics_files.append(args.vsnp_azc)
176 if args.read2 is not None:
177 fastq_files.append(args.read2)
178 idxstats_files.append(args.samtools_idxstats)
179 metrics_files.append(args.vsnp_azc)
180 else:
181 for file_name in sorted(os.listdir(args.input_reads_dir)):
182 fastq_files.append(os.path.join(args.input_reads_dir, file_name))
183 for file_name in sorted(os.listdir(args.input_idxstats_dir)):
184 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
185 if args.list_paired:
186 # Add the idxstats file for reverse.
187 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
188 for file_name in sorted(os.listdir(args.input_metrics_dir)):
189 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
190 if args.list_paired:
191 # Add the metrics file for reverse.
192 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
193 output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)