Mercurial > repos > greg > vsnp_statistics
comparison vsnp_statistics.py @ 4:2d6c6b01319e draft
Uploaded
author | greg |
---|---|
date | Sun, 03 Jan 2021 15:47:28 +0000 |
parents | 321a8259e3f9 |
children | d0fbdeaaa488 |
comparison
equal
deleted
inserted
replaced
3:321a8259e3f9 | 4:2d6c6b01319e |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | 3 import argparse |
4 import gzip | 4 import gzip |
5 import os | |
6 import shutil | |
7 | |
5 import numpy | 8 import numpy |
6 import os | |
7 import pandas | 9 import pandas |
8 import shutil | 10 |
9 | |
10 INPUT_IDXSTATS_DIR = 'input_idxstats' | |
11 INPUT_METRICS_DIR = 'input_metrics' | |
12 INPUT_READS_DIR = 'input_reads' | |
13 QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7', | 11 QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7', |
14 ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15', | 12 ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15', |
15 '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22', | 13 '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22', |
16 '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29', | 14 '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29', |
17 '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36', | 15 '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36', |
24 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1', | 22 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1', |
25 ' ': '1'} | 23 ' ': '1'} |
26 | 24 |
27 | 25 |
28 def fastq_to_df(fastq_file, gzipped): | 26 def fastq_to_df(fastq_file, gzipped): |
29 if gzipped.lower() == "true": | 27 if gzipped: |
30 return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^") | 28 return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^") |
31 else: | 29 return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^") |
32 return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^") | |
33 | |
34 | |
35 def get_base_file_name(file_path): | |
36 base_file_name = os.path.basename(file_path) | |
37 if base_file_name.find(".") > 0: | |
38 # Eliminate the extension. | |
39 return os.path.splitext(base_file_name)[0] | |
40 elif base_file_name.find("_") > 0: | |
41 # The dot extension was likely changed to | |
42 # the " character. | |
43 items = base_file_name.split("_") | |
44 return "_".join(items[0:-1]) | |
45 else: | |
46 return base_file_name | |
47 | 30 |
48 | 31 |
49 def nice_size(size): | 32 def nice_size(size): |
50 # Returns a readably formatted string with the size | 33 # Returns a readably formatted string with the size |
51 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] | 34 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] |
65 return "%s%d bytes" % (prefix, size) | 48 return "%s%d bytes" % (prefix, size) |
66 return "%s%.1f %s" % (prefix, size, word) | 49 return "%s%.1f %s" % (prefix, size, word) |
67 return '??? bytes' | 50 return '??? bytes' |
68 | 51 |
69 | 52 |
70 def output_statistics(reads_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): | 53 def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): |
71 # Produce an Excel spreadsheet that | 54 # Produce an Excel spreadsheet that |
72 # contains a row for each sample. | 55 # contains a row for each sample. |
73 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', | 56 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', |
74 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', | 57 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', |
75 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] | 58 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] |
76 data_frames = [] | 59 data_frames = [] |
77 for i, fastq_file in enumerate(reads_files): | 60 for i, fastq_file in enumerate(fastq_files): |
78 idxstats_file = idxstats_files[i] | 61 idxstats_file = idxstats_files[i] |
79 metrics_file = metrics_files[i] | 62 metrics_file = metrics_files[i] |
80 file_name_base = os.path.basename(fastq_file) | 63 file_name_base = os.path.basename(fastq_file) |
81 # Read fastq_file into a data frame. | 64 # Read fastq_file into a data frame. |
82 fastq_df = fastq_to_df(fastq_file, gzipped) | 65 fastq_df = fastq_to_df(fastq_file, gzipped) |
169 # VCFfilter 611 | 152 # VCFfilter 611 |
170 good_snp_count = items[1] | 153 good_snp_count = items[1] |
171 return ref_with_coverage, avg_depth_of_coverage, good_snp_count | 154 return ref_with_coverage, avg_depth_of_coverage, good_snp_count |
172 | 155 |
173 | 156 |
174 if __name__ == '__main__': | 157 parser = argparse.ArgumentParser() |
175 parser = argparse.ArgumentParser() | 158 |
176 | 159 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') |
177 parser.add_argument('--read1', action='store', dest='read1', required=False, default=None, help='Required: single read') | 160 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') |
178 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') | 161 parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory') |
179 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') | 162 parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory') |
180 parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped') | 163 parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory') |
181 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', required=False, default=None, help='Output of samtools_idxstats') | 164 parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads') |
182 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') | 165 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') |
183 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', required=False, default=None, help='Output of vsnp_add_zero_coverage') | 166 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') |
184 | 167 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') |
185 args = parser.parse_args() | 168 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') |
186 print("args:\n%s\n" % str(args)) | 169 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage') |
187 | 170 |
188 reads_files = [] | 171 args = parser.parse_args() |
189 idxstats_files = [] | 172 |
190 metrics_files = [] | 173 fastq_files = [] |
191 # Accumulate inputs. | 174 idxstats_files = [] |
192 if args.read1 is not None: | 175 metrics_files = [] |
193 # The inputs are not dataset collections, so | 176 # Accumulate inputs. |
194 # read1, read2 (possibly) and vsnp_azc will also | 177 if args.read1 is not None: |
195 # not be None. | 178 # The inputs are not dataset collections, so |
196 reads_files.append(args.read1) | 179 # read1, read2 (possibly) and vsnp_azc will also |
180 # not be None. | |
181 fastq_files.append(args.read1) | |
182 idxstats_files.append(args.samtools_idxstats) | |
183 metrics_files.append(args.vsnp_azc) | |
184 if args.read2 is not None: | |
185 fastq_files.append(args.read2) | |
197 idxstats_files.append(args.samtools_idxstats) | 186 idxstats_files.append(args.samtools_idxstats) |
198 metrics_files.append(args.vsnp_azc) | 187 metrics_files.append(args.vsnp_azc) |
199 if args.read2 is not None: | 188 else: |
200 reads_files.append(args.read2) | 189 for file_name in sorted(os.listdir(args.input_reads_dir)): |
201 idxstats_files.append(args.samtools_idxstats) | 190 fastq_files.append(os.path.join(args.input_reads_dir, file_name)) |
202 metrics_files.append(args.vsnp_azc) | 191 for file_name in sorted(os.listdir(args.input_idxstats_dir)): |
203 else: | 192 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) |
204 for file_name in sorted(os.listdir(INPUT_READS_DIR)): | 193 if args.list_paired: |
205 file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name)) | 194 # Add the idxstats file for reverse. |
206 reads_files.append(file_path) | 195 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) |
207 base_file_name = get_base_file_name(file_path) | 196 for file_name in sorted(os.listdir(args.input_metrics_dir)): |
208 for file_name in sorted(os.listdir(INPUT_IDXSTATS_DIR)): | 197 metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) |
209 file_path = os.path.abspath(os.path.join(INPUT_IDXSTATS_DIR, file_name)) | 198 if args.list_paired: |
210 idxstats_files.append(file_path) | 199 # Add the metrics file for reverse. |
211 for file_name in sorted(os.listdir(INPUT_METRICS_DIR)): | 200 metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) |
212 file_path = os.path.abspath(os.path.join(INPUT_METRICS_DIR, file_name)) | 201 output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey) |
213 metrics_files.append(file_path) | |
214 output_statistics(reads_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey) |