Mercurial > repos > greg > vsnp_get_snps
comparison vsnp_get_snps.py @ 3:14285a94fb13 draft
Uploaded
author | greg |
---|---|
date | Sun, 03 Jan 2021 16:06:33 +0000 |
parents | ee4ef1fc23c6 |
children | 5e4595b9f63c |
comparison
equal
deleted
inserted
replaced
2:7471707d3fb4 | 3:14285a94fb13 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 # Collect quality parsimonious SNPs from vcf files and output alignment files in fasta format. | 3 # Collect quality parsimonious SNPs from vcf files |
4 # and output alignment files in fasta format. | |
4 | 5 |
5 import argparse | 6 import argparse |
6 import multiprocessing | 7 import multiprocessing |
7 import os | 8 import os |
8 import pandas | |
9 import queue | 9 import queue |
10 import shutil | 10 import shutil |
11 import sys | 11 import sys |
12 import time | 12 import time |
13 import vcf | |
14 from collections import OrderedDict | 13 from collections import OrderedDict |
15 from datetime import datetime | 14 from datetime import datetime |
16 | 15 |
17 ALL_VCFS_DIR = 'all_vcf' | 16 import pandas |
18 INPUT_VCF_DIR = 'input_vcf_dir' | 17 import vcf |
19 OUTPUT_JSON_AVG_MQ_DIR = 'output_json_avg_mq_dir' | |
20 OUTPUT_JSON_SNPS_DIR = 'output_json_snps_dir' | |
21 OUTPUT_SNPS_DIR = 'output_snps_dir' | |
22 | 18 |
23 | 19 |
24 def get_time_stamp(): | 20 def get_time_stamp(): |
25 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S') | 21 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S') |
26 | 22 |
38 | 34 |
39 | 35 |
40 def setup_all_vcfs(vcf_files, vcf_dirs): | 36 def setup_all_vcfs(vcf_files, vcf_dirs): |
41 # Create the all_vcfs directory and link | 37 # Create the all_vcfs directory and link |
42 # all input vcf files into it for processing. | 38 # all input vcf files into it for processing. |
43 os.makedirs(ALL_VCFS_DIR) | 39 all_vcfs_dir = 'all_vcf' |
44 vcf_dirs.append(ALL_VCFS_DIR) | 40 os.makedirs(all_vcfs_dir) |
41 vcf_dirs.append(all_vcfs_dir) | |
45 for vcf_file in vcf_files: | 42 for vcf_file in vcf_files: |
46 file_name_base = os.path.basename(vcf_file) | 43 file_name_base = os.path.basename(vcf_file) |
47 dst_file = os.path.join(ALL_VCFS_DIR, file_name_base) | 44 dst_file = os.path.join(all_vcfs_dir, file_name_base) |
48 os.symlink(vcf_file, dst_file) | 45 os.symlink(vcf_file, dst_file) |
49 return vcf_dirs | 46 return vcf_dirs |
50 | 47 |
51 | 48 |
52 class SnpFinder: | 49 class SnpFinder: |
53 | 50 |
54 def __init__(self, num_files, reference, excel_grouper_file, | 51 def __init__(self, num_files, dbkey, input_excel, all_isolates, ac, min_mq, quality_score_n_threshold, min_quality_score, input_vcf_dir, output_json_avg_mq_dir, output_json_snps_dir, output_snps_dir, output_summary): |
55 all_isolates, ac, mq_val, n_threshold, qual_threshold, output_summary): | 52 # Allele count |
56 self.ac = ac | 53 self.ac = ac |
54 # Create a group that will contain all isolates. | |
57 self.all_isolates = all_isolates | 55 self.all_isolates = all_isolates |
56 # Evolving positions dictionary. | |
58 self.all_positions = None | 57 self.all_positions = None |
59 # Filter based on the contents of an Excel file. | 58 # Isolate groups. |
60 self.excel_grouper_file = excel_grouper_file | |
61 # Use Genbank file | |
62 self.groups = [] | 59 self.groups = [] |
63 # This will be populated from the columns | 60 # Excel file for grouping. |
64 # in the Excel filter file if it is used. | 61 self.input_excel = input_excel |
65 self.mq_val = mq_val | 62 # Directory of input zero coverage vcf files. |
66 self.n_threshold = n_threshold | 63 self.input_vcf_dir = input_vcf_dir |
67 self.qual_threshold = qual_threshold | 64 # Minimum map quality value. |
68 self.reference = reference | 65 self.min_mq = min_mq |
66 # Minimum quality score value. | |
67 self.min_quality_score = min_quality_score | |
68 # Number of input zero coverage vcf files. | |
69 self.num_files = num_files | |
70 # Output directory for json average mq files. | |
71 self.output_json_avg_mq_dir = output_json_avg_mq_dir | |
72 # Output directory for json snps files. | |
73 self.output_json_snps_dir = output_json_snps_dir | |
74 # Output directory for snps files. | |
75 self.output_snps_dir = output_snps_dir | |
76 # Quality score N threshold value. | |
77 self.quality_score_n_threshold = quality_score_n_threshold | |
78 self.dbkey = dbkey | |
69 self.start_time = get_time_stamp() | 79 self.start_time = get_time_stamp() |
70 self.summary_str = "" | 80 self.summary_str = "" |
71 self.timer_start = datetime.now() | 81 self.timer_start = datetime.now() |
72 self.num_files = num_files | |
73 self.initiate_summary(output_summary) | 82 self.initiate_summary(output_summary) |
74 | 83 |
75 def append_to_summary(self, html_str): | 84 def append_to_summary(self, html_str): |
85 # Append a string to the html summary output file. | |
76 self.summary_str = "%s%s" % (self.summary_str, html_str) | 86 self.summary_str = "%s%s" % (self.summary_str, html_str) |
77 | 87 |
78 def bin_input_files(self, filename, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix): | 88 def bin_input_files(self, filename, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix): |
89 # Categorize input files into closely related | |
90 # isolate groups based on discovered SNPs, and | |
91 # return a group dictionary. | |
79 sample_groups_list = [] | 92 sample_groups_list = [] |
80 table_name = self.get_base_file_name(filename) | 93 table_name = self.get_sample_name(filename) |
81 try: | 94 defining_snp = False |
82 defining_snp = False | 95 # Absolute positions in set union of two lists. |
83 # Absolute positions in set union of two lists. | 96 for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())): |
84 for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())): | 97 group = defining_snps[abs_position] |
85 group = defining_snps[abs_position] | 98 sample_groups_list.append(group) |
99 self.check_add_group(group) | |
100 if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0: | |
101 table_name = self.get_sample_name(filename) | |
102 table_name = '%s<font color="red">[[MIXED]]</font>' % table_name | |
103 self.copy_file(filename, group) | |
104 defining_snp = True | |
105 if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()): | |
106 for abs_position in list(inverted_defining_snps.keys()): | |
107 group = inverted_defining_snps[abs_position] | |
86 sample_groups_list.append(group) | 108 sample_groups_list.append(group) |
87 self.check_add_group(group) | 109 self.check_add_group(group) |
88 if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0: | |
89 table_name = self.get_base_file_name(filename) | |
90 table_name = '%s<font color="red">[[MIXED]]</font>' % table_name | |
91 self.copy_file(filename, group) | 110 self.copy_file(filename, group) |
92 defining_snp = True | 111 defining_snp = True |
93 if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()): | 112 if defining_snp: |
94 for abs_position in list(inverted_defining_snps.keys()): | 113 samples_groups_dict[table_name] = sorted(sample_groups_list) |
95 group = inverted_defining_snps[abs_position] | 114 else: |
96 sample_groups_list.append(group) | 115 samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>'] |
97 self.check_add_group(group) | |
98 self.copy_file(filename, group) | |
99 defining_snp = True | |
100 if defining_snp: | |
101 samples_groups_dict[table_name] = sorted(sample_groups_list) | |
102 else: | |
103 samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>'] | |
104 except TypeError as e: | |
105 msg = "<br/>Error processing file %s to generate samples_groups_dict: %s<br/>" % (filename, str(e)) | |
106 self.append_to_summary(msg) | |
107 samples_groups_dict[table_name] = [msg] | |
108 return samples_groups_dict | 116 return samples_groups_dict |
109 | 117 |
110 def check_add_group(self, group): | 118 def check_add_group(self, group): |
119 # Add a group if it is npt already in the list. | |
111 if group not in self.groups: | 120 if group not in self.groups: |
112 self.groups.append(group) | 121 self.groups.append(group) |
113 | 122 |
114 def copy_file(self, filename, dir): | 123 def copy_file(self, filename, dir): |
115 if not os.path.exists(dir): | 124 if not os.path.exists(dir): |
116 os.makedirs(dir) | 125 os.makedirs(dir) |
117 shutil.copy(filename, dir) | 126 shutil.copy(filename, dir) |
118 | 127 |
119 def decide_snps(self, filename): | 128 def decide_snps(self, filename): |
120 positions_dict = self.all_positions | |
121 # Find the SNPs in a vcf file to produce a pandas data | 129 # Find the SNPs in a vcf file to produce a pandas data |
122 # frame and a dictionary containing sample map qualities. | 130 # frame and a dictionary containing sample map qualities. |
131 positions_dict = self.all_positions | |
123 sample_map_qualities = {} | 132 sample_map_qualities = {} |
124 # Eliminate the path. | 133 # Eliminate the path. |
125 file_name_base = self.get_base_file_name(filename) | 134 file_name_base = self.get_sample_name(filename) |
126 vcf_reader = vcf.Reader(open(filename, 'r')) | 135 vcf_reader = vcf.Reader(open(filename, 'r')) |
127 sample_dict = {} | 136 sample_dict = {} |
128 for record in vcf_reader: | 137 for record in vcf_reader: |
129 alt = str(record.ALT[0]) | 138 alt = str(record.ALT[0]) |
130 record_position = "%s:%s" % (str(record.CHROM), str(record.POS)) | 139 record_position = "%s:%s" % (str(record.CHROM), str(record.POS)) |
131 if record_position in positions_dict: | 140 if record_position in positions_dict: |
132 if alt == "None": | 141 if alt == "None": |
133 sample_dict.update({record_position: "-"}) | 142 sample_dict.update({record_position: "-"}) |
134 else: | 143 else: |
135 # Not sure this is the best place to capture MQM average | |
136 # may be faster after parsimony SNPs are decided, but | |
137 # then it will require opening the files again. | |
138 # On rare occassions MQM gets called "NaN", thus passing | 144 # On rare occassions MQM gets called "NaN", thus passing |
139 # a string when a number is expected when calculating average. | 145 # a string when a number is expected when calculating average. |
140 mq_val = self.get_mq_val(record.INFO, filename) | 146 mq_val = self.get_mq_val(record.INFO, filename) |
141 if str(mq_val).lower() not in ["nan"]: | 147 if str(mq_val).lower() not in ["nan"]: |
142 sample_map_qualities.update({record_position: mq_val}) | 148 sample_map_qualities.update({record_position: mq_val}) |
143 # Add parameters here to change what each vcf represents. | 149 if len(alt) == 1: |
144 # SNP is represented in table, now how will the vcf represent | |
145 # the called position alt != "None", which means a deletion | |
146 # as alt is not record.FILTER, or rather passed. | |
147 len_alt = len(alt) | |
148 if len_alt == 1: | |
149 qual_val = self.val_as_int(record.QUAL) | 150 qual_val = self.val_as_int(record.QUAL) |
150 ac = record.INFO['AC'][0] | 151 ac = record.INFO['AC'][0] |
151 ref = str(record.REF[0]) | 152 ref = str(record.REF[0]) |
152 if ac == 2 and qual_val > self.n_threshold: | 153 if ac == 2 and qual_val > self.quality_score_n_threshold: |
154 # Add the SNP to a group. | |
153 sample_dict.update({record_position: alt}) | 155 sample_dict.update({record_position: alt}) |
154 elif ac == 1 and qual_val > self.n_threshold: | 156 elif ac == 1 and qual_val > self.quality_score_n_threshold: |
157 # The position is ambiguous. | |
155 alt_ref = "%s%s" % (alt, ref) | 158 alt_ref = "%s%s" % (alt, ref) |
156 if alt_ref == "AG": | 159 if alt_ref == "AG": |
157 sample_dict.update({record_position: "R"}) | 160 sample_dict.update({record_position: "R"}) |
158 elif alt_ref == "CT": | 161 elif alt_ref == "CT": |
159 sample_dict.update({record_position: "Y"}) | 162 sample_dict.update({record_position: "Y"}) |
179 sample_dict.update({record_position: "M"}) | 182 sample_dict.update({record_position: "M"}) |
180 else: | 183 else: |
181 sample_dict.update({record_position: "N"}) | 184 sample_dict.update({record_position: "N"}) |
182 # Poor calls | 185 # Poor calls |
183 elif qual_val <= 50: | 186 elif qual_val <= 50: |
187 # Call the reference allele. | |
184 # Do not coerce record.REF[0] to a string! | 188 # Do not coerce record.REF[0] to a string! |
185 sample_dict.update({record_position: record.REF[0]}) | 189 sample_dict.update({record_position: record.REF[0]}) |
186 elif qual_val <= self.n_threshold: | 190 elif qual_val <= self.quality_score_n_threshold: |
187 sample_dict.update({record_position: "N"}) | 191 sample_dict.update({record_position: "N"}) |
188 else: | 192 else: |
189 # Insurance -- Will still report on a possible | 193 # Insurance -- Will still report on a possible |
190 # SNP even if missed with above statement | 194 # SNP even if missed with above statements. |
191 # Do not coerce record.REF[0] to a string! | 195 # Do not coerce record.REF[0] to a string! |
192 sample_dict.update({record_position: record.REF[0]}) | 196 sample_dict.update({record_position: record.REF[0]}) |
193 # Merge dictionaries and order | 197 # Merge dictionaries and order |
194 merge_dict = {} | 198 merge_dict = {} |
195 # abs_pos:REF | |
196 merge_dict.update(positions_dict) | 199 merge_dict.update(positions_dict) |
197 # abs_pos:ALT replacing all_positions, because keys must be unique | |
198 merge_dict.update(sample_dict) | 200 merge_dict.update(sample_dict) |
199 sample_df = pandas.DataFrame(merge_dict, index=[file_name_base]) | 201 sample_df = pandas.DataFrame(merge_dict, index=[file_name_base]) |
200 return sample_df, file_name_base, sample_map_qualities | 202 return sample_df, file_name_base, sample_map_qualities |
201 | 203 |
202 def df_to_fasta(self, parsimonious_df, group): | 204 def df_to_fasta(self, parsimonious_df, group): |
203 # Generate SNP alignment file from the parsimonious_df | 205 # Generate SNP alignment file from |
204 # data frame. | 206 # the parsimonious_df data frame. |
205 snps_file = os.path.join(OUTPUT_SNPS_DIR, "%s.fasta" % group) | 207 snps_file = os.path.join(self.output_snps_dir, "%s.fasta" % group) |
206 test_duplicates = [] | 208 test_duplicates = [] |
207 has_sequence_data = False | 209 has_sequence_data = False |
208 for index, row in parsimonious_df.iterrows(): | 210 for index, row in parsimonious_df.iterrows(): |
209 for pos in row: | 211 for pos in row: |
210 if len(pos) > 0: | 212 if len(pos) > 0: |
223 | 225 |
224 def find_initial_positions(self, filename): | 226 def find_initial_positions(self, filename): |
225 # Find SNP positions in a vcf file. | 227 # Find SNP positions in a vcf file. |
226 found_positions = {} | 228 found_positions = {} |
227 found_positions_mix = {} | 229 found_positions_mix = {} |
228 try: | 230 vcf_reader = vcf.Reader(open(filename, 'r')) |
229 vcf_reader = vcf.Reader(open(filename, 'r')) | 231 for record in vcf_reader: |
230 try: | 232 qual_val = self.val_as_int(record.QUAL) |
231 for record in vcf_reader: | 233 chrom = record.CHROM |
232 qual_val = self.val_as_int(record.QUAL) | 234 position = record.POS |
233 chrom = record.CHROM | 235 absolute_position = "%s:%s" % (str(chrom), str(position)) |
234 position = record.POS | 236 alt = str(record.ALT[0]) |
235 absolute_position = "%s:%s" % (str(chrom), str(position)) | 237 if alt != "None": |
236 alt = str(record.ALT[0]) | 238 mq_val = self.get_mq_val(record.INFO, filename) |
237 if alt != "None": | 239 ac = record.INFO['AC'][0] |
238 mq_val = self.get_mq_val(record.INFO, filename) | 240 if ac == self.ac and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq: |
239 ac = record.INFO['AC'][0] | 241 found_positions.update({absolute_position: record.REF}) |
240 len_ref = len(record.REF) | 242 if ac == 1 and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq: |
241 if ac == self.ac and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val: | 243 found_positions_mix.update({absolute_position: record.REF}) |
242 found_positions.update({absolute_position: record.REF}) | 244 return found_positions, found_positions_mix |
243 if ac == 1 and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val: | |
244 found_positions_mix.update({absolute_position: record.REF}) | |
245 return found_positions, found_positions_mix | |
246 except (ZeroDivisionError, ValueError, UnboundLocalError, TypeError) as e: | |
247 self.append_to_summar("<br/>Error parsing record in file %s: %s<br/>" % (filename, str(e))) | |
248 return {'': ''}, {'': ''} | |
249 except (SyntaxError, AttributeError) as e: | |
250 self.append_to_summary("<br/>Error attempting to read file %s: %s<br/>" % (filename, str(e))) | |
251 return {'': ''}, {'': ''} | |
252 | 245 |
253 def gather_and_filter(self, prefilter_df, mq_averages, group_dir): | 246 def gather_and_filter(self, prefilter_df, mq_averages, group_dir): |
254 # Group a data frame of SNPs. | 247 # Group a data frame of SNPs. |
255 if self.excel_grouper_file is None: | 248 if self.input_excel is None: |
256 filtered_all_df = prefilter_df | 249 filtered_all_df = prefilter_df |
257 sheet_names = None | 250 sheet_names = None |
258 else: | 251 else: |
259 # Filter positions to be removed from all. | 252 # Filter positions to be removed from all. |
260 xl = pandas.ExcelFile(self.excel_grouper_file) | 253 xl = pandas.ExcelFile(self.input_excel) |
261 sheet_names = xl.sheet_names | 254 sheet_names = xl.sheet_names |
262 # Use the first column to filter "all" postions. | 255 # Use the first column to filter "all" postions. |
263 exclusion_list_all = self.get_position_list(sheet_names, 0) | 256 exclusion_list_all = self.get_position_list(sheet_names, 0) |
264 exclusion_list_group = self.get_position_list(sheet_names, group_dir) | 257 exclusion_list_group = self.get_position_list(sheet_names, group_dir) |
265 exclusion_list = exclusion_list_all + exclusion_list_group | 258 exclusion_list = exclusion_list_all + exclusion_list_group |
266 # Filters for all applied. | 259 # Filters for all applied. |
267 filtered_all_df = prefilter_df.drop(columns=exclusion_list, errors='ignore') | 260 filtered_all_df = prefilter_df.drop(columns=exclusion_list, errors='ignore') |
268 json_snps_file = os.path.join(OUTPUT_JSON_SNPS_DIR, "%s.json" % group_dir) | 261 json_snps_file = os.path.join(self.output_json_snps_dir, "%s.json" % group_dir) |
269 parsimonious_df = self.get_parsimonious_df(filtered_all_df) | 262 parsimonious_df = self.get_parsimonious_df(filtered_all_df) |
270 samples_number, columns = parsimonious_df.shape | 263 samples_number, columns = parsimonious_df.shape |
271 if samples_number >= 4: | 264 if samples_number >= 4: |
265 # Sufficient samples have been found | |
266 # to build a phylogenetic tree. | |
272 has_sequence_data = self.df_to_fasta(parsimonious_df, group_dir) | 267 has_sequence_data = self.df_to_fasta(parsimonious_df, group_dir) |
273 if has_sequence_data: | 268 if has_sequence_data: |
274 json_avg_mq_file = os.path.join(OUTPUT_JSON_AVG_MQ_DIR, "%s.json" % group_dir) | 269 json_avg_mq_file = os.path.join(self.output_json_avg_mq_dir, "%s.json" % group_dir) |
275 mq_averages.to_json(json_avg_mq_file, orient='split') | 270 mq_averages.to_json(json_avg_mq_file, orient='split') |
276 parsimonious_df.to_json(json_snps_file, orient='split') | 271 parsimonious_df.to_json(json_snps_file, orient='split') |
277 else: | 272 else: |
278 msg = "<br/>No sequence data" | 273 msg = "<br/>No sequence data" |
279 if group_dir is not None: | 274 if group_dir is not None: |
283 msg = "<br/>Too few samples to build tree" | 278 msg = "<br/>Too few samples to build tree" |
284 if group_dir is not None: | 279 if group_dir is not None: |
285 msg = "%s for group: %s" % (msg, group_dir) | 280 msg = "%s for group: %s" % (msg, group_dir) |
286 self.append_to_summary("%s<br/>\n" % msg) | 281 self.append_to_summary("%s<br/>\n" % msg) |
287 | 282 |
288 def get_base_file_name(self, file_path): | 283 def get_sample_name(self, file_path): |
284 # Return the sample part of a file name. | |
289 base_file_name = os.path.basename(file_path) | 285 base_file_name = os.path.basename(file_path) |
290 if base_file_name.find(".") > 0: | 286 if base_file_name.find(".") > 0: |
291 # Eliminate the extension. | 287 # Eliminate the extension. |
292 return os.path.splitext(base_file_name)[0] | 288 return os.path.splitext(base_file_name)[0] |
293 elif base_file_name.find("_") > 0: | 289 return base_file_name |
294 # The dot extension was likely changed to | |
295 # the " character. | |
296 items = base_file_name.split("_") | |
297 return "_".join(items[0:-1]) | |
298 else: | |
299 return base_file_name | |
300 | 290 |
301 def get_mq_val(self, record_info, filename): | 291 def get_mq_val(self, record_info, filename): |
302 # Get the MQ (gatk) or MQM (freebayes) value | 292 # Get the MQ (gatk) or MQM (freebayes) value |
303 # from the record.INFO component of the vcf file. | 293 # from the record.INFO component of the vcf file. |
304 try: | 294 try: |
331 | 321 |
332 def get_position_list(self, sheet_names, group): | 322 def get_position_list(self, sheet_names, group): |
333 # Get a list of positions defined by an excel file. | 323 # Get a list of positions defined by an excel file. |
334 exclusion_list = [] | 324 exclusion_list = [] |
335 try: | 325 try: |
336 filter_to_all = pandas.read_excel(self.excel_grouper_file, header=1, usecols=[group]) | 326 filter_to_all = pandas.read_excel(self.input_excel, header=1, usecols=[group]) |
337 for value in filter_to_all.values: | 327 for value in filter_to_all.values: |
338 value = str(value[0]) | 328 value = str(value[0]) |
339 if "-" not in value.split(":")[-1]: | 329 if "-" not in value.split(":")[-1]: |
340 exclusion_list.append(value) | 330 exclusion_list.append(value) |
341 elif "-" in value: | 331 elif "-" in value: |
346 value = sequence_range.split("-") | 336 value = sequence_range.split("-") |
347 for position in range(int(value[0].replace(',', '')), int(value[1].replace(',', '')) + 1): | 337 for position in range(int(value[0].replace(',', '')), int(value[1].replace(',', '')) + 1): |
348 exclusion_list.append(chrom + ":" + str(position)) | 338 exclusion_list.append(chrom + ":" + str(position)) |
349 return exclusion_list | 339 return exclusion_list |
350 except ValueError: | 340 except ValueError: |
351 exclusion_list = [] | 341 return [] |
352 return exclusion_list | |
353 | 342 |
354 def get_snps(self, task_queue, timeout): | 343 def get_snps(self, task_queue, timeout): |
355 while True: | 344 while True: |
356 try: | 345 try: |
357 group_dir = task_queue.get(block=True, timeout=timeout) | 346 group_dir = task_queue.get(block=True, timeout=timeout) |
358 except queue.Empty: | 347 except queue.Empty: |
359 break | 348 break |
360 # Parse all vcf files to accumulate SNPs into a | 349 # Parse all vcf files to accumulate |
361 # data frame. | 350 # the SNPs into a data frame. |
362 positions_dict = {} | 351 positions_dict = {} |
363 group_files = [] | 352 group_files = [] |
364 for file_name in os.listdir(os.path.abspath(group_dir)): | 353 for file_name in os.listdir(os.path.abspath(group_dir)): |
365 file_path = os.path.abspath(os.path.join(group_dir, file_name)) | 354 file_path = os.path.abspath(os.path.join(group_dir, file_name)) |
366 group_files.append(file_path) | 355 group_files.append(file_path) |
367 for file_name in group_files: | 356 for file_name in group_files: |
368 try: | 357 found_positions, found_positions_mix = self.find_initial_positions(file_name) |
369 found_positions, found_positions_mix = self.find_initial_positions(file_name) | 358 positions_dict.update(found_positions) |
370 positions_dict.update(found_positions) | |
371 except Exception as e: | |
372 self.append_to_summary("Error updating the positions_dict dictionary when processing file %s:\n%s\n" % (file_name, str(e))) | |
373 # Order before adding to file to match | 359 # Order before adding to file to match |
374 # with ordering of individual samples. | 360 # with ordering of individual samples. |
375 # all_positions is abs_pos:REF | 361 # all_positions is abs_pos:REF |
376 self.all_positions = OrderedDict(sorted(positions_dict.items())) | 362 self.all_positions = OrderedDict(sorted(positions_dict.items())) |
377 ref_positions_df = pandas.DataFrame(self.all_positions, index=['root']) | 363 ref_positions_df = pandas.DataFrame(self.all_positions, index=['root']) |
392 self.gather_and_filter(prefilter_df, mq_averages, group_dir) | 378 self.gather_and_filter(prefilter_df, mq_averages, group_dir) |
393 task_queue.task_done() | 379 task_queue.task_done() |
394 | 380 |
395 def group_vcfs(self, vcf_files): | 381 def group_vcfs(self, vcf_files): |
396 # Parse an excel file to produce a | 382 # Parse an excel file to produce a |
397 # grouping dictionary for filtering SNPs. | 383 # grouping dictionary for SNPs. |
398 xl = pandas.ExcelFile(self.excel_grouper_file) | 384 xl = pandas.ExcelFile(self.input_excel) |
399 sheet_names = xl.sheet_names | 385 sheet_names = xl.sheet_names |
400 ws = pandas.read_excel(self.excel_grouper_file, sheet_name=sheet_names[0]) | 386 ws = pandas.read_excel(self.input_excel, sheet_name=sheet_names[0]) |
401 defining_snps = ws.iloc[0] | 387 defining_snps = ws.iloc[0] |
402 defsnp_iterator = iter(defining_snps.iteritems()) | 388 defsnp_iterator = iter(defining_snps.iteritems()) |
403 next(defsnp_iterator) | 389 next(defsnp_iterator) |
404 defining_snps = {} | 390 defining_snps = {} |
405 inverted_defining_snps = {} | 391 inverted_defining_snps = {} |
427 def initiate_summary(self, output_summary): | 413 def initiate_summary(self, output_summary): |
428 # Output summary file handle. | 414 # Output summary file handle. |
429 self.append_to_summary('<html>\n') | 415 self.append_to_summary('<html>\n') |
430 self.append_to_summary('<head></head>\n') | 416 self.append_to_summary('<head></head>\n') |
431 self.append_to_summary('<body style=\"font-size:12px;">') | 417 self.append_to_summary('<body style=\"font-size:12px;">') |
432 self.append_to_summary("<b>Time started:</b> %s<br/>" % str(get_time_stamp())) | 418 self.append_to_summary("<b>Time started:</b> %s<br/>" % get_time_stamp()) |
433 self.append_to_summary("<b>Number of VCF inputs:</b> %d<br/>" % self.num_files) | 419 self.append_to_summary("<b>Number of VCF inputs:</b> %d<br/>" % self.num_files) |
434 self.append_to_summary("<b>Reference:</b> %s<br/>" % str(self.reference)) | 420 self.append_to_summary("<b>Reference:</b> %s<br/>" % self.dbkey) |
435 self.append_to_summary("<b>All isolates:</b> %s<br/>" % str(self.all_isolates)) | 421 self.append_to_summary("<b>All isolates:</b> %s<br/>" % str(self.all_isolates)) |
436 | 422 |
437 def return_val(self, val, index=0): | 423 def return_val(self, val, index=0): |
438 # Handle element and single-element list values. | 424 # Handle element and single-element list values. |
439 if isinstance(val, list): | 425 if isinstance(val, list): |
448 # val is likely None here. | 434 # val is likely None here. |
449 return 0 | 435 return 0 |
450 | 436 |
451 | 437 |
452 if __name__ == '__main__': | 438 if __name__ == '__main__': |
439 | |
453 parser = argparse.ArgumentParser() | 440 parser = argparse.ArgumentParser() |
454 | 441 |
455 parser.add_argument('--all_isolates', action='store', dest='all_isolates', required=False, default="No", help='Create table with all isolates'), | 442 parser.add_argument('--ac', action='store', dest='ac', type=int, help='Allele count value'), |
456 parser.add_argument('--excel_grouper_file', action='store', dest='excel_grouper_file', required=False, default=None, help='Optional Excel filter file'), | 443 parser.add_argument('--all_isolates', action='store_true', dest='all_isolates', required=False, default=False, help='Create table with all isolates'), |
444 parser.add_argument('--input_excel', action='store', dest='input_excel', required=False, default=None, help='Optional Excel filter file'), | |
445 parser.add_argument('--input_vcf_dir', action='store', dest='input_vcf_dir', help='Input vcf directory'), | |
446 parser.add_argument('--min_mq', action='store', dest='min_mq', type=int, help='Minimum map quality value'), | |
447 parser.add_argument('--min_quality_score', action='store', dest='min_quality_score', type=int, help='Minimum quality score value'), | |
448 parser.add_argument('--output_json_avg_mq_dir', action='store', dest='output_json_avg_mq_dir', help='Output json average mq directory'), | |
449 parser.add_argument('--output_json_snps_dir', action='store', dest='output_json_snps_dir', help='Output json snps directory'), | |
450 parser.add_argument('--output_snps_dir', action='store', dest='output_snps_dir', help='Output snps directory'), | |
457 parser.add_argument('--output_summary', action='store', dest='output_summary', help='Output summary html file'), | 451 parser.add_argument('--output_summary', action='store', dest='output_summary', help='Output summary html file'), |
458 parser.add_argument('--reference', action='store', dest='reference', help='Reference file'), | 452 parser.add_argument('--processes', action='store', dest='processes', type=int, help='Configured processes for job'), |
459 parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting') | 453 parser.add_argument('--quality_score_n_threshold', action='store', dest='quality_score_n_threshold', type=int, help='Minimum quality score N value for alleles'), |
454 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Galaxy genome build dbkey'), | |
460 | 455 |
461 args = parser.parse_args() | 456 args = parser.parse_args() |
462 | 457 |
463 # Initializations - TODO: should these be passed in as command line args? | 458 # Build the list of all input zero coverage vcf |
464 ac = 2 | 459 # files, both the samples and the "database". |
465 mq_val = 56 | |
466 n_threshold = 50 | |
467 qual_threshold = 150 | |
468 | |
469 # Build the list of sample vcf files for the current run. | |
470 vcf_files = [] | 460 vcf_files = [] |
471 for file_name in os.listdir(INPUT_VCF_DIR): | 461 for file_name in os.listdir(args.input_vcf_dir): |
472 file_path = os.path.abspath(os.path.join(INPUT_VCF_DIR, file_name)) | 462 file_path = os.path.abspath(os.path.join(args.input_vcf_dir, file_name)) |
473 vcf_files.append(file_path) | 463 vcf_files.append(file_path) |
474 | 464 |
475 multiprocessing.set_start_method('spawn') | 465 multiprocessing.set_start_method('spawn') |
476 queue1 = multiprocessing.JoinableQueue() | 466 queue1 = multiprocessing.JoinableQueue() |
477 num_files = len(vcf_files) | 467 num_files = len(vcf_files) |
478 cpus = set_num_cpus(num_files, args.processes) | 468 cpus = set_num_cpus(num_files, args.processes) |
479 # Set a timeout for get()s in the queue. | 469 # Set a timeout for get()s in the queue. |
480 timeout = 0.05 | 470 timeout = 0.05 |
481 | 471 |
482 # Initialize the snp_finder object. | 472 # Initialize the snp_finder object. |
483 snp_finder = SnpFinder(num_files, args.reference, args.excel_grouper_file, args.all_isolates, | 473 snp_finder = SnpFinder(num_files, args.dbkey, args.input_excel, args.all_isolates, args.ac, args.min_mq, args.quality_score_n_threshold, args.min_quality_score, args.input_vcf_dir, args.output_json_avg_mq_dir, args.output_json_snps_dir, args.output_snps_dir, args.output_summary) |
484 ac, mq_val, n_threshold, qual_threshold, args.output_summary) | 474 |
485 | 475 # Define and make the set of directories into which the input_zc_vcf |
486 # Initialize the set of directories containiing vcf files for analysis. | 476 # files will be placed. Selected input values (e.g., the use of |
477 # an Excel file for grouping and filtering, creating a group with | |
478 # all isolates) are used to define the directories. | |
487 vcf_dirs = [] | 479 vcf_dirs = [] |
488 if args.excel_grouper_file is None: | 480 if args.input_excel is None: |
489 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs) | 481 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs) |
490 else: | 482 else: |
491 if args.all_isolates.lower() == "yes": | 483 if args.all_isolates: |
492 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs) | 484 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs) |
493 # Parse the Excel file to detemine groups for filtering. | 485 # Parse the Excel file to detemine groups for filtering. |
494 snp_finder.group_vcfs(vcf_files) | 486 snp_finder.group_vcfs(vcf_files) |
495 # Append the list of group directories created by | 487 # Append the list of group directories created by |
496 # the above call to the set of directories containing | 488 # the above call to the set of directories containing |
497 # vcf files for analysis | 489 # vcf files for analysis. |
498 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups] | 490 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups] |
499 vcf_dirs.extend(group_dirs) | 491 vcf_dirs.extend(group_dirs) |
500 | 492 |
501 # Populate the queue for job splitting. | 493 # Populate the queue for job splitting. |
502 for vcf_dir in vcf_dirs: | 494 for vcf_dir in vcf_dirs: |