comparison vsnp_get_snps.py @ 3:14285a94fb13 draft

Uploaded
author greg
date Sun, 03 Jan 2021 16:06:33 +0000
parents ee4ef1fc23c6
children 5e4595b9f63c
comparison
equal deleted inserted replaced
2:7471707d3fb4 3:14285a94fb13
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 # Collect quality parsimonious SNPs from vcf files and output alignment files in fasta format. 3 # Collect quality parsimonious SNPs from vcf files
4 # and output alignment files in fasta format.
4 5
5 import argparse 6 import argparse
6 import multiprocessing 7 import multiprocessing
7 import os 8 import os
8 import pandas
9 import queue 9 import queue
10 import shutil 10 import shutil
11 import sys 11 import sys
12 import time 12 import time
13 import vcf
14 from collections import OrderedDict 13 from collections import OrderedDict
15 from datetime import datetime 14 from datetime import datetime
16 15
17 ALL_VCFS_DIR = 'all_vcf' 16 import pandas
18 INPUT_VCF_DIR = 'input_vcf_dir' 17 import vcf
19 OUTPUT_JSON_AVG_MQ_DIR = 'output_json_avg_mq_dir'
20 OUTPUT_JSON_SNPS_DIR = 'output_json_snps_dir'
21 OUTPUT_SNPS_DIR = 'output_snps_dir'
22 18
23 19
24 def get_time_stamp(): 20 def get_time_stamp():
25 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S') 21 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
26 22
38 34
39 35
40 def setup_all_vcfs(vcf_files, vcf_dirs): 36 def setup_all_vcfs(vcf_files, vcf_dirs):
41 # Create the all_vcfs directory and link 37 # Create the all_vcfs directory and link
42 # all input vcf files into it for processing. 38 # all input vcf files into it for processing.
43 os.makedirs(ALL_VCFS_DIR) 39 all_vcfs_dir = 'all_vcf'
44 vcf_dirs.append(ALL_VCFS_DIR) 40 os.makedirs(all_vcfs_dir)
41 vcf_dirs.append(all_vcfs_dir)
45 for vcf_file in vcf_files: 42 for vcf_file in vcf_files:
46 file_name_base = os.path.basename(vcf_file) 43 file_name_base = os.path.basename(vcf_file)
47 dst_file = os.path.join(ALL_VCFS_DIR, file_name_base) 44 dst_file = os.path.join(all_vcfs_dir, file_name_base)
48 os.symlink(vcf_file, dst_file) 45 os.symlink(vcf_file, dst_file)
49 return vcf_dirs 46 return vcf_dirs
50 47
51 48
52 class SnpFinder: 49 class SnpFinder:
53 50
54 def __init__(self, num_files, reference, excel_grouper_file, 51 def __init__(self, num_files, dbkey, input_excel, all_isolates, ac, min_mq, quality_score_n_threshold, min_quality_score, input_vcf_dir, output_json_avg_mq_dir, output_json_snps_dir, output_snps_dir, output_summary):
55 all_isolates, ac, mq_val, n_threshold, qual_threshold, output_summary): 52 # Allele count
56 self.ac = ac 53 self.ac = ac
54 # Create a group that will contain all isolates.
57 self.all_isolates = all_isolates 55 self.all_isolates = all_isolates
56 # Evolving positions dictionary.
58 self.all_positions = None 57 self.all_positions = None
59 # Filter based on the contents of an Excel file. 58 # Isolate groups.
60 self.excel_grouper_file = excel_grouper_file
61 # Use Genbank file
62 self.groups = [] 59 self.groups = []
63 # This will be populated from the columns 60 # Excel file for grouping.
64 # in the Excel filter file if it is used. 61 self.input_excel = input_excel
65 self.mq_val = mq_val 62 # Directory of input zero coverage vcf files.
66 self.n_threshold = n_threshold 63 self.input_vcf_dir = input_vcf_dir
67 self.qual_threshold = qual_threshold 64 # Minimum map quality value.
68 self.reference = reference 65 self.min_mq = min_mq
66 # Minimum quality score value.
67 self.min_quality_score = min_quality_score
68 # Number of input zero coverage vcf files.
69 self.num_files = num_files
70 # Output directory for json average mq files.
71 self.output_json_avg_mq_dir = output_json_avg_mq_dir
72 # Output directory for json snps files.
73 self.output_json_snps_dir = output_json_snps_dir
74 # Output directory for snps files.
75 self.output_snps_dir = output_snps_dir
76 # Quality score N threshold value.
77 self.quality_score_n_threshold = quality_score_n_threshold
78 self.dbkey = dbkey
69 self.start_time = get_time_stamp() 79 self.start_time = get_time_stamp()
70 self.summary_str = "" 80 self.summary_str = ""
71 self.timer_start = datetime.now() 81 self.timer_start = datetime.now()
72 self.num_files = num_files
73 self.initiate_summary(output_summary) 82 self.initiate_summary(output_summary)
74 83
75 def append_to_summary(self, html_str): 84 def append_to_summary(self, html_str):
85 # Append a string to the html summary output file.
76 self.summary_str = "%s%s" % (self.summary_str, html_str) 86 self.summary_str = "%s%s" % (self.summary_str, html_str)
77 87
78 def bin_input_files(self, filename, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix): 88 def bin_input_files(self, filename, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix):
89 # Categorize input files into closely related
90 # isolate groups based on discovered SNPs, and
91 # return a group dictionary.
79 sample_groups_list = [] 92 sample_groups_list = []
80 table_name = self.get_base_file_name(filename) 93 table_name = self.get_sample_name(filename)
81 try: 94 defining_snp = False
82 defining_snp = False 95 # Absolute positions in set union of two lists.
83 # Absolute positions in set union of two lists. 96 for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())):
84 for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())): 97 group = defining_snps[abs_position]
85 group = defining_snps[abs_position] 98 sample_groups_list.append(group)
99 self.check_add_group(group)
100 if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0:
101 table_name = self.get_sample_name(filename)
102 table_name = '%s<font color="red">[[MIXED]]</font>' % table_name
103 self.copy_file(filename, group)
104 defining_snp = True
105 if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()):
106 for abs_position in list(inverted_defining_snps.keys()):
107 group = inverted_defining_snps[abs_position]
86 sample_groups_list.append(group) 108 sample_groups_list.append(group)
87 self.check_add_group(group) 109 self.check_add_group(group)
88 if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0:
89 table_name = self.get_base_file_name(filename)
90 table_name = '%s<font color="red">[[MIXED]]</font>' % table_name
91 self.copy_file(filename, group) 110 self.copy_file(filename, group)
92 defining_snp = True 111 defining_snp = True
93 if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()): 112 if defining_snp:
94 for abs_position in list(inverted_defining_snps.keys()): 113 samples_groups_dict[table_name] = sorted(sample_groups_list)
95 group = inverted_defining_snps[abs_position] 114 else:
96 sample_groups_list.append(group) 115 samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>']
97 self.check_add_group(group)
98 self.copy_file(filename, group)
99 defining_snp = True
100 if defining_snp:
101 samples_groups_dict[table_name] = sorted(sample_groups_list)
102 else:
103 samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>']
104 except TypeError as e:
105 msg = "<br/>Error processing file %s to generate samples_groups_dict: %s<br/>" % (filename, str(e))
106 self.append_to_summary(msg)
107 samples_groups_dict[table_name] = [msg]
108 return samples_groups_dict 116 return samples_groups_dict
109 117
110 def check_add_group(self, group): 118 def check_add_group(self, group):
119 # Add a group if it is npt already in the list.
111 if group not in self.groups: 120 if group not in self.groups:
112 self.groups.append(group) 121 self.groups.append(group)
113 122
114 def copy_file(self, filename, dir): 123 def copy_file(self, filename, dir):
115 if not os.path.exists(dir): 124 if not os.path.exists(dir):
116 os.makedirs(dir) 125 os.makedirs(dir)
117 shutil.copy(filename, dir) 126 shutil.copy(filename, dir)
118 127
119 def decide_snps(self, filename): 128 def decide_snps(self, filename):
120 positions_dict = self.all_positions
121 # Find the SNPs in a vcf file to produce a pandas data 129 # Find the SNPs in a vcf file to produce a pandas data
122 # frame and a dictionary containing sample map qualities. 130 # frame and a dictionary containing sample map qualities.
131 positions_dict = self.all_positions
123 sample_map_qualities = {} 132 sample_map_qualities = {}
124 # Eliminate the path. 133 # Eliminate the path.
125 file_name_base = self.get_base_file_name(filename) 134 file_name_base = self.get_sample_name(filename)
126 vcf_reader = vcf.Reader(open(filename, 'r')) 135 vcf_reader = vcf.Reader(open(filename, 'r'))
127 sample_dict = {} 136 sample_dict = {}
128 for record in vcf_reader: 137 for record in vcf_reader:
129 alt = str(record.ALT[0]) 138 alt = str(record.ALT[0])
130 record_position = "%s:%s" % (str(record.CHROM), str(record.POS)) 139 record_position = "%s:%s" % (str(record.CHROM), str(record.POS))
131 if record_position in positions_dict: 140 if record_position in positions_dict:
132 if alt == "None": 141 if alt == "None":
133 sample_dict.update({record_position: "-"}) 142 sample_dict.update({record_position: "-"})
134 else: 143 else:
135 # Not sure this is the best place to capture MQM average
136 # may be faster after parsimony SNPs are decided, but
137 # then it will require opening the files again.
138 # On rare occassions MQM gets called "NaN", thus passing 144 # On rare occassions MQM gets called "NaN", thus passing
139 # a string when a number is expected when calculating average. 145 # a string when a number is expected when calculating average.
140 mq_val = self.get_mq_val(record.INFO, filename) 146 mq_val = self.get_mq_val(record.INFO, filename)
141 if str(mq_val).lower() not in ["nan"]: 147 if str(mq_val).lower() not in ["nan"]:
142 sample_map_qualities.update({record_position: mq_val}) 148 sample_map_qualities.update({record_position: mq_val})
143 # Add parameters here to change what each vcf represents. 149 if len(alt) == 1:
144 # SNP is represented in table, now how will the vcf represent
145 # the called position alt != "None", which means a deletion
146 # as alt is not record.FILTER, or rather passed.
147 len_alt = len(alt)
148 if len_alt == 1:
149 qual_val = self.val_as_int(record.QUAL) 150 qual_val = self.val_as_int(record.QUAL)
150 ac = record.INFO['AC'][0] 151 ac = record.INFO['AC'][0]
151 ref = str(record.REF[0]) 152 ref = str(record.REF[0])
152 if ac == 2 and qual_val > self.n_threshold: 153 if ac == 2 and qual_val > self.quality_score_n_threshold:
154 # Add the SNP to a group.
153 sample_dict.update({record_position: alt}) 155 sample_dict.update({record_position: alt})
154 elif ac == 1 and qual_val > self.n_threshold: 156 elif ac == 1 and qual_val > self.quality_score_n_threshold:
157 # The position is ambiguous.
155 alt_ref = "%s%s" % (alt, ref) 158 alt_ref = "%s%s" % (alt, ref)
156 if alt_ref == "AG": 159 if alt_ref == "AG":
157 sample_dict.update({record_position: "R"}) 160 sample_dict.update({record_position: "R"})
158 elif alt_ref == "CT": 161 elif alt_ref == "CT":
159 sample_dict.update({record_position: "Y"}) 162 sample_dict.update({record_position: "Y"})
179 sample_dict.update({record_position: "M"}) 182 sample_dict.update({record_position: "M"})
180 else: 183 else:
181 sample_dict.update({record_position: "N"}) 184 sample_dict.update({record_position: "N"})
182 # Poor calls 185 # Poor calls
183 elif qual_val <= 50: 186 elif qual_val <= 50:
187 # Call the reference allele.
184 # Do not coerce record.REF[0] to a string! 188 # Do not coerce record.REF[0] to a string!
185 sample_dict.update({record_position: record.REF[0]}) 189 sample_dict.update({record_position: record.REF[0]})
186 elif qual_val <= self.n_threshold: 190 elif qual_val <= self.quality_score_n_threshold:
187 sample_dict.update({record_position: "N"}) 191 sample_dict.update({record_position: "N"})
188 else: 192 else:
189 # Insurance -- Will still report on a possible 193 # Insurance -- Will still report on a possible
190 # SNP even if missed with above statement 194 # SNP even if missed with above statements.
191 # Do not coerce record.REF[0] to a string! 195 # Do not coerce record.REF[0] to a string!
192 sample_dict.update({record_position: record.REF[0]}) 196 sample_dict.update({record_position: record.REF[0]})
193 # Merge dictionaries and order 197 # Merge dictionaries and order
194 merge_dict = {} 198 merge_dict = {}
195 # abs_pos:REF
196 merge_dict.update(positions_dict) 199 merge_dict.update(positions_dict)
197 # abs_pos:ALT replacing all_positions, because keys must be unique
198 merge_dict.update(sample_dict) 200 merge_dict.update(sample_dict)
199 sample_df = pandas.DataFrame(merge_dict, index=[file_name_base]) 201 sample_df = pandas.DataFrame(merge_dict, index=[file_name_base])
200 return sample_df, file_name_base, sample_map_qualities 202 return sample_df, file_name_base, sample_map_qualities
201 203
202 def df_to_fasta(self, parsimonious_df, group): 204 def df_to_fasta(self, parsimonious_df, group):
203 # Generate SNP alignment file from the parsimonious_df 205 # Generate SNP alignment file from
204 # data frame. 206 # the parsimonious_df data frame.
205 snps_file = os.path.join(OUTPUT_SNPS_DIR, "%s.fasta" % group) 207 snps_file = os.path.join(self.output_snps_dir, "%s.fasta" % group)
206 test_duplicates = [] 208 test_duplicates = []
207 has_sequence_data = False 209 has_sequence_data = False
208 for index, row in parsimonious_df.iterrows(): 210 for index, row in parsimonious_df.iterrows():
209 for pos in row: 211 for pos in row:
210 if len(pos) > 0: 212 if len(pos) > 0:
223 225
224 def find_initial_positions(self, filename): 226 def find_initial_positions(self, filename):
225 # Find SNP positions in a vcf file. 227 # Find SNP positions in a vcf file.
226 found_positions = {} 228 found_positions = {}
227 found_positions_mix = {} 229 found_positions_mix = {}
228 try: 230 vcf_reader = vcf.Reader(open(filename, 'r'))
229 vcf_reader = vcf.Reader(open(filename, 'r')) 231 for record in vcf_reader:
230 try: 232 qual_val = self.val_as_int(record.QUAL)
231 for record in vcf_reader: 233 chrom = record.CHROM
232 qual_val = self.val_as_int(record.QUAL) 234 position = record.POS
233 chrom = record.CHROM 235 absolute_position = "%s:%s" % (str(chrom), str(position))
234 position = record.POS 236 alt = str(record.ALT[0])
235 absolute_position = "%s:%s" % (str(chrom), str(position)) 237 if alt != "None":
236 alt = str(record.ALT[0]) 238 mq_val = self.get_mq_val(record.INFO, filename)
237 if alt != "None": 239 ac = record.INFO['AC'][0]
238 mq_val = self.get_mq_val(record.INFO, filename) 240 if ac == self.ac and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq:
239 ac = record.INFO['AC'][0] 241 found_positions.update({absolute_position: record.REF})
240 len_ref = len(record.REF) 242 if ac == 1 and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq:
241 if ac == self.ac and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val: 243 found_positions_mix.update({absolute_position: record.REF})
242 found_positions.update({absolute_position: record.REF}) 244 return found_positions, found_positions_mix
243 if ac == 1 and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val:
244 found_positions_mix.update({absolute_position: record.REF})
245 return found_positions, found_positions_mix
246 except (ZeroDivisionError, ValueError, UnboundLocalError, TypeError) as e:
247 self.append_to_summar("<br/>Error parsing record in file %s: %s<br/>" % (filename, str(e)))
248 return {'': ''}, {'': ''}
249 except (SyntaxError, AttributeError) as e:
250 self.append_to_summary("<br/>Error attempting to read file %s: %s<br/>" % (filename, str(e)))
251 return {'': ''}, {'': ''}
252 245
253 def gather_and_filter(self, prefilter_df, mq_averages, group_dir): 246 def gather_and_filter(self, prefilter_df, mq_averages, group_dir):
254 # Group a data frame of SNPs. 247 # Group a data frame of SNPs.
255 if self.excel_grouper_file is None: 248 if self.input_excel is None:
256 filtered_all_df = prefilter_df 249 filtered_all_df = prefilter_df
257 sheet_names = None 250 sheet_names = None
258 else: 251 else:
259 # Filter positions to be removed from all. 252 # Filter positions to be removed from all.
260 xl = pandas.ExcelFile(self.excel_grouper_file) 253 xl = pandas.ExcelFile(self.input_excel)
261 sheet_names = xl.sheet_names 254 sheet_names = xl.sheet_names
262 # Use the first column to filter "all" postions. 255 # Use the first column to filter "all" postions.
263 exclusion_list_all = self.get_position_list(sheet_names, 0) 256 exclusion_list_all = self.get_position_list(sheet_names, 0)
264 exclusion_list_group = self.get_position_list(sheet_names, group_dir) 257 exclusion_list_group = self.get_position_list(sheet_names, group_dir)
265 exclusion_list = exclusion_list_all + exclusion_list_group 258 exclusion_list = exclusion_list_all + exclusion_list_group
266 # Filters for all applied. 259 # Filters for all applied.
267 filtered_all_df = prefilter_df.drop(columns=exclusion_list, errors='ignore') 260 filtered_all_df = prefilter_df.drop(columns=exclusion_list, errors='ignore')
268 json_snps_file = os.path.join(OUTPUT_JSON_SNPS_DIR, "%s.json" % group_dir) 261 json_snps_file = os.path.join(self.output_json_snps_dir, "%s.json" % group_dir)
269 parsimonious_df = self.get_parsimonious_df(filtered_all_df) 262 parsimonious_df = self.get_parsimonious_df(filtered_all_df)
270 samples_number, columns = parsimonious_df.shape 263 samples_number, columns = parsimonious_df.shape
271 if samples_number >= 4: 264 if samples_number >= 4:
265 # Sufficient samples have been found
266 # to build a phylogenetic tree.
272 has_sequence_data = self.df_to_fasta(parsimonious_df, group_dir) 267 has_sequence_data = self.df_to_fasta(parsimonious_df, group_dir)
273 if has_sequence_data: 268 if has_sequence_data:
274 json_avg_mq_file = os.path.join(OUTPUT_JSON_AVG_MQ_DIR, "%s.json" % group_dir) 269 json_avg_mq_file = os.path.join(self.output_json_avg_mq_dir, "%s.json" % group_dir)
275 mq_averages.to_json(json_avg_mq_file, orient='split') 270 mq_averages.to_json(json_avg_mq_file, orient='split')
276 parsimonious_df.to_json(json_snps_file, orient='split') 271 parsimonious_df.to_json(json_snps_file, orient='split')
277 else: 272 else:
278 msg = "<br/>No sequence data" 273 msg = "<br/>No sequence data"
279 if group_dir is not None: 274 if group_dir is not None:
283 msg = "<br/>Too few samples to build tree" 278 msg = "<br/>Too few samples to build tree"
284 if group_dir is not None: 279 if group_dir is not None:
285 msg = "%s for group: %s" % (msg, group_dir) 280 msg = "%s for group: %s" % (msg, group_dir)
286 self.append_to_summary("%s<br/>\n" % msg) 281 self.append_to_summary("%s<br/>\n" % msg)
287 282
288 def get_base_file_name(self, file_path): 283 def get_sample_name(self, file_path):
284 # Return the sample part of a file name.
289 base_file_name = os.path.basename(file_path) 285 base_file_name = os.path.basename(file_path)
290 if base_file_name.find(".") > 0: 286 if base_file_name.find(".") > 0:
291 # Eliminate the extension. 287 # Eliminate the extension.
292 return os.path.splitext(base_file_name)[0] 288 return os.path.splitext(base_file_name)[0]
293 elif base_file_name.find("_") > 0: 289 return base_file_name
294 # The dot extension was likely changed to
295 # the " character.
296 items = base_file_name.split("_")
297 return "_".join(items[0:-1])
298 else:
299 return base_file_name
300 290
301 def get_mq_val(self, record_info, filename): 291 def get_mq_val(self, record_info, filename):
302 # Get the MQ (gatk) or MQM (freebayes) value 292 # Get the MQ (gatk) or MQM (freebayes) value
303 # from the record.INFO component of the vcf file. 293 # from the record.INFO component of the vcf file.
304 try: 294 try:
331 321
332 def get_position_list(self, sheet_names, group): 322 def get_position_list(self, sheet_names, group):
333 # Get a list of positions defined by an excel file. 323 # Get a list of positions defined by an excel file.
334 exclusion_list = [] 324 exclusion_list = []
335 try: 325 try:
336 filter_to_all = pandas.read_excel(self.excel_grouper_file, header=1, usecols=[group]) 326 filter_to_all = pandas.read_excel(self.input_excel, header=1, usecols=[group])
337 for value in filter_to_all.values: 327 for value in filter_to_all.values:
338 value = str(value[0]) 328 value = str(value[0])
339 if "-" not in value.split(":")[-1]: 329 if "-" not in value.split(":")[-1]:
340 exclusion_list.append(value) 330 exclusion_list.append(value)
341 elif "-" in value: 331 elif "-" in value:
346 value = sequence_range.split("-") 336 value = sequence_range.split("-")
347 for position in range(int(value[0].replace(',', '')), int(value[1].replace(',', '')) + 1): 337 for position in range(int(value[0].replace(',', '')), int(value[1].replace(',', '')) + 1):
348 exclusion_list.append(chrom + ":" + str(position)) 338 exclusion_list.append(chrom + ":" + str(position))
349 return exclusion_list 339 return exclusion_list
350 except ValueError: 340 except ValueError:
351 exclusion_list = [] 341 return []
352 return exclusion_list
353 342
354 def get_snps(self, task_queue, timeout): 343 def get_snps(self, task_queue, timeout):
355 while True: 344 while True:
356 try: 345 try:
357 group_dir = task_queue.get(block=True, timeout=timeout) 346 group_dir = task_queue.get(block=True, timeout=timeout)
358 except queue.Empty: 347 except queue.Empty:
359 break 348 break
360 # Parse all vcf files to accumulate SNPs into a 349 # Parse all vcf files to accumulate
361 # data frame. 350 # the SNPs into a data frame.
362 positions_dict = {} 351 positions_dict = {}
363 group_files = [] 352 group_files = []
364 for file_name in os.listdir(os.path.abspath(group_dir)): 353 for file_name in os.listdir(os.path.abspath(group_dir)):
365 file_path = os.path.abspath(os.path.join(group_dir, file_name)) 354 file_path = os.path.abspath(os.path.join(group_dir, file_name))
366 group_files.append(file_path) 355 group_files.append(file_path)
367 for file_name in group_files: 356 for file_name in group_files:
368 try: 357 found_positions, found_positions_mix = self.find_initial_positions(file_name)
369 found_positions, found_positions_mix = self.find_initial_positions(file_name) 358 positions_dict.update(found_positions)
370 positions_dict.update(found_positions)
371 except Exception as e:
372 self.append_to_summary("Error updating the positions_dict dictionary when processing file %s:\n%s\n" % (file_name, str(e)))
373 # Order before adding to file to match 359 # Order before adding to file to match
374 # with ordering of individual samples. 360 # with ordering of individual samples.
375 # all_positions is abs_pos:REF 361 # all_positions is abs_pos:REF
376 self.all_positions = OrderedDict(sorted(positions_dict.items())) 362 self.all_positions = OrderedDict(sorted(positions_dict.items()))
377 ref_positions_df = pandas.DataFrame(self.all_positions, index=['root']) 363 ref_positions_df = pandas.DataFrame(self.all_positions, index=['root'])
392 self.gather_and_filter(prefilter_df, mq_averages, group_dir) 378 self.gather_and_filter(prefilter_df, mq_averages, group_dir)
393 task_queue.task_done() 379 task_queue.task_done()
394 380
395 def group_vcfs(self, vcf_files): 381 def group_vcfs(self, vcf_files):
396 # Parse an excel file to produce a 382 # Parse an excel file to produce a
397 # grouping dictionary for filtering SNPs. 383 # grouping dictionary for SNPs.
398 xl = pandas.ExcelFile(self.excel_grouper_file) 384 xl = pandas.ExcelFile(self.input_excel)
399 sheet_names = xl.sheet_names 385 sheet_names = xl.sheet_names
400 ws = pandas.read_excel(self.excel_grouper_file, sheet_name=sheet_names[0]) 386 ws = pandas.read_excel(self.input_excel, sheet_name=sheet_names[0])
401 defining_snps = ws.iloc[0] 387 defining_snps = ws.iloc[0]
402 defsnp_iterator = iter(defining_snps.iteritems()) 388 defsnp_iterator = iter(defining_snps.iteritems())
403 next(defsnp_iterator) 389 next(defsnp_iterator)
404 defining_snps = {} 390 defining_snps = {}
405 inverted_defining_snps = {} 391 inverted_defining_snps = {}
427 def initiate_summary(self, output_summary): 413 def initiate_summary(self, output_summary):
428 # Output summary file handle. 414 # Output summary file handle.
429 self.append_to_summary('<html>\n') 415 self.append_to_summary('<html>\n')
430 self.append_to_summary('<head></head>\n') 416 self.append_to_summary('<head></head>\n')
431 self.append_to_summary('<body style=\"font-size:12px;">') 417 self.append_to_summary('<body style=\"font-size:12px;">')
432 self.append_to_summary("<b>Time started:</b> %s<br/>" % str(get_time_stamp())) 418 self.append_to_summary("<b>Time started:</b> %s<br/>" % get_time_stamp())
433 self.append_to_summary("<b>Number of VCF inputs:</b> %d<br/>" % self.num_files) 419 self.append_to_summary("<b>Number of VCF inputs:</b> %d<br/>" % self.num_files)
434 self.append_to_summary("<b>Reference:</b> %s<br/>" % str(self.reference)) 420 self.append_to_summary("<b>Reference:</b> %s<br/>" % self.dbkey)
435 self.append_to_summary("<b>All isolates:</b> %s<br/>" % str(self.all_isolates)) 421 self.append_to_summary("<b>All isolates:</b> %s<br/>" % str(self.all_isolates))
436 422
437 def return_val(self, val, index=0): 423 def return_val(self, val, index=0):
438 # Handle element and single-element list values. 424 # Handle element and single-element list values.
439 if isinstance(val, list): 425 if isinstance(val, list):
448 # val is likely None here. 434 # val is likely None here.
449 return 0 435 return 0
450 436
451 437
452 if __name__ == '__main__': 438 if __name__ == '__main__':
439
453 parser = argparse.ArgumentParser() 440 parser = argparse.ArgumentParser()
454 441
455 parser.add_argument('--all_isolates', action='store', dest='all_isolates', required=False, default="No", help='Create table with all isolates'), 442 parser.add_argument('--ac', action='store', dest='ac', type=int, help='Allele count value'),
456 parser.add_argument('--excel_grouper_file', action='store', dest='excel_grouper_file', required=False, default=None, help='Optional Excel filter file'), 443 parser.add_argument('--all_isolates', action='store_true', dest='all_isolates', required=False, default=False, help='Create table with all isolates'),
444 parser.add_argument('--input_excel', action='store', dest='input_excel', required=False, default=None, help='Optional Excel filter file'),
445 parser.add_argument('--input_vcf_dir', action='store', dest='input_vcf_dir', help='Input vcf directory'),
446 parser.add_argument('--min_mq', action='store', dest='min_mq', type=int, help='Minimum map quality value'),
447 parser.add_argument('--min_quality_score', action='store', dest='min_quality_score', type=int, help='Minimum quality score value'),
448 parser.add_argument('--output_json_avg_mq_dir', action='store', dest='output_json_avg_mq_dir', help='Output json average mq directory'),
449 parser.add_argument('--output_json_snps_dir', action='store', dest='output_json_snps_dir', help='Output json snps directory'),
450 parser.add_argument('--output_snps_dir', action='store', dest='output_snps_dir', help='Output snps directory'),
457 parser.add_argument('--output_summary', action='store', dest='output_summary', help='Output summary html file'), 451 parser.add_argument('--output_summary', action='store', dest='output_summary', help='Output summary html file'),
458 parser.add_argument('--reference', action='store', dest='reference', help='Reference file'), 452 parser.add_argument('--processes', action='store', dest='processes', type=int, help='Configured processes for job'),
459 parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting') 453 parser.add_argument('--quality_score_n_threshold', action='store', dest='quality_score_n_threshold', type=int, help='Minimum quality score N value for alleles'),
454 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Galaxy genome build dbkey'),
460 455
461 args = parser.parse_args() 456 args = parser.parse_args()
462 457
463 # Initializations - TODO: should these be passed in as command line args? 458 # Build the list of all input zero coverage vcf
464 ac = 2 459 # files, both the samples and the "database".
465 mq_val = 56
466 n_threshold = 50
467 qual_threshold = 150
468
469 # Build the list of sample vcf files for the current run.
470 vcf_files = [] 460 vcf_files = []
471 for file_name in os.listdir(INPUT_VCF_DIR): 461 for file_name in os.listdir(args.input_vcf_dir):
472 file_path = os.path.abspath(os.path.join(INPUT_VCF_DIR, file_name)) 462 file_path = os.path.abspath(os.path.join(args.input_vcf_dir, file_name))
473 vcf_files.append(file_path) 463 vcf_files.append(file_path)
474 464
475 multiprocessing.set_start_method('spawn') 465 multiprocessing.set_start_method('spawn')
476 queue1 = multiprocessing.JoinableQueue() 466 queue1 = multiprocessing.JoinableQueue()
477 num_files = len(vcf_files) 467 num_files = len(vcf_files)
478 cpus = set_num_cpus(num_files, args.processes) 468 cpus = set_num_cpus(num_files, args.processes)
479 # Set a timeout for get()s in the queue. 469 # Set a timeout for get()s in the queue.
480 timeout = 0.05 470 timeout = 0.05
481 471
482 # Initialize the snp_finder object. 472 # Initialize the snp_finder object.
483 snp_finder = SnpFinder(num_files, args.reference, args.excel_grouper_file, args.all_isolates, 473 snp_finder = SnpFinder(num_files, args.dbkey, args.input_excel, args.all_isolates, args.ac, args.min_mq, args.quality_score_n_threshold, args.min_quality_score, args.input_vcf_dir, args.output_json_avg_mq_dir, args.output_json_snps_dir, args.output_snps_dir, args.output_summary)
484 ac, mq_val, n_threshold, qual_threshold, args.output_summary) 474
485 475 # Define and make the set of directories into which the input_zc_vcf
486 # Initialize the set of directories containiing vcf files for analysis. 476 # files will be placed. Selected input values (e.g., the use of
477 # an Excel file for grouping and filtering, creating a group with
478 # all isolates) are used to define the directories.
487 vcf_dirs = [] 479 vcf_dirs = []
488 if args.excel_grouper_file is None: 480 if args.input_excel is None:
489 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs) 481 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
490 else: 482 else:
491 if args.all_isolates.lower() == "yes": 483 if args.all_isolates:
492 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs) 484 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
493 # Parse the Excel file to detemine groups for filtering. 485 # Parse the Excel file to detemine groups for filtering.
494 snp_finder.group_vcfs(vcf_files) 486 snp_finder.group_vcfs(vcf_files)
495 # Append the list of group directories created by 487 # Append the list of group directories created by
496 # the above call to the set of directories containing 488 # the above call to the set of directories containing
497 # vcf files for analysis 489 # vcf files for analysis.
498 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups] 490 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups]
499 vcf_dirs.extend(group_dirs) 491 vcf_dirs.extend(group_dirs)
500 492
501 # Populate the queue for job splitting. 493 # Populate the queue for job splitting.
502 for vcf_dir in vcf_dirs: 494 for vcf_dir in vcf_dirs: