comparison pedify.py @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:64e75e21466e
1 import sys
2 import csv
3 import argparse
4
5 DEBUG = 0
6
7 REQ_KEYS = ['genotype_column', 'reference_column', 'alternate_column',
8 'sample_id_column', 'chromosome_column', 'position_column',
9 'variant_id_column']
10
11 GENOTYPE_DICT = {
12 "'0/0": "hom_ref",
13 "'0/1": "het",
14 "'1/1": "hom_alt",
15 "'1/2": "tri_allelic"
16 }
17
18 GENOTYPE_TO_NUMERIC = {
19 "'0/0": 0,
20 "'0/1": 1,
21 "'1/1": 2,
22 "'1/2": 2
23 }
24
25 class PedConverter:
26 def __init__(self):
27 self.cfg = None
28 self.samples = {}
29 self.sites = {}
30 self.xsamples = []
31
32 def verify_column_names(self, datafile_header):
33 # check all the config column names actually exist in the data file
34 for col in self.cfg.col_names.values():
35 if col not in datafile_header:
36 print "The '{}' column was not found in the datafile! Double check your config file is correct. Exiting...".format(
37 col)
38 sys.exit(1)
39
40 def verify_filters(self, datafile_header):
41 # print warning messages if filters invalid
42 all_filters = self.cfg.nfilters.copy()
43 all_filters.update(self.cfg.sfilters)
44 for key in all_filters:
45 col_name = all_filters[key]["col_name"]
46 if col_name not in datafile_header:
47 print "Warning! The '{}' filter was not applied as the datafile does not contain the column '{}'".format(
48 key, col_name)
49
50 def read_config_file(self, cfname):
51 self.cfg = ConfigSettings()
52 rc = self.cfg.parse_config_file(cfname)
53 return rc
54
55 def read_data_file(self, dfname):
56 if (self.cfg == None) or (not self.cfg.is_valid()):
57 return 1
58
59 datafile = open(dfname, 'r')
60 dreader = csv.DictReader(datafile, delimiter='\t')
61 # verify datafile data matches config file
62 self.verify_column_names(dreader.fieldnames)
63 self.verify_filters(dreader.fieldnames)
64 all_sample_ids = set()
65 i = 0
66
67 for row in dreader:
68 failed_filters = self.filter_all(row)
69 sample_key = row[self.cfg.col_names['sample_id_column']]
70 all_sample_ids.add(sample_key)
71 if not failed_filters:
72 # add to sample dict
73 # key is a tuple made up of which chromosome the snp is found on
74 # and the position on the chromosome itself
75 SNP_key = (row[self.cfg.col_names['chromosome_column']], int(row[self.cfg.col_names['position_column']]))
76 genotype = row[self.cfg.col_names['genotype_column']]
77
78 # create a dictionary for each sample (person); each person is associated
79 # with another dictionary of all the SNPs found in that person
80 if sample_key not in self.samples:
81 self.samples[sample_key] = {SNP_key: genotype}
82 else:
83 self.samples[sample_key][SNP_key] = genotype
84
85 # create a dict of all the sites where SNPs exist
86 if SNP_key not in self.sites:
87 # generate arbitrary ID's if there is no pre-existing ID for the SNP
88 if row[self.cfg.col_names['variant_id_column']] == '.':
89 SNP_id = "SNP_" + str(i)
90 else:
91 SNP_id = row[self.cfg.col_names['variant_id_column']]
92
93 SNP_data = {'ref_col': row[self.cfg.col_names['reference_column']],
94 'alt_col': row[self.cfg.col_names['alternate_column']],
95 'SNP_id': SNP_id}
96 self.sites[SNP_key] = SNP_data
97 i += 1
98
99 # make sure every sample contains a genotype for every SNP found
100 for sample_key in self.samples:
101 this_sample = self.samples[sample_key]
102 for SNP_key in self.sites:
103 if SNP_key not in this_sample:
104 this_sample[SNP_key] = "'0/0"
105 datafile.close()
106
107 # get list of samples which were filtered out
108 self.xsamples = list(all_sample_ids.difference(set(self.samples.keys())))
109 return 0
110
111 # returns key of the specific filter/s that failed
112 def filter_numeric(self, row):
113 failed_filters = set()
114 for key in self.cfg.nfilters.keys():
115 nfilter = self.cfg.nfilters[key]
116 cutoff = float(nfilter["cutoff"])
117 op = nfilter["op"]
118 col_name = nfilter["col_name"]
119 if col_name in row.keys():
120 cv = float(row[col_name])
121 if not string_as_operator(cv, cutoff, op):
122 failed_filters.add(key)
123
124 return failed_filters
125
126 # returns key of the specific filter/s that failed
127 def filter_string(self, row):
128 failed_filters = set()
129 for key in self.cfg.sfilters.keys():
130 sfilter = self.cfg.sfilters[key]
131 col_name = sfilter["col_name"]
132 if col_name in row.keys():
133 cs = row[col_name]
134 af = sfilter['accept_flag']
135 ef = sfilter['exact_flag']
136 patterns = sfilter['patterns']
137 if ef:
138 if af:
139 passed = False
140 for p in patterns:
141 if p == cs:
142 passed = True
143 break
144 if passed == False:
145 failed_filters.add(key)
146 else:
147 for p in patterns:
148 if p == cs:
149 failed_filters.add(key)
150 break
151 else:
152 if af:
153 passed = False
154 for p in patterns:
155 if p in cs:
156 passed = True
157 break
158 if passed == False:
159 failed_filters.add(key)
160 else:
161 for p in patterns:
162 if p in cs:
163 failed_filters.add(key)
164 break
165
166 return failed_filters
167
168 def filter_all(self, row):
169 return self.filter_numeric(row).union(self.filter_string(row))
170
171 def create_ped_file(self, filename, numeric=False):
172 output = ""
173
174 sorted_sample_keys = sorted(self.samples.keys())
175 for sample_key in sorted_sample_keys:
176 this_sample = self.samples[sample_key]
177 sorted_site_keys = sorted(this_sample.keys())
178 variants = []
179 if numeric:
180 pef = sample_key
181 else:
182 pef = self.create_ped_entry_front(sample_key)
183
184 for SNP_key in sorted_site_keys:
185 genotype = this_sample[SNP_key]
186 if numeric == True:
187 variants.append(str(GENOTYPE_TO_NUMERIC[genotype]))
188 else:
189 variants.append(genotype_to_bases(genotype, self.sites[SNP_key]))
190
191 output += "{}\t{}\n".format(pef, '\t'.join(variants))
192
193 pedfile = open(filename, 'w')
194 pedfile.write(output)
195 pedfile.close()
196
197 def create_ped_entry_front(self, sample_id):
198 if self.cfg.control_info["control_tag"]["tag"] in sample_id:
199 group = 2
200 elif self.cfg.control_info["cases_tag"]["tag"] in sample_id:
201 group = 1
202 else:
203 group = 1
204
205 entry = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(
206 sample_id,
207 sample_id,
208 sample_id + "_F",
209 sample_id + "_M",
210 2,
211 group)
212
213 return entry
214
215 def create_map_file(self, filename):
216 output = ""
217 for SNP_key in sorted(self.sites.keys()):
218 chrom = SNP_key[0]
219 SNP_id = self.sites[SNP_key]['SNP_id']
220 posn = SNP_key[1]
221 output += "{}\t{}\t{}\n".format(chrom, SNP_id, str(posn))
222
223 mapfile = open(filename, 'w')
224 mapfile.write(output)
225 mapfile.close()
226
227 def create_excluded_samples_file(self, filename):
228 xsfile = open(filename, 'w')
229 xsfile.write('\n'.join(self.xsamples))
230 xsfile.close()
231
232 class ConfigSettings:
233
234 SECTIONS = [
235 "#control",
236 "#column_names",
237 "#numeric_filters",
238 "#string_filters"
239 ]
240
241 def __init__(self):
242 self.control_info = {}
243 self.col_names = {}
244 self.nfilters = {}
245 self.sfilters = {}
246
247 def parse_config_file(self, cfname):
248 cffile = open(cfname, 'r')
249 section = None
250 rc = 0
251
252 for line in cffile:
253 # clean trailing/leading whitespace/newlines
254 line = line.strip()
255 # set section flag
256 if line[0] == '#':
257 if line in ConfigSettings.SECTIONS:
258 section = line
259 else:
260 continue
261 else:
262 # fill up config dicts
263 if section == "#control":
264 (key, col_name, tag) = line.split(',')
265 self.control_info[key] = {'col_name': col_name, 'tag': tag}
266 elif section == "#column_names":
267 (key, col_name) = line.split(',')
268 self.col_names[key] = col_name
269 elif section == "#numeric_filters":
270 (key, col_name, op, cutoff) = line.split(',')
271 self.add_numeric_filter(key, col_name, op, float(cutoff))
272 elif section == "#string_filters":
273 (key, col_name, exact_flag, accept_flag) = line.split(',')
274 patterns = next(cffile).strip().split(',')
275 self.add_string_filter(key, col_name, exact_flag, accept_flag, patterns)
276 else:
277 rc = 2
278 break
279
280 cffile.close()
281 if rc != 0:
282 return rc
283 if not self.is_valid():
284 rc = 3
285 return rc
286
287
288 def is_valid(self):
289 for k in REQ_KEYS:
290 if k not in self.col_names.keys():
291 return False
292 return True
293
294 def add_numeric_filter(self, key, col_name, op, cutoff):
295 self.nfilters[key] = {
296 'col_name': col_name,
297 'op': op,
298 'cutoff': cutoff
299 }
300
301 def add_string_filter(self, key, col_name, exact_flag, accept_flag, patterns):
302 if exact_flag == "exact":
303 ef = True
304 elif exact_flag == "not_exact":
305 ef = False
306 else:
307 return False
308
309 if accept_flag == "accept":
310 af = True
311 elif accept_flag == "reject":
312 af = False
313 else:
314 return False
315
316 self.sfilters[key] = {
317 'col_name': col_name,
318 'exact_flag': ef,
319 'accept_flag': af,
320 'patterns': patterns
321 }
322
323 def __str__(self):
324 rv = "is Valid: {} || control info: {} || col names: {} || numeric filters: {} || string filters: {}".format(
325 self.is_valid(), self.control_info, self.col_names, self.nfilters, self.sfilters)
326 return rv
327
328
329 ### Utility ###
330 def string_as_operator(arg1, arg2, op):
331 if op == "==":
332 return arg1 == arg2
333 elif op == ">":
334 return arg1 > arg2
335 elif op == "<":
336 return arg1 < arg2
337 elif op == "<=":
338 return arg1 <= arg2
339 elif op == ">=":
340 return arg1 >= arg2
341
342 def genotype_to_bases(genotype, SNPdata):
343 bases = ""
344 if genotype in GENOTYPE_DICT:
345 gtype = GENOTYPE_DICT[genotype]
346 if gtype == "hom_ref":
347 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['ref_col'])
348 elif gtype == "hom_alt":
349 bases = "{} {}".format(SNPdata['alt_col'], SNPdata['alt_col'])
350 elif gtype == "het":
351 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['alt_col'])
352 elif gtype == "tri_allelic":
353 aa_col = SNPdata['alt_col']
354 if len(aa_col) > 1:
355 # arbitrarily choose the first one
356 alt_allele = aa_col[0]
357 else:
358 alt_allele = aa_col
359 bases = "{} {}".format(alt_allele, alt_allele)
360 else:
361 print genotype
362 print "Unrecognized genotype!"
363 sys.exit(1)
364 return bases
365
366 ### Main ###
367 def main():
368 # argument parsing
369 parser = argparse.ArgumentParser()
370 parser.add_argument("dfname", help="name of input data file")
371 parser.add_argument("cfname", help="name of input configuration file")
372 parser.add_argument("pfname", help="name of output ped file")
373 parser.add_argument("mfname", help="name of output map file")
374 parser.add_argument("xsname", help="name of output file containing exact IDs of samples who were excluded")
375 args = parser.parse_args()
376
377 pc = PedConverter()
378 # read in config file
379 rc = pc.read_config_file(args.cfname)
380 if rc == 0:
381 print 'config file read successfully'
382 else:
383 print 'failed to read in config file successfully. Error code: {}'.format(rc)
384 # read in data file
385 rc = pc.read_data_file(args.dfname)
386 if rc == 0:
387 print 'data file read successfully'
388 else:
389 print 'failed to read in data file successfully. Error code: {}'.format(rc)
390 pc.create_ped_file(args.pfname, numeric=True)
391 pc.create_map_file(args.mfname)
392 pc.create_excluded_samples_file(args.xsname)
393
394 if __name__ == "__main__":
395 main()