Mercurial > repos > pmac > iterativepca
comparison pedify.py @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:64e75e21466e |
---|---|
1 import sys | |
2 import csv | |
3 import argparse | |
4 | |
5 DEBUG = 0 | |
6 | |
7 REQ_KEYS = ['genotype_column', 'reference_column', 'alternate_column', | |
8 'sample_id_column', 'chromosome_column', 'position_column', | |
9 'variant_id_column'] | |
10 | |
11 GENOTYPE_DICT = { | |
12 "'0/0": "hom_ref", | |
13 "'0/1": "het", | |
14 "'1/1": "hom_alt", | |
15 "'1/2": "tri_allelic" | |
16 } | |
17 | |
18 GENOTYPE_TO_NUMERIC = { | |
19 "'0/0": 0, | |
20 "'0/1": 1, | |
21 "'1/1": 2, | |
22 "'1/2": 2 | |
23 } | |
24 | |
25 class PedConverter: | |
26 def __init__(self): | |
27 self.cfg = None | |
28 self.samples = {} | |
29 self.sites = {} | |
30 self.xsamples = [] | |
31 | |
32 def verify_column_names(self, datafile_header): | |
33 # check all the config column names actually exist in the data file | |
34 for col in self.cfg.col_names.values(): | |
35 if col not in datafile_header: | |
36 print "The '{}' column was not found in the datafile! Double check your config file is correct. Exiting...".format( | |
37 col) | |
38 sys.exit(1) | |
39 | |
40 def verify_filters(self, datafile_header): | |
41 # print warning messages if filters invalid | |
42 all_filters = self.cfg.nfilters.copy() | |
43 all_filters.update(self.cfg.sfilters) | |
44 for key in all_filters: | |
45 col_name = all_filters[key]["col_name"] | |
46 if col_name not in datafile_header: | |
47 print "Warning! The '{}' filter was not applied as the datafile does not contain the column '{}'".format( | |
48 key, col_name) | |
49 | |
50 def read_config_file(self, cfname): | |
51 self.cfg = ConfigSettings() | |
52 rc = self.cfg.parse_config_file(cfname) | |
53 return rc | |
54 | |
55 def read_data_file(self, dfname): | |
56 if (self.cfg == None) or (not self.cfg.is_valid()): | |
57 return 1 | |
58 | |
59 datafile = open(dfname, 'r') | |
60 dreader = csv.DictReader(datafile, delimiter='\t') | |
61 # verify datafile data matches config file | |
62 self.verify_column_names(dreader.fieldnames) | |
63 self.verify_filters(dreader.fieldnames) | |
64 all_sample_ids = set() | |
65 i = 0 | |
66 | |
67 for row in dreader: | |
68 failed_filters = self.filter_all(row) | |
69 sample_key = row[self.cfg.col_names['sample_id_column']] | |
70 all_sample_ids.add(sample_key) | |
71 if not failed_filters: | |
72 # add to sample dict | |
73 # key is a tuple made up of which chromosome the snp is found on | |
74 # and the position on the chromosome itself | |
75 SNP_key = (row[self.cfg.col_names['chromosome_column']], int(row[self.cfg.col_names['position_column']])) | |
76 genotype = row[self.cfg.col_names['genotype_column']] | |
77 | |
78 # create a dictionary for each sample (person); each person is associated | |
79 # with another dictionary of all the SNPs found in that person | |
80 if sample_key not in self.samples: | |
81 self.samples[sample_key] = {SNP_key: genotype} | |
82 else: | |
83 self.samples[sample_key][SNP_key] = genotype | |
84 | |
85 # create a dict of all the sites where SNPs exist | |
86 if SNP_key not in self.sites: | |
87 # generate arbitrary ID's if there is no pre-existing ID for the SNP | |
88 if row[self.cfg.col_names['variant_id_column']] == '.': | |
89 SNP_id = "SNP_" + str(i) | |
90 else: | |
91 SNP_id = row[self.cfg.col_names['variant_id_column']] | |
92 | |
93 SNP_data = {'ref_col': row[self.cfg.col_names['reference_column']], | |
94 'alt_col': row[self.cfg.col_names['alternate_column']], | |
95 'SNP_id': SNP_id} | |
96 self.sites[SNP_key] = SNP_data | |
97 i += 1 | |
98 | |
99 # make sure every sample contains a genotype for every SNP found | |
100 for sample_key in self.samples: | |
101 this_sample = self.samples[sample_key] | |
102 for SNP_key in self.sites: | |
103 if SNP_key not in this_sample: | |
104 this_sample[SNP_key] = "'0/0" | |
105 datafile.close() | |
106 | |
107 # get list of samples which were filtered out | |
108 self.xsamples = list(all_sample_ids.difference(set(self.samples.keys()))) | |
109 return 0 | |
110 | |
111 # returns key of the specific filter/s that failed | |
112 def filter_numeric(self, row): | |
113 failed_filters = set() | |
114 for key in self.cfg.nfilters.keys(): | |
115 nfilter = self.cfg.nfilters[key] | |
116 cutoff = float(nfilter["cutoff"]) | |
117 op = nfilter["op"] | |
118 col_name = nfilter["col_name"] | |
119 if col_name in row.keys(): | |
120 cv = float(row[col_name]) | |
121 if not string_as_operator(cv, cutoff, op): | |
122 failed_filters.add(key) | |
123 | |
124 return failed_filters | |
125 | |
126 # returns key of the specific filter/s that failed | |
127 def filter_string(self, row): | |
128 failed_filters = set() | |
129 for key in self.cfg.sfilters.keys(): | |
130 sfilter = self.cfg.sfilters[key] | |
131 col_name = sfilter["col_name"] | |
132 if col_name in row.keys(): | |
133 cs = row[col_name] | |
134 af = sfilter['accept_flag'] | |
135 ef = sfilter['exact_flag'] | |
136 patterns = sfilter['patterns'] | |
137 if ef: | |
138 if af: | |
139 passed = False | |
140 for p in patterns: | |
141 if p == cs: | |
142 passed = True | |
143 break | |
144 if passed == False: | |
145 failed_filters.add(key) | |
146 else: | |
147 for p in patterns: | |
148 if p == cs: | |
149 failed_filters.add(key) | |
150 break | |
151 else: | |
152 if af: | |
153 passed = False | |
154 for p in patterns: | |
155 if p in cs: | |
156 passed = True | |
157 break | |
158 if passed == False: | |
159 failed_filters.add(key) | |
160 else: | |
161 for p in patterns: | |
162 if p in cs: | |
163 failed_filters.add(key) | |
164 break | |
165 | |
166 return failed_filters | |
167 | |
168 def filter_all(self, row): | |
169 return self.filter_numeric(row).union(self.filter_string(row)) | |
170 | |
171 def create_ped_file(self, filename, numeric=False): | |
172 output = "" | |
173 | |
174 sorted_sample_keys = sorted(self.samples.keys()) | |
175 for sample_key in sorted_sample_keys: | |
176 this_sample = self.samples[sample_key] | |
177 sorted_site_keys = sorted(this_sample.keys()) | |
178 variants = [] | |
179 if numeric: | |
180 pef = sample_key | |
181 else: | |
182 pef = self.create_ped_entry_front(sample_key) | |
183 | |
184 for SNP_key in sorted_site_keys: | |
185 genotype = this_sample[SNP_key] | |
186 if numeric == True: | |
187 variants.append(str(GENOTYPE_TO_NUMERIC[genotype])) | |
188 else: | |
189 variants.append(genotype_to_bases(genotype, self.sites[SNP_key])) | |
190 | |
191 output += "{}\t{}\n".format(pef, '\t'.join(variants)) | |
192 | |
193 pedfile = open(filename, 'w') | |
194 pedfile.write(output) | |
195 pedfile.close() | |
196 | |
197 def create_ped_entry_front(self, sample_id): | |
198 if self.cfg.control_info["control_tag"]["tag"] in sample_id: | |
199 group = 2 | |
200 elif self.cfg.control_info["cases_tag"]["tag"] in sample_id: | |
201 group = 1 | |
202 else: | |
203 group = 1 | |
204 | |
205 entry = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format( | |
206 sample_id, | |
207 sample_id, | |
208 sample_id + "_F", | |
209 sample_id + "_M", | |
210 2, | |
211 group) | |
212 | |
213 return entry | |
214 | |
215 def create_map_file(self, filename): | |
216 output = "" | |
217 for SNP_key in sorted(self.sites.keys()): | |
218 chrom = SNP_key[0] | |
219 SNP_id = self.sites[SNP_key]['SNP_id'] | |
220 posn = SNP_key[1] | |
221 output += "{}\t{}\t{}\n".format(chrom, SNP_id, str(posn)) | |
222 | |
223 mapfile = open(filename, 'w') | |
224 mapfile.write(output) | |
225 mapfile.close() | |
226 | |
227 def create_excluded_samples_file(self, filename): | |
228 xsfile = open(filename, 'w') | |
229 xsfile.write('\n'.join(self.xsamples)) | |
230 xsfile.close() | |
231 | |
232 class ConfigSettings: | |
233 | |
234 SECTIONS = [ | |
235 "#control", | |
236 "#column_names", | |
237 "#numeric_filters", | |
238 "#string_filters" | |
239 ] | |
240 | |
241 def __init__(self): | |
242 self.control_info = {} | |
243 self.col_names = {} | |
244 self.nfilters = {} | |
245 self.sfilters = {} | |
246 | |
247 def parse_config_file(self, cfname): | |
248 cffile = open(cfname, 'r') | |
249 section = None | |
250 rc = 0 | |
251 | |
252 for line in cffile: | |
253 # clean trailing/leading whitespace/newlines | |
254 line = line.strip() | |
255 # set section flag | |
256 if line[0] == '#': | |
257 if line in ConfigSettings.SECTIONS: | |
258 section = line | |
259 else: | |
260 continue | |
261 else: | |
262 # fill up config dicts | |
263 if section == "#control": | |
264 (key, col_name, tag) = line.split(',') | |
265 self.control_info[key] = {'col_name': col_name, 'tag': tag} | |
266 elif section == "#column_names": | |
267 (key, col_name) = line.split(',') | |
268 self.col_names[key] = col_name | |
269 elif section == "#numeric_filters": | |
270 (key, col_name, op, cutoff) = line.split(',') | |
271 self.add_numeric_filter(key, col_name, op, float(cutoff)) | |
272 elif section == "#string_filters": | |
273 (key, col_name, exact_flag, accept_flag) = line.split(',') | |
274 patterns = next(cffile).strip().split(',') | |
275 self.add_string_filter(key, col_name, exact_flag, accept_flag, patterns) | |
276 else: | |
277 rc = 2 | |
278 break | |
279 | |
280 cffile.close() | |
281 if rc != 0: | |
282 return rc | |
283 if not self.is_valid(): | |
284 rc = 3 | |
285 return rc | |
286 | |
287 | |
288 def is_valid(self): | |
289 for k in REQ_KEYS: | |
290 if k not in self.col_names.keys(): | |
291 return False | |
292 return True | |
293 | |
294 def add_numeric_filter(self, key, col_name, op, cutoff): | |
295 self.nfilters[key] = { | |
296 'col_name': col_name, | |
297 'op': op, | |
298 'cutoff': cutoff | |
299 } | |
300 | |
301 def add_string_filter(self, key, col_name, exact_flag, accept_flag, patterns): | |
302 if exact_flag == "exact": | |
303 ef = True | |
304 elif exact_flag == "not_exact": | |
305 ef = False | |
306 else: | |
307 return False | |
308 | |
309 if accept_flag == "accept": | |
310 af = True | |
311 elif accept_flag == "reject": | |
312 af = False | |
313 else: | |
314 return False | |
315 | |
316 self.sfilters[key] = { | |
317 'col_name': col_name, | |
318 'exact_flag': ef, | |
319 'accept_flag': af, | |
320 'patterns': patterns | |
321 } | |
322 | |
323 def __str__(self): | |
324 rv = "is Valid: {} || control info: {} || col names: {} || numeric filters: {} || string filters: {}".format( | |
325 self.is_valid(), self.control_info, self.col_names, self.nfilters, self.sfilters) | |
326 return rv | |
327 | |
328 | |
329 ### Utility ### | |
330 def string_as_operator(arg1, arg2, op): | |
331 if op == "==": | |
332 return arg1 == arg2 | |
333 elif op == ">": | |
334 return arg1 > arg2 | |
335 elif op == "<": | |
336 return arg1 < arg2 | |
337 elif op == "<=": | |
338 return arg1 <= arg2 | |
339 elif op == ">=": | |
340 return arg1 >= arg2 | |
341 | |
342 def genotype_to_bases(genotype, SNPdata): | |
343 bases = "" | |
344 if genotype in GENOTYPE_DICT: | |
345 gtype = GENOTYPE_DICT[genotype] | |
346 if gtype == "hom_ref": | |
347 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['ref_col']) | |
348 elif gtype == "hom_alt": | |
349 bases = "{} {}".format(SNPdata['alt_col'], SNPdata['alt_col']) | |
350 elif gtype == "het": | |
351 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['alt_col']) | |
352 elif gtype == "tri_allelic": | |
353 aa_col = SNPdata['alt_col'] | |
354 if len(aa_col) > 1: | |
355 # arbitrarily choose the first one | |
356 alt_allele = aa_col[0] | |
357 else: | |
358 alt_allele = aa_col | |
359 bases = "{} {}".format(alt_allele, alt_allele) | |
360 else: | |
361 print genotype | |
362 print "Unrecognized genotype!" | |
363 sys.exit(1) | |
364 return bases | |
365 | |
366 ### Main ### | |
367 def main(): | |
368 # argument parsing | |
369 parser = argparse.ArgumentParser() | |
370 parser.add_argument("dfname", help="name of input data file") | |
371 parser.add_argument("cfname", help="name of input configuration file") | |
372 parser.add_argument("pfname", help="name of output ped file") | |
373 parser.add_argument("mfname", help="name of output map file") | |
374 parser.add_argument("xsname", help="name of output file containing exact IDs of samples who were excluded") | |
375 args = parser.parse_args() | |
376 | |
377 pc = PedConverter() | |
378 # read in config file | |
379 rc = pc.read_config_file(args.cfname) | |
380 if rc == 0: | |
381 print 'config file read successfully' | |
382 else: | |
383 print 'failed to read in config file successfully. Error code: {}'.format(rc) | |
384 # read in data file | |
385 rc = pc.read_data_file(args.dfname) | |
386 if rc == 0: | |
387 print 'data file read successfully' | |
388 else: | |
389 print 'failed to read in data file successfully. Error code: {}'.format(rc) | |
390 pc.create_ped_file(args.pfname, numeric=True) | |
391 pc.create_map_file(args.mfname) | |
392 pc.create_excluded_samples_file(args.xsname) | |
393 | |
394 if __name__ == "__main__": | |
395 main() |