0
|
1 import sys
|
|
2 import csv
|
|
3 import argparse
|
|
4
|
|
5 DEBUG = 0
|
|
6
|
|
7 REQ_KEYS = ['genotype_column', 'reference_column', 'alternate_column',
|
|
8 'sample_id_column', 'chromosome_column', 'position_column',
|
|
9 'variant_id_column']
|
|
10
|
|
11 GENOTYPE_DICT = {
|
|
12 "'0/0": "hom_ref",
|
|
13 "'0/1": "het",
|
|
14 "'1/1": "hom_alt",
|
|
15 "'1/2": "tri_allelic"
|
|
16 }
|
|
17
|
|
18 GENOTYPE_TO_NUMERIC = {
|
|
19 "'0/0": 0,
|
|
20 "'0/1": 1,
|
|
21 "'1/1": 2,
|
|
22 "'1/2": 2
|
|
23 }
|
|
24
|
|
25 class PedConverter:
|
|
26 def __init__(self):
|
|
27 self.cfg = None
|
|
28 self.samples = {}
|
|
29 self.sites = {}
|
|
30 self.xsamples = []
|
|
31
|
|
32 def verify_column_names(self, datafile_header):
|
|
33 # check all the config column names actually exist in the data file
|
|
34 for col in self.cfg.col_names.values():
|
|
35 if col not in datafile_header:
|
|
36 print "The '{}' column was not found in the datafile! Double check your config file is correct. Exiting...".format(
|
|
37 col)
|
|
38 sys.exit(1)
|
|
39
|
|
40 def verify_filters(self, datafile_header):
|
|
41 # print warning messages if filters invalid
|
|
42 all_filters = self.cfg.nfilters.copy()
|
|
43 all_filters.update(self.cfg.sfilters)
|
|
44 for key in all_filters:
|
|
45 col_name = all_filters[key]["col_name"]
|
|
46 if col_name not in datafile_header:
|
|
47 print "Warning! The '{}' filter was not applied as the datafile does not contain the column '{}'".format(
|
|
48 key, col_name)
|
|
49
|
|
50 def read_config_file(self, cfname):
|
|
51 self.cfg = ConfigSettings()
|
|
52 rc = self.cfg.parse_config_file(cfname)
|
|
53 return rc
|
|
54
|
|
55 def read_data_file(self, dfname):
|
|
56 if (self.cfg == None) or (not self.cfg.is_valid()):
|
|
57 return 1
|
|
58
|
|
59 datafile = open(dfname, 'r')
|
|
60 dreader = csv.DictReader(datafile, delimiter='\t')
|
|
61 # verify datafile data matches config file
|
|
62 self.verify_column_names(dreader.fieldnames)
|
|
63 self.verify_filters(dreader.fieldnames)
|
|
64 all_sample_ids = set()
|
|
65 i = 0
|
|
66
|
|
67 for row in dreader:
|
|
68 failed_filters = self.filter_all(row)
|
|
69 sample_key = row[self.cfg.col_names['sample_id_column']]
|
|
70 all_sample_ids.add(sample_key)
|
|
71 if not failed_filters:
|
|
72 # add to sample dict
|
|
73 # key is a tuple made up of which chromosome the snp is found on
|
|
74 # and the position on the chromosome itself
|
|
75 SNP_key = (row[self.cfg.col_names['chromosome_column']], int(row[self.cfg.col_names['position_column']]))
|
|
76 genotype = row[self.cfg.col_names['genotype_column']]
|
|
77
|
|
78 # create a dictionary for each sample (person); each person is associated
|
|
79 # with another dictionary of all the SNPs found in that person
|
|
80 if sample_key not in self.samples:
|
|
81 self.samples[sample_key] = {SNP_key: genotype}
|
|
82 else:
|
|
83 self.samples[sample_key][SNP_key] = genotype
|
|
84
|
|
85 # create a dict of all the sites where SNPs exist
|
|
86 if SNP_key not in self.sites:
|
|
87 # generate arbitrary ID's if there is no pre-existing ID for the SNP
|
|
88 if row[self.cfg.col_names['variant_id_column']] == '.':
|
|
89 SNP_id = "SNP_" + str(i)
|
|
90 else:
|
|
91 SNP_id = row[self.cfg.col_names['variant_id_column']]
|
|
92
|
|
93 SNP_data = {'ref_col': row[self.cfg.col_names['reference_column']],
|
|
94 'alt_col': row[self.cfg.col_names['alternate_column']],
|
|
95 'SNP_id': SNP_id}
|
|
96 self.sites[SNP_key] = SNP_data
|
|
97 i += 1
|
|
98
|
|
99 # make sure every sample contains a genotype for every SNP found
|
|
100 for sample_key in self.samples:
|
|
101 this_sample = self.samples[sample_key]
|
|
102 for SNP_key in self.sites:
|
|
103 if SNP_key not in this_sample:
|
|
104 this_sample[SNP_key] = "'0/0"
|
|
105 datafile.close()
|
|
106
|
|
107 # get list of samples which were filtered out
|
|
108 self.xsamples = list(all_sample_ids.difference(set(self.samples.keys())))
|
|
109 return 0
|
|
110
|
|
111 # returns key of the specific filter/s that failed
|
|
112 def filter_numeric(self, row):
|
|
113 failed_filters = set()
|
|
114 for key in self.cfg.nfilters.keys():
|
|
115 nfilter = self.cfg.nfilters[key]
|
|
116 cutoff = float(nfilter["cutoff"])
|
|
117 op = nfilter["op"]
|
|
118 col_name = nfilter["col_name"]
|
|
119 if col_name in row.keys():
|
|
120 cv = float(row[col_name])
|
|
121 if not string_as_operator(cv, cutoff, op):
|
|
122 failed_filters.add(key)
|
|
123
|
|
124 return failed_filters
|
|
125
|
|
126 # returns key of the specific filter/s that failed
|
|
127 def filter_string(self, row):
|
|
128 failed_filters = set()
|
|
129 for key in self.cfg.sfilters.keys():
|
|
130 sfilter = self.cfg.sfilters[key]
|
|
131 col_name = sfilter["col_name"]
|
|
132 if col_name in row.keys():
|
|
133 cs = row[col_name]
|
|
134 af = sfilter['accept_flag']
|
|
135 ef = sfilter['exact_flag']
|
|
136 patterns = sfilter['patterns']
|
|
137 if ef:
|
|
138 if af:
|
|
139 passed = False
|
|
140 for p in patterns:
|
|
141 if p == cs:
|
|
142 passed = True
|
|
143 break
|
|
144 if passed == False:
|
|
145 failed_filters.add(key)
|
|
146 else:
|
|
147 for p in patterns:
|
|
148 if p == cs:
|
|
149 failed_filters.add(key)
|
|
150 break
|
|
151 else:
|
|
152 if af:
|
|
153 passed = False
|
|
154 for p in patterns:
|
|
155 if p in cs:
|
|
156 passed = True
|
|
157 break
|
|
158 if passed == False:
|
|
159 failed_filters.add(key)
|
|
160 else:
|
|
161 for p in patterns:
|
|
162 if p in cs:
|
|
163 failed_filters.add(key)
|
|
164 break
|
|
165
|
|
166 return failed_filters
|
|
167
|
|
168 def filter_all(self, row):
|
|
169 return self.filter_numeric(row).union(self.filter_string(row))
|
|
170
|
|
171 def create_ped_file(self, filename, numeric=False):
|
|
172 output = ""
|
|
173
|
|
174 sorted_sample_keys = sorted(self.samples.keys())
|
|
175 for sample_key in sorted_sample_keys:
|
|
176 this_sample = self.samples[sample_key]
|
|
177 sorted_site_keys = sorted(this_sample.keys())
|
|
178 variants = []
|
|
179 if numeric:
|
|
180 pef = sample_key
|
|
181 else:
|
|
182 pef = self.create_ped_entry_front(sample_key)
|
|
183
|
|
184 for SNP_key in sorted_site_keys:
|
|
185 genotype = this_sample[SNP_key]
|
|
186 if numeric == True:
|
|
187 variants.append(str(GENOTYPE_TO_NUMERIC[genotype]))
|
|
188 else:
|
|
189 variants.append(genotype_to_bases(genotype, self.sites[SNP_key]))
|
|
190
|
|
191 output += "{}\t{}\n".format(pef, '\t'.join(variants))
|
|
192
|
|
193 pedfile = open(filename, 'w')
|
|
194 pedfile.write(output)
|
|
195 pedfile.close()
|
|
196
|
|
197 def create_ped_entry_front(self, sample_id):
|
|
198 if self.cfg.control_info["control_tag"]["tag"] in sample_id:
|
|
199 group = 2
|
|
200 elif self.cfg.control_info["cases_tag"]["tag"] in sample_id:
|
|
201 group = 1
|
|
202 else:
|
|
203 group = 1
|
|
204
|
|
205 entry = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(
|
|
206 sample_id,
|
|
207 sample_id,
|
|
208 sample_id + "_F",
|
|
209 sample_id + "_M",
|
|
210 2,
|
|
211 group)
|
|
212
|
|
213 return entry
|
|
214
|
|
215 def create_map_file(self, filename):
|
|
216 output = ""
|
|
217 for SNP_key in sorted(self.sites.keys()):
|
|
218 chrom = SNP_key[0]
|
|
219 SNP_id = self.sites[SNP_key]['SNP_id']
|
|
220 posn = SNP_key[1]
|
|
221 output += "{}\t{}\t{}\n".format(chrom, SNP_id, str(posn))
|
|
222
|
|
223 mapfile = open(filename, 'w')
|
|
224 mapfile.write(output)
|
|
225 mapfile.close()
|
|
226
|
|
227 def create_excluded_samples_file(self, filename):
|
|
228 xsfile = open(filename, 'w')
|
|
229 xsfile.write('\n'.join(self.xsamples))
|
|
230 xsfile.close()
|
|
231
|
|
232 class ConfigSettings:
|
|
233
|
|
234 SECTIONS = [
|
|
235 "#control",
|
|
236 "#column_names",
|
|
237 "#numeric_filters",
|
|
238 "#string_filters"
|
|
239 ]
|
|
240
|
|
241 def __init__(self):
|
|
242 self.control_info = {}
|
|
243 self.col_names = {}
|
|
244 self.nfilters = {}
|
|
245 self.sfilters = {}
|
|
246
|
|
247 def parse_config_file(self, cfname):
|
|
248 cffile = open(cfname, 'r')
|
|
249 section = None
|
|
250 rc = 0
|
|
251
|
|
252 for line in cffile:
|
|
253 # clean trailing/leading whitespace/newlines
|
|
254 line = line.strip()
|
|
255 # set section flag
|
|
256 if line[0] == '#':
|
|
257 if line in ConfigSettings.SECTIONS:
|
|
258 section = line
|
|
259 else:
|
|
260 continue
|
|
261 else:
|
|
262 # fill up config dicts
|
|
263 if section == "#control":
|
|
264 (key, col_name, tag) = line.split(',')
|
|
265 self.control_info[key] = {'col_name': col_name, 'tag': tag}
|
|
266 elif section == "#column_names":
|
|
267 (key, col_name) = line.split(',')
|
|
268 self.col_names[key] = col_name
|
|
269 elif section == "#numeric_filters":
|
|
270 (key, col_name, op, cutoff) = line.split(',')
|
|
271 self.add_numeric_filter(key, col_name, op, float(cutoff))
|
|
272 elif section == "#string_filters":
|
|
273 (key, col_name, exact_flag, accept_flag) = line.split(',')
|
|
274 patterns = next(cffile).strip().split(',')
|
|
275 self.add_string_filter(key, col_name, exact_flag, accept_flag, patterns)
|
|
276 else:
|
|
277 rc = 2
|
|
278 break
|
|
279
|
|
280 cffile.close()
|
|
281 if rc != 0:
|
|
282 return rc
|
|
283 if not self.is_valid():
|
|
284 rc = 3
|
|
285 return rc
|
|
286
|
|
287
|
|
288 def is_valid(self):
|
|
289 for k in REQ_KEYS:
|
|
290 if k not in self.col_names.keys():
|
|
291 return False
|
|
292 return True
|
|
293
|
|
294 def add_numeric_filter(self, key, col_name, op, cutoff):
|
|
295 self.nfilters[key] = {
|
|
296 'col_name': col_name,
|
|
297 'op': op,
|
|
298 'cutoff': cutoff
|
|
299 }
|
|
300
|
|
301 def add_string_filter(self, key, col_name, exact_flag, accept_flag, patterns):
|
|
302 if exact_flag == "exact":
|
|
303 ef = True
|
|
304 elif exact_flag == "not_exact":
|
|
305 ef = False
|
|
306 else:
|
|
307 return False
|
|
308
|
|
309 if accept_flag == "accept":
|
|
310 af = True
|
|
311 elif accept_flag == "reject":
|
|
312 af = False
|
|
313 else:
|
|
314 return False
|
|
315
|
|
316 self.sfilters[key] = {
|
|
317 'col_name': col_name,
|
|
318 'exact_flag': ef,
|
|
319 'accept_flag': af,
|
|
320 'patterns': patterns
|
|
321 }
|
|
322
|
|
323 def __str__(self):
|
|
324 rv = "is Valid: {} || control info: {} || col names: {} || numeric filters: {} || string filters: {}".format(
|
|
325 self.is_valid(), self.control_info, self.col_names, self.nfilters, self.sfilters)
|
|
326 return rv
|
|
327
|
|
328
|
|
329 ### Utility ###
|
|
330 def string_as_operator(arg1, arg2, op):
|
|
331 if op == "==":
|
|
332 return arg1 == arg2
|
|
333 elif op == ">":
|
|
334 return arg1 > arg2
|
|
335 elif op == "<":
|
|
336 return arg1 < arg2
|
|
337 elif op == "<=":
|
|
338 return arg1 <= arg2
|
|
339 elif op == ">=":
|
|
340 return arg1 >= arg2
|
|
341
|
|
342 def genotype_to_bases(genotype, SNPdata):
|
|
343 bases = ""
|
|
344 if genotype in GENOTYPE_DICT:
|
|
345 gtype = GENOTYPE_DICT[genotype]
|
|
346 if gtype == "hom_ref":
|
|
347 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['ref_col'])
|
|
348 elif gtype == "hom_alt":
|
|
349 bases = "{} {}".format(SNPdata['alt_col'], SNPdata['alt_col'])
|
|
350 elif gtype == "het":
|
|
351 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['alt_col'])
|
|
352 elif gtype == "tri_allelic":
|
|
353 aa_col = SNPdata['alt_col']
|
|
354 if len(aa_col) > 1:
|
|
355 # arbitrarily choose the first one
|
|
356 alt_allele = aa_col[0]
|
|
357 else:
|
|
358 alt_allele = aa_col
|
|
359 bases = "{} {}".format(alt_allele, alt_allele)
|
|
360 else:
|
|
361 print genotype
|
|
362 print "Unrecognized genotype!"
|
|
363 sys.exit(1)
|
|
364 return bases
|
|
365
|
|
366 ### Main ###
|
|
367 def main():
|
|
368 # argument parsing
|
|
369 parser = argparse.ArgumentParser()
|
|
370 parser.add_argument("dfname", help="name of input data file")
|
|
371 parser.add_argument("cfname", help="name of input configuration file")
|
|
372 parser.add_argument("pfname", help="name of output ped file")
|
|
373 parser.add_argument("mfname", help="name of output map file")
|
|
374 parser.add_argument("xsname", help="name of output file containing exact IDs of samples who were excluded")
|
|
375 args = parser.parse_args()
|
|
376
|
|
377 pc = PedConverter()
|
|
378 # read in config file
|
|
379 rc = pc.read_config_file(args.cfname)
|
|
380 if rc == 0:
|
|
381 print 'config file read successfully'
|
|
382 else:
|
|
383 print 'failed to read in config file successfully. Error code: {}'.format(rc)
|
|
384 # read in data file
|
|
385 rc = pc.read_data_file(args.dfname)
|
|
386 if rc == 0:
|
|
387 print 'data file read successfully'
|
|
388 else:
|
|
389 print 'failed to read in data file successfully. Error code: {}'.format(rc)
|
|
390 pc.create_ped_file(args.pfname, numeric=True)
|
|
391 pc.create_map_file(args.mfname)
|
|
392 pc.create_excluded_samples_file(args.xsname)
|
|
393
|
|
394 if __name__ == "__main__":
|
|
395 main()
|