annotate pedify.py @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
64e75e21466e Uploaded
pmac
parents:
diff changeset
1 import sys
64e75e21466e Uploaded
pmac
parents:
diff changeset
2 import csv
64e75e21466e Uploaded
pmac
parents:
diff changeset
3 import argparse
64e75e21466e Uploaded
pmac
parents:
diff changeset
4
64e75e21466e Uploaded
pmac
parents:
diff changeset
5 DEBUG = 0
64e75e21466e Uploaded
pmac
parents:
diff changeset
6
64e75e21466e Uploaded
pmac
parents:
diff changeset
7 REQ_KEYS = ['genotype_column', 'reference_column', 'alternate_column',
64e75e21466e Uploaded
pmac
parents:
diff changeset
8 'sample_id_column', 'chromosome_column', 'position_column',
64e75e21466e Uploaded
pmac
parents:
diff changeset
9 'variant_id_column']
64e75e21466e Uploaded
pmac
parents:
diff changeset
10
64e75e21466e Uploaded
pmac
parents:
diff changeset
11 GENOTYPE_DICT = {
64e75e21466e Uploaded
pmac
parents:
diff changeset
12 "'0/0": "hom_ref",
64e75e21466e Uploaded
pmac
parents:
diff changeset
13 "'0/1": "het",
64e75e21466e Uploaded
pmac
parents:
diff changeset
14 "'1/1": "hom_alt",
64e75e21466e Uploaded
pmac
parents:
diff changeset
15 "'1/2": "tri_allelic"
64e75e21466e Uploaded
pmac
parents:
diff changeset
16 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
17
64e75e21466e Uploaded
pmac
parents:
diff changeset
18 GENOTYPE_TO_NUMERIC = {
64e75e21466e Uploaded
pmac
parents:
diff changeset
19 "'0/0": 0,
64e75e21466e Uploaded
pmac
parents:
diff changeset
20 "'0/1": 1,
64e75e21466e Uploaded
pmac
parents:
diff changeset
21 "'1/1": 2,
64e75e21466e Uploaded
pmac
parents:
diff changeset
22 "'1/2": 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
23 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
24
64e75e21466e Uploaded
pmac
parents:
diff changeset
25 class PedConverter:
64e75e21466e Uploaded
pmac
parents:
diff changeset
26 def __init__(self):
64e75e21466e Uploaded
pmac
parents:
diff changeset
27 self.cfg = None
64e75e21466e Uploaded
pmac
parents:
diff changeset
28 self.samples = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
29 self.sites = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
30 self.xsamples = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
31
64e75e21466e Uploaded
pmac
parents:
diff changeset
32 def verify_column_names(self, datafile_header):
64e75e21466e Uploaded
pmac
parents:
diff changeset
33 # check all the config column names actually exist in the data file
64e75e21466e Uploaded
pmac
parents:
diff changeset
34 for col in self.cfg.col_names.values():
64e75e21466e Uploaded
pmac
parents:
diff changeset
35 if col not in datafile_header:
64e75e21466e Uploaded
pmac
parents:
diff changeset
36 print "The '{}' column was not found in the datafile! Double check your config file is correct. Exiting...".format(
64e75e21466e Uploaded
pmac
parents:
diff changeset
37 col)
64e75e21466e Uploaded
pmac
parents:
diff changeset
38 sys.exit(1)
64e75e21466e Uploaded
pmac
parents:
diff changeset
39
64e75e21466e Uploaded
pmac
parents:
diff changeset
40 def verify_filters(self, datafile_header):
64e75e21466e Uploaded
pmac
parents:
diff changeset
41 # print warning messages if filters invalid
64e75e21466e Uploaded
pmac
parents:
diff changeset
42 all_filters = self.cfg.nfilters.copy()
64e75e21466e Uploaded
pmac
parents:
diff changeset
43 all_filters.update(self.cfg.sfilters)
64e75e21466e Uploaded
pmac
parents:
diff changeset
44 for key in all_filters:
64e75e21466e Uploaded
pmac
parents:
diff changeset
45 col_name = all_filters[key]["col_name"]
64e75e21466e Uploaded
pmac
parents:
diff changeset
46 if col_name not in datafile_header:
64e75e21466e Uploaded
pmac
parents:
diff changeset
47 print "Warning! The '{}' filter was not applied as the datafile does not contain the column '{}'".format(
64e75e21466e Uploaded
pmac
parents:
diff changeset
48 key, col_name)
64e75e21466e Uploaded
pmac
parents:
diff changeset
49
64e75e21466e Uploaded
pmac
parents:
diff changeset
50 def read_config_file(self, cfname):
64e75e21466e Uploaded
pmac
parents:
diff changeset
51 self.cfg = ConfigSettings()
64e75e21466e Uploaded
pmac
parents:
diff changeset
52 rc = self.cfg.parse_config_file(cfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
53 return rc
64e75e21466e Uploaded
pmac
parents:
diff changeset
54
64e75e21466e Uploaded
pmac
parents:
diff changeset
55 def read_data_file(self, dfname):
64e75e21466e Uploaded
pmac
parents:
diff changeset
56 if (self.cfg == None) or (not self.cfg.is_valid()):
64e75e21466e Uploaded
pmac
parents:
diff changeset
57 return 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
58
64e75e21466e Uploaded
pmac
parents:
diff changeset
59 datafile = open(dfname, 'r')
64e75e21466e Uploaded
pmac
parents:
diff changeset
60 dreader = csv.DictReader(datafile, delimiter='\t')
64e75e21466e Uploaded
pmac
parents:
diff changeset
61 # verify datafile data matches config file
64e75e21466e Uploaded
pmac
parents:
diff changeset
62 self.verify_column_names(dreader.fieldnames)
64e75e21466e Uploaded
pmac
parents:
diff changeset
63 self.verify_filters(dreader.fieldnames)
64e75e21466e Uploaded
pmac
parents:
diff changeset
64 all_sample_ids = set()
64e75e21466e Uploaded
pmac
parents:
diff changeset
65 i = 0
64e75e21466e Uploaded
pmac
parents:
diff changeset
66
64e75e21466e Uploaded
pmac
parents:
diff changeset
67 for row in dreader:
64e75e21466e Uploaded
pmac
parents:
diff changeset
68 failed_filters = self.filter_all(row)
64e75e21466e Uploaded
pmac
parents:
diff changeset
69 sample_key = row[self.cfg.col_names['sample_id_column']]
64e75e21466e Uploaded
pmac
parents:
diff changeset
70 all_sample_ids.add(sample_key)
64e75e21466e Uploaded
pmac
parents:
diff changeset
71 if not failed_filters:
64e75e21466e Uploaded
pmac
parents:
diff changeset
72 # add to sample dict
64e75e21466e Uploaded
pmac
parents:
diff changeset
73 # key is a tuple made up of which chromosome the snp is found on
64e75e21466e Uploaded
pmac
parents:
diff changeset
74 # and the position on the chromosome itself
64e75e21466e Uploaded
pmac
parents:
diff changeset
75 SNP_key = (row[self.cfg.col_names['chromosome_column']], int(row[self.cfg.col_names['position_column']]))
64e75e21466e Uploaded
pmac
parents:
diff changeset
76 genotype = row[self.cfg.col_names['genotype_column']]
64e75e21466e Uploaded
pmac
parents:
diff changeset
77
64e75e21466e Uploaded
pmac
parents:
diff changeset
78 # create a dictionary for each sample (person); each person is associated
64e75e21466e Uploaded
pmac
parents:
diff changeset
79 # with another dictionary of all the SNPs found in that person
64e75e21466e Uploaded
pmac
parents:
diff changeset
80 if sample_key not in self.samples:
64e75e21466e Uploaded
pmac
parents:
diff changeset
81 self.samples[sample_key] = {SNP_key: genotype}
64e75e21466e Uploaded
pmac
parents:
diff changeset
82 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
83 self.samples[sample_key][SNP_key] = genotype
64e75e21466e Uploaded
pmac
parents:
diff changeset
84
64e75e21466e Uploaded
pmac
parents:
diff changeset
85 # create a dict of all the sites where SNPs exist
64e75e21466e Uploaded
pmac
parents:
diff changeset
86 if SNP_key not in self.sites:
64e75e21466e Uploaded
pmac
parents:
diff changeset
87 # generate arbitrary ID's if there is no pre-existing ID for the SNP
64e75e21466e Uploaded
pmac
parents:
diff changeset
88 if row[self.cfg.col_names['variant_id_column']] == '.':
64e75e21466e Uploaded
pmac
parents:
diff changeset
89 SNP_id = "SNP_" + str(i)
64e75e21466e Uploaded
pmac
parents:
diff changeset
90 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
91 SNP_id = row[self.cfg.col_names['variant_id_column']]
64e75e21466e Uploaded
pmac
parents:
diff changeset
92
64e75e21466e Uploaded
pmac
parents:
diff changeset
93 SNP_data = {'ref_col': row[self.cfg.col_names['reference_column']],
64e75e21466e Uploaded
pmac
parents:
diff changeset
94 'alt_col': row[self.cfg.col_names['alternate_column']],
64e75e21466e Uploaded
pmac
parents:
diff changeset
95 'SNP_id': SNP_id}
64e75e21466e Uploaded
pmac
parents:
diff changeset
96 self.sites[SNP_key] = SNP_data
64e75e21466e Uploaded
pmac
parents:
diff changeset
97 i += 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
98
64e75e21466e Uploaded
pmac
parents:
diff changeset
99 # make sure every sample contains a genotype for every SNP found
64e75e21466e Uploaded
pmac
parents:
diff changeset
100 for sample_key in self.samples:
64e75e21466e Uploaded
pmac
parents:
diff changeset
101 this_sample = self.samples[sample_key]
64e75e21466e Uploaded
pmac
parents:
diff changeset
102 for SNP_key in self.sites:
64e75e21466e Uploaded
pmac
parents:
diff changeset
103 if SNP_key not in this_sample:
64e75e21466e Uploaded
pmac
parents:
diff changeset
104 this_sample[SNP_key] = "'0/0"
64e75e21466e Uploaded
pmac
parents:
diff changeset
105 datafile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
106
64e75e21466e Uploaded
pmac
parents:
diff changeset
107 # get list of samples which were filtered out
64e75e21466e Uploaded
pmac
parents:
diff changeset
108 self.xsamples = list(all_sample_ids.difference(set(self.samples.keys())))
64e75e21466e Uploaded
pmac
parents:
diff changeset
109 return 0
64e75e21466e Uploaded
pmac
parents:
diff changeset
110
64e75e21466e Uploaded
pmac
parents:
diff changeset
111 # returns key of the specific filter/s that failed
64e75e21466e Uploaded
pmac
parents:
diff changeset
112 def filter_numeric(self, row):
64e75e21466e Uploaded
pmac
parents:
diff changeset
113 failed_filters = set()
64e75e21466e Uploaded
pmac
parents:
diff changeset
114 for key in self.cfg.nfilters.keys():
64e75e21466e Uploaded
pmac
parents:
diff changeset
115 nfilter = self.cfg.nfilters[key]
64e75e21466e Uploaded
pmac
parents:
diff changeset
116 cutoff = float(nfilter["cutoff"])
64e75e21466e Uploaded
pmac
parents:
diff changeset
117 op = nfilter["op"]
64e75e21466e Uploaded
pmac
parents:
diff changeset
118 col_name = nfilter["col_name"]
64e75e21466e Uploaded
pmac
parents:
diff changeset
119 if col_name in row.keys():
64e75e21466e Uploaded
pmac
parents:
diff changeset
120 cv = float(row[col_name])
64e75e21466e Uploaded
pmac
parents:
diff changeset
121 if not string_as_operator(cv, cutoff, op):
64e75e21466e Uploaded
pmac
parents:
diff changeset
122 failed_filters.add(key)
64e75e21466e Uploaded
pmac
parents:
diff changeset
123
64e75e21466e Uploaded
pmac
parents:
diff changeset
124 return failed_filters
64e75e21466e Uploaded
pmac
parents:
diff changeset
125
64e75e21466e Uploaded
pmac
parents:
diff changeset
126 # returns key of the specific filter/s that failed
64e75e21466e Uploaded
pmac
parents:
diff changeset
127 def filter_string(self, row):
64e75e21466e Uploaded
pmac
parents:
diff changeset
128 failed_filters = set()
64e75e21466e Uploaded
pmac
parents:
diff changeset
129 for key in self.cfg.sfilters.keys():
64e75e21466e Uploaded
pmac
parents:
diff changeset
130 sfilter = self.cfg.sfilters[key]
64e75e21466e Uploaded
pmac
parents:
diff changeset
131 col_name = sfilter["col_name"]
64e75e21466e Uploaded
pmac
parents:
diff changeset
132 if col_name in row.keys():
64e75e21466e Uploaded
pmac
parents:
diff changeset
133 cs = row[col_name]
64e75e21466e Uploaded
pmac
parents:
diff changeset
134 af = sfilter['accept_flag']
64e75e21466e Uploaded
pmac
parents:
diff changeset
135 ef = sfilter['exact_flag']
64e75e21466e Uploaded
pmac
parents:
diff changeset
136 patterns = sfilter['patterns']
64e75e21466e Uploaded
pmac
parents:
diff changeset
137 if ef:
64e75e21466e Uploaded
pmac
parents:
diff changeset
138 if af:
64e75e21466e Uploaded
pmac
parents:
diff changeset
139 passed = False
64e75e21466e Uploaded
pmac
parents:
diff changeset
140 for p in patterns:
64e75e21466e Uploaded
pmac
parents:
diff changeset
141 if p == cs:
64e75e21466e Uploaded
pmac
parents:
diff changeset
142 passed = True
64e75e21466e Uploaded
pmac
parents:
diff changeset
143 break
64e75e21466e Uploaded
pmac
parents:
diff changeset
144 if passed == False:
64e75e21466e Uploaded
pmac
parents:
diff changeset
145 failed_filters.add(key)
64e75e21466e Uploaded
pmac
parents:
diff changeset
146 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
147 for p in patterns:
64e75e21466e Uploaded
pmac
parents:
diff changeset
148 if p == cs:
64e75e21466e Uploaded
pmac
parents:
diff changeset
149 failed_filters.add(key)
64e75e21466e Uploaded
pmac
parents:
diff changeset
150 break
64e75e21466e Uploaded
pmac
parents:
diff changeset
151 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
152 if af:
64e75e21466e Uploaded
pmac
parents:
diff changeset
153 passed = False
64e75e21466e Uploaded
pmac
parents:
diff changeset
154 for p in patterns:
64e75e21466e Uploaded
pmac
parents:
diff changeset
155 if p in cs:
64e75e21466e Uploaded
pmac
parents:
diff changeset
156 passed = True
64e75e21466e Uploaded
pmac
parents:
diff changeset
157 break
64e75e21466e Uploaded
pmac
parents:
diff changeset
158 if passed == False:
64e75e21466e Uploaded
pmac
parents:
diff changeset
159 failed_filters.add(key)
64e75e21466e Uploaded
pmac
parents:
diff changeset
160 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
161 for p in patterns:
64e75e21466e Uploaded
pmac
parents:
diff changeset
162 if p in cs:
64e75e21466e Uploaded
pmac
parents:
diff changeset
163 failed_filters.add(key)
64e75e21466e Uploaded
pmac
parents:
diff changeset
164 break
64e75e21466e Uploaded
pmac
parents:
diff changeset
165
64e75e21466e Uploaded
pmac
parents:
diff changeset
166 return failed_filters
64e75e21466e Uploaded
pmac
parents:
diff changeset
167
64e75e21466e Uploaded
pmac
parents:
diff changeset
168 def filter_all(self, row):
64e75e21466e Uploaded
pmac
parents:
diff changeset
169 return self.filter_numeric(row).union(self.filter_string(row))
64e75e21466e Uploaded
pmac
parents:
diff changeset
170
64e75e21466e Uploaded
pmac
parents:
diff changeset
171 def create_ped_file(self, filename, numeric=False):
64e75e21466e Uploaded
pmac
parents:
diff changeset
172 output = ""
64e75e21466e Uploaded
pmac
parents:
diff changeset
173
64e75e21466e Uploaded
pmac
parents:
diff changeset
174 sorted_sample_keys = sorted(self.samples.keys())
64e75e21466e Uploaded
pmac
parents:
diff changeset
175 for sample_key in sorted_sample_keys:
64e75e21466e Uploaded
pmac
parents:
diff changeset
176 this_sample = self.samples[sample_key]
64e75e21466e Uploaded
pmac
parents:
diff changeset
177 sorted_site_keys = sorted(this_sample.keys())
64e75e21466e Uploaded
pmac
parents:
diff changeset
178 variants = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
179 if numeric:
64e75e21466e Uploaded
pmac
parents:
diff changeset
180 pef = sample_key
64e75e21466e Uploaded
pmac
parents:
diff changeset
181 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
182 pef = self.create_ped_entry_front(sample_key)
64e75e21466e Uploaded
pmac
parents:
diff changeset
183
64e75e21466e Uploaded
pmac
parents:
diff changeset
184 for SNP_key in sorted_site_keys:
64e75e21466e Uploaded
pmac
parents:
diff changeset
185 genotype = this_sample[SNP_key]
64e75e21466e Uploaded
pmac
parents:
diff changeset
186 if numeric == True:
64e75e21466e Uploaded
pmac
parents:
diff changeset
187 variants.append(str(GENOTYPE_TO_NUMERIC[genotype]))
64e75e21466e Uploaded
pmac
parents:
diff changeset
188 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
189 variants.append(genotype_to_bases(genotype, self.sites[SNP_key]))
64e75e21466e Uploaded
pmac
parents:
diff changeset
190
64e75e21466e Uploaded
pmac
parents:
diff changeset
191 output += "{}\t{}\n".format(pef, '\t'.join(variants))
64e75e21466e Uploaded
pmac
parents:
diff changeset
192
64e75e21466e Uploaded
pmac
parents:
diff changeset
193 pedfile = open(filename, 'w')
64e75e21466e Uploaded
pmac
parents:
diff changeset
194 pedfile.write(output)
64e75e21466e Uploaded
pmac
parents:
diff changeset
195 pedfile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
196
64e75e21466e Uploaded
pmac
parents:
diff changeset
197 def create_ped_entry_front(self, sample_id):
64e75e21466e Uploaded
pmac
parents:
diff changeset
198 if self.cfg.control_info["control_tag"]["tag"] in sample_id:
64e75e21466e Uploaded
pmac
parents:
diff changeset
199 group = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
200 elif self.cfg.control_info["cases_tag"]["tag"] in sample_id:
64e75e21466e Uploaded
pmac
parents:
diff changeset
201 group = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
202 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
203 group = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
204
64e75e21466e Uploaded
pmac
parents:
diff changeset
205 entry = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(
64e75e21466e Uploaded
pmac
parents:
diff changeset
206 sample_id,
64e75e21466e Uploaded
pmac
parents:
diff changeset
207 sample_id,
64e75e21466e Uploaded
pmac
parents:
diff changeset
208 sample_id + "_F",
64e75e21466e Uploaded
pmac
parents:
diff changeset
209 sample_id + "_M",
64e75e21466e Uploaded
pmac
parents:
diff changeset
210 2,
64e75e21466e Uploaded
pmac
parents:
diff changeset
211 group)
64e75e21466e Uploaded
pmac
parents:
diff changeset
212
64e75e21466e Uploaded
pmac
parents:
diff changeset
213 return entry
64e75e21466e Uploaded
pmac
parents:
diff changeset
214
64e75e21466e Uploaded
pmac
parents:
diff changeset
215 def create_map_file(self, filename):
64e75e21466e Uploaded
pmac
parents:
diff changeset
216 output = ""
64e75e21466e Uploaded
pmac
parents:
diff changeset
217 for SNP_key in sorted(self.sites.keys()):
64e75e21466e Uploaded
pmac
parents:
diff changeset
218 chrom = SNP_key[0]
64e75e21466e Uploaded
pmac
parents:
diff changeset
219 SNP_id = self.sites[SNP_key]['SNP_id']
64e75e21466e Uploaded
pmac
parents:
diff changeset
220 posn = SNP_key[1]
64e75e21466e Uploaded
pmac
parents:
diff changeset
221 output += "{}\t{}\t{}\n".format(chrom, SNP_id, str(posn))
64e75e21466e Uploaded
pmac
parents:
diff changeset
222
64e75e21466e Uploaded
pmac
parents:
diff changeset
223 mapfile = open(filename, 'w')
64e75e21466e Uploaded
pmac
parents:
diff changeset
224 mapfile.write(output)
64e75e21466e Uploaded
pmac
parents:
diff changeset
225 mapfile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
226
64e75e21466e Uploaded
pmac
parents:
diff changeset
227 def create_excluded_samples_file(self, filename):
64e75e21466e Uploaded
pmac
parents:
diff changeset
228 xsfile = open(filename, 'w')
64e75e21466e Uploaded
pmac
parents:
diff changeset
229 xsfile.write('\n'.join(self.xsamples))
64e75e21466e Uploaded
pmac
parents:
diff changeset
230 xsfile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
231
64e75e21466e Uploaded
pmac
parents:
diff changeset
232 class ConfigSettings:
64e75e21466e Uploaded
pmac
parents:
diff changeset
233
64e75e21466e Uploaded
pmac
parents:
diff changeset
234 SECTIONS = [
64e75e21466e Uploaded
pmac
parents:
diff changeset
235 "#control",
64e75e21466e Uploaded
pmac
parents:
diff changeset
236 "#column_names",
64e75e21466e Uploaded
pmac
parents:
diff changeset
237 "#numeric_filters",
64e75e21466e Uploaded
pmac
parents:
diff changeset
238 "#string_filters"
64e75e21466e Uploaded
pmac
parents:
diff changeset
239 ]
64e75e21466e Uploaded
pmac
parents:
diff changeset
240
64e75e21466e Uploaded
pmac
parents:
diff changeset
241 def __init__(self):
64e75e21466e Uploaded
pmac
parents:
diff changeset
242 self.control_info = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
243 self.col_names = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
244 self.nfilters = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
245 self.sfilters = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
246
64e75e21466e Uploaded
pmac
parents:
diff changeset
247 def parse_config_file(self, cfname):
64e75e21466e Uploaded
pmac
parents:
diff changeset
248 cffile = open(cfname, 'r')
64e75e21466e Uploaded
pmac
parents:
diff changeset
249 section = None
64e75e21466e Uploaded
pmac
parents:
diff changeset
250 rc = 0
64e75e21466e Uploaded
pmac
parents:
diff changeset
251
64e75e21466e Uploaded
pmac
parents:
diff changeset
252 for line in cffile:
64e75e21466e Uploaded
pmac
parents:
diff changeset
253 # clean trailing/leading whitespace/newlines
64e75e21466e Uploaded
pmac
parents:
diff changeset
254 line = line.strip()
64e75e21466e Uploaded
pmac
parents:
diff changeset
255 # set section flag
64e75e21466e Uploaded
pmac
parents:
diff changeset
256 if line[0] == '#':
64e75e21466e Uploaded
pmac
parents:
diff changeset
257 if line in ConfigSettings.SECTIONS:
64e75e21466e Uploaded
pmac
parents:
diff changeset
258 section = line
64e75e21466e Uploaded
pmac
parents:
diff changeset
259 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
260 continue
64e75e21466e Uploaded
pmac
parents:
diff changeset
261 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
262 # fill up config dicts
64e75e21466e Uploaded
pmac
parents:
diff changeset
263 if section == "#control":
64e75e21466e Uploaded
pmac
parents:
diff changeset
264 (key, col_name, tag) = line.split(',')
64e75e21466e Uploaded
pmac
parents:
diff changeset
265 self.control_info[key] = {'col_name': col_name, 'tag': tag}
64e75e21466e Uploaded
pmac
parents:
diff changeset
266 elif section == "#column_names":
64e75e21466e Uploaded
pmac
parents:
diff changeset
267 (key, col_name) = line.split(',')
64e75e21466e Uploaded
pmac
parents:
diff changeset
268 self.col_names[key] = col_name
64e75e21466e Uploaded
pmac
parents:
diff changeset
269 elif section == "#numeric_filters":
64e75e21466e Uploaded
pmac
parents:
diff changeset
270 (key, col_name, op, cutoff) = line.split(',')
64e75e21466e Uploaded
pmac
parents:
diff changeset
271 self.add_numeric_filter(key, col_name, op, float(cutoff))
64e75e21466e Uploaded
pmac
parents:
diff changeset
272 elif section == "#string_filters":
64e75e21466e Uploaded
pmac
parents:
diff changeset
273 (key, col_name, exact_flag, accept_flag) = line.split(',')
64e75e21466e Uploaded
pmac
parents:
diff changeset
274 patterns = next(cffile).strip().split(',')
64e75e21466e Uploaded
pmac
parents:
diff changeset
275 self.add_string_filter(key, col_name, exact_flag, accept_flag, patterns)
64e75e21466e Uploaded
pmac
parents:
diff changeset
276 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
277 rc = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
278 break
64e75e21466e Uploaded
pmac
parents:
diff changeset
279
64e75e21466e Uploaded
pmac
parents:
diff changeset
280 cffile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
281 if rc != 0:
64e75e21466e Uploaded
pmac
parents:
diff changeset
282 return rc
64e75e21466e Uploaded
pmac
parents:
diff changeset
283 if not self.is_valid():
64e75e21466e Uploaded
pmac
parents:
diff changeset
284 rc = 3
64e75e21466e Uploaded
pmac
parents:
diff changeset
285 return rc
64e75e21466e Uploaded
pmac
parents:
diff changeset
286
64e75e21466e Uploaded
pmac
parents:
diff changeset
287
64e75e21466e Uploaded
pmac
parents:
diff changeset
288 def is_valid(self):
64e75e21466e Uploaded
pmac
parents:
diff changeset
289 for k in REQ_KEYS:
64e75e21466e Uploaded
pmac
parents:
diff changeset
290 if k not in self.col_names.keys():
64e75e21466e Uploaded
pmac
parents:
diff changeset
291 return False
64e75e21466e Uploaded
pmac
parents:
diff changeset
292 return True
64e75e21466e Uploaded
pmac
parents:
diff changeset
293
64e75e21466e Uploaded
pmac
parents:
diff changeset
294 def add_numeric_filter(self, key, col_name, op, cutoff):
64e75e21466e Uploaded
pmac
parents:
diff changeset
295 self.nfilters[key] = {
64e75e21466e Uploaded
pmac
parents:
diff changeset
296 'col_name': col_name,
64e75e21466e Uploaded
pmac
parents:
diff changeset
297 'op': op,
64e75e21466e Uploaded
pmac
parents:
diff changeset
298 'cutoff': cutoff
64e75e21466e Uploaded
pmac
parents:
diff changeset
299 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
300
64e75e21466e Uploaded
pmac
parents:
diff changeset
301 def add_string_filter(self, key, col_name, exact_flag, accept_flag, patterns):
64e75e21466e Uploaded
pmac
parents:
diff changeset
302 if exact_flag == "exact":
64e75e21466e Uploaded
pmac
parents:
diff changeset
303 ef = True
64e75e21466e Uploaded
pmac
parents:
diff changeset
304 elif exact_flag == "not_exact":
64e75e21466e Uploaded
pmac
parents:
diff changeset
305 ef = False
64e75e21466e Uploaded
pmac
parents:
diff changeset
306 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
307 return False
64e75e21466e Uploaded
pmac
parents:
diff changeset
308
64e75e21466e Uploaded
pmac
parents:
diff changeset
309 if accept_flag == "accept":
64e75e21466e Uploaded
pmac
parents:
diff changeset
310 af = True
64e75e21466e Uploaded
pmac
parents:
diff changeset
311 elif accept_flag == "reject":
64e75e21466e Uploaded
pmac
parents:
diff changeset
312 af = False
64e75e21466e Uploaded
pmac
parents:
diff changeset
313 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
314 return False
64e75e21466e Uploaded
pmac
parents:
diff changeset
315
64e75e21466e Uploaded
pmac
parents:
diff changeset
316 self.sfilters[key] = {
64e75e21466e Uploaded
pmac
parents:
diff changeset
317 'col_name': col_name,
64e75e21466e Uploaded
pmac
parents:
diff changeset
318 'exact_flag': ef,
64e75e21466e Uploaded
pmac
parents:
diff changeset
319 'accept_flag': af,
64e75e21466e Uploaded
pmac
parents:
diff changeset
320 'patterns': patterns
64e75e21466e Uploaded
pmac
parents:
diff changeset
321 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
322
64e75e21466e Uploaded
pmac
parents:
diff changeset
323 def __str__(self):
64e75e21466e Uploaded
pmac
parents:
diff changeset
324 rv = "is Valid: {} || control info: {} || col names: {} || numeric filters: {} || string filters: {}".format(
64e75e21466e Uploaded
pmac
parents:
diff changeset
325 self.is_valid(), self.control_info, self.col_names, self.nfilters, self.sfilters)
64e75e21466e Uploaded
pmac
parents:
diff changeset
326 return rv
64e75e21466e Uploaded
pmac
parents:
diff changeset
327
64e75e21466e Uploaded
pmac
parents:
diff changeset
328
64e75e21466e Uploaded
pmac
parents:
diff changeset
329 ### Utility ###
64e75e21466e Uploaded
pmac
parents:
diff changeset
330 def string_as_operator(arg1, arg2, op):
64e75e21466e Uploaded
pmac
parents:
diff changeset
331 if op == "==":
64e75e21466e Uploaded
pmac
parents:
diff changeset
332 return arg1 == arg2
64e75e21466e Uploaded
pmac
parents:
diff changeset
333 elif op == ">":
64e75e21466e Uploaded
pmac
parents:
diff changeset
334 return arg1 > arg2
64e75e21466e Uploaded
pmac
parents:
diff changeset
335 elif op == "<":
64e75e21466e Uploaded
pmac
parents:
diff changeset
336 return arg1 < arg2
64e75e21466e Uploaded
pmac
parents:
diff changeset
337 elif op == "<=":
64e75e21466e Uploaded
pmac
parents:
diff changeset
338 return arg1 <= arg2
64e75e21466e Uploaded
pmac
parents:
diff changeset
339 elif op == ">=":
64e75e21466e Uploaded
pmac
parents:
diff changeset
340 return arg1 >= arg2
64e75e21466e Uploaded
pmac
parents:
diff changeset
341
64e75e21466e Uploaded
pmac
parents:
diff changeset
342 def genotype_to_bases(genotype, SNPdata):
64e75e21466e Uploaded
pmac
parents:
diff changeset
343 bases = ""
64e75e21466e Uploaded
pmac
parents:
diff changeset
344 if genotype in GENOTYPE_DICT:
64e75e21466e Uploaded
pmac
parents:
diff changeset
345 gtype = GENOTYPE_DICT[genotype]
64e75e21466e Uploaded
pmac
parents:
diff changeset
346 if gtype == "hom_ref":
64e75e21466e Uploaded
pmac
parents:
diff changeset
347 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['ref_col'])
64e75e21466e Uploaded
pmac
parents:
diff changeset
348 elif gtype == "hom_alt":
64e75e21466e Uploaded
pmac
parents:
diff changeset
349 bases = "{} {}".format(SNPdata['alt_col'], SNPdata['alt_col'])
64e75e21466e Uploaded
pmac
parents:
diff changeset
350 elif gtype == "het":
64e75e21466e Uploaded
pmac
parents:
diff changeset
351 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['alt_col'])
64e75e21466e Uploaded
pmac
parents:
diff changeset
352 elif gtype == "tri_allelic":
64e75e21466e Uploaded
pmac
parents:
diff changeset
353 aa_col = SNPdata['alt_col']
64e75e21466e Uploaded
pmac
parents:
diff changeset
354 if len(aa_col) > 1:
64e75e21466e Uploaded
pmac
parents:
diff changeset
355 # arbitrarily choose the first one
64e75e21466e Uploaded
pmac
parents:
diff changeset
356 alt_allele = aa_col[0]
64e75e21466e Uploaded
pmac
parents:
diff changeset
357 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
358 alt_allele = aa_col
64e75e21466e Uploaded
pmac
parents:
diff changeset
359 bases = "{} {}".format(alt_allele, alt_allele)
64e75e21466e Uploaded
pmac
parents:
diff changeset
360 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
361 print genotype
64e75e21466e Uploaded
pmac
parents:
diff changeset
362 print "Unrecognized genotype!"
64e75e21466e Uploaded
pmac
parents:
diff changeset
363 sys.exit(1)
64e75e21466e Uploaded
pmac
parents:
diff changeset
364 return bases
64e75e21466e Uploaded
pmac
parents:
diff changeset
365
64e75e21466e Uploaded
pmac
parents:
diff changeset
366 ### Main ###
64e75e21466e Uploaded
pmac
parents:
diff changeset
367 def main():
64e75e21466e Uploaded
pmac
parents:
diff changeset
368 # argument parsing
64e75e21466e Uploaded
pmac
parents:
diff changeset
369 parser = argparse.ArgumentParser()
64e75e21466e Uploaded
pmac
parents:
diff changeset
370 parser.add_argument("dfname", help="name of input data file")
64e75e21466e Uploaded
pmac
parents:
diff changeset
371 parser.add_argument("cfname", help="name of input configuration file")
64e75e21466e Uploaded
pmac
parents:
diff changeset
372 parser.add_argument("pfname", help="name of output ped file")
64e75e21466e Uploaded
pmac
parents:
diff changeset
373 parser.add_argument("mfname", help="name of output map file")
64e75e21466e Uploaded
pmac
parents:
diff changeset
374 parser.add_argument("xsname", help="name of output file containing exact IDs of samples who were excluded")
64e75e21466e Uploaded
pmac
parents:
diff changeset
375 args = parser.parse_args()
64e75e21466e Uploaded
pmac
parents:
diff changeset
376
64e75e21466e Uploaded
pmac
parents:
diff changeset
377 pc = PedConverter()
64e75e21466e Uploaded
pmac
parents:
diff changeset
378 # read in config file
64e75e21466e Uploaded
pmac
parents:
diff changeset
379 rc = pc.read_config_file(args.cfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
380 if rc == 0:
64e75e21466e Uploaded
pmac
parents:
diff changeset
381 print 'config file read successfully'
64e75e21466e Uploaded
pmac
parents:
diff changeset
382 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
383 print 'failed to read in config file successfully. Error code: {}'.format(rc)
64e75e21466e Uploaded
pmac
parents:
diff changeset
384 # read in data file
64e75e21466e Uploaded
pmac
parents:
diff changeset
385 rc = pc.read_data_file(args.dfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
386 if rc == 0:
64e75e21466e Uploaded
pmac
parents:
diff changeset
387 print 'data file read successfully'
64e75e21466e Uploaded
pmac
parents:
diff changeset
388 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
389 print 'failed to read in data file successfully. Error code: {}'.format(rc)
64e75e21466e Uploaded
pmac
parents:
diff changeset
390 pc.create_ped_file(args.pfname, numeric=True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
391 pc.create_map_file(args.mfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
392 pc.create_excluded_samples_file(args.xsname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
393
64e75e21466e Uploaded
pmac
parents:
diff changeset
394 if __name__ == "__main__":
64e75e21466e Uploaded
pmac
parents:
diff changeset
395 main()