Mercurial > repos > jason-ellul > iterativepca
comparison pedify.py @ 0:cb54350e76ae draft default tip
Uploaded
| author | jason-ellul |
|---|---|
| date | Wed, 01 Jun 2016 03:24:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:cb54350e76ae |
|---|---|
| 1 import sys | |
| 2 import csv | |
| 3 import argparse | |
| 4 | |
| 5 DEBUG = 0 | |
| 6 | |
| 7 REQ_KEYS = ['genotype_column', 'reference_column', 'alternate_column', | |
| 8 'sample_id_column', 'chromosome_column', 'position_column', | |
| 9 'variant_id_column'] | |
| 10 | |
| 11 GENOTYPE_DICT = { | |
| 12 "'0/0": "hom_ref", | |
| 13 "'0/1": "het", | |
| 14 "'1/1": "hom_alt", | |
| 15 "'1/2": "tri_allelic" | |
| 16 } | |
| 17 | |
| 18 GENOTYPE_TO_NUMERIC = { | |
| 19 "'0/0": 0, | |
| 20 "'0/1": 1, | |
| 21 "'1/1": 2, | |
| 22 "'1/2": 2 | |
| 23 } | |
| 24 | |
| 25 class PedConverter: | |
| 26 def __init__(self): | |
| 27 self.cfg = None | |
| 28 self.samples = {} | |
| 29 self.sites = {} | |
| 30 self.xsamples = [] | |
| 31 | |
| 32 def verify_column_names(self, datafile_header): | |
| 33 # check all the config column names actually exist in the data file | |
| 34 for col in self.cfg.col_names.values(): | |
| 35 if col not in datafile_header: | |
| 36 print "The '{}' column was not found in the datafile! Double check your config file is correct. Exiting...".format( | |
| 37 col) | |
| 38 sys.exit(1) | |
| 39 | |
| 40 def verify_filters(self, datafile_header): | |
| 41 # print warning messages if filters invalid | |
| 42 all_filters = self.cfg.nfilters.copy() | |
| 43 all_filters.update(self.cfg.sfilters) | |
| 44 for key in all_filters: | |
| 45 col_name = all_filters[key]["col_name"] | |
| 46 if col_name not in datafile_header: | |
| 47 print "Warning! The '{}' filter was not applied as the datafile does not contain the column '{}'".format( | |
| 48 key, col_name) | |
| 49 | |
| 50 def read_config_file(self, cfname): | |
| 51 self.cfg = ConfigSettings() | |
| 52 rc = self.cfg.parse_config_file(cfname) | |
| 53 return rc | |
| 54 | |
| 55 def read_data_file(self, dfname): | |
| 56 if (self.cfg == None) or (not self.cfg.is_valid()): | |
| 57 return 1 | |
| 58 | |
| 59 datafile = open(dfname, 'r') | |
| 60 dreader = csv.DictReader(datafile, delimiter='\t') | |
| 61 # verify datafile data matches config file | |
| 62 self.verify_column_names(dreader.fieldnames) | |
| 63 self.verify_filters(dreader.fieldnames) | |
| 64 all_sample_ids = set() | |
| 65 i = 0 | |
| 66 | |
| 67 for row in dreader: | |
| 68 failed_filters = self.filter_all(row) | |
| 69 sample_key = row[self.cfg.col_names['sample_id_column']] | |
| 70 all_sample_ids.add(sample_key) | |
| 71 if not failed_filters: | |
| 72 # add to sample dict | |
| 73 # key is a tuple made up of which chromosome the snp is found on | |
| 74 # and the position on the chromosome itself | |
| 75 SNP_key = (row[self.cfg.col_names['chromosome_column']], int(row[self.cfg.col_names['position_column']])) | |
| 76 genotype = row[self.cfg.col_names['genotype_column']] | |
| 77 | |
| 78 # create a dictionary for each sample (person); each person is associated | |
| 79 # with another dictionary of all the SNPs found in that person | |
| 80 if sample_key not in self.samples: | |
| 81 self.samples[sample_key] = {SNP_key: genotype} | |
| 82 else: | |
| 83 self.samples[sample_key][SNP_key] = genotype | |
| 84 | |
| 85 # create a dict of all the sites where SNPs exist | |
| 86 if SNP_key not in self.sites: | |
| 87 # generate arbitrary ID's if there is no pre-existing ID for the SNP | |
| 88 if row[self.cfg.col_names['variant_id_column']] == '.': | |
| 89 SNP_id = "SNP_" + str(i) | |
| 90 else: | |
| 91 SNP_id = row[self.cfg.col_names['variant_id_column']] | |
| 92 | |
| 93 SNP_data = {'ref_col': row[self.cfg.col_names['reference_column']], | |
| 94 'alt_col': row[self.cfg.col_names['alternate_column']], | |
| 95 'SNP_id': SNP_id} | |
| 96 self.sites[SNP_key] = SNP_data | |
| 97 i += 1 | |
| 98 | |
| 99 # make sure every sample contains a genotype for every SNP found | |
| 100 for sample_key in self.samples: | |
| 101 this_sample = self.samples[sample_key] | |
| 102 for SNP_key in self.sites: | |
| 103 if SNP_key not in this_sample: | |
| 104 this_sample[SNP_key] = "'0/0" | |
| 105 datafile.close() | |
| 106 | |
| 107 # get list of samples which were filtered out | |
| 108 self.xsamples = list(all_sample_ids.difference(set(self.samples.keys()))) | |
| 109 return 0 | |
| 110 | |
| 111 # returns key of the specific filter/s that failed | |
| 112 def filter_numeric(self, row): | |
| 113 failed_filters = set() | |
| 114 for key in self.cfg.nfilters.keys(): | |
| 115 nfilter = self.cfg.nfilters[key] | |
| 116 cutoff = float(nfilter["cutoff"]) | |
| 117 op = nfilter["op"] | |
| 118 col_name = nfilter["col_name"] | |
| 119 if col_name in row.keys(): | |
| 120 cv = float(row[col_name]) | |
| 121 if not string_as_operator(cv, cutoff, op): | |
| 122 failed_filters.add(key) | |
| 123 | |
| 124 return failed_filters | |
| 125 | |
| 126 # returns key of the specific filter/s that failed | |
| 127 def filter_string(self, row): | |
| 128 failed_filters = set() | |
| 129 for key in self.cfg.sfilters.keys(): | |
| 130 sfilter = self.cfg.sfilters[key] | |
| 131 col_name = sfilter["col_name"] | |
| 132 if col_name in row.keys(): | |
| 133 cs = row[col_name] | |
| 134 af = sfilter['accept_flag'] | |
| 135 ef = sfilter['exact_flag'] | |
| 136 patterns = sfilter['patterns'] | |
| 137 if ef: | |
| 138 if af: | |
| 139 passed = False | |
| 140 for p in patterns: | |
| 141 if p == cs: | |
| 142 passed = True | |
| 143 break | |
| 144 if passed == False: | |
| 145 failed_filters.add(key) | |
| 146 else: | |
| 147 for p in patterns: | |
| 148 if p == cs: | |
| 149 failed_filters.add(key) | |
| 150 break | |
| 151 else: | |
| 152 if af: | |
| 153 passed = False | |
| 154 for p in patterns: | |
| 155 if p in cs: | |
| 156 passed = True | |
| 157 break | |
| 158 if passed == False: | |
| 159 failed_filters.add(key) | |
| 160 else: | |
| 161 for p in patterns: | |
| 162 if p in cs: | |
| 163 failed_filters.add(key) | |
| 164 break | |
| 165 | |
| 166 return failed_filters | |
| 167 | |
| 168 def filter_all(self, row): | |
| 169 return self.filter_numeric(row).union(self.filter_string(row)) | |
| 170 | |
| 171 def create_ped_file(self, filename, numeric=False): | |
| 172 output = "" | |
| 173 | |
| 174 sorted_sample_keys = sorted(self.samples.keys()) | |
| 175 for sample_key in sorted_sample_keys: | |
| 176 this_sample = self.samples[sample_key] | |
| 177 sorted_site_keys = sorted(this_sample.keys()) | |
| 178 variants = [] | |
| 179 if numeric: | |
| 180 pef = sample_key | |
| 181 else: | |
| 182 pef = self.create_ped_entry_front(sample_key) | |
| 183 | |
| 184 for SNP_key in sorted_site_keys: | |
| 185 genotype = this_sample[SNP_key] | |
| 186 if numeric == True: | |
| 187 variants.append(str(GENOTYPE_TO_NUMERIC[genotype])) | |
| 188 else: | |
| 189 variants.append(genotype_to_bases(genotype, self.sites[SNP_key])) | |
| 190 | |
| 191 output += "{}\t{}\n".format(pef, '\t'.join(variants)) | |
| 192 | |
| 193 pedfile = open(filename, 'w') | |
| 194 pedfile.write(output) | |
| 195 pedfile.close() | |
| 196 | |
| 197 def create_ped_entry_front(self, sample_id): | |
| 198 if self.cfg.control_info["control_tag"]["tag"] in sample_id: | |
| 199 group = 2 | |
| 200 elif self.cfg.control_info["cases_tag"]["tag"] in sample_id: | |
| 201 group = 1 | |
| 202 else: | |
| 203 group = 1 | |
| 204 | |
| 205 entry = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format( | |
| 206 sample_id, | |
| 207 sample_id, | |
| 208 sample_id + "_F", | |
| 209 sample_id + "_M", | |
| 210 2, | |
| 211 group) | |
| 212 | |
| 213 return entry | |
| 214 | |
| 215 def create_map_file(self, filename): | |
| 216 output = "" | |
| 217 for SNP_key in sorted(self.sites.keys()): | |
| 218 chrom = SNP_key[0] | |
| 219 SNP_id = self.sites[SNP_key]['SNP_id'] | |
| 220 posn = SNP_key[1] | |
| 221 output += "{}\t{}\t{}\n".format(chrom, SNP_id, str(posn)) | |
| 222 | |
| 223 mapfile = open(filename, 'w') | |
| 224 mapfile.write(output) | |
| 225 mapfile.close() | |
| 226 | |
| 227 def create_excluded_samples_file(self, filename): | |
| 228 xsfile = open(filename, 'w') | |
| 229 xsfile.write('\n'.join(self.xsamples)) | |
| 230 xsfile.close() | |
| 231 | |
| 232 class ConfigSettings: | |
| 233 | |
| 234 SECTIONS = [ | |
| 235 "#control", | |
| 236 "#column_names", | |
| 237 "#numeric_filters", | |
| 238 "#string_filters" | |
| 239 ] | |
| 240 | |
| 241 def __init__(self): | |
| 242 self.control_info = {} | |
| 243 self.col_names = {} | |
| 244 self.nfilters = {} | |
| 245 self.sfilters = {} | |
| 246 | |
| 247 def parse_config_file(self, cfname): | |
| 248 cffile = open(cfname, 'r') | |
| 249 section = None | |
| 250 rc = 0 | |
| 251 | |
| 252 for line in cffile: | |
| 253 # clean trailing/leading whitespace/newlines | |
| 254 line = line.strip() | |
| 255 # set section flag | |
| 256 if line[0] == '#': | |
| 257 if line in ConfigSettings.SECTIONS: | |
| 258 section = line | |
| 259 else: | |
| 260 continue | |
| 261 else: | |
| 262 # fill up config dicts | |
| 263 if section == "#control": | |
| 264 (key, col_name, tag) = line.split(',') | |
| 265 self.control_info[key] = {'col_name': col_name, 'tag': tag} | |
| 266 elif section == "#column_names": | |
| 267 (key, col_name) = line.split(',') | |
| 268 self.col_names[key] = col_name | |
| 269 elif section == "#numeric_filters": | |
| 270 (key, col_name, op, cutoff) = line.split(',') | |
| 271 self.add_numeric_filter(key, col_name, op, float(cutoff)) | |
| 272 elif section == "#string_filters": | |
| 273 (key, col_name, exact_flag, accept_flag) = line.split(',') | |
| 274 patterns = next(cffile).strip().split(',') | |
| 275 self.add_string_filter(key, col_name, exact_flag, accept_flag, patterns) | |
| 276 else: | |
| 277 rc = 2 | |
| 278 break | |
| 279 | |
| 280 cffile.close() | |
| 281 if rc != 0: | |
| 282 return rc | |
| 283 if not self.is_valid(): | |
| 284 rc = 3 | |
| 285 return rc | |
| 286 | |
| 287 | |
| 288 def is_valid(self): | |
| 289 for k in REQ_KEYS: | |
| 290 if k not in self.col_names.keys(): | |
| 291 return False | |
| 292 return True | |
| 293 | |
| 294 def add_numeric_filter(self, key, col_name, op, cutoff): | |
| 295 self.nfilters[key] = { | |
| 296 'col_name': col_name, | |
| 297 'op': op, | |
| 298 'cutoff': cutoff | |
| 299 } | |
| 300 | |
| 301 def add_string_filter(self, key, col_name, exact_flag, accept_flag, patterns): | |
| 302 if exact_flag == "exact": | |
| 303 ef = True | |
| 304 elif exact_flag == "not_exact": | |
| 305 ef = False | |
| 306 else: | |
| 307 return False | |
| 308 | |
| 309 if accept_flag == "accept": | |
| 310 af = True | |
| 311 elif accept_flag == "reject": | |
| 312 af = False | |
| 313 else: | |
| 314 return False | |
| 315 | |
| 316 self.sfilters[key] = { | |
| 317 'col_name': col_name, | |
| 318 'exact_flag': ef, | |
| 319 'accept_flag': af, | |
| 320 'patterns': patterns | |
| 321 } | |
| 322 | |
| 323 def __str__(self): | |
| 324 rv = "is Valid: {} || control info: {} || col names: {} || numeric filters: {} || string filters: {}".format( | |
| 325 self.is_valid(), self.control_info, self.col_names, self.nfilters, self.sfilters) | |
| 326 return rv | |
| 327 | |
| 328 | |
| 329 ### Utility ### | |
| 330 def string_as_operator(arg1, arg2, op): | |
| 331 if op == "==": | |
| 332 return arg1 == arg2 | |
| 333 elif op == ">": | |
| 334 return arg1 > arg2 | |
| 335 elif op == "<": | |
| 336 return arg1 < arg2 | |
| 337 elif op == "<=": | |
| 338 return arg1 <= arg2 | |
| 339 elif op == ">=": | |
| 340 return arg1 >= arg2 | |
| 341 | |
| 342 def genotype_to_bases(genotype, SNPdata): | |
| 343 bases = "" | |
| 344 if genotype in GENOTYPE_DICT: | |
| 345 gtype = GENOTYPE_DICT[genotype] | |
| 346 if gtype == "hom_ref": | |
| 347 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['ref_col']) | |
| 348 elif gtype == "hom_alt": | |
| 349 bases = "{} {}".format(SNPdata['alt_col'], SNPdata['alt_col']) | |
| 350 elif gtype == "het": | |
| 351 bases = "{} {}".format(SNPdata['ref_col'], SNPdata['alt_col']) | |
| 352 elif gtype == "tri_allelic": | |
| 353 aa_col = SNPdata['alt_col'] | |
| 354 if len(aa_col) > 1: | |
| 355 # arbitrarily choose the first one | |
| 356 alt_allele = aa_col[0] | |
| 357 else: | |
| 358 alt_allele = aa_col | |
| 359 bases = "{} {}".format(alt_allele, alt_allele) | |
| 360 else: | |
| 361 print genotype | |
| 362 print "Unrecognized genotype!" | |
| 363 sys.exit(1) | |
| 364 return bases | |
| 365 | |
| 366 ### Main ### | |
| 367 def main(): | |
| 368 # argument parsing | |
| 369 parser = argparse.ArgumentParser() | |
| 370 parser.add_argument("dfname", help="name of input data file") | |
| 371 parser.add_argument("cfname", help="name of input configuration file") | |
| 372 parser.add_argument("pfname", help="name of output ped file") | |
| 373 parser.add_argument("mfname", help="name of output map file") | |
| 374 parser.add_argument("xsname", help="name of output file containing exact IDs of samples who were excluded") | |
| 375 args = parser.parse_args() | |
| 376 | |
| 377 pc = PedConverter() | |
| 378 # read in config file | |
| 379 rc = pc.read_config_file(args.cfname) | |
| 380 if rc == 0: | |
| 381 print 'config file read successfully' | |
| 382 else: | |
| 383 print 'failed to read in config file successfully. Error code: {}'.format(rc) | |
| 384 # read in data file | |
| 385 rc = pc.read_data_file(args.dfname) | |
| 386 if rc == 0: | |
| 387 print 'data file read successfully' | |
| 388 else: | |
| 389 print 'failed to read in data file successfully. Error code: {}'.format(rc) | |
| 390 pc.create_ped_file(args.pfname, numeric=True) | |
| 391 pc.create_map_file(args.mfname) | |
| 392 pc.create_excluded_samples_file(args.xsname) | |
| 393 | |
| 394 if __name__ == "__main__": | |
| 395 main() |
