Mercurial > repos > jaredgk > ppp_vcf_to_ima
comparison vcf_reader_func.py @ 0:1ad9c146ebb4 draft
Uploaded
| author | jaredgk |
|---|---|
| date | Fri, 19 Oct 2018 14:36:02 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1ad9c146ebb4 |
|---|---|
| 1 import sys | |
| 2 import pysam | |
| 3 import logging | |
| 4 import struct | |
| 5 from random import sample | |
| 6 from collections import defaultdict | |
| 7 import os | |
| 8 import gzip | |
| 9 | |
| 10 def checkIfGzip(filename): | |
| 11 try: | |
| 12 gf = gzip.open(filename) | |
| 13 gl = gf.readline() | |
| 14 gf.close() | |
| 15 vcf_check = b'##fileformat=VCF' | |
| 16 if gl[0:3] == b'BCF': | |
| 17 return 'bcf' | |
| 18 elif gl[:len(vcf_check)] == vcf_check: | |
| 19 return checkHeader(filename) | |
| 20 else: | |
| 21 return 'other' | |
| 22 except: | |
| 23 return 'nozip' | |
| 24 | |
| 25 def checkHeader(filename): | |
| 26 f = open(filename,'rb') | |
| 27 l = f.readline() | |
| 28 f.close() | |
| 29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00' | |
| 30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff' | |
| 31 GZF_HEADER=b'\x1f\x8b' | |
| 32 if l[:len(BGZF_HEADER)] == BGZF_HEADER: | |
| 33 return 'bgzip' | |
| 34 if l[:len(GZF_HEADER)] == GZF_HEADER: | |
| 35 return 'gzip' | |
| 36 return 'nozip' | |
| 37 | |
| 38 def checkFormat(vcfname): | |
| 39 """Checks header of given file for compression type | |
| 40 | |
| 41 | |
| 42 Given a filename, opens file and reads first line to check if | |
| 43 file has BGZF or GZIP header. May be extended to check for BCF format | |
| 44 | |
| 45 Parameters | |
| 46 ---------- | |
| 47 filename : str | |
| 48 Name of file to be checked | |
| 49 | |
| 50 Returns | |
| 51 ------- | |
| 52 extension : str {'bgzip','gzip','vcf','other'} | |
| 53 File extension as indicated by header | |
| 54 | |
| 55 """ | |
| 56 typ = checkIfGzip(vcfname) | |
| 57 if typ != 'nozip': | |
| 58 return typ | |
| 59 f = open(vcfname) | |
| 60 l = f.readline() | |
| 61 f.close() | |
| 62 VCF_TAG='##fileformat=VCF' | |
| 63 if l[:len(VCF_TAG)] == VCF_TAG: | |
| 64 return 'vcf' | |
| 65 return 'other' | |
| 66 | |
| 67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False): | |
| 68 dr = None | |
| 69 pos = record.pos | |
| 70 c = record.chrom | |
| 71 if record.alts is None: | |
| 72 return False | |
| 73 if add_chr: | |
| 74 c = 'chr'+record.chrom | |
| 75 if record.ref == 'C' and 'T' in record.alts: | |
| 76 seq = fasta_ref.fetch(c,pos-1,pos+1) | |
| 77 if seq[0].upper() != 'C': | |
| 78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0])) | |
| 79 #raise Exception("checkIfCpG function not lining up properly") | |
| 80 if seq[1].upper() == 'G': | |
| 81 return True | |
| 82 return False | |
| 83 elif record.ref == 'G' and 'A' in record.alts: | |
| 84 seq = fasta_ref.fetch(c,pos-2,pos) | |
| 85 if seq[1].upper() != 'G': | |
| 86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1])) | |
| 87 #raise Exception("checkIfCpg function not lining up on negative strand") | |
| 88 if seq[0].upper() == 'C': | |
| 89 return True | |
| 90 return False | |
| 91 return False | |
| 92 | |
| 93 def checkForDuplicates(rec_list,pass_list): | |
| 94 for i in range(len(rec_list)-1): | |
| 95 if rec_list[i].pos == rec_list[i+1].pos: | |
| 96 pass_list[i] = False | |
| 97 pass_list[i+1] = False | |
| 98 | |
| 99 def checkForMultiallele(rec_list,pass_list): | |
| 100 for i in range(len(rec_list)): | |
| 101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos: | |
| 102 pass_list[i] = False | |
| 103 pass_list[i+1] = False | |
| 104 if len(rec_list[i].alleles) > 2: | |
| 105 pass_list[i] = False | |
| 106 | |
| 107 def flipChrom(chrom): | |
| 108 if chrom[0:3] == 'chr': | |
| 109 return chrom[0:3] | |
| 110 return 'chr'+chrom | |
| 111 | |
| 112 def getAlleleCountDict(rec): | |
| 113 alleles = defaultdict(int) | |
| 114 total_sites = 0 | |
| 115 missing_inds = 0 | |
| 116 for j in range(len(rec.samples)): | |
| 117 samp = rec.samples[j] | |
| 118 if None in samp.alleles: | |
| 119 missing_inds += 1 | |
| 120 for k in range(len(samp.alleles)): | |
| 121 b = samp.alleles[k] | |
| 122 if b is not None: | |
| 123 alleles[b] += 1 | |
| 124 total_sites+=1 | |
| 125 return alleles, total_sites, missing_inds | |
| 126 | |
| 127 def isInvariant(rec): | |
| 128 alleles, total_sites, missing_inds = getAlleleCountDict(rec) | |
| 129 if len(alleles) == 1: | |
| 130 return True | |
| 131 return False | |
| 132 | |
| 133 def isInformative(rec, mincount=2, alleles=None): | |
| 134 count = 0 | |
| 135 if alleles is None: | |
| 136 alleles, total_sites, missing_inds = getAlleleCountDict(rec) | |
| 137 if len(alleles) != 2: | |
| 138 return False | |
| 139 i1,i2 = alleles.keys() | |
| 140 return (alleles[i1] >= mincount and alleles[i2] >= mincount) | |
| 141 | |
| 142 def getPassSites(record_list, remove_cpg=False, remove_indels=True, | |
| 143 remove_multiallele=True, remove_missing=0, | |
| 144 inform_level=2, fasta_ref=None): | |
| 145 pass_list = [True for r in record_list] | |
| 146 if remove_cpg == True and fasta_ref is None: | |
| 147 raise Exception("CpG removal requires a reference") | |
| 148 if inform_level > 2 or inform_level < 0: | |
| 149 raise Exception("Inform level %d must be between 0 and 2" % inform_level) | |
| 150 if remove_multiallele: | |
| 151 checkForMultiallele(record_list,pass_list) | |
| 152 for i in range(len(record_list)): | |
| 153 rec = record_list[i] | |
| 154 if remove_indels and not checkRecordIsSnp(rec): | |
| 155 pass_list[i] = False | |
| 156 if remove_cpg and checkIfCpG(rec,fasta_ref): | |
| 157 pass_list[i] = False | |
| 158 alleles,total_sites,missing_inds = getAlleleCountDict(rec) | |
| 159 if remove_missing != -1 and missing_inds > int(remove_missing): | |
| 160 pass_list[i] = False | |
| 161 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles): | |
| 162 pass_list[i] = False | |
| 163 #pp = zip([r.pos for r in record_list],pass_list) | |
| 164 #for ppp in pp: | |
| 165 # logging.info(ppp) | |
| 166 return pass_list | |
| 167 | |
| 168 | |
| 169 def filterSites(record_list, remove_cpg=False, remove_indels=True, | |
| 170 remove_multiallele=True, remove_missing=0, inform_level=2, | |
| 171 fasta_ref=None): | |
| 172 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref) | |
| 173 out_list = [] | |
| 174 for i in range(len(pass_list)): | |
| 175 if pass_list[i]: | |
| 176 out_list.append(record_list[i]) | |
| 177 return out_list | |
| 178 | |
| 179 | |
| 180 | |
| 181 class VcfReader(): | |
| 182 def __init__(self, vcfname, compress_flag=False, subsamp_num=None, | |
| 183 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False): | |
| 184 | |
| 185 ext = checkFormat(vcfname) | |
| 186 if ext in ['gzip','other'] : | |
| 187 raise Exception(('Input file %s is gzip-formatted, must be either ' | |
| 188 'uncompressed or zipped with bgzip' % vcfname)) | |
| 189 self.file_uncompressed = (ext == 'vcf') | |
| 190 self.reader_uncompressed = (self.file_uncompressed and not compress_flag) | |
| 191 self.popmodel = None | |
| 192 self.popkeys = None | |
| 193 if popmodel is not None and use_allpop: | |
| 194 raise Exception("Popmodel and allpop cannot both be specified") | |
| 195 if compress_flag and file_uncompressed: | |
| 196 vcfname = compressVcf(vcfname) | |
| 197 if subsamp_num is not None: | |
| 198 if subsamp_list is not None: | |
| 199 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 200 subsamp_list = getSubsampleList(vcfname, subsamp_num) | |
| 201 elif subsamp_fn is not None: | |
| 202 if subsamp_list is not None: | |
| 203 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 204 subsamp_file = open(subsamp_fn,'r') | |
| 205 subsamp_list = [l.strip() for l in subsamp_file.readlines()] | |
| 206 subsamp_file.close() | |
| 207 | |
| 208 if index is None: | |
| 209 self.reader = pysam.VariantFile(vcfname) | |
| 210 else: | |
| 211 self.reader = pysam.VariantFile(vcfname, index_filename=index) | |
| 212 if popmodel is not None: | |
| 213 self.popmodel = popmodel | |
| 214 popsamp_list = popmodel.inds | |
| 215 self.reader.subset_samples(popsamp_list) | |
| 216 self.setPopIdx() | |
| 217 if use_allpop: | |
| 218 self.setAllPop() | |
| 219 if subsamp_list is not None: | |
| 220 logging.debug('Subsampling %d individuals from VCF file' % | |
| 221 (len(subsamp_list))) | |
| 222 self.reader.subset_samples(subsamp_list) | |
| 223 self.prev_last_rec = next(self.reader) | |
| 224 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr') | |
| 225 | |
| 226 def fetch(self, chrom=None, start=None, end=None): | |
| 227 return self.reader.fetch(chrom, start, end) | |
| 228 | |
| 229 def getRecordList(self, region=None, chrom=None, start=None, | |
| 230 end=None): | |
| 231 if self.reader_uncompressed: | |
| 232 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom) | |
| 233 return ret | |
| 234 else: | |
| 235 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom) | |
| 236 | |
| 237 def setPopIdx(self): | |
| 238 self.popkeys = {} | |
| 239 sample_names = [l for l in self.reader.header.samples] | |
| 240 for p in self.popmodel.pop_list: | |
| 241 self.popkeys[p] = [] | |
| 242 for ind in self.popmodel.ind_dict[p]: | |
| 243 self.popkeys[p].append(sample_names.index(ind)) | |
| 244 | |
| 245 def close(self): | |
| 246 self.reader.close() | |
| 247 | |
| 248 def setAllPop(self): | |
| 249 self.popkeys = {'ALL':[]} | |
| 250 for i in range(len(self.reader.header.samples)): | |
| 251 self.popkeys['ALL'].append(i) | |
| 252 | |
| 253 def modChrom(c,vcf_chr): | |
| 254 if c is None: | |
| 255 return None | |
| 256 if vcf_chr and c[:3] != 'chr': | |
| 257 return 'chr'+c | |
| 258 if not vcf_chr and c[:3] == 'chr': | |
| 259 return c[3:] | |
| 260 return c | |
| 261 | |
| 262 def getRecordList(vcf_reader, region=None, chrom=None, start=None, | |
| 263 end=None, add_chr=False): | |
| 264 """Returns list for use in subsampling from input file""" | |
| 265 if region is not None: | |
| 266 c = modChrom(region.chrom,add_chr) | |
| 267 var_sites = vcf_reader.fetch(c, region.start, region.end) | |
| 268 else: | |
| 269 c = modChrom(chrom,add_chr) | |
| 270 var_sites = vcf_reader.fetch(c, start, end) | |
| 271 lst = [] | |
| 272 for rec in var_sites: | |
| 273 lst.append(rec) | |
| 274 return lst | |
| 275 | |
| 276 | |
| 277 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None, | |
| 278 start=None, end=None, add_chr=False): | |
| 279 """Method for getting record list from unzipped VCF file. | |
| 280 | |
| 281 This method will sequentially look through a VCF file until it finds | |
| 282 the given `start` position on `chrom`, then add all records to a list | |
| 283 until the `end` position has been reached. Note that `prev_last_rec` | |
| 284 must be kept track of externally to ensure that if consecutive regions | |
| 285 are called, the record of the first variant outside the first region | |
| 286 is not lost. | |
| 287 | |
| 288 Parameters | |
| 289 ---------- | |
| 290 vcf_reader : pysam VariantFile object | |
| 291 VCF reader initialized from other function | |
| 292 region : region object | |
| 293 Region object with start and end coordinates of region of interest. | |
| 294 prev_last_rec : VariantRecord object | |
| 295 Variable with last record read from VcfReader. Stored here so that | |
| 296 if two adjacent regions are called, the overflow record from the | |
| 297 first region is still included in the next region | |
| 298 | |
| 299 Returns | |
| 300 ------- | |
| 301 lst : list | |
| 302 List of records in given gene region | |
| 303 prev_last_rec : VariantRecord object | |
| 304 First record after target region, for use in next call | |
| 305 | |
| 306 """ | |
| 307 lst = [] | |
| 308 if region is None: | |
| 309 lst.append(prev_last_rec) | |
| 310 for rec in vcf_reader: | |
| 311 lst.append(rec) | |
| 312 return lst, lst[-1] | |
| 313 | |
| 314 if (prev_last_rec is not None and | |
| 315 region.containsRecord(prev_last_rec) == 'in'): | |
| 316 lst.append(prev_last_rec) | |
| 317 elif (prev_last_rec is not None and | |
| 318 region.containsRecord(prev_last_rec) == 'after'): | |
| 319 return [] | |
| 320 rec = next(vcf_reader,None) | |
| 321 if rec is None: | |
| 322 return lst,None | |
| 323 place = region.containsRecord(rec) | |
| 324 while rec is not None and place != 'after': | |
| 325 if place == 'in': | |
| 326 lst.append(rec) | |
| 327 rec = next(vcf_reader,None) | |
| 328 if rec is None: | |
| 329 break | |
| 330 place = region.containsRecord(rec) | |
| 331 prev_last_rec = rec | |
| 332 return lst, prev_last_rec | |
| 333 | |
| 334 | |
| 335 def checkRecordIsSnp(rec): | |
| 336 """Checks if this record is a single nucleotide variant, returns bool.""" | |
| 337 logging.info(str(rec.pos)) | |
| 338 if len(rec.ref) != 1: | |
| 339 return False | |
| 340 if rec.alts is None: | |
| 341 return False | |
| 342 for allele in rec.alts: | |
| 343 if len(allele) != 1: | |
| 344 return False | |
| 345 return True | |
| 346 | |
| 347 | |
| 348 def getSubsampleList(vcfname, ss_count): | |
| 349 """Returns a list of the first `ss_count` individuals in `vcfname` | |
| 350 | |
| 351 """ | |
| 352 | |
| 353 vcf_o = pysam.VariantFile(vcfname) | |
| 354 rec = next(vcf_o) | |
| 355 vcf_o.close() | |
| 356 lst = [] | |
| 357 for samp in rec.samples: | |
| 358 lst.append(samp) | |
| 359 return lst[:int(ss_count)] | |
| 360 | |
| 361 | |
| 362 def compressVcf(vcfname,forceflag=False,remove=False): | |
| 363 """Runs bgzip and tabix on input VCF file. | |
| 364 | |
| 365 Using the pysam library, this function runs the bgzip and tabix utilities | |
| 366 on the given input file. By default, this will not overwrite an existing | |
| 367 zipped file, but will overwrite an existing index. `remove` can be set to | |
| 368 delete the unzipped file. | |
| 369 | |
| 370 Parameters | |
| 371 ---------- | |
| 372 vcfname : str | |
| 373 Name of uncompressed VCF file | |
| 374 forceflag : bool (False) | |
| 375 If true, will overwrite (vcfname).gz if it exists | |
| 376 remove : bool (False) | |
| 377 If true, will delete uncompressed source file | |
| 378 | |
| 379 Returns | |
| 380 ------- | |
| 381 cvcfname : str | |
| 382 Filepath to compressed VCF file | |
| 383 """ | |
| 384 cvcfname = vcfname+".gz" | |
| 385 pysam.tabix_compress(vcfname,cvcfname,force=forceflag) | |
| 386 pysam.tabix_index(cvcfname,preset="vcf",force=True) | |
| 387 if remove: | |
| 388 os.remove(vcfname) | |
| 389 return cvcfname | |
| 390 | |
| 391 def vcfRegionName(prefix, region, ext, oneidx=False, | |
| 392 halfopen=True, sep='-'): | |
| 393 chrom = region.toStr(halfopen, oneidx, sep) | |
| 394 return prefix+'_'+chrom+'.'+ext | |
| 395 | |
| 396 def getRecordsInRegion(region, record_list): | |
| 397 sub_list = [] | |
| 398 for i in range(len(record_list)): | |
| 399 loc = region.containsRecord(record_list[i]) | |
| 400 if loc == "in": | |
| 401 sub_list.append(record_list[i]) | |
| 402 elif loc == "after": | |
| 403 break | |
| 404 return sub_list | |
| 405 | |
| 406 | |
| 407 | |
| 408 | |
| 409 | |
| 410 #def getVcfReader(args): | |
| 411 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None, | |
| 412 subsamp_fn=None, subsamp_list=None, index=None): | |
| 413 """Returns a reader for a given input VCF file. | |
| 414 | |
| 415 Given a filename, filetype, compression option, and optional Subsampling | |
| 416 options, will return a pysam.VariantFile object for iteration and | |
| 417 a flag as to whether this file is compressed or uncompressed. | |
| 418 | |
| 419 Parameters | |
| 420 ---------- | |
| 421 vcfname : str | |
| 422 Filename for VCF file. The extension of this file will be used to | |
| 423 determine whether it is compressed or not unless `var_ext` is set. | |
| 424 var_ext : str (None) | |
| 425 Extension for VCF file if it is not included in the filename. | |
| 426 compress_flag : bool (False) | |
| 427 If filetype is uncompressed and this is set to true, will run | |
| 428 compressVcf function. | |
| 429 subsamp_num : int (None) | |
| 430 If set, will randomly select `subsamp_num` individuals (not | |
| 431 genotypes) from the input VCF file and return a reader with | |
| 432 only those data. | |
| 433 subsamp_fn : str (None) | |
| 434 If set, will return a reader with only data from the samples listed | |
| 435 in the file provided. Cannot be used with other subsampling options. | |
| 436 subsamp_list : list (None) | |
| 437 If set, will return reader with records containing only | |
| 438 individuals named in the list. Cannot be used with other subsampling | |
| 439 options. | |
| 440 | |
| 441 Returns | |
| 442 ------- | |
| 443 vcf_reader : pysam.VariantFile | |
| 444 A reader that can be iterated through for variant records. If | |
| 445 compressed, it will be able to use the pysam fetch method, otherwise | |
| 446 it must be read through sequentially | |
| 447 reader_uncompressed : bool | |
| 448 If True, VCF reader is uncompressed. This means the fetch method | |
| 449 cannot be used and region access must be done using the | |
| 450 "getRecordListUnzipped" method. | |
| 451 | |
| 452 """ | |
| 453 ext = checkFormat(vcfname) | |
| 454 if ext in ['gzip','other'] : | |
| 455 raise Exception(('Input file %s is gzip-formatted, must be either ' | |
| 456 'uncompressed or zipped with bgzip' % vcfname)) | |
| 457 file_uncompressed = (ext == 'vcf') | |
| 458 reader_uncompressed = (file_uncompressed and not compress_flag) | |
| 459 if compress_flag and file_uncompressed: | |
| 460 vcfname = compressVcf(vcfname) | |
| 461 #subsamp_list = None | |
| 462 if subsamp_num is not None: | |
| 463 if subsamp_list is not None: | |
| 464 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 465 subsamp_list = getSubsampleList(vcfname, subsamp_num) | |
| 466 elif subsamp_fn is not None: | |
| 467 if subsamp_list is not None: | |
| 468 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 469 subsamp_file = open(subsamp_fn,'r') | |
| 470 subsamp_list = [l.strip() for l in subsamp_file.readlines()] | |
| 471 subsamp_file.close() | |
| 472 if index is None: | |
| 473 vcf_reader = pysam.VariantFile(vcfname) | |
| 474 else: | |
| 475 vcf_reader = pysam.VariantFile(vcfname, index_filename=index) | |
| 476 if subsamp_list is not None: | |
| 477 logging.debug('Subsampling %d individuals from VCF file' % | |
| 478 (len(subsamp_list))) | |
| 479 vcf_reader.subset_samples(subsamp_list) | |
| 480 return vcf_reader, reader_uncompressed |
