comparison vcf_reader_func.py @ 0:1ad9c146ebb4 draft

Uploaded
author jaredgk
date Fri, 19 Oct 2018 14:36:02 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1ad9c146ebb4
1 import sys
2 import pysam
3 import logging
4 import struct
5 from random import sample
6 from collections import defaultdict
7 import os
8 import gzip
9
10 def checkIfGzip(filename):
11 try:
12 gf = gzip.open(filename)
13 gl = gf.readline()
14 gf.close()
15 vcf_check = b'##fileformat=VCF'
16 if gl[0:3] == b'BCF':
17 return 'bcf'
18 elif gl[:len(vcf_check)] == vcf_check:
19 return checkHeader(filename)
20 else:
21 return 'other'
22 except:
23 return 'nozip'
24
25 def checkHeader(filename):
26 f = open(filename,'rb')
27 l = f.readline()
28 f.close()
29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
31 GZF_HEADER=b'\x1f\x8b'
32 if l[:len(BGZF_HEADER)] == BGZF_HEADER:
33 return 'bgzip'
34 if l[:len(GZF_HEADER)] == GZF_HEADER:
35 return 'gzip'
36 return 'nozip'
37
38 def checkFormat(vcfname):
39 """Checks header of given file for compression type
40
41
42 Given a filename, opens file and reads first line to check if
43 file has BGZF or GZIP header. May be extended to check for BCF format
44
45 Parameters
46 ----------
47 filename : str
48 Name of file to be checked
49
50 Returns
51 -------
52 extension : str {'bgzip','gzip','vcf','other'}
53 File extension as indicated by header
54
55 """
56 typ = checkIfGzip(vcfname)
57 if typ != 'nozip':
58 return typ
59 f = open(vcfname)
60 l = f.readline()
61 f.close()
62 VCF_TAG='##fileformat=VCF'
63 if l[:len(VCF_TAG)] == VCF_TAG:
64 return 'vcf'
65 return 'other'
66
67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
68 dr = None
69 pos = record.pos
70 c = record.chrom
71 if record.alts is None:
72 return False
73 if add_chr:
74 c = 'chr'+record.chrom
75 if record.ref == 'C' and 'T' in record.alts:
76 seq = fasta_ref.fetch(c,pos-1,pos+1)
77 if seq[0].upper() != 'C':
78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
79 #raise Exception("checkIfCpG function not lining up properly")
80 if seq[1].upper() == 'G':
81 return True
82 return False
83 elif record.ref == 'G' and 'A' in record.alts:
84 seq = fasta_ref.fetch(c,pos-2,pos)
85 if seq[1].upper() != 'G':
86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
87 #raise Exception("checkIfCpg function not lining up on negative strand")
88 if seq[0].upper() == 'C':
89 return True
90 return False
91 return False
92
93 def checkForDuplicates(rec_list,pass_list):
94 for i in range(len(rec_list)-1):
95 if rec_list[i].pos == rec_list[i+1].pos:
96 pass_list[i] = False
97 pass_list[i+1] = False
98
99 def checkForMultiallele(rec_list,pass_list):
100 for i in range(len(rec_list)):
101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
102 pass_list[i] = False
103 pass_list[i+1] = False
104 if len(rec_list[i].alleles) > 2:
105 pass_list[i] = False
106
107 def flipChrom(chrom):
108 if chrom[0:3] == 'chr':
109 return chrom[0:3]
110 return 'chr'+chrom
111
112 def getAlleleCountDict(rec):
113 alleles = defaultdict(int)
114 total_sites = 0
115 missing_inds = 0
116 for j in range(len(rec.samples)):
117 samp = rec.samples[j]
118 if None in samp.alleles:
119 missing_inds += 1
120 for k in range(len(samp.alleles)):
121 b = samp.alleles[k]
122 if b is not None:
123 alleles[b] += 1
124 total_sites+=1
125 return alleles, total_sites, missing_inds
126
127 def isInvariant(rec):
128 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
129 if len(alleles) == 1:
130 return True
131 return False
132
133 def isInformative(rec, mincount=2, alleles=None):
134 count = 0
135 if alleles is None:
136 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
137 if len(alleles) != 2:
138 return False
139 i1,i2 = alleles.keys()
140 return (alleles[i1] >= mincount and alleles[i2] >= mincount)
141
142 def getPassSites(record_list, remove_cpg=False, remove_indels=True,
143 remove_multiallele=True, remove_missing=0,
144 inform_level=2, fasta_ref=None):
145 pass_list = [True for r in record_list]
146 if remove_cpg == True and fasta_ref is None:
147 raise Exception("CpG removal requires a reference")
148 if inform_level > 2 or inform_level < 0:
149 raise Exception("Inform level %d must be between 0 and 2" % inform_level)
150 if remove_multiallele:
151 checkForMultiallele(record_list,pass_list)
152 for i in range(len(record_list)):
153 rec = record_list[i]
154 if remove_indels and not checkRecordIsSnp(rec):
155 pass_list[i] = False
156 if remove_cpg and checkIfCpG(rec,fasta_ref):
157 pass_list[i] = False
158 alleles,total_sites,missing_inds = getAlleleCountDict(rec)
159 if remove_missing != -1 and missing_inds > int(remove_missing):
160 pass_list[i] = False
161 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
162 pass_list[i] = False
163 #pp = zip([r.pos for r in record_list],pass_list)
164 #for ppp in pp:
165 # logging.info(ppp)
166 return pass_list
167
168
169 def filterSites(record_list, remove_cpg=False, remove_indels=True,
170 remove_multiallele=True, remove_missing=0, inform_level=2,
171 fasta_ref=None):
172 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
173 out_list = []
174 for i in range(len(pass_list)):
175 if pass_list[i]:
176 out_list.append(record_list[i])
177 return out_list
178
179
180
181 class VcfReader():
182 def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
183 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
184
185 ext = checkFormat(vcfname)
186 if ext in ['gzip','other'] :
187 raise Exception(('Input file %s is gzip-formatted, must be either '
188 'uncompressed or zipped with bgzip' % vcfname))
189 self.file_uncompressed = (ext == 'vcf')
190 self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
191 self.popmodel = None
192 self.popkeys = None
193 if popmodel is not None and use_allpop:
194 raise Exception("Popmodel and allpop cannot both be specified")
195 if compress_flag and file_uncompressed:
196 vcfname = compressVcf(vcfname)
197 if subsamp_num is not None:
198 if subsamp_list is not None:
199 raise Exception('Multiple subsampling options called in getVcfReader')
200 subsamp_list = getSubsampleList(vcfname, subsamp_num)
201 elif subsamp_fn is not None:
202 if subsamp_list is not None:
203 raise Exception('Multiple subsampling options called in getVcfReader')
204 subsamp_file = open(subsamp_fn,'r')
205 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
206 subsamp_file.close()
207
208 if index is None:
209 self.reader = pysam.VariantFile(vcfname)
210 else:
211 self.reader = pysam.VariantFile(vcfname, index_filename=index)
212 if popmodel is not None:
213 self.popmodel = popmodel
214 popsamp_list = popmodel.inds
215 self.reader.subset_samples(popsamp_list)
216 self.setPopIdx()
217 if use_allpop:
218 self.setAllPop()
219 if subsamp_list is not None:
220 logging.debug('Subsampling %d individuals from VCF file' %
221 (len(subsamp_list)))
222 self.reader.subset_samples(subsamp_list)
223 self.prev_last_rec = next(self.reader)
224 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
225
226 def fetch(self, chrom=None, start=None, end=None):
227 return self.reader.fetch(chrom, start, end)
228
229 def getRecordList(self, region=None, chrom=None, start=None,
230 end=None):
231 if self.reader_uncompressed:
232 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
233 return ret
234 else:
235 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
236
237 def setPopIdx(self):
238 self.popkeys = {}
239 sample_names = [l for l in self.reader.header.samples]
240 for p in self.popmodel.pop_list:
241 self.popkeys[p] = []
242 for ind in self.popmodel.ind_dict[p]:
243 self.popkeys[p].append(sample_names.index(ind))
244
245 def close(self):
246 self.reader.close()
247
248 def setAllPop(self):
249 self.popkeys = {'ALL':[]}
250 for i in range(len(self.reader.header.samples)):
251 self.popkeys['ALL'].append(i)
252
253 def modChrom(c,vcf_chr):
254 if c is None:
255 return None
256 if vcf_chr and c[:3] != 'chr':
257 return 'chr'+c
258 if not vcf_chr and c[:3] == 'chr':
259 return c[3:]
260 return c
261
262 def getRecordList(vcf_reader, region=None, chrom=None, start=None,
263 end=None, add_chr=False):
264 """Returns list for use in subsampling from input file"""
265 if region is not None:
266 c = modChrom(region.chrom,add_chr)
267 var_sites = vcf_reader.fetch(c, region.start, region.end)
268 else:
269 c = modChrom(chrom,add_chr)
270 var_sites = vcf_reader.fetch(c, start, end)
271 lst = []
272 for rec in var_sites:
273 lst.append(rec)
274 return lst
275
276
277 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
278 start=None, end=None, add_chr=False):
279 """Method for getting record list from unzipped VCF file.
280
281 This method will sequentially look through a VCF file until it finds
282 the given `start` position on `chrom`, then add all records to a list
283 until the `end` position has been reached. Note that `prev_last_rec`
284 must be kept track of externally to ensure that if consecutive regions
285 are called, the record of the first variant outside the first region
286 is not lost.
287
288 Parameters
289 ----------
290 vcf_reader : pysam VariantFile object
291 VCF reader initialized from other function
292 region : region object
293 Region object with start and end coordinates of region of interest.
294 prev_last_rec : VariantRecord object
295 Variable with last record read from VcfReader. Stored here so that
296 if two adjacent regions are called, the overflow record from the
297 first region is still included in the next region
298
299 Returns
300 -------
301 lst : list
302 List of records in given gene region
303 prev_last_rec : VariantRecord object
304 First record after target region, for use in next call
305
306 """
307 lst = []
308 if region is None:
309 lst.append(prev_last_rec)
310 for rec in vcf_reader:
311 lst.append(rec)
312 return lst, lst[-1]
313
314 if (prev_last_rec is not None and
315 region.containsRecord(prev_last_rec) == 'in'):
316 lst.append(prev_last_rec)
317 elif (prev_last_rec is not None and
318 region.containsRecord(prev_last_rec) == 'after'):
319 return []
320 rec = next(vcf_reader,None)
321 if rec is None:
322 return lst,None
323 place = region.containsRecord(rec)
324 while rec is not None and place != 'after':
325 if place == 'in':
326 lst.append(rec)
327 rec = next(vcf_reader,None)
328 if rec is None:
329 break
330 place = region.containsRecord(rec)
331 prev_last_rec = rec
332 return lst, prev_last_rec
333
334
335 def checkRecordIsSnp(rec):
336 """Checks if this record is a single nucleotide variant, returns bool."""
337 logging.info(str(rec.pos))
338 if len(rec.ref) != 1:
339 return False
340 if rec.alts is None:
341 return False
342 for allele in rec.alts:
343 if len(allele) != 1:
344 return False
345 return True
346
347
348 def getSubsampleList(vcfname, ss_count):
349 """Returns a list of the first `ss_count` individuals in `vcfname`
350
351 """
352
353 vcf_o = pysam.VariantFile(vcfname)
354 rec = next(vcf_o)
355 vcf_o.close()
356 lst = []
357 for samp in rec.samples:
358 lst.append(samp)
359 return lst[:int(ss_count)]
360
361
362 def compressVcf(vcfname,forceflag=False,remove=False):
363 """Runs bgzip and tabix on input VCF file.
364
365 Using the pysam library, this function runs the bgzip and tabix utilities
366 on the given input file. By default, this will not overwrite an existing
367 zipped file, but will overwrite an existing index. `remove` can be set to
368 delete the unzipped file.
369
370 Parameters
371 ----------
372 vcfname : str
373 Name of uncompressed VCF file
374 forceflag : bool (False)
375 If true, will overwrite (vcfname).gz if it exists
376 remove : bool (False)
377 If true, will delete uncompressed source file
378
379 Returns
380 -------
381 cvcfname : str
382 Filepath to compressed VCF file
383 """
384 cvcfname = vcfname+".gz"
385 pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
386 pysam.tabix_index(cvcfname,preset="vcf",force=True)
387 if remove:
388 os.remove(vcfname)
389 return cvcfname
390
391 def vcfRegionName(prefix, region, ext, oneidx=False,
392 halfopen=True, sep='-'):
393 chrom = region.toStr(halfopen, oneidx, sep)
394 return prefix+'_'+chrom+'.'+ext
395
396 def getRecordsInRegion(region, record_list):
397 sub_list = []
398 for i in range(len(record_list)):
399 loc = region.containsRecord(record_list[i])
400 if loc == "in":
401 sub_list.append(record_list[i])
402 elif loc == "after":
403 break
404 return sub_list
405
406
407
408
409
410 #def getVcfReader(args):
411 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
412 subsamp_fn=None, subsamp_list=None, index=None):
413 """Returns a reader for a given input VCF file.
414
415 Given a filename, filetype, compression option, and optional Subsampling
416 options, will return a pysam.VariantFile object for iteration and
417 a flag as to whether this file is compressed or uncompressed.
418
419 Parameters
420 ----------
421 vcfname : str
422 Filename for VCF file. The extension of this file will be used to
423 determine whether it is compressed or not unless `var_ext` is set.
424 var_ext : str (None)
425 Extension for VCF file if it is not included in the filename.
426 compress_flag : bool (False)
427 If filetype is uncompressed and this is set to true, will run
428 compressVcf function.
429 subsamp_num : int (None)
430 If set, will randomly select `subsamp_num` individuals (not
431 genotypes) from the input VCF file and return a reader with
432 only those data.
433 subsamp_fn : str (None)
434 If set, will return a reader with only data from the samples listed
435 in the file provided. Cannot be used with other subsampling options.
436 subsamp_list : list (None)
437 If set, will return reader with records containing only
438 individuals named in the list. Cannot be used with other subsampling
439 options.
440
441 Returns
442 -------
443 vcf_reader : pysam.VariantFile
444 A reader that can be iterated through for variant records. If
445 compressed, it will be able to use the pysam fetch method, otherwise
446 it must be read through sequentially
447 reader_uncompressed : bool
448 If True, VCF reader is uncompressed. This means the fetch method
449 cannot be used and region access must be done using the
450 "getRecordListUnzipped" method.
451
452 """
453 ext = checkFormat(vcfname)
454 if ext in ['gzip','other'] :
455 raise Exception(('Input file %s is gzip-formatted, must be either '
456 'uncompressed or zipped with bgzip' % vcfname))
457 file_uncompressed = (ext == 'vcf')
458 reader_uncompressed = (file_uncompressed and not compress_flag)
459 if compress_flag and file_uncompressed:
460 vcfname = compressVcf(vcfname)
461 #subsamp_list = None
462 if subsamp_num is not None:
463 if subsamp_list is not None:
464 raise Exception('Multiple subsampling options called in getVcfReader')
465 subsamp_list = getSubsampleList(vcfname, subsamp_num)
466 elif subsamp_fn is not None:
467 if subsamp_list is not None:
468 raise Exception('Multiple subsampling options called in getVcfReader')
469 subsamp_file = open(subsamp_fn,'r')
470 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
471 subsamp_file.close()
472 if index is None:
473 vcf_reader = pysam.VariantFile(vcfname)
474 else:
475 vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
476 if subsamp_list is not None:
477 logging.debug('Subsampling %d individuals from VCF file' %
478 (len(subsamp_list)))
479 vcf_reader.subset_samples(subsamp_list)
480 return vcf_reader, reader_uncompressed