annotate vcf_reader_func.py @ 5:009cf225ef96 draft default tip

Uploaded
author jaredgk
date Wed, 24 Oct 2018 15:34:54 -0400
parents 1ad9c146ebb4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
1 import sys
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
2 import pysam
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
3 import logging
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
4 import struct
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
5 from random import sample
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
6 from collections import defaultdict
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
7 import os
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
8 import gzip
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
9
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
10 def checkIfGzip(filename):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
11 try:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
12 gf = gzip.open(filename)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
13 gl = gf.readline()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
14 gf.close()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
15 vcf_check = b'##fileformat=VCF'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
16 if gl[0:3] == b'BCF':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
17 return 'bcf'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
18 elif gl[:len(vcf_check)] == vcf_check:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
19 return checkHeader(filename)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
20 else:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
21 return 'other'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
22 except:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
23 return 'nozip'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
24
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
25 def checkHeader(filename):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
26 f = open(filename,'rb')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
27 l = f.readline()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
28 f.close()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
31 GZF_HEADER=b'\x1f\x8b'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
32 if l[:len(BGZF_HEADER)] == BGZF_HEADER:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
33 return 'bgzip'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
34 if l[:len(GZF_HEADER)] == GZF_HEADER:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
35 return 'gzip'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
36 return 'nozip'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
37
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
38 def checkFormat(vcfname):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
39 """Checks header of given file for compression type
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
40
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
41
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
42 Given a filename, opens file and reads first line to check if
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
43 file has BGZF or GZIP header. May be extended to check for BCF format
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
44
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
45 Parameters
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
46 ----------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
47 filename : str
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
48 Name of file to be checked
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
49
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
50 Returns
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
51 -------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
52 extension : str {'bgzip','gzip','vcf','other'}
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
53 File extension as indicated by header
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
54
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
55 """
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
56 typ = checkIfGzip(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
57 if typ != 'nozip':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
58 return typ
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
59 f = open(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
60 l = f.readline()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
61 f.close()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
62 VCF_TAG='##fileformat=VCF'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
63 if l[:len(VCF_TAG)] == VCF_TAG:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
64 return 'vcf'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
65 return 'other'
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
66
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
68 dr = None
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
69 pos = record.pos
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
70 c = record.chrom
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
71 if record.alts is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
72 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
73 if add_chr:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
74 c = 'chr'+record.chrom
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
75 if record.ref == 'C' and 'T' in record.alts:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
76 seq = fasta_ref.fetch(c,pos-1,pos+1)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
77 if seq[0].upper() != 'C':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
79 #raise Exception("checkIfCpG function not lining up properly")
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
80 if seq[1].upper() == 'G':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
81 return True
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
82 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
83 elif record.ref == 'G' and 'A' in record.alts:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
84 seq = fasta_ref.fetch(c,pos-2,pos)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
85 if seq[1].upper() != 'G':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
87 #raise Exception("checkIfCpg function not lining up on negative strand")
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
88 if seq[0].upper() == 'C':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
89 return True
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
90 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
91 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
92
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
93 def checkForDuplicates(rec_list,pass_list):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
94 for i in range(len(rec_list)-1):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
95 if rec_list[i].pos == rec_list[i+1].pos:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
96 pass_list[i] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
97 pass_list[i+1] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
98
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
99 def checkForMultiallele(rec_list,pass_list):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
100 for i in range(len(rec_list)):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
102 pass_list[i] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
103 pass_list[i+1] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
104 if len(rec_list[i].alleles) > 2:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
105 pass_list[i] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
106
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
107 def flipChrom(chrom):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
108 if chrom[0:3] == 'chr':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
109 return chrom[0:3]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
110 return 'chr'+chrom
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
111
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
112 def getAlleleCountDict(rec):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
113 alleles = defaultdict(int)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
114 total_sites = 0
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
115 missing_inds = 0
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
116 for j in range(len(rec.samples)):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
117 samp = rec.samples[j]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
118 if None in samp.alleles:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
119 missing_inds += 1
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
120 for k in range(len(samp.alleles)):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
121 b = samp.alleles[k]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
122 if b is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
123 alleles[b] += 1
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
124 total_sites+=1
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
125 return alleles, total_sites, missing_inds
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
126
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
127 def isInvariant(rec):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
128 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
129 if len(alleles) == 1:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
130 return True
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
131 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
132
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
133 def isInformative(rec, mincount=2, alleles=None):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
134 count = 0
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
135 if alleles is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
136 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
137 if len(alleles) != 2:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
138 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
139 i1,i2 = alleles.keys()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
140 return (alleles[i1] >= mincount and alleles[i2] >= mincount)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
141
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
142 def getPassSites(record_list, remove_cpg=False, remove_indels=True,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
143 remove_multiallele=True, remove_missing=0,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
144 inform_level=2, fasta_ref=None):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
145 pass_list = [True for r in record_list]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
146 if remove_cpg == True and fasta_ref is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
147 raise Exception("CpG removal requires a reference")
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
148 if inform_level > 2 or inform_level < 0:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
149 raise Exception("Inform level %d must be between 0 and 2" % inform_level)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
150 if remove_multiallele:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
151 checkForMultiallele(record_list,pass_list)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
152 for i in range(len(record_list)):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
153 rec = record_list[i]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
154 if remove_indels and not checkRecordIsSnp(rec):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
155 pass_list[i] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
156 if remove_cpg and checkIfCpG(rec,fasta_ref):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
157 pass_list[i] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
158 alleles,total_sites,missing_inds = getAlleleCountDict(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
159 if remove_missing != -1 and missing_inds > int(remove_missing):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
160 pass_list[i] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
161 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
162 pass_list[i] = False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
163 #pp = zip([r.pos for r in record_list],pass_list)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
164 #for ppp in pp:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
165 # logging.info(ppp)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
166 return pass_list
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
167
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
168
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
169 def filterSites(record_list, remove_cpg=False, remove_indels=True,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
170 remove_multiallele=True, remove_missing=0, inform_level=2,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
171 fasta_ref=None):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
172 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
173 out_list = []
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
174 for i in range(len(pass_list)):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
175 if pass_list[i]:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
176 out_list.append(record_list[i])
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
177 return out_list
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
178
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
179
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
180
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
181 class VcfReader():
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
182 def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
183 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
184
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
185 ext = checkFormat(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
186 if ext in ['gzip','other'] :
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
187 raise Exception(('Input file %s is gzip-formatted, must be either '
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
188 'uncompressed or zipped with bgzip' % vcfname))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
189 self.file_uncompressed = (ext == 'vcf')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
190 self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
191 self.popmodel = None
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
192 self.popkeys = None
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
193 if popmodel is not None and use_allpop:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
194 raise Exception("Popmodel and allpop cannot both be specified")
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
195 if compress_flag and file_uncompressed:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
196 vcfname = compressVcf(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
197 if subsamp_num is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
198 if subsamp_list is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
199 raise Exception('Multiple subsampling options called in getVcfReader')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
200 subsamp_list = getSubsampleList(vcfname, subsamp_num)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
201 elif subsamp_fn is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
202 if subsamp_list is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
203 raise Exception('Multiple subsampling options called in getVcfReader')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
204 subsamp_file = open(subsamp_fn,'r')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
205 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
206 subsamp_file.close()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
207
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
208 if index is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
209 self.reader = pysam.VariantFile(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
210 else:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
211 self.reader = pysam.VariantFile(vcfname, index_filename=index)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
212 if popmodel is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
213 self.popmodel = popmodel
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
214 popsamp_list = popmodel.inds
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
215 self.reader.subset_samples(popsamp_list)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
216 self.setPopIdx()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
217 if use_allpop:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
218 self.setAllPop()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
219 if subsamp_list is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
220 logging.debug('Subsampling %d individuals from VCF file' %
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
221 (len(subsamp_list)))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
222 self.reader.subset_samples(subsamp_list)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
223 self.prev_last_rec = next(self.reader)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
224 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
225
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
226 def fetch(self, chrom=None, start=None, end=None):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
227 return self.reader.fetch(chrom, start, end)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
228
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
229 def getRecordList(self, region=None, chrom=None, start=None,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
230 end=None):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
231 if self.reader_uncompressed:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
232 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
233 return ret
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
234 else:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
235 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
236
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
237 def setPopIdx(self):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
238 self.popkeys = {}
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
239 sample_names = [l for l in self.reader.header.samples]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
240 for p in self.popmodel.pop_list:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
241 self.popkeys[p] = []
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
242 for ind in self.popmodel.ind_dict[p]:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
243 self.popkeys[p].append(sample_names.index(ind))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
244
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
245 def close(self):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
246 self.reader.close()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
247
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
248 def setAllPop(self):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
249 self.popkeys = {'ALL':[]}
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
250 for i in range(len(self.reader.header.samples)):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
251 self.popkeys['ALL'].append(i)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
252
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
253 def modChrom(c,vcf_chr):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
254 if c is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
255 return None
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
256 if vcf_chr and c[:3] != 'chr':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
257 return 'chr'+c
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
258 if not vcf_chr and c[:3] == 'chr':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
259 return c[3:]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
260 return c
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
261
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
262 def getRecordList(vcf_reader, region=None, chrom=None, start=None,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
263 end=None, add_chr=False):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
264 """Returns list for use in subsampling from input file"""
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
265 if region is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
266 c = modChrom(region.chrom,add_chr)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
267 var_sites = vcf_reader.fetch(c, region.start, region.end)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
268 else:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
269 c = modChrom(chrom,add_chr)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
270 var_sites = vcf_reader.fetch(c, start, end)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
271 lst = []
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
272 for rec in var_sites:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
273 lst.append(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
274 return lst
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
275
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
276
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
277 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
278 start=None, end=None, add_chr=False):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
279 """Method for getting record list from unzipped VCF file.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
280
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
281 This method will sequentially look through a VCF file until it finds
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
282 the given `start` position on `chrom`, then add all records to a list
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
283 until the `end` position has been reached. Note that `prev_last_rec`
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
284 must be kept track of externally to ensure that if consecutive regions
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
285 are called, the record of the first variant outside the first region
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
286 is not lost.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
287
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
288 Parameters
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
289 ----------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
290 vcf_reader : pysam VariantFile object
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
291 VCF reader initialized from other function
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
292 region : region object
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
293 Region object with start and end coordinates of region of interest.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
294 prev_last_rec : VariantRecord object
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
295 Variable with last record read from VcfReader. Stored here so that
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
296 if two adjacent regions are called, the overflow record from the
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
297 first region is still included in the next region
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
298
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
299 Returns
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
300 -------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
301 lst : list
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
302 List of records in given gene region
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
303 prev_last_rec : VariantRecord object
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
304 First record after target region, for use in next call
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
305
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
306 """
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
307 lst = []
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
308 if region is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
309 lst.append(prev_last_rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
310 for rec in vcf_reader:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
311 lst.append(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
312 return lst, lst[-1]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
313
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
314 if (prev_last_rec is not None and
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
315 region.containsRecord(prev_last_rec) == 'in'):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
316 lst.append(prev_last_rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
317 elif (prev_last_rec is not None and
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
318 region.containsRecord(prev_last_rec) == 'after'):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
319 return []
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
320 rec = next(vcf_reader,None)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
321 if rec is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
322 return lst,None
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
323 place = region.containsRecord(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
324 while rec is not None and place != 'after':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
325 if place == 'in':
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
326 lst.append(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
327 rec = next(vcf_reader,None)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
328 if rec is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
329 break
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
330 place = region.containsRecord(rec)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
331 prev_last_rec = rec
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
332 return lst, prev_last_rec
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
333
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
334
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
335 def checkRecordIsSnp(rec):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
336 """Checks if this record is a single nucleotide variant, returns bool."""
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
337 logging.info(str(rec.pos))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
338 if len(rec.ref) != 1:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
339 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
340 if rec.alts is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
341 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
342 for allele in rec.alts:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
343 if len(allele) != 1:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
344 return False
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
345 return True
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
346
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
347
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
348 def getSubsampleList(vcfname, ss_count):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
349 """Returns a list of the first `ss_count` individuals in `vcfname`
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
350
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
351 """
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
352
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
353 vcf_o = pysam.VariantFile(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
354 rec = next(vcf_o)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
355 vcf_o.close()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
356 lst = []
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
357 for samp in rec.samples:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
358 lst.append(samp)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
359 return lst[:int(ss_count)]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
360
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
361
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
362 def compressVcf(vcfname,forceflag=False,remove=False):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
363 """Runs bgzip and tabix on input VCF file.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
364
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
365 Using the pysam library, this function runs the bgzip and tabix utilities
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
366 on the given input file. By default, this will not overwrite an existing
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
367 zipped file, but will overwrite an existing index. `remove` can be set to
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
368 delete the unzipped file.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
369
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
370 Parameters
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
371 ----------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
372 vcfname : str
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
373 Name of uncompressed VCF file
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
374 forceflag : bool (False)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
375 If true, will overwrite (vcfname).gz if it exists
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
376 remove : bool (False)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
377 If true, will delete uncompressed source file
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
378
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
379 Returns
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
380 -------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
381 cvcfname : str
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
382 Filepath to compressed VCF file
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
383 """
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
384 cvcfname = vcfname+".gz"
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
385 pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
386 pysam.tabix_index(cvcfname,preset="vcf",force=True)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
387 if remove:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
388 os.remove(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
389 return cvcfname
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
390
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
391 def vcfRegionName(prefix, region, ext, oneidx=False,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
392 halfopen=True, sep='-'):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
393 chrom = region.toStr(halfopen, oneidx, sep)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
394 return prefix+'_'+chrom+'.'+ext
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
395
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
396 def getRecordsInRegion(region, record_list):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
397 sub_list = []
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
398 for i in range(len(record_list)):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
399 loc = region.containsRecord(record_list[i])
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
400 if loc == "in":
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
401 sub_list.append(record_list[i])
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
402 elif loc == "after":
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
403 break
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
404 return sub_list
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
405
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
406
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
407
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
408
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
409
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
410 #def getVcfReader(args):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
411 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
412 subsamp_fn=None, subsamp_list=None, index=None):
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
413 """Returns a reader for a given input VCF file.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
414
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
415 Given a filename, filetype, compression option, and optional Subsampling
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
416 options, will return a pysam.VariantFile object for iteration and
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
417 a flag as to whether this file is compressed or uncompressed.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
418
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
419 Parameters
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
420 ----------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
421 vcfname : str
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
422 Filename for VCF file. The extension of this file will be used to
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
423 determine whether it is compressed or not unless `var_ext` is set.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
424 var_ext : str (None)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
425 Extension for VCF file if it is not included in the filename.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
426 compress_flag : bool (False)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
427 If filetype is uncompressed and this is set to true, will run
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
428 compressVcf function.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
429 subsamp_num : int (None)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
430 If set, will randomly select `subsamp_num` individuals (not
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
431 genotypes) from the input VCF file and return a reader with
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
432 only those data.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
433 subsamp_fn : str (None)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
434 If set, will return a reader with only data from the samples listed
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
435 in the file provided. Cannot be used with other subsampling options.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
436 subsamp_list : list (None)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
437 If set, will return reader with records containing only
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
438 individuals named in the list. Cannot be used with other subsampling
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
439 options.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
440
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
441 Returns
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
442 -------
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
443 vcf_reader : pysam.VariantFile
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
444 A reader that can be iterated through for variant records. If
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
445 compressed, it will be able to use the pysam fetch method, otherwise
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
446 it must be read through sequentially
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
447 reader_uncompressed : bool
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
448 If True, VCF reader is uncompressed. This means the fetch method
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
449 cannot be used and region access must be done using the
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
450 "getRecordListUnzipped" method.
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
451
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
452 """
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
453 ext = checkFormat(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
454 if ext in ['gzip','other'] :
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
455 raise Exception(('Input file %s is gzip-formatted, must be either '
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
456 'uncompressed or zipped with bgzip' % vcfname))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
457 file_uncompressed = (ext == 'vcf')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
458 reader_uncompressed = (file_uncompressed and not compress_flag)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
459 if compress_flag and file_uncompressed:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
460 vcfname = compressVcf(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
461 #subsamp_list = None
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
462 if subsamp_num is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
463 if subsamp_list is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
464 raise Exception('Multiple subsampling options called in getVcfReader')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
465 subsamp_list = getSubsampleList(vcfname, subsamp_num)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
466 elif subsamp_fn is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
467 if subsamp_list is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
468 raise Exception('Multiple subsampling options called in getVcfReader')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
469 subsamp_file = open(subsamp_fn,'r')
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
470 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
471 subsamp_file.close()
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
472 if index is None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
473 vcf_reader = pysam.VariantFile(vcfname)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
474 else:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
475 vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
476 if subsamp_list is not None:
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
477 logging.debug('Subsampling %d individuals from VCF file' %
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
478 (len(subsamp_list)))
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
479 vcf_reader.subset_samples(subsamp_list)
1ad9c146ebb4 Uploaded
jaredgk
parents:
diff changeset
480 return vcf_reader, reader_uncompressed