Mercurial > repos > jjohnson > pileup_to_vcf
comparison pileup_to_vcf.py @ 0:3890f8ba0e4d
Uploaded
author | jjohnson |
---|---|
date | Mon, 18 Feb 2013 11:32:50 -0500 |
parents | |
children | bb0ac51f1b02 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3890f8ba0e4d |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2012, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # Jesse Erdmann | |
12 # | |
13 #------------------------------------------------------------------------------ | |
14 """ | |
15 | |
16 """ | |
17 Generate a VCF file from a samtools pileup | |
18 filtering on read coverage, base call quality, and allele frequency | |
19 | |
20 Pileup Format | |
21 http://samtools.sourceforge.net/pileup.shtml | |
22 Columns: | |
23 chromosome, 1-based coordinate, reference base, the number of reads covering the site, read bases, base qualities | |
24 | |
25 VCF Format | |
26 http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 | |
27 The header line names the 8 fixed, mandatory columns. These columns are as follows: | |
28 CHROM POS ID REF ALT QUAL FILTER INFO | |
29 """ | |
30 | |
31 import sys,re,os.path | |
32 import optparse | |
33 from optparse import OptionParser | |
34 | |
35 vcf_header = """\ | |
36 ##fileformat=VCFv4.0 | |
37 ##source=pileup_to_vcf.pyV1.0 | |
38 ##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\"> | |
39 ##INFO=<ID=$SAF_header,Number=.,Type=Float,Description=\"Specific Allele Frequency\"> | |
40 ##FILTER=<ID=DP,Description=\"Minimum depth of %s\"> | |
41 ##FILTER=<ID=$SAF_header,Description=\"Allele frequency of at least %s with base quality minimum %d\"> | |
42 #CHROM POS ID REF ALT QUAL FILTER INFO\ | |
43 """ | |
44 | |
45 def __main__(): | |
46 #Parse Command Line | |
47 parser = optparse.OptionParser() | |
48 # files | |
49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' ) | |
50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) | |
51 # filters | |
52 parser.add_option( '-c', '--min_coverage', type='int', default='0', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 0]' ) | |
53 parser.add_option( '-r', '--report_depth', type='choice', choices=['all','ref','qual'], default='ref', dest='report_depth', help='Coverage depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality' ) | |
54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' ) | |
55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' ) | |
56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' ) | |
57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' ) | |
58 # select columns | |
59 parser.add_option( '-C', '--chrom_col', type='int', default='1', dest='chrom_col', help='The ordinal position (starting with 1) of the chromosome column' ) | |
60 parser.add_option( '-P', '--pos_col', type='int', default='2', dest='pos_col', help='The ordinal position (starting with 1) of the position column' ) | |
61 parser.add_option( '-R', '--ref_col', type='int', default='3', dest='ref_col', help='The ordinal position (starting with 1) of the reference base column' ) | |
62 parser.add_option( '-D', '--coverage_col', type='int', default='4', dest='coverage_col', help='The ordinal position (starting with 1) of the read coverage depth column' ) | |
63 parser.add_option( '-B', '--base_call_col', type='int', default='5', dest='base_call_col', help='The ordinal position (starting with 1) of the base call column' ) | |
64 parser.add_option( '-Q', '--base_qual_col', type='int', default='6', dest='base_qual_col', help='The ordinal position (starting with 1) of the base quality column' ) | |
65 # debug | |
66 parser.add_option( '-d', '--debug', action="store_true", dest='debug', default=False, help='Print Debug messages' ) | |
67 # | |
68 (options, args) = parser.parse_args() | |
69 | |
70 debug = options.debug if options.debug else False | |
71 | |
72 #Coverage must have at least a total depth of min_coverage to be considered | |
73 min_coverage = options.min_coverage | |
74 #Count depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality | |
75 dp_as = options.report_depth | |
76 #Base qualities must be at least the min_base_quality value to be counted | |
77 min_base_quality = options.min_base_quality | |
78 #Coverage must be more than this pct in support of the variant, e.g. variant support / total coverage > min_allele_freq | |
79 min_allele_freq = options.min_allele_freq | |
80 #Allow multiple variants to be reported for a position, min_allele_freq must be less than 0.5 for this to have effect. | |
81 allow_multiples = options.allow_multiples | |
82 # Whether to report indels | |
83 report_indels = not options.snps_only | |
84 | |
85 # Check for 1-based column options, and subtract 1 for python array index | |
86 #Which column (count starting with 1) contains the chromosome (Default assumes 6-col pileup or 12-col filtered pileup) | |
87 chrom_col = options.chrom_col - 1 | |
88 #Which column (count starting with 1) contains the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
89 pos_col = options.pos_col - 1 | |
90 #Which column (count starting with 1) contains the reference base (Default assumes 6-col pileup or 12-col filtered pileup) | |
91 ref_col = options.ref_col - 1 | |
92 #Which column (count starting with 1) contains the total coverage for the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
93 coverage_col = options.coverage_col - 1 | |
94 #Which column (count starting with 1) contains the base calls for the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
95 base_call_col = options.base_call_col - 1 | |
96 #Which column (count starting with 1) contains the base call qualities for the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
97 base_qual_col = options.base_qual_col - 1 | |
98 | |
99 # pileup input | |
100 if options.input != None: | |
101 try: | |
102 inputPath = os.path.abspath(options.input) | |
103 inputFile = open(inputPath, 'r') | |
104 except Exception, e: | |
105 print >> sys.stderr, "failed: %s" % e | |
106 exit(2) | |
107 else: | |
108 inputFile = sys.stdin | |
109 # vcf output | |
110 if options.output != None: | |
111 try: | |
112 outputPath = os.path.abspath(options.output) | |
113 outputFile = open(outputPath, 'w') | |
114 except Exception, e: | |
115 print >> sys.stderr, "failed: %s" % e | |
116 exit(3) | |
117 else: | |
118 outputFile = sys.stdout | |
119 | |
120 indel_len_pattern = '([1-9][0-9]*)' | |
121 ref_skip_pattern = '[<>]' | |
122 | |
123 SAF_header = "SAF"; | |
124 | |
125 print >> outputFile, vcf_header % (min_coverage,min_allele_freq,min_base_quality) | |
126 | |
127 try: | |
128 for linenum,line in enumerate(inputFile): | |
129 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line) | |
130 fields = line.split('\t') | |
131 adp = int(fields[coverage_col]) # count of all reads spanning ref base | |
132 # Quick check for coverage | |
133 if adp < min_coverage : | |
134 continue | |
135 bases = fields[base_call_col] | |
136 if dp_as != 'all' : | |
137 m = re.findall(ref_skip_pattern,bases) | |
138 # Quick check for coverage, pileup coverage - reference skips | |
139 if adp -len(m) < min_coverage: | |
140 continue | |
141 quals = fields[base_qual_col] | |
142 chrom = fields[chrom_col] | |
143 pos = int(fields[pos_col]) | |
144 ref = fields[ref_col] | |
145 deletions = dict() | |
146 max_deletion = 0 | |
147 insertions = dict() | |
148 snps = {'A':0,'C':0,'G':0,'T':0,'N':0} | |
149 bi = 0 # bases index | |
150 qi = 0 # quals index | |
151 rdp = 0 # read coverage depth | |
152 qdp = 0 # read coverage depth with base call quality passing minimum | |
153 variants = dict() # variant -> count | |
154 while bi < len(bases): | |
155 base = bases[bi] | |
156 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33) | |
157 bi += 1 | |
158 if base in '<>' : #reference skip (between paired reads) | |
159 pass; | |
160 elif base in '.,' : # match reference on forward/reverse strand | |
161 rdp += 1 | |
162 qual = ord(quals[qi]) -33 | |
163 qi += 1 | |
164 # count match | |
165 if qual >= min_base_quality: | |
166 qdp += 1 | |
167 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand | |
168 rdp += 1 | |
169 qual = ord(quals[qi]) -33 | |
170 qi += 1 | |
171 # record snp variant | |
172 if qual >= min_base_quality: | |
173 qdp += 1 | |
174 snps[base.upper()] += 1 | |
175 elif base in '+-' : # indel | |
176 rdp += 1 | |
177 qdp += 1 # no related quality score, so count it | |
178 m = re.match(indel_len_pattern,bases[bi:]) | |
179 indel_len_str = m.group(0) # get the indel len string | |
180 bi += len(indel_len_str) # increment the index by the len of the len string | |
181 indel_len = int(indel_len_str) | |
182 indel = bases[bi:bi+indel_len].upper() # get the indel bases | |
183 bi += indel_len # increment the index by the len of the indel | |
184 if base == '+': | |
185 # record insertion variant | |
186 if indel in insertions: | |
187 insertions[indel] += 1 | |
188 else: | |
189 insertions[indel] = 1 | |
190 elif base == '-': | |
191 # record deletion variant | |
192 max_deletion = max(indel_len,max_deletion) | |
193 if indel in deletions: | |
194 deletions[indel] += 1 | |
195 else: | |
196 deletions[indel] = 1 | |
197 elif base == '^' : #start of read (followed by read quality char) | |
198 read_mapping_qual = ord(bases[bi]) - 33 | |
199 bi += 1 | |
200 elif base == '$' : #end of read | |
201 pass # no associated quality in either bases or quals | |
202 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else adp | |
203 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions) | |
204 # output any variants that meet the depth coverage requirement | |
205 vcf_pos = pos | |
206 vcf_ref = ref | |
207 suffix = '' | |
208 alts = [] | |
209 safs = [] | |
210 # If we have any deletions to report, need to modify the ref string | |
211 if debug: print >> sys.stderr, "max deletions: %s" % deletions | |
212 if report_indels: | |
213 for k,v in deletions.items(): | |
214 saf = v/float(dp) | |
215 if saf >= min_allele_freq: | |
216 if len(k) > len(suffix): | |
217 suffix = k | |
218 vcf_ref = (ref + suffix).upper() | |
219 if debug: print >> sys.stderr, "snps: %s" % snps | |
220 for k,v in snps.items(): | |
221 if debug: print >> sys.stderr, "snp: %s %d" % (k,v) | |
222 saf = v/float(dp) | |
223 if saf >= min_allele_freq: | |
224 alts.append(k + suffix) | |
225 safs.append(saf) | |
226 if debug: print >> sys.stderr, "insertions: %s" % insertions | |
227 if report_indels: | |
228 for k,v in insertions.items(): | |
229 saf = v/float(dp) | |
230 if saf >= min_allele_freq: | |
231 alts.append(ref + k + suffix) | |
232 safs.append(saf) | |
233 if debug: print >> sys.stderr, "deletions: %s" % deletions | |
234 for k,v in deletions.items(): | |
235 saf = v/float(dp) | |
236 if saf >= min_allele_freq: | |
237 alts.append(vcf_ref[:len(vcf_ref) - len(k)]) # TODO alt will be a substring of vcf_ref, test this | |
238 safs.append(saf) | |
239 if len(alts) > 0: | |
240 vcf_id = "." | |
241 vcf_qual = "." | |
242 vcf_filter = "PASS" | |
243 # if not allow_multiples, report only the most freq alt | |
244 if not allow_multiples and len(alts) > 1: | |
245 max_saf = max(safs) | |
246 for i,v in enumerate(safs): | |
247 if v == max_saf: | |
248 vcf_alt = alts[i] | |
249 info = "SAF=%f;DP=%d" % (max_saf,dp) | |
250 # if allow_multiples | |
251 else: | |
252 vcf_alt = ','.join(alts) | |
253 isafs = [] | |
254 for saf in safs: | |
255 isafs.append("SAF=%f" % saf) | |
256 info = "%s;DP=%d" % (','.join(isafs),dp) | |
257 print >> outputFile,"%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s" % (chrom,vcf_pos,vcf_id,vcf_ref,vcf_alt,vcf_qual,vcf_filter,info) | |
258 except Exception, e: | |
259 print >> sys.stderr, "failed: %s" % e | |
260 exit(1) | |
261 | |
262 if __name__ == "__main__": __main__() | |
263 |