0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 #
|
|
4 #------------------------------------------------------------------------------
|
|
5 # University of Minnesota
|
|
6 # Copyright 2012, Regents of the University of Minnesota
|
|
7 #------------------------------------------------------------------------------
|
|
8 # Author:
|
|
9 #
|
|
10 # James E Johnson
|
|
11 # Jesse Erdmann
|
|
12 #
|
|
13 #------------------------------------------------------------------------------
|
|
14 """
|
|
15
|
|
16 """
|
|
17 Generate a VCF file from a samtools pileup
|
|
18 filtering on read coverage, base call quality, and allele frequency
|
|
19
|
|
20 Pileup Format
|
|
21 http://samtools.sourceforge.net/pileup.shtml
|
|
22 Columns:
|
|
23 chromosome, 1-based coordinate, reference base, the number of reads covering the site, read bases, base qualities
|
|
24
|
|
25 VCF Format
|
|
26 http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
|
|
27 The header line names the 8 fixed, mandatory columns. These columns are as follows:
|
|
28 CHROM POS ID REF ALT QUAL FILTER INFO
|
|
29 """
|
|
30
|
|
31 import sys,re,os.path
|
|
32 import optparse
|
|
33 from optparse import OptionParser
|
|
34
|
|
35 vcf_header = """\
|
|
36 ##fileformat=VCFv4.0
|
|
37 ##source=pileup_to_vcf.pyV1.0
|
|
38 ##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">
|
|
39 ##INFO=<ID=$SAF_header,Number=.,Type=Float,Description=\"Specific Allele Frequency\">
|
|
40 ##FILTER=<ID=DP,Description=\"Minimum depth of %s\">
|
|
41 ##FILTER=<ID=$SAF_header,Description=\"Allele frequency of at least %s with base quality minimum %d\">
|
|
42 #CHROM POS ID REF ALT QUAL FILTER INFO\
|
|
43 """
|
|
44
|
|
45 def __main__():
|
|
46 #Parse Command Line
|
|
47 parser = optparse.OptionParser()
|
|
48 # files
|
|
49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' )
|
|
50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' )
|
|
51 # filters
|
|
52 parser.add_option( '-c', '--min_coverage', type='int', default='0', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 0]' )
|
|
53 parser.add_option( '-r', '--report_depth', type='choice', choices=['all','ref','qual'], default='ref', dest='report_depth', help='Coverage depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality' )
|
|
54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' )
|
|
55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' )
|
|
56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' )
|
|
57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' )
|
|
58 # select columns
|
|
59 parser.add_option( '-C', '--chrom_col', type='int', default='1', dest='chrom_col', help='The ordinal position (starting with 1) of the chromosome column' )
|
|
60 parser.add_option( '-P', '--pos_col', type='int', default='2', dest='pos_col', help='The ordinal position (starting with 1) of the position column' )
|
|
61 parser.add_option( '-R', '--ref_col', type='int', default='3', dest='ref_col', help='The ordinal position (starting with 1) of the reference base column' )
|
|
62 parser.add_option( '-D', '--coverage_col', type='int', default='4', dest='coverage_col', help='The ordinal position (starting with 1) of the read coverage depth column' )
|
|
63 parser.add_option( '-B', '--base_call_col', type='int', default='5', dest='base_call_col', help='The ordinal position (starting with 1) of the base call column' )
|
|
64 parser.add_option( '-Q', '--base_qual_col', type='int', default='6', dest='base_qual_col', help='The ordinal position (starting with 1) of the base quality column' )
|
|
65 # debug
|
|
66 parser.add_option( '-d', '--debug', action="store_true", dest='debug', default=False, help='Print Debug messages' )
|
|
67 #
|
|
68 (options, args) = parser.parse_args()
|
|
69
|
|
70 debug = options.debug if options.debug else False
|
|
71
|
|
72 #Coverage must have at least a total depth of min_coverage to be considered
|
|
73 min_coverage = options.min_coverage
|
|
74 #Count depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality
|
|
75 dp_as = options.report_depth
|
|
76 #Base qualities must be at least the min_base_quality value to be counted
|
|
77 min_base_quality = options.min_base_quality
|
|
78 #Coverage must be more than this pct in support of the variant, e.g. variant support / total coverage > min_allele_freq
|
|
79 min_allele_freq = options.min_allele_freq
|
|
80 #Allow multiple variants to be reported for a position, min_allele_freq must be less than 0.5 for this to have effect.
|
|
81 allow_multiples = options.allow_multiples
|
|
82 # Whether to report indels
|
|
83 report_indels = not options.snps_only
|
|
84
|
|
85 # Check for 1-based column options, and subtract 1 for python array index
|
|
86 #Which column (count starting with 1) contains the chromosome (Default assumes 6-col pileup or 12-col filtered pileup)
|
|
87 chrom_col = options.chrom_col - 1
|
|
88 #Which column (count starting with 1) contains the position (Default assumes 6-col pileup or 12-col filtered pileup)
|
|
89 pos_col = options.pos_col - 1
|
|
90 #Which column (count starting with 1) contains the reference base (Default assumes 6-col pileup or 12-col filtered pileup)
|
|
91 ref_col = options.ref_col - 1
|
|
92 #Which column (count starting with 1) contains the total coverage for the position (Default assumes 6-col pileup or 12-col filtered pileup)
|
|
93 coverage_col = options.coverage_col - 1
|
|
94 #Which column (count starting with 1) contains the base calls for the position (Default assumes 6-col pileup or 12-col filtered pileup)
|
|
95 base_call_col = options.base_call_col - 1
|
|
96 #Which column (count starting with 1) contains the base call qualities for the position (Default assumes 6-col pileup or 12-col filtered pileup)
|
|
97 base_qual_col = options.base_qual_col - 1
|
|
98
|
|
99 # pileup input
|
|
100 if options.input != None:
|
|
101 try:
|
|
102 inputPath = os.path.abspath(options.input)
|
|
103 inputFile = open(inputPath, 'r')
|
|
104 except Exception, e:
|
|
105 print >> sys.stderr, "failed: %s" % e
|
|
106 exit(2)
|
|
107 else:
|
|
108 inputFile = sys.stdin
|
|
109 # vcf output
|
|
110 if options.output != None:
|
|
111 try:
|
|
112 outputPath = os.path.abspath(options.output)
|
|
113 outputFile = open(outputPath, 'w')
|
|
114 except Exception, e:
|
|
115 print >> sys.stderr, "failed: %s" % e
|
|
116 exit(3)
|
|
117 else:
|
|
118 outputFile = sys.stdout
|
|
119
|
|
120 indel_len_pattern = '([1-9][0-9]*)'
|
|
121 ref_skip_pattern = '[<>]'
|
|
122
|
|
123 SAF_header = "SAF";
|
|
124
|
|
125 print >> outputFile, vcf_header % (min_coverage,min_allele_freq,min_base_quality)
|
|
126
|
|
127 try:
|
|
128 for linenum,line in enumerate(inputFile):
|
|
129 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line)
|
|
130 fields = line.split('\t')
|
|
131 adp = int(fields[coverage_col]) # count of all reads spanning ref base
|
|
132 # Quick check for coverage
|
|
133 if adp < min_coverage :
|
|
134 continue
|
|
135 bases = fields[base_call_col]
|
|
136 if dp_as != 'all' :
|
|
137 m = re.findall(ref_skip_pattern,bases)
|
|
138 # Quick check for coverage, pileup coverage - reference skips
|
|
139 if adp -len(m) < min_coverage:
|
|
140 continue
|
|
141 quals = fields[base_qual_col]
|
|
142 chrom = fields[chrom_col]
|
|
143 pos = int(fields[pos_col])
|
|
144 ref = fields[ref_col]
|
|
145 deletions = dict()
|
|
146 max_deletion = 0
|
|
147 insertions = dict()
|
|
148 snps = {'A':0,'C':0,'G':0,'T':0,'N':0}
|
|
149 bi = 0 # bases index
|
|
150 qi = 0 # quals index
|
|
151 rdp = 0 # read coverage depth
|
|
152 qdp = 0 # read coverage depth with base call quality passing minimum
|
|
153 variants = dict() # variant -> count
|
|
154 while bi < len(bases):
|
|
155 base = bases[bi]
|
|
156 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33)
|
|
157 bi += 1
|
|
158 if base in '<>' : #reference skip (between paired reads)
|
|
159 pass;
|
|
160 elif base in '.,' : # match reference on forward/reverse strand
|
|
161 rdp += 1
|
|
162 qual = ord(quals[qi]) -33
|
|
163 qi += 1
|
|
164 # count match
|
|
165 if qual >= min_base_quality:
|
|
166 qdp += 1
|
|
167 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand
|
|
168 rdp += 1
|
|
169 qual = ord(quals[qi]) -33
|
|
170 qi += 1
|
|
171 # record snp variant
|
|
172 if qual >= min_base_quality:
|
|
173 qdp += 1
|
|
174 snps[base.upper()] += 1
|
|
175 elif base in '+-' : # indel
|
|
176 rdp += 1
|
|
177 qdp += 1 # no related quality score, so count it
|
|
178 m = re.match(indel_len_pattern,bases[bi:])
|
|
179 indel_len_str = m.group(0) # get the indel len string
|
|
180 bi += len(indel_len_str) # increment the index by the len of the len string
|
|
181 indel_len = int(indel_len_str)
|
|
182 indel = bases[bi:bi+indel_len].upper() # get the indel bases
|
|
183 bi += indel_len # increment the index by the len of the indel
|
|
184 if base == '+':
|
|
185 # record insertion variant
|
|
186 if indel in insertions:
|
|
187 insertions[indel] += 1
|
|
188 else:
|
|
189 insertions[indel] = 1
|
|
190 elif base == '-':
|
|
191 # record deletion variant
|
|
192 max_deletion = max(indel_len,max_deletion)
|
|
193 if indel in deletions:
|
|
194 deletions[indel] += 1
|
|
195 else:
|
|
196 deletions[indel] = 1
|
|
197 elif base == '^' : #start of read (followed by read quality char)
|
|
198 read_mapping_qual = ord(bases[bi]) - 33
|
|
199 bi += 1
|
|
200 elif base == '$' : #end of read
|
|
201 pass # no associated quality in either bases or quals
|
|
202 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else adp
|
|
203 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions)
|
|
204 # output any variants that meet the depth coverage requirement
|
|
205 vcf_pos = pos
|
|
206 vcf_ref = ref
|
|
207 suffix = ''
|
|
208 alts = []
|
|
209 safs = []
|
|
210 # If we have any deletions to report, need to modify the ref string
|
|
211 if debug: print >> sys.stderr, "max deletions: %s" % deletions
|
|
212 if report_indels:
|
|
213 for k,v in deletions.items():
|
|
214 saf = v/float(dp)
|
|
215 if saf >= min_allele_freq:
|
|
216 if len(k) > len(suffix):
|
|
217 suffix = k
|
|
218 vcf_ref = (ref + suffix).upper()
|
|
219 if debug: print >> sys.stderr, "snps: %s" % snps
|
|
220 for k,v in snps.items():
|
|
221 if debug: print >> sys.stderr, "snp: %s %d" % (k,v)
|
|
222 saf = v/float(dp)
|
|
223 if saf >= min_allele_freq:
|
|
224 alts.append(k + suffix)
|
|
225 safs.append(saf)
|
|
226 if debug: print >> sys.stderr, "insertions: %s" % insertions
|
|
227 if report_indels:
|
|
228 for k,v in insertions.items():
|
|
229 saf = v/float(dp)
|
|
230 if saf >= min_allele_freq:
|
|
231 alts.append(ref + k + suffix)
|
|
232 safs.append(saf)
|
|
233 if debug: print >> sys.stderr, "deletions: %s" % deletions
|
|
234 for k,v in deletions.items():
|
|
235 saf = v/float(dp)
|
|
236 if saf >= min_allele_freq:
|
|
237 alts.append(vcf_ref[:len(vcf_ref) - len(k)]) # TODO alt will be a substring of vcf_ref, test this
|
|
238 safs.append(saf)
|
|
239 if len(alts) > 0:
|
|
240 vcf_id = "."
|
|
241 vcf_qual = "."
|
|
242 vcf_filter = "PASS"
|
|
243 # if not allow_multiples, report only the most freq alt
|
|
244 if not allow_multiples and len(alts) > 1:
|
|
245 max_saf = max(safs)
|
|
246 for i,v in enumerate(safs):
|
|
247 if v == max_saf:
|
|
248 vcf_alt = alts[i]
|
|
249 info = "SAF=%f;DP=%d" % (max_saf,dp)
|
|
250 # if allow_multiples
|
|
251 else:
|
|
252 vcf_alt = ','.join(alts)
|
|
253 isafs = []
|
|
254 for saf in safs:
|
|
255 isafs.append("SAF=%f" % saf)
|
|
256 info = "%s;DP=%d" % (','.join(isafs),dp)
|
|
257 print >> outputFile,"%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s" % (chrom,vcf_pos,vcf_id,vcf_ref,vcf_alt,vcf_qual,vcf_filter,info)
|
|
258 except Exception, e:
|
|
259 print >> sys.stderr, "failed: %s" % e
|
|
260 exit(1)
|
|
261
|
|
262 if __name__ == "__main__": __main__()
|
|
263
|