annotate pileup_to_vcf.py @ 9:c0a6e8f595ec default tip

Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
author Jim Johnson <jj@umn.edu>
date Thu, 11 Apr 2013 10:28:10 -0500
parents e77ab15bbce9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/env python
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
2 """
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
3 #
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
4 #------------------------------------------------------------------------------
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
5 # University of Minnesota
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
6 # Copyright 2012, Regents of the University of Minnesota
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
7 #------------------------------------------------------------------------------
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
8 # Author:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
9 #
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
10 # James E Johnson
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
11 # Jesse Erdmann
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
12 #
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
13 #------------------------------------------------------------------------------
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
14 """
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
15
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
16 """
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
17 Generate a VCF file from a samtools pileup
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
18 filtering on read coverage, base call quality, and allele frequency
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
19
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
20 Pileup Format
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
21 http://samtools.sourceforge.net/pileup.shtml
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
22 Columns:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
23 chromosome, 1-based coordinate, reference base, the number of reads covering the site, read bases, base qualities
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
24
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
25 VCF Format
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
26 http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
27 The header line names the 8 fixed, mandatory columns. These columns are as follows:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
28 CHROM POS ID REF ALT QUAL FILTER INFO
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
29 """
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
30
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
31 import sys,re,os.path
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
32 import optparse
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
33 from optparse import OptionParser
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
34
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
35 vcf_header = """\
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
36 ##fileformat=VCFv4.0
9
c0a6e8f595ec Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents: 7
diff changeset
37 ##source=pileup_to_vcf.pyV1.2
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
38 ##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">
7
e77ab15bbce9 Fix VCF header
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
39 ##INFO=<ID=SAF,Number=.,Type=Float,Description=\"Specific Allele Frequency\">
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
40 ##FILTER=<ID=DP,Description=\"Minimum depth of %s\">
7
e77ab15bbce9 Fix VCF header
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
41 ##FILTER=<ID=SAF,Description=\"Allele frequency of at least %s with base quality minimum %d\">
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
42 #CHROM POS ID REF ALT QUAL FILTER INFO\
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
43 """
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
44
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
45 def __main__():
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
46 #Parse Command Line
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
47 parser = optparse.OptionParser()
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
48 # files
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
51 # filters
1
bb0ac51f1b02 Handle delection char: *
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
52 parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' )
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
53 parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' )
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' )
9
c0a6e8f595ec Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents: 7
diff changeset
58 # ID to use
c0a6e8f595ec Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents: 7
diff changeset
59 parser.add_option( '-I', '--id', dest='id', default=None, help='The value for the VCF ID field' )
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
60 # select columns
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
61 parser.add_option( '-C', '--chrom_col', type='int', default='1', dest='chrom_col', help='The ordinal position (starting with 1) of the chromosome column' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
62 parser.add_option( '-P', '--pos_col', type='int', default='2', dest='pos_col', help='The ordinal position (starting with 1) of the position column' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
63 parser.add_option( '-R', '--ref_col', type='int', default='3', dest='ref_col', help='The ordinal position (starting with 1) of the reference base column' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
64 parser.add_option( '-D', '--coverage_col', type='int', default='4', dest='coverage_col', help='The ordinal position (starting with 1) of the read coverage depth column' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
65 parser.add_option( '-B', '--base_call_col', type='int', default='5', dest='base_call_col', help='The ordinal position (starting with 1) of the base call column' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
66 parser.add_option( '-Q', '--base_qual_col', type='int', default='6', dest='base_qual_col', help='The ordinal position (starting with 1) of the base quality column' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
67 # debug
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
68 parser.add_option( '-d', '--debug', action="store_true", dest='debug', default=False, help='Print Debug messages' )
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
69 #
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
70 (options, args) = parser.parse_args()
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
71
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
72 debug = options.debug if options.debug else False
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
73
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
74 #Coverage must have at least a total depth of min_coverage to be considered
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
75 min_coverage = options.min_coverage
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
76 #Count depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
77 dp_as = options.report_depth
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
78 #Base qualities must be at least the min_base_quality value to be counted
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
79 min_base_quality = options.min_base_quality
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
80 #Coverage must be more than this pct in support of the variant, e.g. variant support / total coverage > min_allele_freq
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
81 min_allele_freq = options.min_allele_freq
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
82 #Allow multiple variants to be reported for a position, min_allele_freq must be less than 0.5 for this to have effect.
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
83 allow_multiples = options.allow_multiples
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
84 # Whether to report indels
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
85 report_indels = not options.snps_only
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
86
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
87 # Check for 1-based column options, and subtract 1 for python array index
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
88 #Which column (count starting with 1) contains the chromosome (Default assumes 6-col pileup or 12-col filtered pileup)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
89 chrom_col = options.chrom_col - 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
90 #Which column (count starting with 1) contains the position (Default assumes 6-col pileup or 12-col filtered pileup)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
91 pos_col = options.pos_col - 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
92 #Which column (count starting with 1) contains the reference base (Default assumes 6-col pileup or 12-col filtered pileup)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
93 ref_col = options.ref_col - 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
94 #Which column (count starting with 1) contains the total coverage for the position (Default assumes 6-col pileup or 12-col filtered pileup)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
95 coverage_col = options.coverage_col - 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
96 #Which column (count starting with 1) contains the base calls for the position (Default assumes 6-col pileup or 12-col filtered pileup)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
97 base_call_col = options.base_call_col - 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
98 #Which column (count starting with 1) contains the base call qualities for the position (Default assumes 6-col pileup or 12-col filtered pileup)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
99 base_qual_col = options.base_qual_col - 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
100
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
101 # pileup input
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
102 if options.input != None:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
103 try:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
104 inputPath = os.path.abspath(options.input)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
105 inputFile = open(inputPath, 'r')
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
106 except Exception, e:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
107 print >> sys.stderr, "failed: %s" % e
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
108 exit(2)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
109 else:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
110 inputFile = sys.stdin
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
111 # vcf output
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
112 if options.output != None:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
113 try:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
114 outputPath = os.path.abspath(options.output)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
115 outputFile = open(outputPath, 'w')
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
116 except Exception, e:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
117 print >> sys.stderr, "failed: %s" % e
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
118 exit(3)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
119 else:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
120 outputFile = sys.stdout
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
121
9
c0a6e8f595ec Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents: 7
diff changeset
122 vcf_id = options.id if options.id else "."
c0a6e8f595ec Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents: 7
diff changeset
123
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
124 indel_len_pattern = '([1-9][0-9]*)'
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
125 ref_skip_pattern = '[<>]'
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
126
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
127 SAF_header = "SAF";
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
128
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
129 print >> outputFile, vcf_header % (min_coverage,min_allele_freq,min_base_quality)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
130
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
131 try:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
132 for linenum,line in enumerate(inputFile):
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
133 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
134 fields = line.split('\t')
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
135 sdp = int(fields[coverage_col]) # count of all reads spanning ref base from source coverage column
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
136 # Quick check for coverage
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
137 if dp_as != 'all' and sdp < min_coverage :
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
138 continue
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
139 bases = fields[base_call_col]
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
140 m = re.findall(ref_skip_pattern,bases)
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
141 # Quick check for coverage, pileup coverage - reference skips
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
142 rdp = sdp - (len(m) if m else 0)
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
143 if dp_as == 'ref' or dp_as == 'qual':
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
144 if rdp < min_coverage:
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
145 continue
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
146 quals = fields[base_qual_col]
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
147 chrom = fields[chrom_col]
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
148 pos = int(fields[pos_col])
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
149 ref = fields[ref_col]
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
150 deletions = dict()
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
151 max_deletion = 0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
152 insertions = dict()
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
153 snps = {'A':0,'C':0,'G':0,'T':0,'N':0}
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
154 bi = 0 # bases index
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
155 qi = 0 # quals index
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
156 adp = 0 # read coverage depth
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
157 qdp = 0 # read coverage depth with base call quality passing minimum
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
158 variants = dict() # variant -> count
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
159 mc = 0 # match count
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
160 while bi < len(bases):
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
161 base = bases[bi]
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
162 # if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33)
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
163 bi += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
164 if base in '<>' : #reference skip (between paired reads)
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
165 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
3
aa76c8dd97e6 Reference skips have a base quality score
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
166 qi += 1
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
167 elif base in '.,' : # match reference on forward/reverse strand
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
168 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
169 mc += 1
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
170 adp += 1
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
171 qual = ord(quals[qi]) -33
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
172 qi += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
173 # count match
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
174 if qual >= min_base_quality:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
175 qdp += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
176 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
177 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
178 adp += 1
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
179 qual = ord(quals[qi]) -33
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
180 qi += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
181 # record snp variant
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
182 if qual >= min_base_quality:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
183 qdp += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
184 snps[base.upper()] += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
185 elif base in '+-' : # indel
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
186 adp += 1
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
187 m = re.match(indel_len_pattern,bases[bi:])
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
188 indel_len_str = m.group(0) # get the indel len string
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
189 bi += len(indel_len_str) # increment the index by the len of the len string
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
190 indel_len = int(indel_len_str)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
191 indel = bases[bi:bi+indel_len].upper() # get the indel bases
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
192 if debug: print >> sys.stderr, "%3d\t%s%s" % (bi-2,base,indel)
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
193 bi += indel_len # increment the index by the len of the indel
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
194 if base == '+':
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
195 # record insertion variant
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
196 if indel in insertions:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
197 insertions[indel] += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
198 else:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
199 insertions[indel] = 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
200 elif base == '-':
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
201 # record deletion variant
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
202 max_deletion = max(indel_len,max_deletion)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
203 if indel in deletions:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
204 deletions[indel] += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
205 else:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
206 deletions[indel] = 1
1
bb0ac51f1b02 Handle delection char: *
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
207 elif base == '*' : # a deletion from a prior base
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
208 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
1
bb0ac51f1b02 Handle delection char: *
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
209 qi += 1
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
210 elif base == '^' : #start of read (followed by read quality char)
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
211 if debug: print >> sys.stderr, "%3d\t%s%s" % (bi-1,base,bases[bi])
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
212 read_mapping_qual = ord(bases[bi]) - 33
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
213 bi += 1
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
214 elif base == '$' : #end of read
4
fafa105e5f58 Fix base quality
Jim Johnson <jj@umn.edu>
parents: 3
diff changeset
215 if debug: print >> sys.stderr, "%3d\t%s" % (bi-1,base)
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
216 pass # no associated quality in either bases or quals
2
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
217 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else sdp if dp_as == 'source' else adp
d6de2d1f4af9 Fix read depth reporting
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
218 if debug: print >> sys.stderr, "match count: %d" % mc
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
219 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions)
1
bb0ac51f1b02 Handle delection char: *
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
220 if dp < max(1,min_coverage) :
bb0ac51f1b02 Handle delection char: *
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
221 continue
0
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
222 # output any variants that meet the depth coverage requirement
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
223 vcf_pos = pos
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
224 vcf_ref = ref
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
225 suffix = ''
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
226 alts = []
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
227 safs = []
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
228 # If we have any deletions to report, need to modify the ref string
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
229 if debug: print >> sys.stderr, "max deletions: %s" % deletions
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
230 if report_indels:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
231 for k,v in deletions.items():
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
232 saf = v/float(dp)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
233 if saf >= min_allele_freq:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
234 if len(k) > len(suffix):
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
235 suffix = k
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
236 vcf_ref = (ref + suffix).upper()
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
237 if debug: print >> sys.stderr, "snps: %s" % snps
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
238 for k,v in snps.items():
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
239 if debug: print >> sys.stderr, "snp: %s %d" % (k,v)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
240 saf = v/float(dp)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
241 if saf >= min_allele_freq:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
242 alts.append(k + suffix)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
243 safs.append(saf)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
244 if debug: print >> sys.stderr, "insertions: %s" % insertions
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
245 if report_indels:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
246 for k,v in insertions.items():
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
247 saf = v/float(dp)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
248 if saf >= min_allele_freq:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
249 alts.append(ref + k + suffix)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
250 safs.append(saf)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
251 if debug: print >> sys.stderr, "deletions: %s" % deletions
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
252 for k,v in deletions.items():
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
253 saf = v/float(dp)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
254 if saf >= min_allele_freq:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
255 alts.append(vcf_ref[:len(vcf_ref) - len(k)]) # TODO alt will be a substring of vcf_ref, test this
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
256 safs.append(saf)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
257 if len(alts) > 0:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
258 vcf_qual = "."
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
259 vcf_filter = "PASS"
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
260 # if not allow_multiples, report only the most freq alt
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
261 if not allow_multiples and len(alts) > 1:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
262 max_saf = max(safs)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
263 for i,v in enumerate(safs):
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
264 if v == max_saf:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
265 vcf_alt = alts[i]
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
266 info = "SAF=%f;DP=%d" % (max_saf,dp)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
267 # if allow_multiples
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
268 else:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
269 vcf_alt = ','.join(alts)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
270 isafs = []
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
271 for saf in safs:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
272 isafs.append("SAF=%f" % saf)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
273 info = "%s;DP=%d" % (','.join(isafs),dp)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
274 print >> outputFile,"%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s" % (chrom,vcf_pos,vcf_id,vcf_ref,vcf_alt,vcf_qual,vcf_filter,info)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
275 except Exception, e:
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
276 print >> sys.stderr, "failed: %s" % e
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
277 exit(1)
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
278
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
279 if __name__ == "__main__": __main__()
3890f8ba0e4d Uploaded
jjohnson
parents:
diff changeset
280