comparison pileup_to_vcf.py @ 2:d6de2d1f4af9

Fix read depth reporting
author Jim Johnson <jj@umn.edu>
date Sun, 03 Mar 2013 20:38:43 -0600
parents bb0ac51f1b02
children aa76c8dd97e6
comparison
equal deleted inserted replaced
1:bb0ac51f1b02 2:d6de2d1f4af9
48 # files 48 # files
49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' ) 49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' )
50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) 50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' )
51 # filters 51 # filters
52 parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' ) 52 parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' )
53 parser.add_option( '-r', '--report_depth', type='choice', choices=['all','ref','qual'], default='ref', dest='report_depth', help='Coverage depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality' ) 53 parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' )
54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' ) 54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' )
55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' ) 55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' )
56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' ) 56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' )
57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' ) 57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' )
58 # select columns 58 # select columns
126 126
127 try: 127 try:
128 for linenum,line in enumerate(inputFile): 128 for linenum,line in enumerate(inputFile):
129 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line) 129 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line)
130 fields = line.split('\t') 130 fields = line.split('\t')
131 adp = int(fields[coverage_col]) # count of all reads spanning ref base 131 sdp = int(fields[coverage_col]) # count of all reads spanning ref base from source coverage column
132 # Quick check for coverage 132 # Quick check for coverage
133 if adp < min_coverage : 133 if dp_as != 'all' and sdp < min_coverage :
134 continue 134 continue
135 bases = fields[base_call_col] 135 bases = fields[base_call_col]
136 if dp_as != 'all' : 136 m = re.findall(ref_skip_pattern,bases)
137 m = re.findall(ref_skip_pattern,bases) 137 # Quick check for coverage, pileup coverage - reference skips
138 # Quick check for coverage, pileup coverage - reference skips 138 rdp = sdp - (len(m) if m else 0)
139 if adp -len(m) < min_coverage: 139 if dp_as == 'ref' or dp_as == 'qual':
140 if rdp < min_coverage:
140 continue 141 continue
141 quals = fields[base_qual_col] 142 quals = fields[base_qual_col]
142 chrom = fields[chrom_col] 143 chrom = fields[chrom_col]
143 pos = int(fields[pos_col]) 144 pos = int(fields[pos_col])
144 ref = fields[ref_col] 145 ref = fields[ref_col]
146 max_deletion = 0 147 max_deletion = 0
147 insertions = dict() 148 insertions = dict()
148 snps = {'A':0,'C':0,'G':0,'T':0,'N':0} 149 snps = {'A':0,'C':0,'G':0,'T':0,'N':0}
149 bi = 0 # bases index 150 bi = 0 # bases index
150 qi = 0 # quals index 151 qi = 0 # quals index
151 rdp = 0 # read coverage depth 152 adp = 0 # read coverage depth
152 qdp = 0 # read coverage depth with base call quality passing minimum 153 qdp = 0 # read coverage depth with base call quality passing minimum
153 variants = dict() # variant -> count 154 variants = dict() # variant -> count
155 mc = 0 # match count
154 while bi < len(bases): 156 while bi < len(bases):
155 base = bases[bi] 157 base = bases[bi]
156 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33) 158 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33)
157 bi += 1 159 bi += 1
158 if base in '<>' : #reference skip (between paired reads) 160 if base in '<>' : #reference skip (between paired reads)
159 pass; 161 pass;
160 elif base in '.,' : # match reference on forward/reverse strand 162 elif base in '.,' : # match reference on forward/reverse strand
161 rdp += 1 163 mc += 1
164 adp += 1
162 qual = ord(quals[qi]) -33 165 qual = ord(quals[qi]) -33
163 qi += 1 166 qi += 1
164 # count match 167 # count match
165 if qual >= min_base_quality: 168 if qual >= min_base_quality:
166 qdp += 1 169 qdp += 1
167 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand 170 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand
168 rdp += 1 171 adp += 1
169 qual = ord(quals[qi]) -33 172 qual = ord(quals[qi]) -33
170 qi += 1 173 qi += 1
171 # record snp variant 174 # record snp variant
172 if qual >= min_base_quality: 175 if qual >= min_base_quality:
173 qdp += 1 176 qdp += 1
174 snps[base.upper()] += 1 177 snps[base.upper()] += 1
175 elif base in '+-' : # indel 178 elif base in '+-' : # indel
176 rdp += 1 179 adp += 1
177 qdp += 1 # no related quality score, so count it
178 m = re.match(indel_len_pattern,bases[bi:]) 180 m = re.match(indel_len_pattern,bases[bi:])
179 indel_len_str = m.group(0) # get the indel len string 181 indel_len_str = m.group(0) # get the indel len string
180 bi += len(indel_len_str) # increment the index by the len of the len string 182 bi += len(indel_len_str) # increment the index by the len of the len string
181 indel_len = int(indel_len_str) 183 indel_len = int(indel_len_str)
182 indel = bases[bi:bi+indel_len].upper() # get the indel bases 184 indel = bases[bi:bi+indel_len].upper() # get the indel bases
199 elif base == '^' : #start of read (followed by read quality char) 201 elif base == '^' : #start of read (followed by read quality char)
200 read_mapping_qual = ord(bases[bi]) - 33 202 read_mapping_qual = ord(bases[bi]) - 33
201 bi += 1 203 bi += 1
202 elif base == '$' : #end of read 204 elif base == '$' : #end of read
203 pass # no associated quality in either bases or quals 205 pass # no associated quality in either bases or quals
204 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else adp 206 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else sdp if dp_as == 'source' else adp
207 if debug: print >> sys.stderr, "match count: %d" % mc
205 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions) 208 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions)
206 if dp < max(1,min_coverage) : 209 if dp < max(1,min_coverage) :
207 continue 210 continue
208 # output any variants that meet the depth coverage requirement 211 # output any variants that meet the depth coverage requirement
209 vcf_pos = pos 212 vcf_pos = pos