Mercurial > repos > jjohnson > pileup_to_vcf
comparison pileup_to_vcf.py @ 2:d6de2d1f4af9
Fix read depth reporting
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Sun, 03 Mar 2013 20:38:43 -0600 |
parents | bb0ac51f1b02 |
children | aa76c8dd97e6 |
comparison
equal
deleted
inserted
replaced
1:bb0ac51f1b02 | 2:d6de2d1f4af9 |
---|---|
48 # files | 48 # files |
49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' ) | 49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' ) |
50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) | 50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) |
51 # filters | 51 # filters |
52 parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' ) | 52 parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' ) |
53 parser.add_option( '-r', '--report_depth', type='choice', choices=['all','ref','qual'], default='ref', dest='report_depth', help='Coverage depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality' ) | 53 parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' ) |
54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' ) | 54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' ) |
55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' ) | 55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' ) |
56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' ) | 56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' ) |
57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' ) | 57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' ) |
58 # select columns | 58 # select columns |
126 | 126 |
127 try: | 127 try: |
128 for linenum,line in enumerate(inputFile): | 128 for linenum,line in enumerate(inputFile): |
129 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line) | 129 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line) |
130 fields = line.split('\t') | 130 fields = line.split('\t') |
131 adp = int(fields[coverage_col]) # count of all reads spanning ref base | 131 sdp = int(fields[coverage_col]) # count of all reads spanning ref base from source coverage column |
132 # Quick check for coverage | 132 # Quick check for coverage |
133 if adp < min_coverage : | 133 if dp_as != 'all' and sdp < min_coverage : |
134 continue | 134 continue |
135 bases = fields[base_call_col] | 135 bases = fields[base_call_col] |
136 if dp_as != 'all' : | 136 m = re.findall(ref_skip_pattern,bases) |
137 m = re.findall(ref_skip_pattern,bases) | 137 # Quick check for coverage, pileup coverage - reference skips |
138 # Quick check for coverage, pileup coverage - reference skips | 138 rdp = sdp - (len(m) if m else 0) |
139 if adp -len(m) < min_coverage: | 139 if dp_as == 'ref' or dp_as == 'qual': |
140 if rdp < min_coverage: | |
140 continue | 141 continue |
141 quals = fields[base_qual_col] | 142 quals = fields[base_qual_col] |
142 chrom = fields[chrom_col] | 143 chrom = fields[chrom_col] |
143 pos = int(fields[pos_col]) | 144 pos = int(fields[pos_col]) |
144 ref = fields[ref_col] | 145 ref = fields[ref_col] |
146 max_deletion = 0 | 147 max_deletion = 0 |
147 insertions = dict() | 148 insertions = dict() |
148 snps = {'A':0,'C':0,'G':0,'T':0,'N':0} | 149 snps = {'A':0,'C':0,'G':0,'T':0,'N':0} |
149 bi = 0 # bases index | 150 bi = 0 # bases index |
150 qi = 0 # quals index | 151 qi = 0 # quals index |
151 rdp = 0 # read coverage depth | 152 adp = 0 # read coverage depth |
152 qdp = 0 # read coverage depth with base call quality passing minimum | 153 qdp = 0 # read coverage depth with base call quality passing minimum |
153 variants = dict() # variant -> count | 154 variants = dict() # variant -> count |
155 mc = 0 # match count | |
154 while bi < len(bases): | 156 while bi < len(bases): |
155 base = bases[bi] | 157 base = bases[bi] |
156 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33) | 158 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33) |
157 bi += 1 | 159 bi += 1 |
158 if base in '<>' : #reference skip (between paired reads) | 160 if base in '<>' : #reference skip (between paired reads) |
159 pass; | 161 pass; |
160 elif base in '.,' : # match reference on forward/reverse strand | 162 elif base in '.,' : # match reference on forward/reverse strand |
161 rdp += 1 | 163 mc += 1 |
164 adp += 1 | |
162 qual = ord(quals[qi]) -33 | 165 qual = ord(quals[qi]) -33 |
163 qi += 1 | 166 qi += 1 |
164 # count match | 167 # count match |
165 if qual >= min_base_quality: | 168 if qual >= min_base_quality: |
166 qdp += 1 | 169 qdp += 1 |
167 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand | 170 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand |
168 rdp += 1 | 171 adp += 1 |
169 qual = ord(quals[qi]) -33 | 172 qual = ord(quals[qi]) -33 |
170 qi += 1 | 173 qi += 1 |
171 # record snp variant | 174 # record snp variant |
172 if qual >= min_base_quality: | 175 if qual >= min_base_quality: |
173 qdp += 1 | 176 qdp += 1 |
174 snps[base.upper()] += 1 | 177 snps[base.upper()] += 1 |
175 elif base in '+-' : # indel | 178 elif base in '+-' : # indel |
176 rdp += 1 | 179 adp += 1 |
177 qdp += 1 # no related quality score, so count it | |
178 m = re.match(indel_len_pattern,bases[bi:]) | 180 m = re.match(indel_len_pattern,bases[bi:]) |
179 indel_len_str = m.group(0) # get the indel len string | 181 indel_len_str = m.group(0) # get the indel len string |
180 bi += len(indel_len_str) # increment the index by the len of the len string | 182 bi += len(indel_len_str) # increment the index by the len of the len string |
181 indel_len = int(indel_len_str) | 183 indel_len = int(indel_len_str) |
182 indel = bases[bi:bi+indel_len].upper() # get the indel bases | 184 indel = bases[bi:bi+indel_len].upper() # get the indel bases |
199 elif base == '^' : #start of read (followed by read quality char) | 201 elif base == '^' : #start of read (followed by read quality char) |
200 read_mapping_qual = ord(bases[bi]) - 33 | 202 read_mapping_qual = ord(bases[bi]) - 33 |
201 bi += 1 | 203 bi += 1 |
202 elif base == '$' : #end of read | 204 elif base == '$' : #end of read |
203 pass # no associated quality in either bases or quals | 205 pass # no associated quality in either bases or quals |
204 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else adp | 206 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else sdp if dp_as == 'source' else adp |
207 if debug: print >> sys.stderr, "match count: %d" % mc | |
205 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions) | 208 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions) |
206 if dp < max(1,min_coverage) : | 209 if dp < max(1,min_coverage) : |
207 continue | 210 continue |
208 # output any variants that meet the depth coverage requirement | 211 # output any variants that meet the depth coverage requirement |
209 vcf_pos = pos | 212 vcf_pos = pos |