Mercurial > repos > jjohnson > pileup_to_vcf
annotate pileup_to_vcf.py @ 9:c0a6e8f595ec default tip
Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
| author | Jim Johnson <jj@umn.edu> |
|---|---|
| date | Thu, 11 Apr 2013 10:28:10 -0500 |
| parents | e77ab15bbce9 |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 """ | |
| 3 # | |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2012, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | |
| 9 # | |
| 10 # James E Johnson | |
| 11 # Jesse Erdmann | |
| 12 # | |
| 13 #------------------------------------------------------------------------------ | |
| 14 """ | |
| 15 | |
| 16 """ | |
| 17 Generate a VCF file from a samtools pileup | |
| 18 filtering on read coverage, base call quality, and allele frequency | |
| 19 | |
| 20 Pileup Format | |
| 21 http://samtools.sourceforge.net/pileup.shtml | |
| 22 Columns: | |
| 23 chromosome, 1-based coordinate, reference base, the number of reads covering the site, read bases, base qualities | |
| 24 | |
| 25 VCF Format | |
| 26 http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 | |
| 27 The header line names the 8 fixed, mandatory columns. These columns are as follows: | |
| 28 CHROM POS ID REF ALT QUAL FILTER INFO | |
| 29 """ | |
| 30 | |
| 31 import sys,re,os.path | |
| 32 import optparse | |
| 33 from optparse import OptionParser | |
| 34 | |
| 35 vcf_header = """\ | |
| 36 ##fileformat=VCFv4.0 | |
|
9
c0a6e8f595ec
Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents:
7
diff
changeset
|
37 ##source=pileup_to_vcf.pyV1.2 |
| 0 | 38 ##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\"> |
| 7 | 39 ##INFO=<ID=SAF,Number=.,Type=Float,Description=\"Specific Allele Frequency\"> |
| 0 | 40 ##FILTER=<ID=DP,Description=\"Minimum depth of %s\"> |
| 7 | 41 ##FILTER=<ID=SAF,Description=\"Allele frequency of at least %s with base quality minimum %d\"> |
| 0 | 42 #CHROM POS ID REF ALT QUAL FILTER INFO\ |
| 43 """ | |
| 44 | |
| 45 def __main__(): | |
| 46 #Parse Command Line | |
| 47 parser = optparse.OptionParser() | |
| 48 # files | |
| 49 parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' ) | |
| 50 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) | |
| 51 # filters | |
| 1 | 52 parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' ) |
| 2 | 53 parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' ) |
| 0 | 54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' ) |
| 55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' ) | |
| 56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' ) | |
| 57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' ) | |
|
9
c0a6e8f595ec
Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents:
7
diff
changeset
|
58 # ID to use |
|
c0a6e8f595ec
Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents:
7
diff
changeset
|
59 parser.add_option( '-I', '--id', dest='id', default=None, help='The value for the VCF ID field' ) |
| 0 | 60 # select columns |
| 61 parser.add_option( '-C', '--chrom_col', type='int', default='1', dest='chrom_col', help='The ordinal position (starting with 1) of the chromosome column' ) | |
| 62 parser.add_option( '-P', '--pos_col', type='int', default='2', dest='pos_col', help='The ordinal position (starting with 1) of the position column' ) | |
| 63 parser.add_option( '-R', '--ref_col', type='int', default='3', dest='ref_col', help='The ordinal position (starting with 1) of the reference base column' ) | |
| 64 parser.add_option( '-D', '--coverage_col', type='int', default='4', dest='coverage_col', help='The ordinal position (starting with 1) of the read coverage depth column' ) | |
| 65 parser.add_option( '-B', '--base_call_col', type='int', default='5', dest='base_call_col', help='The ordinal position (starting with 1) of the base call column' ) | |
| 66 parser.add_option( '-Q', '--base_qual_col', type='int', default='6', dest='base_qual_col', help='The ordinal position (starting with 1) of the base quality column' ) | |
| 67 # debug | |
| 68 parser.add_option( '-d', '--debug', action="store_true", dest='debug', default=False, help='Print Debug messages' ) | |
| 69 # | |
| 70 (options, args) = parser.parse_args() | |
| 71 | |
| 72 debug = options.debug if options.debug else False | |
| 73 | |
| 74 #Coverage must have at least a total depth of min_coverage to be considered | |
| 75 min_coverage = options.min_coverage | |
| 76 #Count depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality | |
| 77 dp_as = options.report_depth | |
| 78 #Base qualities must be at least the min_base_quality value to be counted | |
| 79 min_base_quality = options.min_base_quality | |
| 80 #Coverage must be more than this pct in support of the variant, e.g. variant support / total coverage > min_allele_freq | |
| 81 min_allele_freq = options.min_allele_freq | |
| 82 #Allow multiple variants to be reported for a position, min_allele_freq must be less than 0.5 for this to have effect. | |
| 83 allow_multiples = options.allow_multiples | |
| 84 # Whether to report indels | |
| 85 report_indels = not options.snps_only | |
| 86 | |
| 87 # Check for 1-based column options, and subtract 1 for python array index | |
| 88 #Which column (count starting with 1) contains the chromosome (Default assumes 6-col pileup or 12-col filtered pileup) | |
| 89 chrom_col = options.chrom_col - 1 | |
| 90 #Which column (count starting with 1) contains the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
| 91 pos_col = options.pos_col - 1 | |
| 92 #Which column (count starting with 1) contains the reference base (Default assumes 6-col pileup or 12-col filtered pileup) | |
| 93 ref_col = options.ref_col - 1 | |
| 94 #Which column (count starting with 1) contains the total coverage for the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
| 95 coverage_col = options.coverage_col - 1 | |
| 96 #Which column (count starting with 1) contains the base calls for the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
| 97 base_call_col = options.base_call_col - 1 | |
| 98 #Which column (count starting with 1) contains the base call qualities for the position (Default assumes 6-col pileup or 12-col filtered pileup) | |
| 99 base_qual_col = options.base_qual_col - 1 | |
| 100 | |
| 101 # pileup input | |
| 102 if options.input != None: | |
| 103 try: | |
| 104 inputPath = os.path.abspath(options.input) | |
| 105 inputFile = open(inputPath, 'r') | |
| 106 except Exception, e: | |
| 107 print >> sys.stderr, "failed: %s" % e | |
| 108 exit(2) | |
| 109 else: | |
| 110 inputFile = sys.stdin | |
| 111 # vcf output | |
| 112 if options.output != None: | |
| 113 try: | |
| 114 outputPath = os.path.abspath(options.output) | |
| 115 outputFile = open(outputPath, 'w') | |
| 116 except Exception, e: | |
| 117 print >> sys.stderr, "failed: %s" % e | |
| 118 exit(3) | |
| 119 else: | |
| 120 outputFile = sys.stdout | |
| 121 | |
|
9
c0a6e8f595ec
Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents:
7
diff
changeset
|
122 vcf_id = options.id if options.id else "." |
|
c0a6e8f595ec
Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
Jim Johnson <jj@umn.edu>
parents:
7
diff
changeset
|
123 |
| 0 | 124 indel_len_pattern = '([1-9][0-9]*)' |
| 125 ref_skip_pattern = '[<>]' | |
| 126 | |
| 127 SAF_header = "SAF"; | |
| 128 | |
| 129 print >> outputFile, vcf_header % (min_coverage,min_allele_freq,min_base_quality) | |
| 130 | |
| 131 try: | |
| 132 for linenum,line in enumerate(inputFile): | |
| 133 if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line) | |
| 134 fields = line.split('\t') | |
| 2 | 135 sdp = int(fields[coverage_col]) # count of all reads spanning ref base from source coverage column |
| 0 | 136 # Quick check for coverage |
| 2 | 137 if dp_as != 'all' and sdp < min_coverage : |
| 0 | 138 continue |
| 139 bases = fields[base_call_col] | |
| 2 | 140 m = re.findall(ref_skip_pattern,bases) |
| 141 # Quick check for coverage, pileup coverage - reference skips | |
| 142 rdp = sdp - (len(m) if m else 0) | |
| 143 if dp_as == 'ref' or dp_as == 'qual': | |
| 144 if rdp < min_coverage: | |
| 0 | 145 continue |
| 146 quals = fields[base_qual_col] | |
| 147 chrom = fields[chrom_col] | |
| 148 pos = int(fields[pos_col]) | |
| 149 ref = fields[ref_col] | |
| 150 deletions = dict() | |
| 151 max_deletion = 0 | |
| 152 insertions = dict() | |
| 153 snps = {'A':0,'C':0,'G':0,'T':0,'N':0} | |
| 154 bi = 0 # bases index | |
| 155 qi = 0 # quals index | |
| 2 | 156 adp = 0 # read coverage depth |
| 0 | 157 qdp = 0 # read coverage depth with base call quality passing minimum |
| 158 variants = dict() # variant -> count | |
| 2 | 159 mc = 0 # match count |
| 0 | 160 while bi < len(bases): |
| 161 base = bases[bi] | |
| 4 | 162 # if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33) |
| 0 | 163 bi += 1 |
| 164 if base in '<>' : #reference skip (between paired reads) | |
| 4 | 165 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33) |
|
3
aa76c8dd97e6
Reference skips have a base quality score
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
166 qi += 1 |
| 0 | 167 elif base in '.,' : # match reference on forward/reverse strand |
| 4 | 168 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33) |
| 2 | 169 mc += 1 |
| 170 adp += 1 | |
| 0 | 171 qual = ord(quals[qi]) -33 |
| 172 qi += 1 | |
| 173 # count match | |
| 174 if qual >= min_base_quality: | |
| 175 qdp += 1 | |
| 176 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand | |
| 4 | 177 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33) |
| 2 | 178 adp += 1 |
| 0 | 179 qual = ord(quals[qi]) -33 |
| 180 qi += 1 | |
| 181 # record snp variant | |
| 182 if qual >= min_base_quality: | |
| 183 qdp += 1 | |
| 184 snps[base.upper()] += 1 | |
| 185 elif base in '+-' : # indel | |
| 2 | 186 adp += 1 |
| 0 | 187 m = re.match(indel_len_pattern,bases[bi:]) |
| 188 indel_len_str = m.group(0) # get the indel len string | |
| 189 bi += len(indel_len_str) # increment the index by the len of the len string | |
| 190 indel_len = int(indel_len_str) | |
| 191 indel = bases[bi:bi+indel_len].upper() # get the indel bases | |
| 4 | 192 if debug: print >> sys.stderr, "%3d\t%s%s" % (bi-2,base,indel) |
| 0 | 193 bi += indel_len # increment the index by the len of the indel |
| 194 if base == '+': | |
| 195 # record insertion variant | |
| 196 if indel in insertions: | |
| 197 insertions[indel] += 1 | |
| 198 else: | |
| 199 insertions[indel] = 1 | |
| 200 elif base == '-': | |
| 201 # record deletion variant | |
| 202 max_deletion = max(indel_len,max_deletion) | |
| 203 if indel in deletions: | |
| 204 deletions[indel] += 1 | |
| 205 else: | |
| 206 deletions[indel] = 1 | |
| 1 | 207 elif base == '*' : # a deletion from a prior base |
| 4 | 208 if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33) |
| 1 | 209 qi += 1 |
| 0 | 210 elif base == '^' : #start of read (followed by read quality char) |
| 4 | 211 if debug: print >> sys.stderr, "%3d\t%s%s" % (bi-1,base,bases[bi]) |
| 0 | 212 read_mapping_qual = ord(bases[bi]) - 33 |
| 213 bi += 1 | |
| 214 elif base == '$' : #end of read | |
| 4 | 215 if debug: print >> sys.stderr, "%3d\t%s" % (bi-1,base) |
| 0 | 216 pass # no associated quality in either bases or quals |
| 2 | 217 dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else sdp if dp_as == 'source' else adp |
| 218 if debug: print >> sys.stderr, "match count: %d" % mc | |
| 0 | 219 if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions) |
| 1 | 220 if dp < max(1,min_coverage) : |
| 221 continue | |
| 0 | 222 # output any variants that meet the depth coverage requirement |
| 223 vcf_pos = pos | |
| 224 vcf_ref = ref | |
| 225 suffix = '' | |
| 226 alts = [] | |
| 227 safs = [] | |
| 228 # If we have any deletions to report, need to modify the ref string | |
| 229 if debug: print >> sys.stderr, "max deletions: %s" % deletions | |
| 230 if report_indels: | |
| 231 for k,v in deletions.items(): | |
| 232 saf = v/float(dp) | |
| 233 if saf >= min_allele_freq: | |
| 234 if len(k) > len(suffix): | |
| 235 suffix = k | |
| 236 vcf_ref = (ref + suffix).upper() | |
| 237 if debug: print >> sys.stderr, "snps: %s" % snps | |
| 238 for k,v in snps.items(): | |
| 239 if debug: print >> sys.stderr, "snp: %s %d" % (k,v) | |
| 240 saf = v/float(dp) | |
| 241 if saf >= min_allele_freq: | |
| 242 alts.append(k + suffix) | |
| 243 safs.append(saf) | |
| 244 if debug: print >> sys.stderr, "insertions: %s" % insertions | |
| 245 if report_indels: | |
| 246 for k,v in insertions.items(): | |
| 247 saf = v/float(dp) | |
| 248 if saf >= min_allele_freq: | |
| 249 alts.append(ref + k + suffix) | |
| 250 safs.append(saf) | |
| 251 if debug: print >> sys.stderr, "deletions: %s" % deletions | |
| 252 for k,v in deletions.items(): | |
| 253 saf = v/float(dp) | |
| 254 if saf >= min_allele_freq: | |
| 255 alts.append(vcf_ref[:len(vcf_ref) - len(k)]) # TODO alt will be a substring of vcf_ref, test this | |
| 256 safs.append(saf) | |
| 257 if len(alts) > 0: | |
| 258 vcf_qual = "." | |
| 259 vcf_filter = "PASS" | |
| 260 # if not allow_multiples, report only the most freq alt | |
| 261 if not allow_multiples and len(alts) > 1: | |
| 262 max_saf = max(safs) | |
| 263 for i,v in enumerate(safs): | |
| 264 if v == max_saf: | |
| 265 vcf_alt = alts[i] | |
| 266 info = "SAF=%f;DP=%d" % (max_saf,dp) | |
| 267 # if allow_multiples | |
| 268 else: | |
| 269 vcf_alt = ','.join(alts) | |
| 270 isafs = [] | |
| 271 for saf in safs: | |
| 272 isafs.append("SAF=%f" % saf) | |
| 273 info = "%s;DP=%d" % (','.join(isafs),dp) | |
| 274 print >> outputFile,"%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s" % (chrom,vcf_pos,vcf_id,vcf_ref,vcf_alt,vcf_qual,vcf_filter,info) | |
| 275 except Exception, e: | |
| 276 print >> sys.stderr, "failed: %s" % e | |
| 277 exit(1) | |
| 278 | |
| 279 if __name__ == "__main__": __main__() | |
| 280 |
