comparison pileup_to_vcf.py @ 9:c0a6e8f595ec default tip

Add option to set VCF ID field value, this can be used to ID germline variants for SnpSift
author Jim Johnson <jj@umn.edu>
date Thu, 11 Apr 2013 10:28:10 -0500
parents e77ab15bbce9
children
comparison
equal deleted inserted replaced
8:07cd87e94fbe 9:c0a6e8f595ec
32 import optparse 32 import optparse
33 from optparse import OptionParser 33 from optparse import OptionParser
34 34
35 vcf_header = """\ 35 vcf_header = """\
36 ##fileformat=VCFv4.0 36 ##fileformat=VCFv4.0
37 ##source=pileup_to_vcf.pyV1.1 37 ##source=pileup_to_vcf.pyV1.2
38 ##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\"> 38 ##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">
39 ##INFO=<ID=SAF,Number=.,Type=Float,Description=\"Specific Allele Frequency\"> 39 ##INFO=<ID=SAF,Number=.,Type=Float,Description=\"Specific Allele Frequency\">
40 ##FILTER=<ID=DP,Description=\"Minimum depth of %s\"> 40 ##FILTER=<ID=DP,Description=\"Minimum depth of %s\">
41 ##FILTER=<ID=SAF,Description=\"Allele frequency of at least %s with base quality minimum %d\"> 41 ##FILTER=<ID=SAF,Description=\"Allele frequency of at least %s with base quality minimum %d\">
42 #CHROM POS ID REF ALT QUAL FILTER INFO\ 42 #CHROM POS ID REF ALT QUAL FILTER INFO\
53 parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' ) 53 parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' )
54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' ) 54 parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' )
55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' ) 55 parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' )
56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' ) 56 parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' )
57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' ) 57 parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' )
58 # ID to use
59 parser.add_option( '-I', '--id', dest='id', default=None, help='The value for the VCF ID field' )
58 # select columns 60 # select columns
59 parser.add_option( '-C', '--chrom_col', type='int', default='1', dest='chrom_col', help='The ordinal position (starting with 1) of the chromosome column' ) 61 parser.add_option( '-C', '--chrom_col', type='int', default='1', dest='chrom_col', help='The ordinal position (starting with 1) of the chromosome column' )
60 parser.add_option( '-P', '--pos_col', type='int', default='2', dest='pos_col', help='The ordinal position (starting with 1) of the position column' ) 62 parser.add_option( '-P', '--pos_col', type='int', default='2', dest='pos_col', help='The ordinal position (starting with 1) of the position column' )
61 parser.add_option( '-R', '--ref_col', type='int', default='3', dest='ref_col', help='The ordinal position (starting with 1) of the reference base column' ) 63 parser.add_option( '-R', '--ref_col', type='int', default='3', dest='ref_col', help='The ordinal position (starting with 1) of the reference base column' )
62 parser.add_option( '-D', '--coverage_col', type='int', default='4', dest='coverage_col', help='The ordinal position (starting with 1) of the read coverage depth column' ) 64 parser.add_option( '-D', '--coverage_col', type='int', default='4', dest='coverage_col', help='The ordinal position (starting with 1) of the read coverage depth column' )
114 except Exception, e: 116 except Exception, e:
115 print >> sys.stderr, "failed: %s" % e 117 print >> sys.stderr, "failed: %s" % e
116 exit(3) 118 exit(3)
117 else: 119 else:
118 outputFile = sys.stdout 120 outputFile = sys.stdout
121
122 vcf_id = options.id if options.id else "."
119 123
120 indel_len_pattern = '([1-9][0-9]*)' 124 indel_len_pattern = '([1-9][0-9]*)'
121 ref_skip_pattern = '[<>]' 125 ref_skip_pattern = '[<>]'
122 126
123 SAF_header = "SAF"; 127 SAF_header = "SAF";
249 saf = v/float(dp) 253 saf = v/float(dp)
250 if saf >= min_allele_freq: 254 if saf >= min_allele_freq:
251 alts.append(vcf_ref[:len(vcf_ref) - len(k)]) # TODO alt will be a substring of vcf_ref, test this 255 alts.append(vcf_ref[:len(vcf_ref) - len(k)]) # TODO alt will be a substring of vcf_ref, test this
252 safs.append(saf) 256 safs.append(saf)
253 if len(alts) > 0: 257 if len(alts) > 0:
254 vcf_id = "."
255 vcf_qual = "." 258 vcf_qual = "."
256 vcf_filter = "PASS" 259 vcf_filter = "PASS"
257 # if not allow_multiples, report only the most freq alt 260 # if not allow_multiples, report only the most freq alt
258 if not allow_multiples and len(alts) > 1: 261 if not allow_multiples and len(alts) > 1:
259 max_saf = max(safs) 262 max_saf = max(safs)