Mercurial > repos > devteam > vcf_annotate
comparison annotate.py @ 0:b001b50f2009 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 27 Jan 2014 09:28:06 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b001b50f2009 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 import os.path | |
| 4 import sys | |
| 5 import optparse | |
| 6 | |
| 7 import vcfClass | |
| 8 from vcfClass import * | |
| 9 | |
| 10 import tools | |
| 11 from tools import * | |
| 12 | |
| 13 if __name__ == "__main__": | |
| 14 main() | |
| 15 | |
| 16 # Check that the reference and alternate in the dbsnp vcf file match those | |
| 17 # from the input vcf file. | |
| 18 def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation): | |
| 19 text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \ | |
| 20 vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt | |
| 21 | |
| 22 allelesAgree = True | |
| 23 if vcfRef.lower() != dbsnpRef.lower(): | |
| 24 if vcfRef.lower() != dbsnpAlt.lower(): | |
| 25 #print >> sys.stderr, text | |
| 26 allelesAgree = False | |
| 27 else: | |
| 28 if vcfAlt.lower() != dbsnpAlt.lower(): | |
| 29 #print >> sys.stderr, text | |
| 30 allelesAgree = False | |
| 31 | |
| 32 return allelesAgree | |
| 33 | |
| 34 # Intersect two vcf files. It is assumed that the two files are | |
| 35 # sorted by genomic coordinates and the reference sequences are | |
| 36 # in the same order. | |
| 37 def annotateVcf(v, d, outputFile, annotation): | |
| 38 success1 = v.getRecord() | |
| 39 success2 = d.getRecord() | |
| 40 currentReferenceSequence = v.referenceSequence | |
| 41 | |
| 42 # Finish when the end of the first file has been reached. | |
| 43 while success1: | |
| 44 | |
| 45 # If the end of the dbsnp vcf file is reached, write out the | |
| 46 # remaining records from the vcf file. | |
| 47 if not success2: | |
| 48 outputFile.write(v.record) | |
| 49 success1 = v.getRecord() | |
| 50 | |
| 51 if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence: | |
| 52 if v.position == d.position: | |
| 53 allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation) | |
| 54 if annotation == "dbsnp": v.rsid = d.getDbsnpInfo() | |
| 55 elif annotation == "hapmap": | |
| 56 if allelesAgree: v.info += ";HM3" | |
| 57 else: v.info += ";HM3A" | |
| 58 record = v.buildRecord(False) | |
| 59 outputFile.write(record) | |
| 60 | |
| 61 success1 = v.getRecord() | |
| 62 success2 = d.getRecord() | |
| 63 elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) | |
| 64 elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) | |
| 65 else: | |
| 66 if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) | |
| 67 elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) | |
| 68 | |
| 69 # If the last record for a reference sequence is the same for both vcf | |
| 70 # files, they will both have referenceSequences different from the | |
| 71 # current reference sequence. Change the reference sequence to reflect | |
| 72 # this and proceed. | |
| 73 else: | |
| 74 if v.referenceSequence != d.referenceSequence: | |
| 75 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different." | |
| 76 print >> sys.stderr, "Check that both files contain records for the following reference sequences:" | |
| 77 print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence | |
| 78 exit(1) | |
| 79 currentReferenceSequence = v.referenceSequence | |
| 80 | |
| 81 def main(): | |
| 82 | |
| 83 # Parse the command line options | |
| 84 usage = "Usage: vcfPytools.py annotate [options]" | |
| 85 parser = optparse.OptionParser(usage = usage) | |
| 86 parser.add_option("-i", "--in", | |
| 87 action="store", type="string", | |
| 88 dest="vcfFile", help="input vcf files") | |
| 89 parser.add_option("-d", "--dbsnp", | |
| 90 action="store", type="string", | |
| 91 dest="dbsnpFile", help="input dbsnp vcf file") | |
| 92 parser.add_option("-m", "--hapmap", | |
| 93 action="store", type="string", | |
| 94 dest="hapmapFile", help="input hapmap vcf file") | |
| 95 parser.add_option("-o", "--out", | |
| 96 action="store", type="string", | |
| 97 dest="output", help="output vcf file") | |
| 98 | |
| 99 (options, args) = parser.parse_args() | |
| 100 | |
| 101 # Check that a single vcf file is given. | |
| 102 if options.vcfFile == None: | |
| 103 parser.print_help() | |
| 104 print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation." | |
| 105 exit(1) | |
| 106 | |
| 107 # Check that either a hapmap or a dbsnp vcf file is included. | |
| 108 if options.dbsnpFile == None and options.hapmapFile == None: | |
| 109 parser.print_help() | |
| 110 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)." | |
| 111 exit(1) | |
| 112 elif options.dbsnpFile != None and options.hapmapFile != None: | |
| 113 parser.print_help() | |
| 114 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)." | |
| 115 exit(1) | |
| 116 | |
| 117 # Set the output file to stdout if no output file was specified. | |
| 118 outputFile, writeOut = setOutput(options.output) # tools.py | |
| 119 | |
| 120 v = vcf() # Define vcf object. | |
| 121 d = vcf() # Define dbsnp/hapmap vcf object. | |
| 122 if options.dbsnpFile: | |
| 123 d.dbsnpVcf = True | |
| 124 annotationFile = options.dbsnpFile | |
| 125 annotation = "dbsnp" | |
| 126 elif options.hapmapFile: | |
| 127 d.hapmapVcf = True | |
| 128 annotationFile = options.hapmapFile | |
| 129 annotation = "hapmap" | |
| 130 | |
| 131 # Open the vcf files. | |
| 132 v.openVcf(options.vcfFile) | |
| 133 d.openVcf(annotationFile) | |
| 134 | |
| 135 # Read in the header information. | |
| 136 v.parseHeader(options.vcfFile, writeOut) | |
| 137 d.parseHeader(annotationFile, writeOut) | |
| 138 | |
| 139 # Add an extra line to the vcf header to indicate the file used for | |
| 140 # performing dbsnp annotation. | |
| 141 taskDescriptor = "##vcfPytools=annotated vcf file with " | |
| 142 if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile | |
| 143 elif options.hapmapFile: | |
| 144 taskDescriptor += "hapmap file " + options.hapmapFile | |
| 145 v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \ | |
| 146 options.hapmapFile + "\">" | |
| 147 v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \ | |
| 148 ", determined from file " + options.hapmapFile + "\">" | |
| 149 writeHeader(outputFile, v, False, taskDescriptor) # tools.py | |
| 150 | |
| 151 # Annotate the vcf file. | |
| 152 annotateVcf(v, d, outputFile, annotation) | |
| 153 | |
| 154 # Check that the input files had the same list of reference sequences. | |
| 155 # If not, it is possible that there were some problems. | |
| 156 checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py | |
| 157 | |
| 158 # Close the vcf files. | |
| 159 v.closeVcf(options.vcfFile) | |
| 160 d.closeVcf(annotationFile) | |
| 161 | |
| 162 # End the program. | |
| 163 return 0 |
