| 0 | 1 #!/usr/bin/python | 
|  | 2 | 
|  | 3 import os.path | 
|  | 4 import sys | 
|  | 5 import optparse | 
|  | 6 | 
|  | 7 import vcfClass | 
|  | 8 from vcfClass import * | 
|  | 9 | 
|  | 10 import tools | 
|  | 11 from tools import * | 
|  | 12 | 
|  | 13 if __name__ == "__main__": | 
|  | 14   main() | 
|  | 15 | 
|  | 16 # Check that the reference and alternate in the dbsnp vcf file match those | 
|  | 17 # from the input vcf file. | 
|  | 18 def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation): | 
|  | 19   text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \ | 
|  | 20          vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt | 
|  | 21 | 
|  | 22   allelesAgree = True | 
|  | 23   if vcfRef.lower() != dbsnpRef.lower(): | 
|  | 24     if vcfRef.lower() != dbsnpAlt.lower(): | 
|  | 25       #print >> sys.stderr, text | 
|  | 26       allelesAgree = False | 
|  | 27   else: | 
|  | 28     if vcfAlt.lower() != dbsnpAlt.lower(): | 
|  | 29       #print >> sys.stderr, text | 
|  | 30       allelesAgree = False | 
|  | 31 | 
|  | 32   return allelesAgree | 
|  | 33 | 
|  | 34 # Intersect two vcf files.  It is assumed that the two files are | 
|  | 35 # sorted by genomic coordinates and the reference sequences are | 
|  | 36 # in the same order. | 
|  | 37 def annotateVcf(v, d, outputFile, annotation): | 
|  | 38   success1 = v.getRecord() | 
|  | 39   success2 = d.getRecord() | 
|  | 40   currentReferenceSequence = v.referenceSequence | 
|  | 41 | 
|  | 42 # Finish when the end of the first file has been reached. | 
|  | 43   while success1: | 
|  | 44 | 
|  | 45 # If the end of the dbsnp vcf file is reached, write out the | 
|  | 46 # remaining records from the vcf file. | 
|  | 47     if not success2: | 
|  | 48       outputFile.write(v.record) | 
|  | 49       success1 = v.getRecord() | 
|  | 50 | 
|  | 51     if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence: | 
|  | 52       if v.position == d.position: | 
|  | 53         allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation) | 
|  | 54         if annotation == "dbsnp": v.rsid = d.getDbsnpInfo() | 
|  | 55         elif annotation == "hapmap": | 
|  | 56           if allelesAgree: v.info += ";HM3" | 
|  | 57           else: v.info += ";HM3A" | 
|  | 58         record = v.buildRecord(False) | 
|  | 59         outputFile.write(record) | 
|  | 60 | 
|  | 61         success1 = v.getRecord() | 
|  | 62         success2 = d.getRecord() | 
|  | 63       elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) | 
|  | 64       elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) | 
|  | 65     else: | 
|  | 66       if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) | 
|  | 67       elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) | 
|  | 68 | 
|  | 69 # If the last record for a reference sequence is the same for both vcf | 
|  | 70 # files, they will both have referenceSequences different from the | 
|  | 71 # current reference sequence.  Change the reference sequence to reflect | 
|  | 72 # this and proceed. | 
|  | 73       else: | 
|  | 74         if v.referenceSequence != d.referenceSequence: | 
|  | 75           print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different." | 
|  | 76           print >> sys.stderr, "Check that both files contain records for the following reference sequences:" | 
|  | 77           print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence | 
|  | 78           exit(1) | 
|  | 79       currentReferenceSequence = v.referenceSequence | 
|  | 80 | 
|  | 81 def main(): | 
|  | 82 | 
|  | 83 # Parse the command line options | 
|  | 84   usage = "Usage: vcfPytools.py annotate [options]" | 
|  | 85   parser = optparse.OptionParser(usage = usage) | 
|  | 86   parser.add_option("-i", "--in", | 
|  | 87                     action="store", type="string", | 
|  | 88                     dest="vcfFile", help="input vcf files") | 
|  | 89   parser.add_option("-d", "--dbsnp", | 
|  | 90                     action="store", type="string", | 
|  | 91                     dest="dbsnpFile", help="input dbsnp vcf file") | 
|  | 92   parser.add_option("-m", "--hapmap", | 
|  | 93                     action="store", type="string", | 
|  | 94                     dest="hapmapFile", help="input hapmap vcf file") | 
|  | 95   parser.add_option("-o", "--out", | 
|  | 96                     action="store", type="string", | 
|  | 97                     dest="output", help="output vcf file") | 
|  | 98 | 
|  | 99   (options, args) = parser.parse_args() | 
|  | 100 | 
|  | 101 # Check that a single  vcf file is given. | 
|  | 102   if options.vcfFile == None: | 
|  | 103     parser.print_help() | 
|  | 104     print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation." | 
|  | 105     exit(1) | 
|  | 106 | 
|  | 107 # Check that either a hapmap or a dbsnp vcf file is included. | 
|  | 108   if options.dbsnpFile == None and options.hapmapFile == None: | 
|  | 109     parser.print_help() | 
|  | 110     print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)." | 
|  | 111     exit(1) | 
|  | 112   elif options.dbsnpFile != None and options.hapmapFile != None: | 
|  | 113     parser.print_help() | 
|  | 114     print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)." | 
|  | 115     exit(1) | 
|  | 116 | 
|  | 117 # Set the output file to stdout if no output file was specified. | 
|  | 118   outputFile, writeOut = setOutput(options.output) # tools.py | 
|  | 119 | 
|  | 120   v = vcf() # Define vcf object. | 
|  | 121   d = vcf() # Define dbsnp/hapmap vcf object. | 
|  | 122   if options.dbsnpFile: | 
|  | 123     d.dbsnpVcf = True | 
|  | 124     annotationFile = options.dbsnpFile | 
|  | 125     annotation = "dbsnp" | 
|  | 126   elif options.hapmapFile: | 
|  | 127     d.hapmapVcf = True | 
|  | 128     annotationFile = options.hapmapFile | 
|  | 129     annotation = "hapmap" | 
|  | 130 | 
|  | 131 # Open the vcf files. | 
|  | 132   v.openVcf(options.vcfFile) | 
|  | 133   d.openVcf(annotationFile) | 
|  | 134 | 
|  | 135 # Read in the header information. | 
|  | 136   v.parseHeader(options.vcfFile, writeOut) | 
|  | 137   d.parseHeader(annotationFile, writeOut) | 
|  | 138 | 
|  | 139 # Add an extra line to the vcf header to indicate the file used for | 
|  | 140 # performing dbsnp annotation. | 
|  | 141   taskDescriptor = "##vcfPytools=annotated vcf file with " | 
|  | 142   if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile | 
|  | 143   elif options.hapmapFile: | 
|  | 144     taskDescriptor += "hapmap file " + options.hapmapFile | 
|  | 145     v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \ | 
|  | 146                                 options.hapmapFile + "\">" | 
|  | 147     v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \ | 
|  | 148                                  ", determined from file " + options.hapmapFile + "\">" | 
|  | 149   writeHeader(outputFile, v, False, taskDescriptor) # tools.py | 
|  | 150 | 
|  | 151 # Annotate the vcf file. | 
|  | 152   annotateVcf(v, d, outputFile, annotation) | 
|  | 153 | 
|  | 154 # Check that the input files had the same list of reference sequences. | 
|  | 155 # If not, it is possible that there were some problems. | 
|  | 156   checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py | 
|  | 157 | 
|  | 158 # Close the vcf files. | 
|  | 159   v.closeVcf(options.vcfFile) | 
|  | 160   d.closeVcf(annotationFile) | 
|  | 161 | 
|  | 162 # End the program. | 
|  | 163   return 0 |