annotate tools/vcf_tools/annotate.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import os.path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 import optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 import vcfClass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 from vcfClass import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 import tools
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 from tools import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 main()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 # Check that the reference and alternate in the dbsnp vcf file match those
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 # from the input vcf file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 allelesAgree = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 if vcfRef.lower() != dbsnpRef.lower():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 if vcfRef.lower() != dbsnpAlt.lower():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 #print >> sys.stderr, text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 allelesAgree = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 if vcfAlt.lower() != dbsnpAlt.lower():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 #print >> sys.stderr, text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 allelesAgree = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 return allelesAgree
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 # Intersect two vcf files. It is assumed that the two files are
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 # sorted by genomic coordinates and the reference sequences are
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 # in the same order.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 def annotateVcf(v, d, outputFile, annotation):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 success1 = v.getRecord()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 success2 = d.getRecord()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 currentReferenceSequence = v.referenceSequence
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 # Finish when the end of the first file has been reached.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 while success1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 # If the end of the dbsnp vcf file is reached, write out the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 # remaining records from the vcf file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 if not success2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 outputFile.write(v.record)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 success1 = v.getRecord()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 if v.position == d.position:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 if annotation == "dbsnp": v.rsid = d.getDbsnpInfo()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 elif annotation == "hapmap":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 if allelesAgree: v.info += ";HM3"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 else: v.info += ";HM3A"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 record = v.buildRecord(False)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 outputFile.write(record)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 success1 = v.getRecord()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 success2 = d.getRecord()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 # If the last record for a reference sequence is the same for both vcf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 # files, they will both have referenceSequences different from the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 # current reference sequence. Change the reference sequence to reflect
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 # this and proceed.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 if v.referenceSequence != d.referenceSequence:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 currentReferenceSequence = v.referenceSequence
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 # Parse the command line options
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 usage = "Usage: vcfPytools.py annotate [options]"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 parser = optparse.OptionParser(usage = usage)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 parser.add_option("-i", "--in",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 dest="vcfFile", help="input vcf files")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 parser.add_option("-d", "--dbsnp",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 dest="dbsnpFile", help="input dbsnp vcf file")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 parser.add_option("-m", "--hapmap",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 dest="hapmapFile", help="input hapmap vcf file")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 parser.add_option("-o", "--out",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 dest="output", help="output vcf file")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 (options, args) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 # Check that a single vcf file is given.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if options.vcfFile == None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 parser.print_help()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 # Check that either a hapmap or a dbsnp vcf file is included.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 if options.dbsnpFile == None and options.hapmapFile == None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 parser.print_help()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 elif options.dbsnpFile != None and options.hapmapFile != None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 parser.print_help()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 # Set the output file to stdout if no output file was specified.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 outputFile, writeOut = setOutput(options.output) # tools.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 v = vcf() # Define vcf object.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 d = vcf() # Define dbsnp/hapmap vcf object.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 if options.dbsnpFile:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 d.dbsnpVcf = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 annotationFile = options.dbsnpFile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 annotation = "dbsnp"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 elif options.hapmapFile:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 d.hapmapVcf = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 annotationFile = options.hapmapFile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 annotation = "hapmap"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 # Open the vcf files.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 v.openVcf(options.vcfFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 d.openVcf(annotationFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 # Read in the header information.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 v.parseHeader(options.vcfFile, writeOut)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 d.parseHeader(annotationFile, writeOut)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 # Add an extra line to the vcf header to indicate the file used for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 # performing dbsnp annotation.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 taskDescriptor = "##vcfPytools=annotated vcf file with "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 elif options.hapmapFile:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 taskDescriptor += "hapmap file " + options.hapmapFile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 options.hapmapFile + "\">"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 ", determined from file " + options.hapmapFile + "\">"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 # Annotate the vcf file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 annotateVcf(v, d, outputFile, annotation)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 # Check that the input files had the same list of reference sequences.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 # If not, it is possible that there were some problems.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 # Close the vcf files.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 v.closeVcf(options.vcfFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 d.closeVcf(annotationFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 # End the program.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 return 0