annotate tools/vcf_tools/vcfClass.py @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import os.path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 import re
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 class vcf:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 def __init__(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 # Header info.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 self.filename = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 self.hasHeader = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 self.headerText = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 self.headerTitles = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 #self.headerInfoText = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 #self.headerFormatText = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 # Store the info and format tags as well as the lines that describe
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 # them in a dictionary.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 self.numberDataSets = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 self.includedDataSets = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 self.infoHeaderTags = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 self.infoHeaderString = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 self.formatHeaderTags = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 self.formatHeaderString = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 # Genotype information.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 self.genotypes = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 self.infoField = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 # Reference sequence information.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 self.referenceSequences = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 self.referenceSequenceList = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 self.referenceSequence = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 # Record information.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 self.position = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 self.samplesList = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 # Determine which fields to process.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 self.processInfo = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 self.processGenotypes = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 self.dbsnpVcf = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 self.hapmapVcf = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 # Open a vcf file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 def openVcf(self, filename):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 if filename == "stdin":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 self.filehandle = sys.stdin
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 self.filename = "stdin"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 try: self.filehandle = open(filename,"r")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 except IOError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 print >> sys.stderr, "Failed to find file: ",filename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 self.filename = os.path.abspath(filename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 # Parse the vcf header.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 def parseHeader(self, filename, writeOut):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 while self.getHeaderLine(filename, writeOut):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 # Determine the type of information in the header line.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 def getHeaderLine(self, filename, writeOut):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 self.headerLine = self.filehandle.readline().rstrip("\n")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 elif self.headerLine.startswith("##"): success = self.headerAdditional()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 else: success = self.noHeader(filename, writeOut)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 return success
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 # Read information on an info field from the header line.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 def headerInfo(self, writeOut, lineType):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 tag = self.headerLine.split("=",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 tagID = (tag[1].split("ID=",1))[1].split(",",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 # Check if this info field has already been defined.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 # Determine the number of entries, entry type and description.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 except IndexError: tagDescription = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 # Check that the number of fields associated with the tag is either
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 # an integer or a '.' to indicate variable number of entries.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 if tagNumber == ".": tagNumber = "variable"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 try: tagNumber = int(tagNumber)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 print >> sys.stderr, "\nError parsing header. Problem with info tag:", tagID
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if lineType == "info":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 self.infoHeaderString[tagID] = self.headerLine
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 if lineType == "format":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 self.formatHeaderString[tagID] = self.headerLine
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 # Check to see if the records contain information from multiple different
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 # sources. If vcfPytools has been used to find the intersection or union
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 # of two vcf files, the records may have been merged to keep all the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 # information available. If this is the case, there will be a ##FILE line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 # for each set of information in the file. The order of these files needs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 # to be maintained.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 def headerFiles(self, writeOut):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 fileID = (self.headerLine.split("ID=",1))[1].split(",",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 filename = fileID[1].split("\"",2)[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 try: fileID = int(fileID[0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 print >> sys.stderr, "File ID in ##FILE entry must be an integer."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 print >> sys.stderr, self.headerLine
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 if self.includedDataSets.has_key(fileID):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 print >> sys.stderr, "\nERROR: file " + self.filename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 self.includedDataSets[fileID] = filename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 # Set the number of files with information in this vcf file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 if fileID > self.numberDataSets: self.numberDataSets = fileID
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 # Read additional information contained in the header.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 def headerAdditional(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 self.headerText += self.headerLine + "\n"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 # Read in the column titles to check that all standard fields
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 # are present and read in all the samples.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 def headerTitleString(self, filename, writeOut):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 self.headerTitles = self.headerLine + "\n"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 # Strip the end of line character from the last infoFields entry.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 infoFields = self.headerLine.split("\t")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 if len(infoFields) > 8:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 # if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 # elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 self.samplesList = infoFields[9:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 self.genotypes = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 elif len(infoFields) == 8:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 if writeOut: print >> sys.stdout, "No samples present in the header. No genotype information available."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 print self.headerLine, len(infoFields)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 print >> sys.stderr, "Not all vcf standard fields are available."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 return False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 # If there is no header in the vcf file, close and reopen the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 # file so that the first line is avaiable for parsing as a
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 # vcf record.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 def noHeader(self, filename, writeOut):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 if writeOut: print >> sys.stdout, "No header lines present in", filename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 self.hasHeader = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 self.closeVcf(filename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 self.openVcf(filename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 return False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 # Check that info fields exist.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 def checkInfoFields(self, tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 if self.infoHeaderTags.has_key(tag) == False:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 # Get the next line of information from the vcf file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 def getRecord(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 self.record = self.filehandle.readline()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 if not self.record: return False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 # Set up and execute a regular expression match.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 recordMatch = recordRe.match(self.record)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 if recordMatch == None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 print >> sys.stderr, "Unable to resolve vcf record.\n"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 print >> sys.stderr, self.record
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 self.referenceSequence = recordMatch.group(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 try: self.position = int(recordMatch.group(2))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197 text = "variant position is not an integer"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198 self.generalError(text, "", None)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 self.rsid = recordMatch.group(3)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 self.ref = recordMatch.group(4)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201 self.alt = recordMatch.group(5)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202 self.quality = recordMatch.group(6)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203 self.filters = recordMatch.group(7)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 self.info = recordMatch.group(8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 self.genotypeString = recordMatch.group(9)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206 self.infoTags = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208 # Check that the quality is an integer or a float. If not, set the quality
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 # to zero.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 try: self.quality = float(self.quality)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 except ValueError: self.quality = float(0.)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213 # If recordMatch.group(9) is not the end of line character, there is
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214 # genotype information with this record.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215 if self.genotypeString != "\n": self.hasGenotypes = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216 else: self.hasGenotypes = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 # Add the reference sequence to the dictionary. If it didn't previously
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219 # exist append the reference sequence to the end of the list as well.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 # This ensures that the order in which the reference sequences appeared
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 # in the header can be preserved.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222 if self.referenceSequence not in self.referenceSequences:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 self.referenceSequences[self.referenceSequence] = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
224 self.referenceSequenceList.append(self.referenceSequence)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
225
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
226 # Check for multiple alternate alleles.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
227 self.alternateAlleles = self.alt.split(",")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
228 self.numberAlternateAlleles = len(self.alternateAlleles)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
229
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
230 # If required, process the info and genotypes.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
231 if self.processInfo: self.processInfoFields()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
232 if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
233
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
234 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
235
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
236 # Process the info string.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
237 def processInfoFields(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
238
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
239 # First break the info string into its constituent elements.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
240 infoEntries = self.info.split(";")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
241
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
242 # As long as some info fields exist, place them into a dictionary.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
243 for entry in infoEntries:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
244 infoEntry = entry.split("=")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
245
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
246 # If the entry is a flag, there will be no equals and the length of
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
247 # infoEntry will be 1. In this case, set the dictionary entry to the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
248 # whole entry. If the vcf file has undergone a union or intersection
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
249 # operation and contains the information from multiple files, this may
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
250 # be a '/' seperate list of flags and so cannot be set to a Boolean value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
251 # yet.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
252 if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
253 elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
254
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
255 # Process the genotype formats and values.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
256 def processGenotypeFields(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
257 genotypeEntries = self.genotypeString.split("\t")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
258 self.genotypeFormatString = genotypeEntries[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
259 self.genotypes = list(genotypeEntries[2:])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
260 self.genotypeFormats = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
261 self.genotypeFields = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
262 self.genotypeFormats = self.genotypeFormatString.split(":")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
263
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
264 # Check that the number of genotype fields is equal to the number of samples
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
265 if len(self.samplesList) != len(self.genotypes):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
266 text = "The number of genotypes is different to the number of samples"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
267 self.generalError(text, "", "")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
268
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
269 # Add the genotype information to a dictionary.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
270 for i in range( len(self.samplesList) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
271 genotypeInfo = self.genotypes[i].split(":")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
272 self.genotypeFields[ self.samplesList[i] ] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
273
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
274 # Check that there are as many fields as in the format field. If not, this must
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
275 # be because the information is not known. In this case, it is permitted that
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
276 # the genotype information is either . or ./.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
277 if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
278 self.genotypeFields[ self.samplesList[i] ] = "."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
279 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
280 if len(self.genotypeFormats) != len(genotypeInfo):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
281 text = "The number of genotype fields is different to the number specified in the format string"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
282 self.generalError(text, "sample", self.samplesList[i])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
283
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
284 for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
285
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
286 # Parse through the vcf file until the correct reference sequence is
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
287 # encountered and the position is greater than or equal to that requested.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
288 def parseVcf(self, referenceSequence, position, writeOut, outputFile):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
289 success = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
290 if self.referenceSequence != referenceSequence:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
291 while self.referenceSequence != referenceSequence and success:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
292 if writeOut: outputFile.write(self.record)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
293 success = self.getRecord()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
294
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
295 while self.referenceSequence == referenceSequence and self.position < position and success:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
296 if writeOut: outputFile.write(self.record)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
297 success = self.getRecord()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
298
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
299 return success
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
300
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
301 # Get the information for a specific info tag. Also check that it contains
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
302 # the correct number and type of entries.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
303 def getInfo(self, tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
304 result = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
305
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
306 # Check if the tag exists in the header information. If so,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
307 # determine the number and type of entries asscoiated with this
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
308 # tag.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
309 if self.infoHeaderTags.has_key(tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
310 infoNumber = self.infoHeaderTags[tag][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
311 infoType = self.infoHeaderTags[tag][1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
312 numberValues = infoNumber
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
313
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
314 # First check that the tag exists in the information string. Then split
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
315 # the entry on commas. For flag entries, do not perform the split.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
316 if self.infoTags.has_key(tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
317 if numberValues == 0 and infoType == "Flag": result = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
318 elif numberValues != 0 and infoType == "Flag":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
319 print >> sys.stderr, "ERROR"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
320 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
321 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
322 fields = self.infoTags[tag].split(",")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
323 if len(fields) != numberValues:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
324 text = "Unexpected number of entries"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
325 self.generalError(text, "information tag", tag)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
326
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
327 for i in range(infoNumber):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
328 try: result.append(fields[i])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
329 except IndexError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
330 text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
331 self.generalError(text, "tag:", tag)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
332 else: numberValues = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
333
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
334 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
335 text = "information field does not have a definition in the header"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
336 self.generalError(text, "tag", tag)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
337
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
338 return numberValues, infoType, result
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
339
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
340 # Get the genotype information.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
341 def getGenotypeInfo(self, sample, tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
342 result = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
343 if self.formatHeaderTags.has_key(tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
344 infoNumber = self.formatHeaderTags[tag][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
345 infoType = self.formatHeaderTags[tag][1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
346 numberValues = infoNumber
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
347
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
348 if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
349 numberValues = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
350 result = "."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
351 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
352 if self.genotypeFields[sample].has_key(tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
353 if tag == "GT":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
354 if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
355 text = "Unexected number of characters in genotype (GT) field"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
356 self.generalError(text, "sample", sample)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
357
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
358 # If a diploid call, check whether or not the genotype is phased.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
359 elif len(self.genotypeFields[sample][tag]) == 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
360 self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
361 result.append( self.genotypeFields[sample][tag][0] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
362 result.append( self.genotypeFields[sample][tag][2] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
363 elif len(self.genotypeFields[sample][tag]) == 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
364 result.append( self.genotypeFields[sample][tag][0] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
365 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
366 fields = self.genotypeFields[sample][tag].split(",")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
367 if len(fields) != numberValues:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
368 text = "Unexpected number of characters in " + tag + " field"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
369 self.generalError(text, "sample", sample)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
370
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
371 for i in range(infoNumber): result.append(fields[i])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
372 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
373 text = "genotype field does not have a definition in the header"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
374 self.generalError(text, "tag", tag)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
375
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
376 return numberValues, result
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
377
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
378 # Parse the dbsnp entry. If the entry conforms to the required variant type,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
379 # return the dbsnp rsid value, otherwise ".".
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
380 def getDbsnpInfo(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
381
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
382 # First check that the variant class (VC) is listed as SNP.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
383 vc = self.info.split("VC=",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
384 if vc[1].find(";") != -1: snp = vc[1].split(";",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
385 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
386 snp = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
387 snp.append(vc[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
388
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
389 if snp[0].lower() == "snp": rsid = self.rsid
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
390 else: rsid = "."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
391
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
392 return rsid
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
393
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
394 # Build a new vcf record.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
395 def buildRecord(self, removeGenotypes):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
396 record = self.referenceSequence + "\t" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
397 str(self.position) + "\t" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
398 self.rsid + "\t" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
399 self.ref + "\t" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
400 self.alt + "\t" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
401 str(self.quality) + "\t" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
402 self.filters + "\t" + \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
403 self.info
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
404
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
405 if self.hasGenotypes and not removeGenotypes: record += self.genotypeString
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
406
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
407 record += "\n"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
408
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
409 return record
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
410
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
411 # Close the vcf file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
412 def closeVcf(self, filename):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
413 self.filehandle.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
414
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
415 # Define error messages for different handled errors.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
416 def generalError(self, text, field, fieldValue):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
417 print >> sys.stderr, "\nError encountered when attempting to read:"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
418 print >> sys.stderr, "\treference sequence :\t", self.referenceSequence
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
419 print >> sys.stderr, "\tposition :\t\t", self.position
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
420 if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
421 print >> sys.stderr, "\n", text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
422 exit(1)