0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import os.path
|
|
4 import sys
|
|
5 import re
|
|
6
|
|
7 class vcf:
|
|
8 def __init__(self):
|
|
9
|
|
10 # Header info.
|
|
11 self.filename = ""
|
|
12 self.hasHeader = True
|
|
13 self.headerText = ""
|
|
14 self.headerTitles = ""
|
|
15 #self.headerInfoText = ""
|
|
16 #self.headerFormatText = ""
|
|
17
|
|
18 # Store the info and format tags as well as the lines that describe
|
|
19 # them in a dictionary.
|
|
20 self.numberDataSets = 0
|
|
21 self.includedDataSets = {}
|
|
22 self.infoHeaderTags = {}
|
|
23 self.infoHeaderString = {}
|
|
24 self.formatHeaderTags = {}
|
|
25 self.formatHeaderString = {}
|
|
26
|
|
27 # Genotype information.
|
|
28 self.genotypes = False
|
|
29 self.infoField = {}
|
|
30
|
|
31 # Reference sequence information.
|
|
32 self.referenceSequences = {}
|
|
33 self.referenceSequenceList = []
|
|
34 self.referenceSequence = ""
|
|
35
|
|
36 # Record information.
|
|
37 self.position = -1
|
|
38 self.samplesList = []
|
|
39
|
|
40 # Determine which fields to process.
|
|
41 self.processInfo = False
|
|
42 self.processGenotypes = False
|
|
43 self.dbsnpVcf = False
|
|
44 self.hapmapVcf = False
|
|
45
|
|
46 # Open a vcf file.
|
|
47 def openVcf(self, filename):
|
|
48 if filename == "stdin":
|
|
49 self.filehandle = sys.stdin
|
|
50 self.filename = "stdin"
|
|
51 else:
|
|
52 try: self.filehandle = open(filename,"r")
|
|
53 except IOError:
|
|
54 print >> sys.stderr, "Failed to find file: ",filename
|
|
55 exit(1)
|
|
56 self.filename = os.path.abspath(filename)
|
|
57
|
|
58 # Parse the vcf header.
|
|
59 def parseHeader(self, filename, writeOut):
|
|
60 while self.getHeaderLine(filename, writeOut):
|
|
61 continue
|
|
62
|
|
63 # Determine the type of information in the header line.
|
|
64 def getHeaderLine(self, filename, writeOut):
|
|
65 self.headerLine = self.filehandle.readline().rstrip("\n")
|
|
66 if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")
|
|
67 elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")
|
|
68 elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)
|
|
69 elif self.headerLine.startswith("##"): success = self.headerAdditional()
|
|
70 elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)
|
|
71 else: success = self.noHeader(filename, writeOut)
|
|
72
|
|
73 return success
|
|
74
|
|
75 # Read information on an info field from the header line.
|
|
76 def headerInfo(self, writeOut, lineType):
|
|
77 tag = self.headerLine.split("=",1)
|
|
78 tagID = (tag[1].split("ID=",1))[1].split(",",1)
|
|
79
|
|
80 # Check if this info field has already been defined.
|
|
81 if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):
|
|
82 print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header."
|
|
83 exit(1)
|
|
84
|
|
85 # Determine the number of entries, entry type and description.
|
|
86 tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)
|
|
87 tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)
|
|
88 try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0]
|
|
89 except IndexError: tagDescription = ""
|
|
90 tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]
|
|
91
|
|
92 # Check that the number of fields associated with the tag is either
|
|
93 # an integer or a '.' to indicate variable number of entries.
|
|
94 if tagNumber == ".": tagNumber = "variable"
|
|
95 else:
|
|
96 try: tagNumber = int(tagNumber)
|
|
97 except ValueError:
|
|
98 print >> sys.stderr, "\nError parsing header. Problem with info tag:", tagID
|
|
99 print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'"
|
|
100 exit(1)
|
|
101
|
|
102 if lineType == "info":
|
|
103 self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription
|
|
104 self.infoHeaderString[tagID] = self.headerLine
|
|
105 if lineType == "format":
|
|
106 self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription
|
|
107 self.formatHeaderString[tagID] = self.headerLine
|
|
108
|
|
109 return True
|
|
110
|
|
111 # Check to see if the records contain information from multiple different
|
|
112 # sources. If vcfPytools has been used to find the intersection or union
|
|
113 # of two vcf files, the records may have been merged to keep all the
|
|
114 # information available. If this is the case, there will be a ##FILE line
|
|
115 # for each set of information in the file. The order of these files needs
|
|
116 # to be maintained.
|
|
117 def headerFiles(self, writeOut):
|
|
118 fileID = (self.headerLine.split("ID=",1))[1].split(",",1)
|
|
119 filename = fileID[1].split("\"",2)[1]
|
|
120 try: fileID = int(fileID[0])
|
|
121 except ValueError:
|
|
122 print >> sys.stderr, "File ID in ##FILE entry must be an integer."
|
|
123 print >> sys.stderr, self.headerLine
|
|
124 exit(1)
|
|
125 if self.includedDataSets.has_key(fileID):
|
|
126 print >> sys.stderr, "\nERROR: file " + self.filename
|
|
127 print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values."
|
|
128 exit(1)
|
|
129 self.includedDataSets[fileID] = filename
|
|
130
|
|
131 # Set the number of files with information in this vcf file.
|
|
132 if fileID > self.numberDataSets: self.numberDataSets = fileID
|
|
133
|
|
134 return True
|
|
135
|
|
136 # Read additional information contained in the header.
|
|
137 def headerAdditional(self):
|
|
138 self.headerText += self.headerLine + "\n"
|
|
139
|
|
140 return True
|
|
141
|
|
142 # Read in the column titles to check that all standard fields
|
|
143 # are present and read in all the samples.
|
|
144 def headerTitleString(self, filename, writeOut):
|
|
145 self.headerTitles = self.headerLine + "\n"
|
|
146
|
|
147 # Strip the end of line character from the last infoFields entry.
|
|
148 infoFields = self.headerLine.split("\t")
|
|
149 if len(infoFields) > 8:
|
|
150 # if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename
|
|
151 # elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename
|
|
152 self.samplesList = infoFields[9:]
|
|
153 self.genotypes = True
|
|
154 elif len(infoFields) == 8:
|
|
155 if writeOut: print >> sys.stdout, "No samples present in the header. No genotype information available."
|
|
156 else:
|
|
157 print self.headerLine, len(infoFields)
|
|
158 print >> sys.stderr, "Not all vcf standard fields are available."
|
|
159 exit(1)
|
|
160
|
|
161 return False
|
|
162
|
|
163 # If there is no header in the vcf file, close and reopen the
|
|
164 # file so that the first line is avaiable for parsing as a
|
|
165 # vcf record.
|
|
166 def noHeader(self, filename, writeOut):
|
|
167 if writeOut: print >> sys.stdout, "No header lines present in", filename
|
|
168 self.hasHeader = False
|
|
169 self.closeVcf(filename)
|
|
170 self.openVcf(filename)
|
|
171
|
|
172 return False
|
|
173
|
|
174 # Check that info fields exist.
|
|
175 def checkInfoFields(self, tag):
|
|
176 if self.infoHeaderTags.has_key(tag) == False:
|
|
177 print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header."
|
|
178 exit(1)
|
|
179
|
|
180 # Get the next line of information from the vcf file.
|
|
181 def getRecord(self):
|
|
182 self.record = self.filehandle.readline()
|
|
183 if not self.record: return False
|
|
184
|
|
185 # Set up and execute a regular expression match.
|
|
186 recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$")
|
|
187 #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$")
|
|
188 recordMatch = recordRe.match(self.record)
|
|
189 if recordMatch == None:
|
|
190 print >> sys.stderr, "Unable to resolve vcf record.\n"
|
|
191 print >> sys.stderr, self.record
|
|
192 exit(1)
|
|
193
|
|
194 self.referenceSequence = recordMatch.group(1)
|
|
195 try: self.position = int(recordMatch.group(2))
|
|
196 except ValueError:
|
|
197 text = "variant position is not an integer"
|
|
198 self.generalError(text, "", None)
|
|
199 self.rsid = recordMatch.group(3)
|
|
200 self.ref = recordMatch.group(4)
|
|
201 self.alt = recordMatch.group(5)
|
|
202 self.quality = recordMatch.group(6)
|
|
203 self.filters = recordMatch.group(7)
|
|
204 self.info = recordMatch.group(8)
|
|
205 self.genotypeString = recordMatch.group(9)
|
|
206 self.infoTags = {}
|
|
207
|
|
208 # Check that the quality is an integer or a float. If not, set the quality
|
|
209 # to zero.
|
|
210 try: self.quality = float(self.quality)
|
|
211 except ValueError: self.quality = float(0.)
|
|
212
|
|
213 # If recordMatch.group(9) is not the end of line character, there is
|
|
214 # genotype information with this record.
|
|
215 if self.genotypeString != "\n": self.hasGenotypes = True
|
|
216 else: self.hasGenotypes = False
|
|
217
|
|
218 # Add the reference sequence to the dictionary. If it didn't previously
|
|
219 # exist append the reference sequence to the end of the list as well.
|
|
220 # This ensures that the order in which the reference sequences appeared
|
|
221 # in the header can be preserved.
|
|
222 if self.referenceSequence not in self.referenceSequences:
|
|
223 self.referenceSequences[self.referenceSequence] = True
|
|
224 self.referenceSequenceList.append(self.referenceSequence)
|
|
225
|
|
226 # Check for multiple alternate alleles.
|
|
227 self.alternateAlleles = self.alt.split(",")
|
|
228 self.numberAlternateAlleles = len(self.alternateAlleles)
|
|
229
|
|
230 # If required, process the info and genotypes.
|
|
231 if self.processInfo: self.processInfoFields()
|
|
232 if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields()
|
|
233
|
|
234 return True
|
|
235
|
|
236 # Process the info string.
|
|
237 def processInfoFields(self):
|
|
238
|
|
239 # First break the info string into its constituent elements.
|
|
240 infoEntries = self.info.split(";")
|
|
241
|
|
242 # As long as some info fields exist, place them into a dictionary.
|
|
243 for entry in infoEntries:
|
|
244 infoEntry = entry.split("=")
|
|
245
|
|
246 # If the entry is a flag, there will be no equals and the length of
|
|
247 # infoEntry will be 1. In this case, set the dictionary entry to the
|
|
248 # whole entry. If the vcf file has undergone a union or intersection
|
|
249 # operation and contains the information from multiple files, this may
|
|
250 # be a '/' seperate list of flags and so cannot be set to a Boolean value
|
|
251 # yet.
|
|
252 if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0]
|
|
253 elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1]
|
|
254
|
|
255 # Process the genotype formats and values.
|
|
256 def processGenotypeFields(self):
|
|
257 genotypeEntries = self.genotypeString.split("\t")
|
|
258 self.genotypeFormatString = genotypeEntries[1]
|
|
259 self.genotypes = list(genotypeEntries[2:])
|
|
260 self.genotypeFormats = {}
|
|
261 self.genotypeFields = {}
|
|
262 self.genotypeFormats = self.genotypeFormatString.split(":")
|
|
263
|
|
264 # Check that the number of genotype fields is equal to the number of samples
|
|
265 if len(self.samplesList) != len(self.genotypes):
|
|
266 text = "The number of genotypes is different to the number of samples"
|
|
267 self.generalError(text, "", "")
|
|
268
|
|
269 # Add the genotype information to a dictionary.
|
|
270 for i in range( len(self.samplesList) ):
|
|
271 genotypeInfo = self.genotypes[i].split(":")
|
|
272 self.genotypeFields[ self.samplesList[i] ] = {}
|
|
273
|
|
274 # Check that there are as many fields as in the format field. If not, this must
|
|
275 # be because the information is not known. In this case, it is permitted that
|
|
276 # the genotype information is either . or ./.
|
|
277 if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo):
|
|
278 self.genotypeFields[ self.samplesList[i] ] = "."
|
|
279 else:
|
|
280 if len(self.genotypeFormats) != len(genotypeInfo):
|
|
281 text = "The number of genotype fields is different to the number specified in the format string"
|
|
282 self.generalError(text, "sample", self.samplesList[i])
|
|
283
|
|
284 for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j]
|
|
285
|
|
286 # Parse through the vcf file until the correct reference sequence is
|
|
287 # encountered and the position is greater than or equal to that requested.
|
|
288 def parseVcf(self, referenceSequence, position, writeOut, outputFile):
|
|
289 success = True
|
|
290 if self.referenceSequence != referenceSequence:
|
|
291 while self.referenceSequence != referenceSequence and success:
|
|
292 if writeOut: outputFile.write(self.record)
|
|
293 success = self.getRecord()
|
|
294
|
|
295 while self.referenceSequence == referenceSequence and self.position < position and success:
|
|
296 if writeOut: outputFile.write(self.record)
|
|
297 success = self.getRecord()
|
|
298
|
|
299 return success
|
|
300
|
|
301 # Get the information for a specific info tag. Also check that it contains
|
|
302 # the correct number and type of entries.
|
|
303 def getInfo(self, tag):
|
|
304 result = []
|
|
305
|
|
306 # Check if the tag exists in the header information. If so,
|
|
307 # determine the number and type of entries asscoiated with this
|
|
308 # tag.
|
|
309 if self.infoHeaderTags.has_key(tag):
|
|
310 infoNumber = self.infoHeaderTags[tag][0]
|
|
311 infoType = self.infoHeaderTags[tag][1]
|
|
312 numberValues = infoNumber
|
|
313
|
|
314 # First check that the tag exists in the information string. Then split
|
|
315 # the entry on commas. For flag entries, do not perform the split.
|
|
316 if self.infoTags.has_key(tag):
|
|
317 if numberValues == 0 and infoType == "Flag": result = True
|
|
318 elif numberValues != 0 and infoType == "Flag":
|
|
319 print >> sys.stderr, "ERROR"
|
|
320 exit(1)
|
|
321 else:
|
|
322 fields = self.infoTags[tag].split(",")
|
|
323 if len(fields) != numberValues:
|
|
324 text = "Unexpected number of entries"
|
|
325 self.generalError(text, "information tag", tag)
|
|
326
|
|
327 for i in range(infoNumber):
|
|
328 try: result.append(fields[i])
|
|
329 except IndexError:
|
|
330 text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]
|
|
331 self.generalError(text, "tag:", tag)
|
|
332 else: numberValues = 0
|
|
333
|
|
334 else:
|
|
335 text = "information field does not have a definition in the header"
|
|
336 self.generalError(text, "tag", tag)
|
|
337
|
|
338 return numberValues, infoType, result
|
|
339
|
|
340 # Get the genotype information.
|
|
341 def getGenotypeInfo(self, sample, tag):
|
|
342 result = []
|
|
343 if self.formatHeaderTags.has_key(tag):
|
|
344 infoNumber = self.formatHeaderTags[tag][0]
|
|
345 infoType = self.formatHeaderTags[tag][1]
|
|
346 numberValues = infoNumber
|
|
347
|
|
348 if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:
|
|
349 numberValues = 0
|
|
350 result = "."
|
|
351 else:
|
|
352 if self.genotypeFields[sample].has_key(tag):
|
|
353 if tag == "GT":
|
|
354 if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:
|
|
355 text = "Unexected number of characters in genotype (GT) field"
|
|
356 self.generalError(text, "sample", sample)
|
|
357
|
|
358 # If a diploid call, check whether or not the genotype is phased.
|
|
359 elif len(self.genotypeFields[sample][tag]) == 3:
|
|
360 self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False
|
|
361 result.append( self.genotypeFields[sample][tag][0] )
|
|
362 result.append( self.genotypeFields[sample][tag][2] )
|
|
363 elif len(self.genotypeFields[sample][tag]) == 3:
|
|
364 result.append( self.genotypeFields[sample][tag][0] )
|
|
365 else:
|
|
366 fields = self.genotypeFields[sample][tag].split(",")
|
|
367 if len(fields) != numberValues:
|
|
368 text = "Unexpected number of characters in " + tag + " field"
|
|
369 self.generalError(text, "sample", sample)
|
|
370
|
|
371 for i in range(infoNumber): result.append(fields[i])
|
|
372 else:
|
|
373 text = "genotype field does not have a definition in the header"
|
|
374 self.generalError(text, "tag", tag)
|
|
375
|
|
376 return numberValues, result
|
|
377
|
|
378 # Parse the dbsnp entry. If the entry conforms to the required variant type,
|
|
379 # return the dbsnp rsid value, otherwise ".".
|
|
380 def getDbsnpInfo(self):
|
|
381
|
|
382 # First check that the variant class (VC) is listed as SNP.
|
|
383 vc = self.info.split("VC=",1)
|
|
384 if vc[1].find(";") != -1: snp = vc[1].split(";",1)
|
|
385 else:
|
|
386 snp = []
|
|
387 snp.append(vc[1])
|
|
388
|
|
389 if snp[0].lower() == "snp": rsid = self.rsid
|
|
390 else: rsid = "."
|
|
391
|
|
392 return rsid
|
|
393
|
|
394 # Build a new vcf record.
|
|
395 def buildRecord(self, removeGenotypes):
|
|
396 record = self.referenceSequence + "\t" + \
|
|
397 str(self.position) + "\t" + \
|
|
398 self.rsid + "\t" + \
|
|
399 self.ref + "\t" + \
|
|
400 self.alt + "\t" + \
|
|
401 str(self.quality) + "\t" + \
|
|
402 self.filters + "\t" + \
|
|
403 self.info
|
|
404
|
|
405 if self.hasGenotypes and not removeGenotypes: record += self.genotypeString
|
|
406
|
|
407 record += "\n"
|
|
408
|
|
409 return record
|
|
410
|
|
411 # Close the vcf file.
|
|
412 def closeVcf(self, filename):
|
|
413 self.filehandle.close()
|
|
414
|
|
415 # Define error messages for different handled errors.
|
|
416 def generalError(self, text, field, fieldValue):
|
|
417 print >> sys.stderr, "\nError encountered when attempting to read:"
|
|
418 print >> sys.stderr, "\treference sequence :\t", self.referenceSequence
|
|
419 print >> sys.stderr, "\tposition :\t\t", self.position
|
|
420 if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue
|
|
421 print >> sys.stderr, "\n", text
|
|
422 exit(1)
|