| 0 | 1 #!/usr/bin/python | 
|  | 2 | 
|  | 3 import os.path | 
|  | 4 import sys | 
|  | 5 import re | 
|  | 6 | 
|  | 7 class vcf: | 
|  | 8   def __init__(self): | 
|  | 9 | 
|  | 10 # Header info. | 
|  | 11     self.filename = "" | 
|  | 12     self.hasHeader = True | 
|  | 13     self.headerText = "" | 
|  | 14     self.headerTitles = "" | 
|  | 15     self.vcfFormat = "" | 
|  | 16     #self.headerInfoText = "" | 
|  | 17     #self.headerFormatText = "" | 
|  | 18 | 
|  | 19 # Store the info and format tags as well as the lines that describe | 
|  | 20 # them in a dictionary. | 
|  | 21     self.numberDataSets = 0 | 
|  | 22     self.includedDataSets = {} | 
|  | 23     self.infoHeaderTags = {} | 
|  | 24     self.infoHeaderString = {} | 
|  | 25     self.formatHeaderTags = {} | 
|  | 26     self.formatHeaderString = {} | 
|  | 27 | 
|  | 28 # Genotype information. | 
|  | 29     self.genotypes = False | 
|  | 30     self.infoField = {} | 
|  | 31 | 
|  | 32 # Reference sequence information. | 
|  | 33     self.referenceSequences = {} | 
|  | 34     self.referenceSequenceList = [] | 
|  | 35     self.referenceSequence = "" | 
|  | 36 | 
|  | 37 # Record information. | 
|  | 38     self.position = -1 | 
|  | 39     self.samplesList = [] | 
|  | 40 | 
|  | 41 # Determine which fields to process. | 
|  | 42     self.processInfo = False | 
|  | 43     self.processGenotypes = False | 
|  | 44     self.dbsnpVcf = False | 
|  | 45     self.hapmapVcf = False | 
|  | 46 | 
|  | 47 # Open a vcf file. | 
|  | 48   def openVcf(self, filename): | 
|  | 49     if filename == "stdin": | 
|  | 50       self.filehandle = sys.stdin | 
|  | 51       self.filename = "stdin" | 
|  | 52     else: | 
|  | 53       try: self.filehandle = open(filename,"r") | 
|  | 54       except IOError: | 
|  | 55         print >> sys.stderr, "Failed to find file: ",filename | 
|  | 56         exit(1) | 
|  | 57       self.filename = os.path.abspath(filename) | 
|  | 58 | 
|  | 59 # Parse the vcf header. | 
|  | 60   def parseHeader(self, filename, writeOut): | 
|  | 61     while self.getHeaderLine(filename, writeOut): | 
|  | 62       continue | 
|  | 63 | 
|  | 64 # Determine the type of information in the header line. | 
|  | 65   def getHeaderLine(self, filename, writeOut): | 
|  | 66     self.headerLine = self.filehandle.readline().rstrip("\n") | 
|  | 67     if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat() | 
|  | 68     if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info") | 
|  | 69     elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format") | 
|  | 70     elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut) | 
|  | 71     elif self.headerLine.startswith("##"): success = self.headerAdditional() | 
|  | 72     elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut) | 
|  | 73     else: success = self.noHeader(filename, writeOut) | 
|  | 74 | 
|  | 75     return success | 
|  | 76 | 
|  | 77 # Read VCF format | 
|  | 78   def getvcfFormat(self): | 
|  | 79       try: | 
|  | 80           self.vcfFormat = self.headerLine.split("=",1)[1] | 
|  | 81           self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string | 
|  | 82       except IndexError: | 
|  | 83           print >> sys.stderr, "\nError parsing the fileformat" | 
|  | 84           print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine | 
|  | 85           exit(1) | 
|  | 86       return True | 
|  | 87 | 
|  | 88 | 
|  | 89 # Read information on an info field from the header line. | 
|  | 90   def headerInfo(self, writeOut, lineType): | 
|  | 91     tag = self.headerLine.split("=",1) | 
|  | 92     tagID = (tag[1].split("ID=",1))[1].split(",",1) | 
|  | 93 | 
|  | 94 # Check if this info field has already been defined. | 
|  | 95     if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])): | 
|  | 96       print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header." | 
|  | 97       exit(1) | 
|  | 98 | 
|  | 99 # Determine the number of entries, entry type and description. | 
|  | 100     tagNumber = (tagID[1].split("Number=",1))[1].split(",",1) | 
|  | 101     tagType = (tagNumber[1].split("Type=",1))[1].split(",",1) | 
|  | 102     try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0] | 
|  | 103     except IndexError: tagDescription = "" | 
|  | 104     tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0] | 
|  | 105 | 
|  | 106 # Check that the number of fields associated with the tag is either | 
|  | 107 # an integer or a '.' to indicate variable number of entries. | 
|  | 108     if tagNumber == ".": tagNumber = "variable" | 
|  | 109     else: | 
|  | 110       if self.vcfFormat<4.1: | 
|  | 111 | 
|  | 112         try: | 
|  | 113             tagNumber = int(tagNumber) | 
|  | 114 | 
|  | 115         except ValueError: | 
|  | 116             print >> sys.stderr, "\nError parsing header.  Problem with info tag:", tagID | 
|  | 117             print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'" | 
|  | 118             exit(1) | 
|  | 119 | 
|  | 120     if lineType == "info": | 
|  | 121       self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription | 
|  | 122       self.infoHeaderString[tagID] = self.headerLine | 
|  | 123     if lineType == "format": | 
|  | 124       self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription | 
|  | 125       self.formatHeaderString[tagID] = self.headerLine | 
|  | 126 | 
|  | 127     return True | 
|  | 128 | 
|  | 129 # Check to see if the records contain information from multiple different | 
|  | 130 # sources.  If vcfPytools has been used to find the intersection or union | 
|  | 131 # of two vcf files, the records may have been merged to keep all the | 
|  | 132 # information available.  If this is the case, there will be a ##FILE line | 
|  | 133 # for each set of information in the file.  The order of these files needs | 
|  | 134 # to be maintained. | 
|  | 135   def headerFiles(self, writeOut): | 
|  | 136     fileID = (self.headerLine.split("ID=",1))[1].split(",",1) | 
|  | 137     filename = fileID[1].split("\"",2)[1] | 
|  | 138     try: fileID = int(fileID[0]) | 
|  | 139     except ValueError: | 
|  | 140       print >> sys.stderr, "File ID in ##FILE entry must be an integer." | 
|  | 141       print >> sys.stderr, self.headerLine | 
|  | 142       exit(1) | 
|  | 143     if self.includedDataSets.has_key(fileID): | 
|  | 144       print >> sys.stderr, "\nERROR: file " + self.filename | 
|  | 145       print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values." | 
|  | 146       exit(1) | 
|  | 147     self.includedDataSets[fileID] = filename | 
|  | 148 | 
|  | 149 # Set the number of files with information in this vcf file. | 
|  | 150     if fileID > self.numberDataSets: self.numberDataSets = fileID | 
|  | 151 | 
|  | 152     return True | 
|  | 153 | 
|  | 154 # Read additional information contained in the header. | 
|  | 155   def headerAdditional(self): | 
|  | 156     self.headerText += self.headerLine + "\n" | 
|  | 157 | 
|  | 158     return True | 
|  | 159 | 
|  | 160 # Read in the column titles to check that all standard fields | 
|  | 161 # are present and read in all the samples. | 
|  | 162   def headerTitleString(self, filename, writeOut): | 
|  | 163     self.headerTitles = self.headerLine + "\n" | 
|  | 164 | 
|  | 165 # Strip the end of line character from the last infoFields entry. | 
|  | 166     infoFields = self.headerLine.split("\t") | 
|  | 167     if len(infoFields) > 8: | 
|  | 168 #      if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename | 
|  | 169 #      elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename | 
|  | 170       self.samplesList = infoFields[9:] | 
|  | 171       self.genotypes = True | 
|  | 172     elif len(infoFields) == 8: | 
|  | 173       if writeOut: print >> sys.stdout, "No samples present in the header.  No genotype information available." | 
|  | 174     else: | 
|  | 175       print self.headerLine, len(infoFields) | 
|  | 176       print >> sys.stderr, "Not all vcf standard fields are available." | 
|  | 177       exit(1) | 
|  | 178 | 
|  | 179     return False | 
|  | 180 | 
|  | 181 # If there is no header in the vcf file, close and reopen the | 
|  | 182 # file so that the first line is avaiable for parsing as a | 
|  | 183 # vcf record. | 
|  | 184   def noHeader(self, filename, writeOut): | 
|  | 185     if writeOut: print >> sys.stdout, "No header lines present in", filename | 
|  | 186     self.hasHeader = False | 
|  | 187     self.closeVcf(filename) | 
|  | 188     self.openVcf(filename) | 
|  | 189 | 
|  | 190     return False | 
|  | 191 | 
|  | 192 # Check that info fields exist. | 
|  | 193   def checkInfoFields(self, tag): | 
|  | 194     if self.infoHeaderTags.has_key(tag) == False: | 
|  | 195       print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header." | 
|  | 196       exit(1) | 
|  | 197 | 
|  | 198 # Get the next line of information from the vcf file. | 
|  | 199   def getRecord(self): | 
|  | 200     self.record = self.filehandle.readline() | 
|  | 201     if not self.record: return False | 
|  | 202 | 
|  | 203 # Set up and execute a regular expression match. | 
|  | 204     recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$") | 
|  | 205     #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$") | 
|  | 206     recordMatch = recordRe.match(self.record) | 
|  | 207     if recordMatch == None: | 
|  | 208       print >> sys.stderr, "Unable to resolve vcf record.\n" | 
|  | 209       print >> sys.stderr, self.record | 
|  | 210       exit(1) | 
|  | 211 | 
|  | 212     self.referenceSequence = recordMatch.group(1) | 
|  | 213     try: self.position = int(recordMatch.group(2)) | 
|  | 214     except ValueError: | 
|  | 215       text = "variant position is not an integer" | 
|  | 216       self.generalError(text, "", None) | 
|  | 217     self.rsid       = recordMatch.group(3) | 
|  | 218     self.ref        = recordMatch.group(4) | 
|  | 219     self.alt        = recordMatch.group(5) | 
|  | 220     self.quality    = recordMatch.group(6) | 
|  | 221     self.filters    = recordMatch.group(7) | 
|  | 222     self.info       = recordMatch.group(8) | 
|  | 223     self.genotypeString = recordMatch.group(9) | 
|  | 224     self.infoTags   = {} | 
|  | 225 | 
|  | 226 # Check that the quality is an integer or a float.  If not, set the quality | 
|  | 227 # to zero. | 
|  | 228     try: self.quality = float(self.quality) | 
|  | 229     except ValueError: self.quality = float(0.) | 
|  | 230 | 
|  | 231 # If recordMatch.group(9) is not the end of line character, there is | 
|  | 232 # genotype information with this record. | 
|  | 233     if self.genotypeString != "\n": self.hasGenotypes = True | 
|  | 234     else: self.hasGenotypes = False | 
|  | 235 | 
|  | 236 # Add the reference sequence to the dictionary.  If it didn't previously | 
|  | 237 # exist append the reference sequence to the end of the list as well. | 
|  | 238 # This ensures that the order in which the reference sequences appeared | 
|  | 239 # in the header can be preserved. | 
|  | 240     if self.referenceSequence not in self.referenceSequences: | 
|  | 241       self.referenceSequences[self.referenceSequence] = True | 
|  | 242       self.referenceSequenceList.append(self.referenceSequence) | 
|  | 243 | 
|  | 244 # Check for multiple alternate alleles. | 
|  | 245     self.alternateAlleles = self.alt.split(",") | 
|  | 246     self.numberAlternateAlleles = len(self.alternateAlleles) | 
|  | 247 | 
|  | 248 # If required, process the info and genotypes. | 
|  | 249     if self.processInfo: self.processInfoFields() | 
|  | 250     if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields() | 
|  | 251 | 
|  | 252     return True | 
|  | 253 | 
|  | 254 # Process the info string. | 
|  | 255   def processInfoFields(self): | 
|  | 256 | 
|  | 257 # First break the info string into its constituent elements. | 
|  | 258     infoEntries = self.info.split(";") | 
|  | 259 | 
|  | 260 # As long as some info fields exist, place them into a dictionary. | 
|  | 261     for entry in infoEntries: | 
|  | 262       infoEntry = entry.split("=") | 
|  | 263 | 
|  | 264 # If the entry is a flag, there will be no equals and the length of | 
|  | 265 # infoEntry will be 1.  In this case, set the dictionary entry to the | 
|  | 266 # whole entry.  If the vcf file has undergone a union or intersection | 
|  | 267 # operation and contains the information from multiple files, this may | 
|  | 268 # be a '/' seperate list of flags and so cannot be set to a Boolean value | 
|  | 269 # yet. | 
|  | 270       if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0] | 
|  | 271       elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1] | 
|  | 272 | 
|  | 273 # Process the genotype formats and values. | 
|  | 274   def processGenotypeFields(self): | 
|  | 275     genotypeEntries = self.genotypeString.split("\t") | 
|  | 276     self.genotypeFormatString = genotypeEntries[1] | 
|  | 277     self.genotypes = list(genotypeEntries[2:]) | 
|  | 278     self.genotypeFormats = {} | 
|  | 279     self.genotypeFields = {} | 
|  | 280     self.genotypeFormats = self.genotypeFormatString.split(":") | 
|  | 281 | 
|  | 282 # Check that the number of genotype fields is equal to the number of samples | 
|  | 283     if len(self.samplesList) != len(self.genotypes): | 
|  | 284       text = "The number of genotypes is different to the number of samples" | 
|  | 285       self.generalError(text, "", "") | 
|  | 286 | 
|  | 287 # Add the genotype information to a dictionary. | 
|  | 288     for i in range( len(self.samplesList) ): | 
|  | 289       genotypeInfo = self.genotypes[i].split(":") | 
|  | 290       self.genotypeFields[ self.samplesList[i] ] = {} | 
|  | 291 | 
|  | 292 # Check that there are as many fields as in the format field.  If not, this must | 
|  | 293 # be because the information is not known.  In this case, it is permitted that | 
|  | 294 # the genotype information is either . or ./. | 
|  | 295       if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo): | 
|  | 296         self.genotypeFields[ self.samplesList[i] ] = "." | 
|  | 297       else: | 
|  | 298         if len(self.genotypeFormats) != len(genotypeInfo): | 
|  | 299           text = "The number of genotype fields is different to the number specified in the format string" | 
|  | 300           self.generalError(text, "sample", self.samplesList[i]) | 
|  | 301 | 
|  | 302         for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j] | 
|  | 303 | 
|  | 304 # Parse through the vcf file until the correct reference sequence is | 
|  | 305 # encountered and the position is greater than or equal to that requested. | 
|  | 306   def parseVcf(self, referenceSequence, position, writeOut, outputFile): | 
|  | 307     success = True | 
|  | 308     if self.referenceSequence != referenceSequence: | 
|  | 309       while self.referenceSequence != referenceSequence and success: | 
|  | 310         if writeOut: outputFile.write(self.record) | 
|  | 311         success = self.getRecord() | 
|  | 312 | 
|  | 313     while self.referenceSequence == referenceSequence and self.position < position and success: | 
|  | 314       if writeOut: outputFile.write(self.record) | 
|  | 315       success = self.getRecord() | 
|  | 316 | 
|  | 317     return success | 
|  | 318 | 
|  | 319 # Get the information for a specific info tag.  Also check that it contains | 
|  | 320 # the correct number and type of entries. | 
|  | 321   def getInfo(self, tag): | 
|  | 322     result = [] | 
|  | 323 | 
|  | 324 # Check if the tag exists in the header information.  If so, | 
|  | 325 # determine the number and type of entries asscoiated with this | 
|  | 326 # tag. | 
|  | 327     if self.infoHeaderTags.has_key(tag): | 
|  | 328       infoNumber = self.infoHeaderTags[tag][0] | 
|  | 329       infoType = self.infoHeaderTags[tag][1] | 
|  | 330       numberValues = infoNumber | 
|  | 331 | 
|  | 332 # First check that the tag exists in the information string.  Then split | 
|  | 333 # the entry on commas.  For flag entries, do not perform the split. | 
|  | 334       if self.infoTags.has_key(tag): | 
|  | 335         if numberValues == 0 and infoType == "Flag": result = True | 
|  | 336         elif numberValues != 0 and infoType == "Flag": | 
|  | 337           print >> sys.stderr, "ERROR" | 
|  | 338           exit(1) | 
|  | 339         else: | 
|  | 340           fields = self.infoTags[tag].split(",") | 
|  | 341           if len(fields) != numberValues: | 
|  | 342             text = "Unexpected number of entries" | 
|  | 343             self.generalError(text, "information tag", tag) | 
|  | 344 | 
|  | 345           for i in range(infoNumber): | 
|  | 346             try: result.append(fields[i]) | 
|  | 347             except IndexError: | 
|  | 348               text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0] | 
|  | 349               self.generalError(text, "tag:", tag) | 
|  | 350       else: numberValues = 0 | 
|  | 351 | 
|  | 352     else: | 
|  | 353       text = "information field does not have a definition in the header" | 
|  | 354       self.generalError(text, "tag", tag) | 
|  | 355 | 
|  | 356     return numberValues, infoType, result | 
|  | 357 | 
|  | 358 # Get the genotype information. | 
|  | 359   def getGenotypeInfo(self, sample, tag): | 
|  | 360     result = [] | 
|  | 361     if self.formatHeaderTags.has_key(tag): | 
|  | 362       infoNumber = self.formatHeaderTags[tag][0] | 
|  | 363       infoType = self.formatHeaderTags[tag][1] | 
|  | 364       numberValues = infoNumber | 
|  | 365 | 
|  | 366       if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1: | 
|  | 367         numberValues = 0 | 
|  | 368         result = "." | 
|  | 369       else: | 
|  | 370         if self.genotypeFields[sample].has_key(tag): | 
|  | 371           if tag == "GT": | 
|  | 372             if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1: | 
|  | 373               text = "Unexected number of characters in genotype (GT) field" | 
|  | 374               self.generalError(text, "sample", sample) | 
|  | 375 | 
|  | 376 # If a diploid call, check whether or not the genotype is phased. | 
|  | 377             elif len(self.genotypeFields[sample][tag]) == 3: | 
|  | 378               self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False | 
|  | 379               result.append( self.genotypeFields[sample][tag][0] ) | 
|  | 380               result.append( self.genotypeFields[sample][tag][2] ) | 
|  | 381             elif len(self.genotypeFields[sample][tag]) == 3: | 
|  | 382               result.append( self.genotypeFields[sample][tag][0] ) | 
|  | 383           else: | 
|  | 384             fields = self.genotypeFields[sample][tag].split(",") | 
|  | 385             if len(fields) != numberValues: | 
|  | 386               text = "Unexpected number of characters in " + tag + " field" | 
|  | 387               self.generalError(text, "sample", sample) | 
|  | 388 | 
|  | 389             for i in range(infoNumber): result.append(fields[i]) | 
|  | 390     else: | 
|  | 391       text = "genotype field does not have a definition in the header" | 
|  | 392       self.generalError(text, "tag", tag) | 
|  | 393 | 
|  | 394     return numberValues, result | 
|  | 395 | 
|  | 396 # Parse the dbsnp entry.  If the entry conforms to the required variant type, | 
|  | 397 # return the dbsnp rsid value, otherwise ".". | 
|  | 398   def getDbsnpInfo(self): | 
|  | 399 | 
|  | 400 # First check that the variant class (VC) is listed as SNP. | 
|  | 401     vc = self.info.split("VC=",1) | 
|  | 402     if vc[1].find(";") != -1: snp = vc[1].split(";",1) | 
|  | 403     else: | 
|  | 404       snp = [] | 
|  | 405       snp.append(vc[1]) | 
|  | 406 | 
|  | 407     if snp[0].lower() == "snp": rsid = self.rsid | 
|  | 408     else: rsid = "." | 
|  | 409 | 
|  | 410     return rsid | 
|  | 411 | 
|  | 412 # Build a new vcf record. | 
|  | 413   def buildRecord(self, removeGenotypes): | 
|  | 414     record = self.referenceSequence + "\t" + \ | 
|  | 415                 str(self.position) + "\t" + \ | 
|  | 416                 self.rsid + "\t" + \ | 
|  | 417                 self.ref + "\t" + \ | 
|  | 418                 self.alt + "\t" + \ | 
|  | 419                 str(self.quality) + "\t" + \ | 
|  | 420                 self.filters + "\t" + \ | 
|  | 421                 self.info | 
|  | 422 | 
|  | 423     if self.hasGenotypes and not removeGenotypes: record += self.genotypeString | 
|  | 424 | 
|  | 425     record += "\n" | 
|  | 426 | 
|  | 427     return record | 
|  | 428 | 
|  | 429 # Close the vcf file. | 
|  | 430   def closeVcf(self, filename): | 
|  | 431     self.filehandle.close() | 
|  | 432 | 
|  | 433 # Define error messages for different handled errors. | 
|  | 434   def generalError(self, text, field, fieldValue): | 
|  | 435     print >> sys.stderr, "\nError encountered when attempting to read:" | 
|  | 436     print >> sys.stderr, "\treference sequence :\t", self.referenceSequence | 
|  | 437     print >> sys.stderr, "\tposition :\t\t", self.position | 
|  | 438     if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue | 
|  | 439     print >> sys.stderr,  "\n", text | 
|  | 440     exit(1) |