sharplabtool: tools/vcf_tools/tools.py annotate

annotate tools/vcf_tools/tools.py @ 0:9071e359b9a3

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children

rev	line source
0 9071e359b9a3 Uploaded xuebing parents: diff changeset	1 #!/usr/bin/python
9071e359b9a3 Uploaded xuebing parents: diff changeset	2
9071e359b9a3 Uploaded xuebing parents: diff changeset	3 import os.path
9071e359b9a3 Uploaded xuebing parents: diff changeset	4 import sys
9071e359b9a3 Uploaded xuebing parents: diff changeset	5 import vcfPytools
9071e359b9a3 Uploaded xuebing parents: diff changeset	6 from vcfPytools import __version__
9071e359b9a3 Uploaded xuebing parents: diff changeset	7
9071e359b9a3 Uploaded xuebing parents: diff changeset	8 # Determine whether to output to a file or stdout.
9071e359b9a3 Uploaded xuebing parents: diff changeset	9 def setOutput(output):
9071e359b9a3 Uploaded xuebing parents: diff changeset	10 if output == None:
9071e359b9a3 Uploaded xuebing parents: diff changeset	11 outputFile = sys.stdout
9071e359b9a3 Uploaded xuebing parents: diff changeset	12 writeOut = False
9071e359b9a3 Uploaded xuebing parents: diff changeset	13 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	14 output = os.path.abspath(output)
9071e359b9a3 Uploaded xuebing parents: diff changeset	15 outputFile = open(output, 'w')
9071e359b9a3 Uploaded xuebing parents: diff changeset	16 writeOut = True
9071e359b9a3 Uploaded xuebing parents: diff changeset	17
9071e359b9a3 Uploaded xuebing parents: diff changeset	18 return outputFile, writeOut
9071e359b9a3 Uploaded xuebing parents: diff changeset	19
9071e359b9a3 Uploaded xuebing parents: diff changeset	20 # Determine which file has priority for writing out records.
9071e359b9a3 Uploaded xuebing parents: diff changeset	21 def setVcfPriority(priorityFile, vcfFiles):
9071e359b9a3 Uploaded xuebing parents: diff changeset	22 if priorityFile == None: priority = 0
9071e359b9a3 Uploaded xuebing parents: diff changeset	23 elif priorityFile == vcfFiles[0]: priority = 1
9071e359b9a3 Uploaded xuebing parents: diff changeset	24 elif priorityFile == vcfFiles[1]: priority = 2
9071e359b9a3 Uploaded xuebing parents: diff changeset	25 elif priorityFile.lower() == "merge": priority = 3
9071e359b9a3 Uploaded xuebing parents: diff changeset	26 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	27 print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."
9071e359b9a3 Uploaded xuebing parents: diff changeset	28 exit(1)
9071e359b9a3 Uploaded xuebing parents: diff changeset	29
9071e359b9a3 Uploaded xuebing parents: diff changeset	30 return priority
9071e359b9a3 Uploaded xuebing parents: diff changeset	31
9071e359b9a3 Uploaded xuebing parents: diff changeset	32 # If the union or intersection of two vcf files is being performed
9071e359b9a3 Uploaded xuebing parents: diff changeset	33 # and the output vcf file is to contain the information from both
9071e359b9a3 Uploaded xuebing parents: diff changeset	34 # files, the headers need to be merged to ensure that all info and
9071e359b9a3 Uploaded xuebing parents: diff changeset	35 # format entries have an explanation.
9071e359b9a3 Uploaded xuebing parents: diff changeset	36 def mergeHeaders(v1, v2, v3):
9071e359b9a3 Uploaded xuebing parents: diff changeset	37
9071e359b9a3 Uploaded xuebing parents: diff changeset	38 # If either file does not have a header, terminate the program.
9071e359b9a3 Uploaded xuebing parents: diff changeset	39 # In order to merge the headers, the different fields must be
9071e359b9a3 Uploaded xuebing parents: diff changeset	40 # checked to ensure the files are compatible.
9071e359b9a3 Uploaded xuebing parents: diff changeset	41 if not v1.hasHeader or not v2.hasHeader:
9071e359b9a3 Uploaded xuebing parents: diff changeset	42 print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."
9071e359b9a3 Uploaded xuebing parents: diff changeset	43 exit(1)
9071e359b9a3 Uploaded xuebing parents: diff changeset	44
9071e359b9a3 Uploaded xuebing parents: diff changeset	45 v3.infoHeaderTags = v1.infoHeaderTags.copy()
9071e359b9a3 Uploaded xuebing parents: diff changeset	46 v3.formatHeaderTags = v1.formatHeaderTags.copy()
9071e359b9a3 Uploaded xuebing parents: diff changeset	47 v3.numberDataSets = v1.numberDataSets
9071e359b9a3 Uploaded xuebing parents: diff changeset	48 v3.includedDataSets = v1.includedDataSets.copy()
9071e359b9a3 Uploaded xuebing parents: diff changeset	49 v3.headerText = v1.headerText
9071e359b9a3 Uploaded xuebing parents: diff changeset	50 v3.headerTitles = v1.headerTitles
9071e359b9a3 Uploaded xuebing parents: diff changeset	51 v3.infoHeaderString = v1.infoHeaderString.copy()
9071e359b9a3 Uploaded xuebing parents: diff changeset	52 v3.formatHeaderString = v1.formatHeaderString.copy()
9071e359b9a3 Uploaded xuebing parents: diff changeset	53
9071e359b9a3 Uploaded xuebing parents: diff changeset	54 # Merge the info field descriptions.
9071e359b9a3 Uploaded xuebing parents: diff changeset	55 for tag in v2.infoHeaderTags:
9071e359b9a3 Uploaded xuebing parents: diff changeset	56 if v1.infoHeaderTags.has_key(tag):
9071e359b9a3 Uploaded xuebing parents: diff changeset	57 if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \
9071e359b9a3 Uploaded xuebing parents: diff changeset	58 v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:
9071e359b9a3 Uploaded xuebing parents: diff changeset	59 print v1.infoHeaderTags[tag][0]
9071e359b9a3 Uploaded xuebing parents: diff changeset	60 print v1.infoHeaderTags[tag][1]
9071e359b9a3 Uploaded xuebing parents: diff changeset	61 print v1.infoHeaderTags[tag][2]
9071e359b9a3 Uploaded xuebing parents: diff changeset	62 print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
9071e359b9a3 Uploaded xuebing parents: diff changeset	63 exit(1)
9071e359b9a3 Uploaded xuebing parents: diff changeset	64 else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]
9071e359b9a3 Uploaded xuebing parents: diff changeset	65
9071e359b9a3 Uploaded xuebing parents: diff changeset	66 # Merge the format field descriptions.
9071e359b9a3 Uploaded xuebing parents: diff changeset	67 for tag in v2.formatHeaderTags:
9071e359b9a3 Uploaded xuebing parents: diff changeset	68 if v1.formatHeaderTags.has_key(tag):
9071e359b9a3 Uploaded xuebing parents: diff changeset	69 if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \
9071e359b9a3 Uploaded xuebing parents: diff changeset	70 v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:
9071e359b9a3 Uploaded xuebing parents: diff changeset	71 print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
9071e359b9a3 Uploaded xuebing parents: diff changeset	72 exit(1)
9071e359b9a3 Uploaded xuebing parents: diff changeset	73 else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]
9071e359b9a3 Uploaded xuebing parents: diff changeset	74
9071e359b9a3 Uploaded xuebing parents: diff changeset	75 # Now check to see if the vcf files contain information from multiple
9071e359b9a3 Uploaded xuebing parents: diff changeset	76 # records themselves and create an ordered list in which the data
9071e359b9a3 Uploaded xuebing parents: diff changeset	77 # will appear in the file. For instance, of the first file has
9071e359b9a3 Uploaded xuebing parents: diff changeset	78 # already got two sets of data and is being intersected with a file
9071e359b9a3 Uploaded xuebing parents: diff changeset	79 # with one set of data, the order of data in the new vcf file will be
9071e359b9a3 Uploaded xuebing parents: diff changeset	80 # the two sets from the first file followed by the second, e.g.
9071e359b9a3 Uploaded xuebing parents: diff changeset	81 # AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the
9071e359b9a3 Uploaded xuebing parents: diff changeset	82 # value of AC from the second vcf. The header will have a ##FILE for
9071e359b9a3 Uploaded xuebing parents: diff changeset	83 # each of the three files, so the origin if the data can be recovered.
9071e359b9a3 Uploaded xuebing parents: diff changeset	84 if v1.numberDataSets == 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	85 v3.includedDataSets[v3.numberDataSets + 1] = v1.filename
9071e359b9a3 Uploaded xuebing parents: diff changeset	86 v3.numberDataSets += 1
9071e359b9a3 Uploaded xuebing parents: diff changeset	87 if v2.numberDataSets == 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	88 v3.includedDataSets[v3.numberDataSets + 1] = v2.filename
9071e359b9a3 Uploaded xuebing parents: diff changeset	89 v3.numberDataSets += 1
9071e359b9a3 Uploaded xuebing parents: diff changeset	90 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	91 for i in range(1, v2.numberDataSets + 1):
9071e359b9a3 Uploaded xuebing parents: diff changeset	92 v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]
9071e359b9a3 Uploaded xuebing parents: diff changeset	93 v3.numberDataSets += 1
9071e359b9a3 Uploaded xuebing parents: diff changeset	94
9071e359b9a3 Uploaded xuebing parents: diff changeset	95 # If either of the input files contain multiple data sets (e.g. multiple
9071e359b9a3 Uploaded xuebing parents: diff changeset	96 # vcf files have undergone intersection or union calculations and all
9071e359b9a3 Uploaded xuebing parents: diff changeset	97 # information has been retained) and the priority isn't set to 'merge',
9071e359b9a3 Uploaded xuebing parents: diff changeset	98 # terminate the program. This is to ensure that the origin of the data
9071e359b9a3 Uploaded xuebing parents: diff changeset	99 # doesn't get confused.
9071e359b9a3 Uploaded xuebing parents: diff changeset	100 def checkDataSets(v1, v2):
9071e359b9a3 Uploaded xuebing parents: diff changeset	101 if v1.numberDataSets + v2.numberDataSets != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	102 print >> sys.stderr, "\nERROR:"
9071e359b9a3 Uploaded xuebing parents: diff changeset	103 print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."
9071e359b9a3 Uploaded xuebing parents: diff changeset	104 print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"
9071e359b9a3 Uploaded xuebing parents: diff changeset	105 print >> sys.stderr, "Other tools may be incompatible with this format."
9071e359b9a3 Uploaded xuebing parents: diff changeset	106 exit(1)
9071e359b9a3 Uploaded xuebing parents: diff changeset	107
9071e359b9a3 Uploaded xuebing parents: diff changeset	108 # Write the header to file.
9071e359b9a3 Uploaded xuebing parents: diff changeset	109 def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):
9071e359b9a3 Uploaded xuebing parents: diff changeset	110 if not v.hasHeader:
9071e359b9a3 Uploaded xuebing parents: diff changeset	111 v.headerText = "##fileformat=VCFv4.0\n##source=vcfPytools " + __version__ + "\n"
9071e359b9a3 Uploaded xuebing parents: diff changeset	112 v.headerTitles = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
9071e359b9a3 Uploaded xuebing parents: diff changeset	113 outputFile.write(v.headerText) if v.headerText != "" else None
9071e359b9a3 Uploaded xuebing parents: diff changeset	114 print >> outputFile, taskDescriptor
9071e359b9a3 Uploaded xuebing parents: diff changeset	115 for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]
9071e359b9a3 Uploaded xuebing parents: diff changeset	116 for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]
9071e359b9a3 Uploaded xuebing parents: diff changeset	117
9071e359b9a3 Uploaded xuebing parents: diff changeset	118 # Write out a list of files indicating which data set belongs to which file.
9071e359b9a3 Uploaded xuebing parents: diff changeset	119 if v.numberDataSets != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	120 for i in range(1, v.numberDataSets + 1):
9071e359b9a3 Uploaded xuebing parents: diff changeset	121 print >> outputFile, "##FILE=<ID=" + str(i) + ",\"" + v.includedDataSets[i] + "\">"
9071e359b9a3 Uploaded xuebing parents: diff changeset	122
9071e359b9a3 Uploaded xuebing parents: diff changeset	123 if removeGenotypes:
9071e359b9a3 Uploaded xuebing parents: diff changeset	124 line = v.headerTitles.rstrip("\n").split("\t")
9071e359b9a3 Uploaded xuebing parents: diff changeset	125 newHeaderTitles = line[0]
9071e359b9a3 Uploaded xuebing parents: diff changeset	126 for i in range(1,8):
9071e359b9a3 Uploaded xuebing parents: diff changeset	127 newHeaderTitles = newHeaderTitles + "\t" + line[i]
9071e359b9a3 Uploaded xuebing parents: diff changeset	128 newHeaderTitles = newHeaderTitles + "\n"
9071e359b9a3 Uploaded xuebing parents: diff changeset	129 outputFile.write( newHeaderTitles )
9071e359b9a3 Uploaded xuebing parents: diff changeset	130 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	131 outputFile.write( v.headerTitles )
9071e359b9a3 Uploaded xuebing parents: diff changeset	132
9071e359b9a3 Uploaded xuebing parents: diff changeset	133 # Check that the two reference sequence lists are identical.
9071e359b9a3 Uploaded xuebing parents: diff changeset	134 # If there are a different number or order, the results may
9071e359b9a3 Uploaded xuebing parents: diff changeset	135 # not be as expected.
9071e359b9a3 Uploaded xuebing parents: diff changeset	136 def checkReferenceSequenceLists(list1, list2):
9071e359b9a3 Uploaded xuebing parents: diff changeset	137 errorMessage = False
9071e359b9a3 Uploaded xuebing parents: diff changeset	138 if len(list1) != len(list2):
9071e359b9a3 Uploaded xuebing parents: diff changeset	139 print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."
9071e359b9a3 Uploaded xuebing parents: diff changeset	140 errorMessage = True
9071e359b9a3 Uploaded xuebing parents: diff changeset	141 elif list1 != list2:
9071e359b9a3 Uploaded xuebing parents: diff changeset	142 print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."
9071e359b9a3 Uploaded xuebing parents: diff changeset	143 errorMessage = True
9071e359b9a3 Uploaded xuebing parents: diff changeset	144 if errorMessage:
9071e359b9a3 Uploaded xuebing parents: diff changeset	145 print >> sys.stderr, "Results may not be as expected."
9071e359b9a3 Uploaded xuebing parents: diff changeset	146 print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."
9071e359b9a3 Uploaded xuebing parents: diff changeset	147 print >> sys.stderr, "Reference sequence lists observed were:\n\t", list1, "\n\t", list2
9071e359b9a3 Uploaded xuebing parents: diff changeset	148
9071e359b9a3 Uploaded xuebing parents: diff changeset	149 # Write out a vcf record to file. The record written depends on the
9071e359b9a3 Uploaded xuebing parents: diff changeset	150 # value of 'priority' and could therefore be the record from either
9071e359b9a3 Uploaded xuebing parents: diff changeset	151 # of the vcf files, or a combination of them.
9071e359b9a3 Uploaded xuebing parents: diff changeset	152
9071e359b9a3 Uploaded xuebing parents: diff changeset	153 def writeVcfRecord(priority, v1, v2, outputFile):
9071e359b9a3 Uploaded xuebing parents: diff changeset	154 if priority == 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	155 if v1.quality >= v2.quality: outputFile.write(v1.record)
9071e359b9a3 Uploaded xuebing parents: diff changeset	156 else: outputFile.write(v2.record)
9071e359b9a3 Uploaded xuebing parents: diff changeset	157 elif priority == 1: outputFile.write(v1.record)
9071e359b9a3 Uploaded xuebing parents: diff changeset	158 elif priority == 2: outputFile.write(v2.record)
9071e359b9a3 Uploaded xuebing parents: diff changeset	159 elif priority == 3:
9071e359b9a3 Uploaded xuebing parents: diff changeset	160
9071e359b9a3 Uploaded xuebing parents: diff changeset	161 # Define the missing entry values (depends on the number of data sets
9071e359b9a3 Uploaded xuebing parents: diff changeset	162 # in the file).
9071e359b9a3 Uploaded xuebing parents: diff changeset	163 info = ""
9071e359b9a3 Uploaded xuebing parents: diff changeset	164 missingEntry1 = missingEntry2 = "."
9071e359b9a3 Uploaded xuebing parents: diff changeset	165 for i in range(1, v1.numberDataSets): missingEntry1 += "/."
9071e359b9a3 Uploaded xuebing parents: diff changeset	166 for i in range(1, v2.numberDataSets): missingEntry2 += "/."
9071e359b9a3 Uploaded xuebing parents: diff changeset	167 secondList = v2.infoTags.copy()
9071e359b9a3 Uploaded xuebing parents: diff changeset	168
9071e359b9a3 Uploaded xuebing parents: diff changeset	169 # Build up the info field.
9071e359b9a3 Uploaded xuebing parents: diff changeset	170 for tag in v1.infoTags:
9071e359b9a3 Uploaded xuebing parents: diff changeset	171 if secondList.has_key(tag):
9071e359b9a3 Uploaded xuebing parents: diff changeset	172 if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"
9071e359b9a3 Uploaded xuebing parents: diff changeset	173 del secondList[tag]
9071e359b9a3 Uploaded xuebing parents: diff changeset	174 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	175 if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"
9071e359b9a3 Uploaded xuebing parents: diff changeset	176
9071e359b9a3 Uploaded xuebing parents: diff changeset	177 # Now include the info tags that are not populated in the first vcf file.
9071e359b9a3 Uploaded xuebing parents: diff changeset	178 for tag in secondList:
9071e359b9a3 Uploaded xuebing parents: diff changeset	179 if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"
9071e359b9a3 Uploaded xuebing parents: diff changeset	180
9071e359b9a3 Uploaded xuebing parents: diff changeset	181 # Build the complete record.
9071e359b9a3 Uploaded xuebing parents: diff changeset	182 info = info.rstrip(";")
9071e359b9a3 Uploaded xuebing parents: diff changeset	183 record = v1.referenceSequence + "\t" + str(v1.position) + "\t" + v1.rsid + "\t" + v1.ref + "\t" + \
9071e359b9a3 Uploaded xuebing parents: diff changeset	184 v1.alt + "/" + v2.alt + "\t" + v1.quality + "/" + v2.quality + "\t.\t" + info
9071e359b9a3 Uploaded xuebing parents: diff changeset	185 print >> outputFile, record
9071e359b9a3 Uploaded xuebing parents: diff changeset	186 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	187 print >> sys.sterr, "Unknown file priority."
9071e359b9a3 Uploaded xuebing parents: diff changeset	188 exit(1)

Mercurial > repos > xuebing > sharplabtool

annotate tools/vcf_tools/tools.py @ 0:9071e359b9a3