0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import os.path
|
|
4 import sys
|
|
5 import vcfPytools
|
|
6 from vcfPytools import __version__
|
|
7
|
|
8 # Determine whether to output to a file or stdout.
|
|
9 def setOutput(output):
|
|
10 if output == None:
|
|
11 outputFile = sys.stdout
|
|
12 writeOut = False
|
|
13 else:
|
|
14 output = os.path.abspath(output)
|
|
15 outputFile = open(output, 'w')
|
|
16 writeOut = True
|
|
17
|
|
18 return outputFile, writeOut
|
|
19
|
|
20 # Determine which file has priority for writing out records.
|
|
21 def setVcfPriority(priorityFile, vcfFiles):
|
|
22 if priorityFile == None: priority = 0
|
|
23 elif priorityFile == vcfFiles[0]: priority = 1
|
|
24 elif priorityFile == vcfFiles[1]: priority = 2
|
|
25 elif priorityFile.lower() == "merge": priority = 3
|
|
26 else:
|
|
27 print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."
|
|
28 exit(1)
|
|
29
|
|
30 return priority
|
|
31
|
|
32 # If the union or intersection of two vcf files is being performed
|
|
33 # and the output vcf file is to contain the information from both
|
|
34 # files, the headers need to be merged to ensure that all info and
|
|
35 # format entries have an explanation.
|
|
36 def mergeHeaders(v1, v2, v3):
|
|
37
|
|
38 # If either file does not have a header, terminate the program.
|
|
39 # In order to merge the headers, the different fields must be
|
|
40 # checked to ensure the files are compatible.
|
|
41 if not v1.hasHeader or not v2.hasHeader:
|
|
42 print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."
|
|
43 exit(1)
|
|
44
|
|
45 v3.infoHeaderTags = v1.infoHeaderTags.copy()
|
|
46 v3.formatHeaderTags = v1.formatHeaderTags.copy()
|
|
47 v3.numberDataSets = v1.numberDataSets
|
|
48 v3.includedDataSets = v1.includedDataSets.copy()
|
|
49 v3.headerText = v1.headerText
|
|
50 v3.headerTitles = v1.headerTitles
|
|
51 v3.infoHeaderString = v1.infoHeaderString.copy()
|
|
52 v3.formatHeaderString = v1.formatHeaderString.copy()
|
|
53
|
|
54 # Merge the info field descriptions.
|
|
55 for tag in v2.infoHeaderTags:
|
|
56 if v1.infoHeaderTags.has_key(tag):
|
|
57 if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \
|
|
58 v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:
|
|
59 print v1.infoHeaderTags[tag][0]
|
|
60 print v1.infoHeaderTags[tag][1]
|
|
61 print v1.infoHeaderTags[tag][2]
|
|
62 print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
|
|
63 exit(1)
|
|
64 else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]
|
|
65
|
|
66 # Merge the format field descriptions.
|
|
67 for tag in v2.formatHeaderTags:
|
|
68 if v1.formatHeaderTags.has_key(tag):
|
|
69 if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \
|
|
70 v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:
|
|
71 print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
|
|
72 exit(1)
|
|
73 else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]
|
|
74
|
|
75 # Now check to see if the vcf files contain information from multiple
|
|
76 # records themselves and create an ordered list in which the data
|
|
77 # will appear in the file. For instance, of the first file has
|
|
78 # already got two sets of data and is being intersected with a file
|
|
79 # with one set of data, the order of data in the new vcf file will be
|
|
80 # the two sets from the first file followed by the second, e.g.
|
|
81 # AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the
|
|
82 # value of AC from the second vcf. The header will have a ##FILE for
|
|
83 # each of the three files, so the origin if the data can be recovered.
|
|
84 if v1.numberDataSets == 0:
|
|
85 v3.includedDataSets[v3.numberDataSets + 1] = v1.filename
|
|
86 v3.numberDataSets += 1
|
|
87 if v2.numberDataSets == 0:
|
|
88 v3.includedDataSets[v3.numberDataSets + 1] = v2.filename
|
|
89 v3.numberDataSets += 1
|
|
90 else:
|
|
91 for i in range(1, v2.numberDataSets + 1):
|
|
92 v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]
|
|
93 v3.numberDataSets += 1
|
|
94
|
|
95 # If either of the input files contain multiple data sets (e.g. multiple
|
|
96 # vcf files have undergone intersection or union calculations and all
|
|
97 # information has been retained) and the priority isn't set to 'merge',
|
|
98 # terminate the program. This is to ensure that the origin of the data
|
|
99 # doesn't get confused.
|
|
100 def checkDataSets(v1, v2):
|
|
101 if v1.numberDataSets + v2.numberDataSets != 0:
|
|
102 print >> sys.stderr, "\nERROR:"
|
|
103 print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."
|
|
104 print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"
|
|
105 print >> sys.stderr, "Other tools may be incompatible with this format."
|
|
106 exit(1)
|
|
107
|
|
108 # Write the header to file.
|
|
109 def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):
|
|
110 if not v.hasHeader:
|
|
111 v.headerText = "##fileformat=VCFv4.0\n##source=vcfPytools " + __version__ + "\n"
|
|
112 v.headerTitles = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
|
|
113 outputFile.write(v.headerText) if v.headerText != "" else None
|
|
114 print >> outputFile, taskDescriptor
|
|
115 for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]
|
|
116 for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]
|
|
117
|
|
118 # Write out a list of files indicating which data set belongs to which file.
|
|
119 if v.numberDataSets != 0:
|
|
120 for i in range(1, v.numberDataSets + 1):
|
|
121 print >> outputFile, "##FILE=<ID=" + str(i) + ",\"" + v.includedDataSets[i] + "\">"
|
|
122
|
|
123 if removeGenotypes:
|
|
124 line = v.headerTitles.rstrip("\n").split("\t")
|
|
125 newHeaderTitles = line[0]
|
|
126 for i in range(1,8):
|
|
127 newHeaderTitles = newHeaderTitles + "\t" + line[i]
|
|
128 newHeaderTitles = newHeaderTitles + "\n"
|
|
129 outputFile.write( newHeaderTitles )
|
|
130 else:
|
|
131 outputFile.write( v.headerTitles )
|
|
132
|
|
133 # Check that the two reference sequence lists are identical.
|
|
134 # If there are a different number or order, the results may
|
|
135 # not be as expected.
|
|
136 def checkReferenceSequenceLists(list1, list2):
|
|
137 errorMessage = False
|
|
138 if len(list1) != len(list2):
|
|
139 print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."
|
|
140 errorMessage = True
|
|
141 elif list1 != list2:
|
|
142 print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."
|
|
143 errorMessage = True
|
|
144 if errorMessage:
|
|
145 print >> sys.stderr, "Results may not be as expected."
|
|
146 print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."
|
|
147 print >> sys.stderr, "Reference sequence lists observed were:\n\t", list1, "\n\t", list2
|
|
148
|
|
149 # Write out a vcf record to file. The record written depends on the
|
|
150 # value of 'priority' and could therefore be the record from either
|
|
151 # of the vcf files, or a combination of them.
|
|
152
|
|
153 def writeVcfRecord(priority, v1, v2, outputFile):
|
|
154 if priority == 0:
|
|
155 if v1.quality >= v2.quality: outputFile.write(v1.record)
|
|
156 else: outputFile.write(v2.record)
|
|
157 elif priority == 1: outputFile.write(v1.record)
|
|
158 elif priority == 2: outputFile.write(v2.record)
|
|
159 elif priority == 3:
|
|
160
|
|
161 # Define the missing entry values (depends on the number of data sets
|
|
162 # in the file).
|
|
163 info = ""
|
|
164 missingEntry1 = missingEntry2 = "."
|
|
165 for i in range(1, v1.numberDataSets): missingEntry1 += "/."
|
|
166 for i in range(1, v2.numberDataSets): missingEntry2 += "/."
|
|
167 secondList = v2.infoTags.copy()
|
|
168
|
|
169 # Build up the info field.
|
|
170 for tag in v1.infoTags:
|
|
171 if secondList.has_key(tag):
|
|
172 if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"
|
|
173 del secondList[tag]
|
|
174 else:
|
|
175 if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"
|
|
176
|
|
177 # Now include the info tags that are not populated in the first vcf file.
|
|
178 for tag in secondList:
|
|
179 if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"
|
|
180
|
|
181 # Build the complete record.
|
|
182 info = info.rstrip(";")
|
|
183 record = v1.referenceSequence + "\t" + str(v1.position) + "\t" + v1.rsid + "\t" + v1.ref + "\t" + \
|
|
184 v1.alt + "/" + v2.alt + "\t" + v1.quality + "/" + v2.quality + "\t.\t" + info
|
|
185 print >> outputFile, record
|
|
186 else:
|
|
187 print >> sys.sterr, "Unknown file priority."
|
|
188 exit(1)
|