0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import os.path
|
|
4 import sys
|
|
5 import optparse
|
|
6
|
|
7 import vcfClass
|
|
8 from vcfClass import *
|
|
9
|
|
10 import tools
|
|
11 from tools import *
|
|
12
|
|
13 if __name__ == "__main__":
|
|
14 main()
|
|
15
|
|
16 def main():
|
|
17
|
|
18 # Parse the command line options
|
|
19 usage = "Usage: vcfPytools.py extract [options]"
|
|
20 parser = optparse.OptionParser(usage = usage)
|
|
21 parser.add_option("-i", "--in",
|
|
22 action="store", type="string",
|
|
23 dest="vcfFile", help="input vcf file (stdin for piped vcf)")
|
|
24 parser.add_option("-o", "--out",
|
|
25 action="store", type="string",
|
|
26 dest="output", help="output validation file")
|
|
27 parser.add_option("-s", "--reference-sequence",
|
|
28 action="store", type="string",
|
|
29 dest="referenceSequence", help="extract records from this reference sequence")
|
|
30 parser.add_option("-r", "--region",
|
|
31 action="store", type="string",
|
|
32 dest="region", help="extract records from this region")
|
|
33 parser.add_option("-q", "--keep-quality",
|
|
34 action="append", type="string", nargs=2,
|
|
35 dest="keepQuality", help="keep records containing this quality")
|
|
36 parser.add_option("-k", "--keep-info",
|
|
37 action="append", type="string",
|
|
38 dest="infoKeep", help="keep records containing this info field")
|
|
39 parser.add_option("-d", "--discard-info",
|
|
40 action="append", type="string",
|
|
41 dest="infoDiscard", help="discard records containing this info field")
|
|
42 parser.add_option("-p", "--pass-filter",
|
|
43 action="store_true", default=False,
|
|
44 dest="passFilter", help="discard records whose filter field is not PASS")
|
|
45
|
|
46 (options, args) = parser.parse_args()
|
|
47
|
|
48 # Check that a vcf file is given.
|
|
49 if options.vcfFile == None:
|
|
50 parser.print_help()
|
|
51 print >> sys.stderr, "\nInput vcf file (--in, -i) is required."
|
|
52 exit(1)
|
|
53
|
|
54 # Check that either a reference sequence or region is specified,
|
|
55 # but not both if not dealing with info fields.
|
|
56 if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality:
|
|
57 if not options.referenceSequence and not options.region:
|
|
58 parser.print_help()
|
|
59 print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied"
|
|
60 print >> sys.stderr, "if not extracting records based on info strings."
|
|
61 exit(1)
|
|
62 if options.referenceSequence and options.region:
|
|
63 parser.print_help()
|
|
64 print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both."
|
|
65 exit(1)
|
|
66
|
|
67 # If a region was supplied, check the format.
|
|
68 if options.region:
|
|
69 if options.region.find(":") == -1 or options.region.find("..") == -1:
|
|
70 print >> sys.stderr, "\nIncorrect format for region string. Required: ref:start..end."
|
|
71 exit(1)
|
|
72 regionList = options.region.split(":",1)
|
|
73 referenceSequence = regionList[0]
|
|
74 try: start = int(regionList[1].split("..")[0])
|
|
75 except ValueError:
|
|
76 print >> sys.stderr, "region start coordinate is not an integer"
|
|
77 exit(1)
|
|
78 try: end = int(regionList[1].split("..")[1])
|
|
79 except ValueError:
|
|
80 print >> sys.stderr, "region end coordinate is not an integer"
|
|
81 exit(1)
|
|
82
|
|
83 # Ensure that discard-info and keep-info haven't both been defined.
|
|
84 if options.infoKeep and options.infoDiscard:
|
|
85 print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously."
|
|
86 exit(1)
|
|
87
|
|
88 # If the --keep-quality argument is used, check that a value and a logical
|
|
89 # argument are supplied and that the logical argument is valid.
|
|
90
|
|
91 if options.keepQuality:
|
|
92 for value, logic in options.keepQuality:
|
|
93 if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge":
|
|
94 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
|
|
95 print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>"
|
|
96 print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt"
|
|
97 exit(1)
|
|
98 try: qualityValue = float(value)
|
|
99 except ValueError:
|
|
100 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
|
|
101 print >> sys.stderr, "Quality value must be an integer or float value."
|
|
102 exit(1)
|
|
103 qualityLogic = logic
|
|
104
|
|
105 # Set the output file to stdout if no output file was specified.
|
|
106 outputFile, writeOut = setOutput(options.output)
|
|
107
|
|
108 v = vcf() # Define vcf object.
|
|
109
|
|
110 # Set process info to True if info strings need to be parsed.
|
|
111 if options.infoKeep or options.infoDiscard: v.processInfo = True
|
|
112
|
|
113 # Open the file.
|
|
114 v.openVcf(options.vcfFile)
|
|
115
|
|
116 # Read in the header information.
|
|
117 v.parseHeader(options.vcfFile, writeOut)
|
|
118 taskDescriptor = "##vcfPytools=extract data"
|
|
119 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
|
|
120
|
|
121 # Read through all the entries and write out records in the correct
|
|
122 # reference sequence.
|
|
123 while v.getRecord():
|
|
124 writeRecord = True
|
|
125 if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False
|
|
126 elif options.region:
|
|
127 if v.referenceSequence != referenceSequence: writeRecord = False
|
|
128 elif v.position < start or v.position > end: writeRecord = False
|
|
129
|
|
130 # Only consider these fields if the record is contained within the
|
|
131 # specified region.
|
|
132 if options.infoKeep and writeRecord:
|
|
133 for tag in options.infoKeep:
|
|
134 if v.infoTags.has_key(tag):
|
|
135 writeRecord = True
|
|
136 break
|
|
137 if not v.infoTags.has_key(tag): writeRecord = False
|
|
138 if options.infoDiscard and writeRecord:
|
|
139 for tag in options.infoDiscard:
|
|
140 if v.infoTags.has_key(tag): writeRecord = False
|
|
141 if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False
|
|
142 if options.keepQuality:
|
|
143 if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False
|
|
144 if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False
|
|
145 if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False
|
|
146 if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False
|
|
147 if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False
|
|
148
|
|
149 if writeRecord: outputFile.write(v.record)
|
|
150
|
|
151 # Close the file.
|
|
152 v.closeVcf(options.vcfFile)
|
|
153
|
|
154 # Terminate the program cleanly.
|
|
155 return 0
|