annotate tools/vcf_tools/extract.py @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import os.path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 import optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 import vcfClass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 from vcfClass import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 import tools
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 from tools import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 main()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 # Parse the command line options
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 usage = "Usage: vcfPytools.py extract [options]"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 parser = optparse.OptionParser(usage = usage)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 parser.add_option("-i", "--in",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 dest="vcfFile", help="input vcf file (stdin for piped vcf)")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 parser.add_option("-o", "--out",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 dest="output", help="output validation file")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 parser.add_option("-s", "--reference-sequence",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 dest="referenceSequence", help="extract records from this reference sequence")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 parser.add_option("-r", "--region",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 action="store", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 dest="region", help="extract records from this region")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 parser.add_option("-q", "--keep-quality",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 action="append", type="string", nargs=2,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 dest="keepQuality", help="keep records containing this quality")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 parser.add_option("-k", "--keep-info",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 action="append", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 dest="infoKeep", help="keep records containing this info field")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 parser.add_option("-d", "--discard-info",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 action="append", type="string",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 dest="infoDiscard", help="discard records containing this info field")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 parser.add_option("-p", "--pass-filter",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 action="store_true", default=False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 dest="passFilter", help="discard records whose filter field is not PASS")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 (options, args) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 # Check that a vcf file is given.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 if options.vcfFile == None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 parser.print_help()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 print >> sys.stderr, "\nInput vcf file (--in, -i) is required."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 # Check that either a reference sequence or region is specified,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 # but not both if not dealing with info fields.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 if not options.referenceSequence and not options.region:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 parser.print_help()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 print >> sys.stderr, "if not extracting records based on info strings."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 if options.referenceSequence and options.region:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 parser.print_help()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 # If a region was supplied, check the format.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 if options.region:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 if options.region.find(":") == -1 or options.region.find("..") == -1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 print >> sys.stderr, "\nIncorrect format for region string. Required: ref:start..end."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 regionList = options.region.split(":",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 referenceSequence = regionList[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 try: start = int(regionList[1].split("..")[0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 print >> sys.stderr, "region start coordinate is not an integer"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 try: end = int(regionList[1].split("..")[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 print >> sys.stderr, "region end coordinate is not an integer"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 # Ensure that discard-info and keep-info haven't both been defined.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if options.infoKeep and options.infoDiscard:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 # If the --keep-quality argument is used, check that a value and a logical
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 # argument are supplied and that the logical argument is valid.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 if options.keepQuality:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 for value, logic in options.keepQuality:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 try: qualityValue = float(value)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 print >> sys.stderr, "Quality value must be an integer or float value."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 qualityLogic = logic
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 # Set the output file to stdout if no output file was specified.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 outputFile, writeOut = setOutput(options.output)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 v = vcf() # Define vcf object.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 # Set process info to True if info strings need to be parsed.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 if options.infoKeep or options.infoDiscard: v.processInfo = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 # Open the file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 v.openVcf(options.vcfFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 # Read in the header information.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 v.parseHeader(options.vcfFile, writeOut)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 taskDescriptor = "##vcfPytools=extract data"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 # Read through all the entries and write out records in the correct
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 # reference sequence.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 while v.getRecord():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 writeRecord = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 elif options.region:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 if v.referenceSequence != referenceSequence: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 elif v.position < start or v.position > end: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 # Only consider these fields if the record is contained within the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 # specified region.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 if options.infoKeep and writeRecord:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 for tag in options.infoKeep:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 if v.infoTags.has_key(tag):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 writeRecord = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 if not v.infoTags.has_key(tag): writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 if options.infoDiscard and writeRecord:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 for tag in options.infoDiscard:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 if v.infoTags.has_key(tag): writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 if options.keepQuality:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 if writeRecord: outputFile.write(v.record)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 # Close the file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 v.closeVcf(options.vcfFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 # Terminate the program cleanly.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 return 0