Mercurial > repos > xuebing > sharplabtool
diff tools/vcf_tools/intersect.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vcf_tools/intersect.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,181 @@ +#!/usr/bin/python + +import os.path +import sys +import optparse + +import bedClass +from bedClass import * + +import vcfClass +from vcfClass import * + +import tools +from tools import * + +if __name__ == "__main__": + main() + +# Intersect two vcf files. It is assumed that the two files are +# sorted by genomic coordinates and the reference sequences are +# in the same order. +def intersectVcf(v1, v2, priority, outputFile): + success1 = v1.getRecord() + success2 = v2.getRecord() + currentReferenceSequence = v1.referenceSequence + +# As soon as the end of either file is reached, there can be no +# more intersecting SNPs, so terminate. + while success1 and success2: + if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence: + if v1.position == v2.position: + writeVcfRecord(priority, v1, v2, outputFile) + success1 = v1.getRecord() + success2 = v2.getRecord() + elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None) + elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None) + else: + if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None) + elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None) + +# If the last record for a reference sequence is the same for both vcf +# files, they will both have referenceSequences different from the +# current reference sequence. Change the reference sequence to reflect +# this and proceed. + else: + if v1.referenceSequence != v2.referenceSequence: + print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different." + print >> sys.stderr, "Check that both files contain records for the following reference sequences:" + print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence + exit(1) + currentReferenceSequence = v1.referenceSequence + +# Intersect a vcf file and a bed file. It is assumed that the +# two files are sorted by genomic coordinates and the reference +# sequences are in the same order. +def intersectVcfBed(v, b, outputFile): + successb = b.getRecord() + successv = v.getRecord() + currentReferenceSequence = v.referenceSequence + +# As soon as the end of the first file is reached, there are no +# more intersections and the program can terminate. + while successv: + if v.referenceSequence == b.referenceSequence: + if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None) + elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position) + else: + outputFile.write(v.record) + successv = v.getRecord() + else: + if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None) + if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position) + currentReferenceSequence = v.referenceSequence + +def main(): + +# Parse the command line options + usage = "Usage: vcfPytools.py intersect [options]" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-i", "--in", + action="append", type="string", + dest="vcfFiles", help="input vcf files") + parser.add_option("-b", "--bed", + action="store", type="string", + dest="bedFile", help="input bed vcf file") + parser.add_option("-o", "--out", + action="store", type="string", + dest="output", help="output vcf file") + parser.add_option("-f", "--priority-file", + action="store", type="string", + dest="priorityFile", help="output records from this vcf file") + + (options, args) = parser.parse_args() + +# Check that a single vcf file is given. + if options.vcfFiles == None: + parser.print_help() + print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection." + exit(1) + elif len(options.vcfFiles) > 2: + parser.print_help() + print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection." + exit(1) + elif len(options.vcfFiles) == 1 and not options.bedFile: + parser.print_help() + print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection." + exit(1) + +# Set the output file to stdout if no output file was specified. + outputFile, writeOut = setOutput(options.output) # tools.py + +# If intersecting with a bed file, call the bed intersection routine. + if options.bedFile: + v = vcf() # Define vcf object. + b = bed() # Define bed object. + +# Open the files. + v.openVcf(options.vcfFiles[0]) + b.openBed(options.bedFile) + +# Read in the header information. + v.parseHeader(options.vcfFiles[0], writeOut) + taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile + writeHeader(outputFile, v, False, taskDescriptor) # tools.py + +# Intersect the vcf file with the bed file. + intersectVcfBed(v, b, outputFile) + +# Check that the input files had the same list of reference sequences. +# If not, it is possible that there were some problems. + checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py + +# Close the files. + v.closeVcf(options.vcfFiles[0]) + b.closeBed(options.bedFile) + + else: + priority = setVcfPriority(options.priorityFile, options.vcfFiles) + v1 = vcf() # Define vcf object. + v2 = vcf() # Define vcf object. + +# Open the vcf files. + v1.openVcf(options.vcfFiles[0]) + v2.openVcf(options.vcfFiles[1]) + +# Read in the header information. + v1.parseHeader(options.vcfFiles[0], writeOut) + v2.parseHeader(options.vcfFiles[1], writeOut) + if priority == 3: + v3 = vcf() # Generate a new vcf object that will contain the header information of the new file. + mergeHeaders(v1, v2, v3) # tools.py + v1.processInfo = True + v2.processInfo = True + else: checkDataSets(v1, v2) + + #print v1.samplesList + #print v2.samplesList + +# Check that the header for the two files contain the same samples. + if v1.samplesList != v2.samplesList: + print >> sys.stderr, "vcf files contain different samples (or sample order)." + exit(1) + else: + taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename + if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor) + elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py + else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py + +# Intersect the two vcf files. + intersectVcf(v1, v2, priority, outputFile) + +# Check that the input files had the same list of reference sequences. +# If not, it is possible that there were some problems. + checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py + +# Close the vcf files. + v1.closeVcf(options.vcfFiles[0]) + v2.closeVcf(options.vcfFiles[1]) + +# End the program. + return 0