annotate commons/core/parsing/VarscanToVCF.py @ 44:5f796c5c579f

Uploaded
author m-zytnicki
date Wed, 18 Sep 2013 08:32:38 -0400
parents 94ab73e8a190
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 # Copyright INRA (Institut National de la Recherche Agronomique)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 # http://www.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 # http://urgi.versailles.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 # This software is governed by the CeCILL license under French law and
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 # abiding by the rules of distribution of free software. You can use,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 # modify and/ or redistribute the software under the terms of the CeCILL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 # license as circulated by CEA, CNRS and INRIA at the following URL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 # "http://www.cecill.info".
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 # As a counterpart to the access to the source code and rights to copy,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 # modify and redistribute granted by the license, users are provided only
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 # with a limited warranty and the software's author, the holder of the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 # economic rights, and the successive licensors have only limited
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 # liability.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 # In this respect, the user's attention is drawn to the risks associated
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 # with loading, using, modifying and/or developing or reproducing the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 # software by the user in light of its specific status of free software,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 # that may mean that it is complicated to manipulate, and that also
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 # therefore means that it is reserved for developers and experienced
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 # professionals having in-depth computer knowledge. Users are therefore
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 # encouraged to load and test the software's suitability as regards their
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 # requirements in conditions enabling the security of their systems and/or
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 # data to be ensured and, more generally, to use and operate it in the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 # same conditions as regards security.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 # The fact that you are presently reading this means that you have had
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 # knowledge of the CeCILL license and that you accept its terms.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 import math
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 from commons.core.LoggerFactory import LoggerFactory
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 from commons.core.utils.RepetOptionParser import RepetOptionParser
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 from commons.core.utils.FileUtils import FileUtils
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 from commons.core.parsing.VarscanFile import VarscanFile
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 from commons.core.seq.Bioseq import Bioseq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 LOG_DEPTH = "core.parsing"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 ##Reference launcher implementation
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 class VarscanToVCF(object):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 def __init__(self, varscanFileName = "", vcfFileName = "", doClean = False, verbosity = 0):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 self._varscanFileName = varscanFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 self.setvcfFileName(vcfFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49 self._doClean = doClean
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 self._verbosity = verbosity
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 self._vcfRevision = "VCFv4.1"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 self._vcfHeader = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 def setAttributesFromCmdLine(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 description = "Conver Varscan file to VCF file."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 epilog = "\t$ python VarscanToVCF.py -i varscanFileName -v 2"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 parser = RepetOptionParser(description = description, epilog = epilog)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 parser.add_option("-i", "--Varscan", dest = "varscanFileName", action = "store", type = "string", help = "input Varscan file name [compulsory] [format: varscan2.2.8]", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 parser.add_option("-o", "--vcfFileName",dest = "vcfFileName", action = "store", type = "string", help = "vcfFileName file name [default: <input>.vcf]", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 options = parser.parse_args()[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 self._setAttributesFromOptions(options)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 def _setAttributesFromOptions(self, options):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 self.setvarscanFileName(options.varscanFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 self.setvcfFileName(options.vcfFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 self.setDoClean(options.doClean)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 self.setVerbosity(options.verbosity)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 def setvarscanFileName(self, varscanFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 self._varscanFileName = varscanFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 def setvcfFileName(self, vcfFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 if vcfFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 self._vcfFileName = "%s.vcf" % self._varscanFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 self._vcfFileName = vcfFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 def setDoClean(self, doClean):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 self._doClean = doClean
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 def setVerbosity(self, verbosity):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 self._verbosity = verbosity
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 def _checkOptions(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 if self._varscanFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 self._logAndRaise("ERROR: Missing input file name")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 if not FileUtils.isRessourceExists(self._varscanFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 self._logAndRaise("ERROR: Input Varscan file '%s' does not exist!" % self._varscanFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 def _logAndRaise(self, errorMsg):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 self._log.error(errorMsg)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 raise Exception(errorMsg)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 def _convertVarscanLineToVCFRecord(self, varscanLine, lineNumber):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 iVarscanFile = VarscanFile()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 iVarscanFile.setTypeOfVarscanFile("Varscan_2_2_8")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 iVarscanHit = iVarscanFile.createVarscanObjectFromLine(varscanLine, lineNumber)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 Chrom = iVarscanHit.getChrom()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 Pos = int(iVarscanHit.getPosition())
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 #ID = str(lineNumber)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 ID = "."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 Ref = iVarscanHit.getRef()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 Alt = iVarscanHit.getVar()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 Qual = -10*math.log10(float(iVarscanHit.getPValue()))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 Filter = "."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 AF = float(iVarscanHit.getVarFreq()[:-1])/100
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 DP = int(iVarscanHit.getReadsRef()) + int(iVarscanHit.getReadsVar())
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 RBQ = iVarscanHit.getQualRef()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 ABQ = iVarscanHit.getQualVar()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 #MQ = iVarscanHit.getMapQualRef()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 Info = ";".join(["AF=%.4f" %AF,"DP=%d" %DP,"RBQ=%s" %RBQ, "ABQ=%s" %ABQ])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 allel = Bioseq().getATGCNFromIUPACandATGCN(iVarscanHit.getCns(), Ref)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 if allel != Alt:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121 self._log.warning("'VarAllele' attribute of Varscan file line '%d' was not correct. Correcting using '%s' instead of '%s'." % (lineNumber, allel, Alt))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 Alt = allel
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 vcfLine = "%s\t%s\t%s\t%s\t%s\t%.9f\t%s\t%s\n" % (Chrom, Pos, ID, Ref, Alt, Qual, Filter, Info)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 return vcfLine
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 def run(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 LoggerFactory.setLevel(self._log, self._verbosity)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 self._checkOptions()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 self._log.info("START Varscan To VCF")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 self._log.debug("Input file name: %s" % self._varscanFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 with open(self._vcfFileName, "w") as fVCF:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 fVCF.write("##fileformat=%s\n" % self._vcfRevision)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135 fVCF.write("%s\n" % self._vcfHeader)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 with open(self._varscanFileName, "r") as fVarscan:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 lineNumber = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139 line = fVarscan.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 while line:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 if line[0] != "#" and "Chrom\tPosition\tRef\tCons" not in line:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 vcfLine = self._convertVarscanLineToVCFRecord(line, lineNumber)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143 fVCF.write(vcfLine)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144 line = fVarscan.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 lineNumber += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 self._log.info("END Varscan To VCF")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 if __name__ == "__main__":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150 iLaunch = VarscanToVCF()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151 iLaunch.setAttributesFromCmdLine()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 iLaunch.run()