comparison commons/core/parsing/VarscanToVCF.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33 import math
34 from commons.core.LoggerFactory import LoggerFactory
35 from commons.core.utils.RepetOptionParser import RepetOptionParser
36 from commons.core.utils.FileUtils import FileUtils
37 from commons.core.parsing.VarscanFile import VarscanFile
38 from commons.core.seq.Bioseq import Bioseq
39
40 LOG_DEPTH = "core.parsing"
41
42 ##Reference launcher implementation
43 #
44 class VarscanToVCF(object):
45
46 def __init__(self, varscanFileName = "", vcfFileName = "", doClean = False, verbosity = 0):
47 self._varscanFileName = varscanFileName
48 self.setvcfFileName(vcfFileName)
49 self._doClean = doClean
50 self._verbosity = verbosity
51
52 self._vcfRevision = "VCFv4.1"
53 self._vcfHeader = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
54
55 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
56
57 def setAttributesFromCmdLine(self):
58 description = "Conver Varscan file to VCF file."
59 epilog = "\t$ python VarscanToVCF.py -i varscanFileName -v 2"
60 parser = RepetOptionParser(description = description, epilog = epilog)
61 parser.add_option("-i", "--Varscan", dest = "varscanFileName", action = "store", type = "string", help = "input Varscan file name [compulsory] [format: varscan2.2.8]", default = "")
62 parser.add_option("-o", "--vcfFileName",dest = "vcfFileName", action = "store", type = "string", help = "vcfFileName file name [default: <input>.vcf]", default = "")
63 parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False)
64 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
65 options = parser.parse_args()[0]
66 self._setAttributesFromOptions(options)
67
68 def _setAttributesFromOptions(self, options):
69 self.setvarscanFileName(options.varscanFileName)
70 self.setvcfFileName(options.vcfFileName)
71 self.setDoClean(options.doClean)
72 self.setVerbosity(options.verbosity)
73
74 def setvarscanFileName(self, varscanFileName):
75 self._varscanFileName = varscanFileName
76
77 def setvcfFileName(self, vcfFileName):
78 if vcfFileName == "":
79 self._vcfFileName = "%s.vcf" % self._varscanFileName
80 else:
81 self._vcfFileName = vcfFileName
82
83 def setDoClean(self, doClean):
84 self._doClean = doClean
85
86 def setVerbosity(self, verbosity):
87 self._verbosity = verbosity
88
89 def _checkOptions(self):
90 if self._varscanFileName == "":
91 self._logAndRaise("ERROR: Missing input file name")
92 else:
93 if not FileUtils.isRessourceExists(self._varscanFileName):
94 self._logAndRaise("ERROR: Input Varscan file '%s' does not exist!" % self._varscanFileName)
95
96 def _logAndRaise(self, errorMsg):
97 self._log.error(errorMsg)
98 raise Exception(errorMsg)
99
100 def _convertVarscanLineToVCFRecord(self, varscanLine, lineNumber):
101 iVarscanFile = VarscanFile()
102 iVarscanFile.setTypeOfVarscanFile("Varscan_2_2_8")
103 iVarscanHit = iVarscanFile.createVarscanObjectFromLine(varscanLine, lineNumber)
104 Chrom = iVarscanHit.getChrom()
105 Pos = int(iVarscanHit.getPosition())
106 #ID = str(lineNumber)
107 ID = "."
108 Ref = iVarscanHit.getRef()
109 Alt = iVarscanHit.getVar()
110 Qual = -10*math.log10(float(iVarscanHit.getPValue()))
111 Filter = "."
112 AF = float(iVarscanHit.getVarFreq()[:-1])/100
113 DP = int(iVarscanHit.getReadsRef()) + int(iVarscanHit.getReadsVar())
114 RBQ = iVarscanHit.getQualRef()
115 ABQ = iVarscanHit.getQualVar()
116 #MQ = iVarscanHit.getMapQualRef()
117 Info = ";".join(["AF=%.4f" %AF,"DP=%d" %DP,"RBQ=%s" %RBQ, "ABQ=%s" %ABQ])
118
119 allel = Bioseq().getATGCNFromIUPACandATGCN(iVarscanHit.getCns(), Ref)
120 if allel != Alt:
121 self._log.warning("'VarAllele' attribute of Varscan file line '%d' was not correct. Correcting using '%s' instead of '%s'." % (lineNumber, allel, Alt))
122 Alt = allel
123
124 vcfLine = "%s\t%s\t%s\t%s\t%s\t%.9f\t%s\t%s\n" % (Chrom, Pos, ID, Ref, Alt, Qual, Filter, Info)
125 return vcfLine
126
127 def run(self):
128 LoggerFactory.setLevel(self._log, self._verbosity)
129 self._checkOptions()
130 self._log.info("START Varscan To VCF")
131 self._log.debug("Input file name: %s" % self._varscanFileName)
132
133 with open(self._vcfFileName, "w") as fVCF:
134 fVCF.write("##fileformat=%s\n" % self._vcfRevision)
135 fVCF.write("%s\n" % self._vcfHeader)
136
137 with open(self._varscanFileName, "r") as fVarscan:
138 lineNumber = 1
139 line = fVarscan.readline()
140 while line:
141 if line[0] != "#" and "Chrom\tPosition\tRef\tCons" not in line:
142 vcfLine = self._convertVarscanLineToVCFRecord(line, lineNumber)
143 fVCF.write(vcfLine)
144 line = fVarscan.readline()
145 lineNumber += 1
146
147 self._log.info("END Varscan To VCF")
148
149 if __name__ == "__main__":
150 iLaunch = VarscanToVCF()
151 iLaunch.setAttributesFromCmdLine()
152 iLaunch.run()