Mercurial > repos > yufei-luo > s_mart
diff commons/core/parsing/CrossSsrAndBesMappedByBlatToGff.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/CrossSsrAndBesMappedByBlatToGff.py Tue Apr 30 15:02:29 2013 -0400 @@ -0,0 +1,197 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import optparse +from commons.core.parsing.SsrParser import SsrParser +from commons.core.parsing.BlatParser import BlatParser + +class CrossSsrAndBesMappedByBlatToGff(object): + + + def __init__(self): + self._inputFileSSR = '' + self._inputFileBlat = '' + self._outputFileGFF = '' + + def setAttributesFromCmdLine(self): + help = '\ + \nThis Script Launch CrossSsrAndBesMappedByBlatToGff.\n\n\ + Example 1: python CrossSsrAndBesMappedByBlatToGff.py -s ssrResultsFile.tab -b blatResultsFile.tab -o outputFile.gff3\n\ + Example 2: python CrossSsrAndBesMappedByBlatToGff.py -s ssrResultsFile.tab -b blatResultsFile.tab -o outputFile.gff3 -n muscadine:filtre1\n\n' + + parser = optparse.OptionParser(usage= help, version="CovertSamToFastq.py v1.0") + parser.add_option( '-s', '--ssr', dest='inputSSR', help='SSR Input File Name [Format: tabular]', default= None ) + parser.add_option( '-b', '--blat', dest='inputBLAT', help='Blat Input File Name [Format: tabular]', default= None ) + parser.add_option( '-o', '--output', dest='output', help='Output File Name [Format: GFF3]', default= None ) + parser.add_option( '-n', '--methodName', dest='methodName', help='Method name in col. 3 [Default: None]', default= None ) + ( options, args ) = parser.parse_args() + self.options = options + + def checkOptions(self): + if self.options.inputSSR == '': + raise Exception("ERROR: No SSR file specified for -s !") + elif not os.path.exists(self.options.inputSSR): + raise Exception("ERROR: SSR Input File doesn't exist !") + else: + self._inputFileSSR = self.options.inputSSR + + if self.options.inputBLAT == '': + raise Exception("ERROR: No Blat file specified for -b !") + elif not os.path.exists(self.options.inputBLAT): + raise Exception("ERROR: Blat Input File doesn't exist !") + else: + self._inputFileBlat = self.options.inputBLAT + + if self.options.output == '': + raise Exception("ERROR: No Output file specified for -o !") + else: + self._outputFileGFF = self.options.output + + self._methodName = self.options.methodName + + def run(self): + self.checkOptions() + self._createGFFOutputFile() + + dictSsrParser = {} + dictSsrParser = self.createDictOfSsrParser(dictSsrParser) + + BLATFile = open(self._inputFileBlat, 'r') + + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + blatLine = BLATFile.readline() + numberLine = 6 + while blatLine != '' and blatLine != '\n': + thisBlatHit = BlatParser() + thisBlatHit.setAttributesFromString(blatLine, numberLine) + besName = thisBlatHit.getQName() + + if besName in dictSsrParser: + lLinesToPrint = self.createListOfGFFLinesForThisBesWithSSR(thisBlatHit, dictSsrParser) + self._printGFFLinesToOutputFile(lLinesToPrint) + + blatLine = BLATFile.readline() + numberLine = numberLine + 1 + + BLATFile.close() + + def createDictOfSsrParser(self, dictSsrParser): + dictSsrParser = {} + SSRFile = open(self._inputFileSSR, 'r') + + header = SSRFile.readline() + line = SSRFile.readline() + numberLine = 2 + + while line != '' and line != '\n': + thisSSRHit = SsrParser() + thisSSRHit.setAttributesFromString(line, numberLine) + + BESName = thisSSRHit.getBesName() + if not BESName in dictSsrParser: + list = [thisSSRHit] + dictSsrParser[BESName] = list + else: + list = dictSsrParser[BESName] + list.append(thisSSRHit) + dictSsrParser[BESName] = list + + line = SSRFile.readline() + numberLine = numberLine + 1 + + SSRFile.close() + return dictSsrParser + + def createListOfGFFLinesForThisBesWithSSR(self, BlatHitObject, dictSsrParser): + listGffLines = [] + + besNameToKeep = BlatHitObject.getQName() + lOfSSRHitObject = dictSsrParser[besNameToKeep] + + for SSRHitObject in lOfSSRHitObject: + posSSRStart = self.convertSSRPositionsToChromPositions(SSRHitObject.getSsrStart(), BlatHitObject.getTStart(), BlatHitObject.getTEnd(), BlatHitObject.getStrand()) + posSSREnd = self.convertSSRPositionsToChromPositions(SSRHitObject.getSsrEnd(), BlatHitObject.getTStart(), BlatHitObject.getTEnd(), BlatHitObject.getStrand()) + ssrSeq = self.getSsrSeq(SSRHitObject.getSsrMotif(), SSRHitObject.getSsrMotifNumber()) + + col1 = BlatHitObject.getTName() + col2 = 'CrossSsrAndBesAlignedByBlat' + if self._methodName != '' and self._methodName != None: + col3 = '%s:SSR' %self._methodName + else: + col3 = 'SSR' + col4 = posSSRStart + col5 = posSSREnd + col6 = '.' + col7 = BlatHitObject.getStrand() + col8 = '.' + col9 = 'ID=SSR_%s_%s;Name=SSR_%s_%s;bes_name=%s;bes_size=%s;bes_matchstart=%s;bes_matchend=%s;bes_redundancy=%s;ssr_type=%s;ssr_motif=%s;ssr_motif_number=%s;ssr_start=%s;ssr_end=%s;muscadine_seq=%s' % (besNameToKeep, SSRHitObject.getBesRedundancy(), + besNameToKeep, SSRHitObject.getBesRedundancy(), + besNameToKeep, BlatHitObject.getQSize(), + BlatHitObject.getQStart(), BlatHitObject.getQEnd(), + SSRHitObject.getBesRedundancy(), SSRHitObject.getSsrNbNucleotides(), + SSRHitObject.getSsrMotif(), SSRHitObject.getSsrMotifNumber(), + SSRHitObject.getSsrStart(), SSRHitObject.getSsrEnd(), ssrSeq) + gffLine = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (col1, col2, col3, col4, col5, col6, col7, col8, col9) + listGffLines.append(gffLine) + + return listGffLines + + def convertSSRPositionsToChromPositions(self, ssrPos, chromPosStart, chromPosEnd, strand): + if strand == '+': + newPos = int(chromPosStart) + int(ssrPos) - 1 + elif strand == '-': + newPos = int(chromPosEnd) - int(ssrPos) + 1 + return newPos + + def getSsrSeq(self, motif, nbMotif): + ssrSeq = motif * int(nbMotif) + return ssrSeq + + def _createGFFOutputFile(self): + GFFfile = open(self._outputFileGFF, 'w') + GFFfile.write("##gff-version 3\n") + GFFfile.close() + + def _printGFFLinesToOutputFile(self, lLinesToPrint): + GFFfile = open(self._outputFileGFF, 'a') + for line in lLinesToPrint: + GFFfile.write(line) + GFFfile.close() + +if __name__ == '__main__': + iCrossSsrAndBesMappedByBlatToGff = CrossSsrAndBesMappedByBlatToGff() + iCrossSsrAndBesMappedByBlatToGff.setAttributesFromCmdLine() + iCrossSsrAndBesMappedByBlatToGff.run() \ No newline at end of file