Mercurial > repos > yufei-luo > s_mart
diff commons/tools/ChangeSequenceHeaders.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/ChangeSequenceHeaders.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,524 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import getopt +from commons.core.coord.Align import Align +from commons.core.coord.Path import Path +from commons.core.coord.Match import Match + + + +class ChangeSequenceHeaders( object ): + + def __init__( self, name="ChangeSequenceHeaders", inFile="", format="", step=0, prefix="seq", outFile="",linkFile="", whichCluster = "", replace_query=True, replace_subject=True, verbosity=0): + self._name = name + self._inFile = inFile + self._format = format + self._step = step + self._prefix = prefix + self._linkFile = linkFile + self._whichCluster = whichCluster + self._outFile = outFile + self.replace_query = replace_query + self.replace_subject = replace_subject + self._verbose = verbosity + + + def help( self ): + print + print "usage: %s.py [ options ]" % ( self._name ) + print "options:" + print " -h: this help" + print " -i: name of the input file" + print " -f: format of the input file (fasta/newick/align/path/tab)" + print " -s: step (1: shorten headers / 2: retrieve initial headers)" + print " -p: prefix of new headers (with '-s 1', default='seq')" + print " -l: name of the 'link' file (with '-s 2', format=map)" + print " -w: header formatting type (A: after LTRharvest, B: for ClusterConsensus, not specified: no change)" + print " -o: name of the output file (default=inFile+'.newH'/'.initH')" + print + + + def setAttributesFromCmdLine( self ): + try: + opts, args = getopt.getopt(sys.argv[1:],"hi:f:s:p:l:w:o:v:") + except getopt.GetoptError, err: + sys.stderr.write( "%s\n" % ( str(err) ) ) + self.help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); sys.exit(0) + elif o == "-i": + self.setInputFile( a ) + elif o == "-f": + self.setFormat( a ) + elif o == "-s": + self.setStep( a ) + elif o == "-p": + self.setPrefix( a ) + elif o == "-l": + self.setLinkFile( a ) + elif o == "-w": + self.setWhichcluster( a ) + elif o == "-o": + self.setOutputFile( a ) + elif o == "-v": + self.setVerbosityLevel( a ) + + + def setInputFile( self, inFile ): + self._inFile = inFile + + def setFormat( self, format ): + self._format = format + + def setPrefix( self, prefix ): + self._prefix = prefix + + def setStep( self, step ): + self._step = int(step) + + def setLinkFile( self, linkFile ): + self._linkFile = linkFile + + def setWhichcluster( self, whichCluster ): + self._whichCluster = whichCluster + + def setOutputFile( self, outFile ): + self._outFile = outFile + + def setVerbosityLevel( self, verbose ): + self._verbose = int(verbose) + + + def checkAttributes( self ): + if self._inFile == "": + sys.stderr.write( "ERROR: missing input file name (-i)\n" ) + self.help(); sys.exit(1) + if not os.path.exists( self._inFile ): + sys.stderr.write( "ERROR: input file doesn't exist (-i)\n" ) + self.help(); sys.exit(1) + if self._format not in ["fasta","newick","align","path","tab","axt","lastz", "psl","chain"]: + sys.stderr.write( "ERROR: unrecognized format '%s' (-f)\n" ) + self.help(); sys.exit(1) + if self._step not in [1,2]: + sys.stderr.write( "ERROR: missing step (-s)\n" ) + self.help(); sys.exit(1) + if self._step == 1 and self._prefix == "": + sys.stderr.write( "ERROR: missing prefix (-p)\n" ) + self.help(); sys.exit(1) + if self._step == 2: + if self._linkFile == "": + sys.stderr.write( "ERROR: missing link file name (-l)\n" ) + self.help(); sys.exit(1) + if not os.path.exists( self._linkFile ): + sys.stderr.write( "ERROR: link file doesn't exist (-l)\n" ) + self.help(); sys.exit(1) + if self._whichCluster not in ["A", "B", ""]: + sys.stderr.write( "ERROR: formatting type not available (-w option): %s\n" % self._whichCluster) + self.help(); sys.exit(1) + if self._outFile == "": + if self._step == 1: + self._outFile = "%s.newH" % ( self._inFile ) + elif self._step == 2: + self._outFile = "%s.initH" % ( self._inFile ) + + + def shortenSequenceHeadersForFastaFile( self ): + if self._verbose > 0: + print "shorten sequence headers for fasta file..." + sys.stdout.flush() + if self._verbose > 1: + print "save sequences in '%s'" %( self._outFile ) + inFileHandler = open( self._inFile, "r" ) + linkFileHandler = open( self._linkFile, "w" ) + outFileHandler = open( self._outFile, "w" ) + countSeq = 0 + lengthSeq = 0 + while True: + line = inFileHandler.readline() + if line == "": + break + if line[0] == ">": + countSeq += 1 + newHeader = "%s%i" % ( self._prefix, countSeq ) + if self._verbose > 1: + print "initial '%s' -> new '%s'" % ( line[1:-1], newHeader ) + outFileHandler.write( ">%s\n" % ( newHeader ) ) + if lengthSeq != 0: + linkFileHandler.write( "\t%i\t%i\n" % ( 1, lengthSeq ) ) + lengthSeq = 0 + linkFileHandler.write( "%s\t%s" % ( newHeader, line[1:-1] ) ) + else: + lengthSeq += len( line.replace("\n","") ) + outFileHandler.write( line ) + linkFileHandler.write( "\t%i\t%i\n" % ( 1, lengthSeq ) ) + inFileHandler.close() + linkFileHandler.close() + outFileHandler.close() + if self._verbose > 0: + print "nb of sequences: %i" % ( countSeq ) + + + def getLinksNewToInitialHeaders( self ): + if self._verbose > 0: + print "retrieve the links new->initial headers" + sys.stdout.flush() + dNew2Init = {} + linkFileHandler = open( self._linkFile,"r" ) + while True: + line = linkFileHandler.readline() + if line == "": + break + tokens = line.split("\t") + if len(tokens) == 4: + dNew2Init[ tokens[0] ] = tokens[1] + elif len(tokens) == 2: + dNew2Init[ tokens[0] ] = tokens[1].split("\n")[0] + else: + sys.stderr.write( "ERROR: link file is badly formatted\n" ) + sys.exit(1) + linkFileHandler.close() + if self._verbose > 0: print "nb of links: %i" % ( len(dNew2Init.keys()) ); sys.stdout.flush() + return dNew2Init + + + def retrieveInitialSequenceHeadersForFastaFile( self, dNew2Init ): + if self._verbose > 0: + print "retrieve initial headers for fasta file" + sys.stdout.flush() + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + countSeq = 0 + while True: + line = inFileHandler.readline() + if line == "": + break + if line[0] == ">": + if self._whichCluster == "": + initHeader = line[1:-1] + newHeader = dNew2Init[initHeader] + else: + tokens = line[1:-1].split("_") + initHeader = dNew2Init[tokens[1]] + + pattern = "" + if "BlastclustCluster" in tokens[0]: + pattern = "Blc" + if "MCLCluster" in tokens[0]: + pattern = "MCL" + + if self._whichCluster == "A": + newHeader = "%s_%s" % (tokens[0], initHeader) + elif self._whichCluster == "B": + classif = initHeader.split("_")[0] + consensusName = "_".join(initHeader.split("_")[1:]) + clusterId = tokens[0].split("Cluster")[1].split("Mb")[0] + newHeader = "%s_%s%s_%s" % (classif, pattern, clusterId, consensusName) + + outFileHandler.write( ">%s\n" % newHeader ) + else: + outFileHandler.write( line ) + inFileHandler.close() + outFileHandler.close() + if self._verbose > 0: + print "nb of sequences: %i" % ( countSeq ) + + + def retrieveInitialSequenceHeadersForNewickFile( self, dNew2Init ): + inF = open( self._inFile, "r" ) + lines = inF.readlines() + inF.close() + line = "".join(lines) #.replace(";","").replace("\n","") + outF = open( self._outFile, "w" ) + for newH in dNew2Init.keys(): + line = line.replace( newH+":", dNew2Init[newH].replace(" ","_").replace("::","-").replace(":","-").replace(",","-")+":" ) + outF.write( line ) + outF.close() + + + def retrieveInitialSequenceHeadersForAlignFile( self, dNew2Init ): + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + a = Align() + while True: + line = inFileHandler.readline() + if line == "": + break + a.setFromTuple( line.split("\t") ) + nameToBeReplaced = a.range_query.seqname + if dNew2Init.has_key( nameToBeReplaced ): + a.range_query.seqname = dNew2Init[ nameToBeReplaced ] + nameToBeReplaced = a.range_subject.seqname + if dNew2Init.has_key( nameToBeReplaced ): + a.range_subject.seqname = dNew2Init[ nameToBeReplaced ] + a.write( outFileHandler ) + inFileHandler.close() + outFileHandler.close() + + + def retrieveInitialSequenceHeadersForPathFile( self, dNew2Init ): + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + p = Path() + while True: + line = inFileHandler.readline() + if line == "": + break + p.setFromTuple( line.split("\t") ) + nameToBeReplaced = p.range_query.seqname + if dNew2Init.has_key( nameToBeReplaced ): + p.range_query.seqname = dNew2Init[ nameToBeReplaced ] + nameToBeReplaced = p.range_subject.seqname + if dNew2Init.has_key( nameToBeReplaced ): + p.range_subject.seqname = dNew2Init[ nameToBeReplaced ] + p.write( outFileHandler ) + inFileHandler.close() + outFileHandler.close() + + + def retrieveInitialSequenceHeadersForMatchFile( self, dNew2Init ): + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + m = Match() + while True: + line = inFileHandler.readline() + if line == "": + break + if line[0:10] == "query.name": + continue + m.setFromTuple( line.split("\t") ) + nameToBeReplaced = m.range_query.seqname + if dNew2Init.has_key( nameToBeReplaced ): + m.range_query.seqname = dNew2Init[ nameToBeReplaced ] + nameToBeReplaced = m.range_subject.seqname + if dNew2Init.has_key( nameToBeReplaced ): + m.range_subject.seqname = dNew2Init[ nameToBeReplaced ] + m.write( outFileHandler ) + inFileHandler.close() + outFileHandler.close() + + def retrieveInitialSequenceHeadersForAxtFile( self, dNew2Init): + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + while True: + try: + line = inFileHandler.next() + except StopIteration: + break + + if line == "" or not "seq" in line: + outFileHandler.write(line) + else : + elems = line.split(" ") + try: + subject_seqname = elems[1] + if self.replace_subject : + nameToBeReplaced = elems[1] + if dNew2Init.has_key( nameToBeReplaced ): + subject_seqname = dNew2Init[nameToBeReplaced] + subject_seqname = subject_seqname.strip('\n').strip('\r') + + query_seqname = elems[4] + if self.replace_query: + nameToBeReplaced = elems[4] + if dNew2Init.has_key( nameToBeReplaced ): + query_seqname = dNew2Init[nameToBeReplaced] + query_seqname = query_seqname.strip('\n').strip('\r') + + modedelems = [ elems[0], subject_seqname, elems[2], elems[3], query_seqname, elems[5], elems[6], elems[7], elems[8]] + newLine = " ".join(modedelems) + outFileHandler.write("%s\n" % newLine) + if self._verbose >0 : + print("query", query_seqname, "subject", subject_seqname) + print("Output axt_line : line %s " % newLine) + except: pass + inFileHandler.close() + outFileHandler.close() + + def retrieveInitialSequenceHeadersForPslFile( self, dNew2Init): + + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + while True: + try: + line = inFileHandler.next() + except StopIteration: + break + + if line == "" or not "seq" in line: + outFileHandler.write(line) + else : + elems = line.split() + try: + subject_seqname = elems[13] + if self.replace_subject : + nameToBeReplaced = elems[13] + if dNew2Init.has_key( nameToBeReplaced ): + subject_seqname = dNew2Init[nameToBeReplaced] + subject_seqname = subject_seqname.strip('\n').strip('\r') + + query_seqname = elems[9] + if self.replace_query: + nameToBeReplaced = elems[9] + if dNew2Init.has_key( nameToBeReplaced ): + query_seqname = dNew2Init[nameToBeReplaced] + query_seqname = query_seqname.strip('\n').strip('\r') + + modedelems =elems[0:9]+[query_seqname]+elems[10:13]+[subject_seqname]+elems[14:21] + #modedelems = [ elems[0], elems[1], elems[2], elems[3], elems[4], elems[5], elems[6], elems[7], elems[8], query_seqname, ] + #modedelems = [ elems[0], subject_seqname, elems[2], elems[3], query_seqname, elems[5], elems[6], elems[7], elems[8]] + newLine = "\t".join(modedelems) + outFileHandler.write("%s\n" % newLine) + if self._verbose >0 : + print("query", query_seqname, "subject", subject_seqname) + print("Output psl_line : line %s " % newLine) + except: pass + sys.stdout.flush() + inFileHandler.close() + outFileHandler.close() + + + def retrieveInitialSequenceHeadersForLastZFile( self, dNew2Init): + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + while True: + try: + line = inFileHandler.next() + except StopIteration: + break + #score, name1, strand1, size1, zstart1, end1, name2, strand2, size2, zstart2, end2, identity, coverage + + if line == "" or not "seq" in line: + outFileHandler.write(line) + else : + elems = line.split("\t") + try: + subject_seqname = elems[1] + if self.replace_subject : + nameToBeReplaced = elems[1] + if dNew2Init.has_key( nameToBeReplaced ): + subject_seqname = dNew2Init[nameToBeReplaced] + subject_seqname = subject_seqname.strip('\n').strip('\r') + + query_seqname = elems[6] + if self.replace_query: + nameToBeReplaced = elems[6] + if dNew2Init.has_key( nameToBeReplaced ): + query_seqname = dNew2Init[nameToBeReplaced] + query_seqname = query_seqname.strip('\n').strip('\r') + + modedelems = [ elems[0], subject_seqname, elems[2], elems[3], elems[4], elems[5], query_seqname, elems[7], elems[8],elems[9],elems[10], elems[11], elems[12],elems[13],elems[14].strip('\n').strip('\r')] + newLine = "\t".join(modedelems) + outFileHandler.write("%s\n" % newLine) + if self._verbose >0 : + print("query", query_seqname, "subject", subject_seqname) + print("Output lastz_line : line %s " % newLine) + except: pass + inFileHandler.close() + outFileHandler.close() + + def retrieveInitialSequenceHeadersForChainFile( self, dNew2Init): + #format: chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id + inFileHandler = open( self._inFile, "r" ) + outFileHandler = open( self._outFile, "w" ) + while True: + try: + line = inFileHandler.next() + except StopIteration: + break + if line == "" or not "seq" in line: + outFileHandler.write(line) + else : + elems = line.split(" ") + try: + subject_seqname = elems[2] + if self.replace_subject : + nameToBeReplaced = elems[2] + if dNew2Init.has_key( nameToBeReplaced ): + subject_seqname = dNew2Init[nameToBeReplaced] + subject_seqname = subject_seqname.strip('\n').strip('\r') + + query_seqname = elems[7] + if self.replace_query: + nameToBeReplaced = elems[7] + if dNew2Init.has_key( nameToBeReplaced ): + query_seqname = dNew2Init[nameToBeReplaced] + query_seqname = query_seqname.strip('\n').strip('\r') + + modedelems = elems[:] + modedelems[2] = subject_seqname + modedelems[7] = query_seqname + newLine = " ".join(modedelems) + outFileHandler.write("%s\n" % newLine) + except: pass + + inFileHandler.close() + outFileHandler.close() + + + def run( self ): + self.checkAttributes() + if self._step == 1: + if self._linkFile == "": + self._linkFile = "%s.newHlink" % ( self._inFile ) + if self._format == "fasta": + self.shortenSequenceHeadersForFastaFile() + if self._step == 2: + dNew2Init = self.getLinksNewToInitialHeaders() + if self._format == "fasta": + self.retrieveInitialSequenceHeadersForFastaFile( dNew2Init ) + elif self._format == "newick": + self.retrieveInitialSequenceHeadersForNewickFile( dNew2Init ) + elif self._format == "align": + self.retrieveInitialSequenceHeadersForAlignFile( dNew2Init ) + elif self._format == "path": + self.retrieveInitialSequenceHeadersForPathFile( dNew2Init ) + elif self._format == "axt": + self.retrieveInitialSequenceHeadersForAxtFile( dNew2Init) + elif self._format == "psl": + self.retrieveInitialSequenceHeadersForPslFile( dNew2Init) + elif self._format == "lastz": + self.retrieveInitialSequenceHeadersForLastZFile(dNew2Init) + elif self._format == "chain": + self.retrieveInitialSequenceHeadersForChainFile(dNew2Init) + elif self._format in [ "tab", "match" ]: + self.retrieveInitialSequenceHeadersForMatchFile( dNew2Init ) + + +if __name__ == "__main__": + i = ChangeSequenceHeaders() + i.setAttributesFromCmdLine() + i.run() \ No newline at end of file