Mercurial > repos > yufei-luo > s_mart
view commons/core/seq/Bioseq.py @ 40:cd852f3e04ab
Uploaded
author | m-zytnicki |
---|---|
date | Thu, 30 May 2013 03:03:34 -0400 |
parents | 44d5973c188c |
children |
line wrap: on
line source
# Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import sys import string import re import random import cStringIO from commons.core.coord.Map import Map from commons.core.checker.RepetException import RepetException DNA_ALPHABET_WITH_N = set( ['A','T','G','C','N'] ) IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']) ## Record a sequence with its header # class Bioseq( object ): header = "" sequence = "" ## constructor # # @param name the header of sequence # @param seq sequence (DNA, RNA, protein) # def __init__( self, name="", seq="" ): self.header = name self.sequence = seq ## Equal operator # def __eq__( self, o ): if self.header==o.header and self.sequence==o.sequence: return True return False ## overload __repr__ # def __repr__( self ): return "%s;%s" % ( self.header, self.sequence ) ## set attribute header # # @param header a string # def setHeader( self, header ): self.header = header ## get attribute header # # @return header def getHeader(self): return self.header ## set attribute sequence # # @param sequence a string # def setSequence( self, sequence ): self.sequence = sequence def getSequence(self): return self.sequence ## reset # def reset( self ): self.setHeader( "" ) self.setSequence( "" ) ## Test if bioseq is empty # def isEmpty( self ): return self.header == "" and self.sequence == "" ## Reverse the sequence # def reverse( self ): tmp = self.sequence self.sequence = tmp[::-1] ## Turn the sequence into its complement # Force upper case letters # @warning: old name in pyRepet.Bioseq realComplement # def complement( self ): complement = "" self.upCase() for i in xrange(0,len(self.sequence),1): if self.sequence[i] == "A": complement += "T" elif self.sequence[i] == "T": complement += "A" elif self.sequence[i] == "C": complement += "G" elif self.sequence[i] == "G": complement += "C" elif self.sequence[i] == "M": complement += "K" elif self.sequence[i] == "R": complement += "Y" elif self.sequence[i] == "W": complement += "W" elif self.sequence[i] == "S": complement += "S" elif self.sequence[i] == "Y": complement += "R" elif self.sequence[i] == "K": complement += "M" elif self.sequence[i] == "V": complement += "B" elif self.sequence[i] == "H": complement += "D" elif self.sequence[i] == "D": complement += "H" elif self.sequence[i] == "B": complement += "V" elif self.sequence[i] == "N": complement += "N" elif self.sequence[i] == "-": complement += "-" else: print "WARNING: unknown symbol '%s', replacing it by N" % ( self.sequence[i] ) complement += "N" self.sequence = complement ## Reverse and complement the sequence # # Force upper case letters # @warning: old name in pyRepet.Bioseq : complement # def reverseComplement( self ): self.reverse() self.complement() ## Remove gap in the sequence # def cleanGap(self): self.sequence = self.sequence.replace("-","") ## Copy current Bioseq Instance # # @return: a Bioseq instance, a copy of current sequence. # def copyBioseqInstance(self): seq = Bioseq() seq.sequence = self.sequence seq.header = self.header return seq ## Add phase information after the name of sequence in header # # @param phase integer representing phase (1, 2, 3, -1, -2, -3) # def setFrameInfoOnHeader(self, phase): if " " in self.header: name, desc = self.header.split(" ", 1) name = name + "_" + str(phase) self.header = name + " " + desc else: self.header = self.header + "_" + str(phase) ## Fill Bioseq attributes with fasta file # # @param faFileHandler file handler of a fasta file # def read( self, faFileHandler ): line = faFileHandler.readline() if line == "": self.header = None self.sequence = None return while line == "\n": line = faFileHandler.readline() if line[0] == '>': self.header = string.rstrip(line[1:]) else: print "error, line is",string.rstrip(line) return line = " " seq = cStringIO.StringIO() while line: prev_pos = faFileHandler.tell() line = faFileHandler.readline() if line == "": break if line[0] == '>': faFileHandler.seek( prev_pos ) break seq.write( string.rstrip(line) ) self.sequence = seq.getvalue() ## Create a subsequence with a modified header # # @param s integer start a required subsequence # @param e integer end a required subsequence # # @return a Bioseq instance, a subsequence of current sequence # def subseq( self, s, e=0 ): if e == 0 : e=len( self.sequence ) if s > e : print "error: start must be < or = to end" return if s <= 0 : print "error: start must be > 0" return sub = Bioseq() sub.header = self.header + " fragment " + str(s) + ".." + str(e) sub.sequence = self.sequence[(s-1):e] return sub ## Get the nucleotide or aminoacid at the given position # # @param pos integer nucleotide or aminoacid position # # @return a string # def getNtFromPosition(self, pos): result = None if not (pos < 1 or pos > self.getLength()): result = self.sequence[pos - 1] return result ## Print in stdout the Bioseq in fasta format with 60 characters lines # # @param l length of required sequence default is whole sequence # def view(self,l=0): print '>'+self.header i=0 if(l==0): l=len(self.sequence) seq=self.sequence[0:l] while i<len(seq): print seq[i:i+60] i=i+60 ## Get length of sequence # # @param avoidN boolean don't count 'N' nucleotides # # @return length of current sequence # def getLength( self, countN = True ): if countN: return len(self.sequence) else: return len(self.sequence) - self.countNt('N') ## Return the proportion of a specific character # # @param nt character that we want to count # def propNt( self, nt ): return self.countNt( nt ) / float( self.getLength() ) ## Count occurrence of specific character # # @param nt character that we want to count # # @return nb of occurrences # def countNt( self, nt ): return self.sequence.count( nt ) ## Count occurrence of each nucleotide in current seq # # @return a dict, keys are nucleotides, values are nb of occurrences # def countAllNt( self ): dNt2Count = {} for nt in ["A","T","G","C","N"]: dNt2Count[ nt ] = self.countNt( nt ) return dNt2Count ## Return a dict with the number of occurrences for each combination of ATGC of specified size and number of word found # # @param size integer required length word # def occ_word( self, size ): occ = {} if size == 0: return occ,0 nbword = 0 srch = re.compile('[^ATGC]+') wordlist = self._createWordList( size ) for i in wordlist: occ[i] = 0 lenseq = len(self.sequence) i = 0 while i < lenseq-size+1: word = self.sequence[i:i+size].upper() m = srch.search(word) if m == None: occ[word] = occ[word]+1 nbword = nbword + 1 i = i + 1 else: i = i + m.end(0) return occ, nbword ## Return a dictionary with the frequency of occurs for each combination of ATGC of specified size # # @param size integer required length word # def freq_word( self, size ): dOcc, nbWords = self.occ_word( size ) freq = {} for word in dOcc.keys(): freq[word] = float(dOcc[word]) / nbWords return freq ## Find ORF in each phase # # @return: a dict, keys are phases, values are stop codon positions. # def findORF (self): orf = {0:[],1:[],2:[]} length = len (self.sequence) for i in xrange(0,length): triplet = self.sequence[i:i+3] if ( triplet == "TAA" or triplet == "TAG" or triplet == "TGA"): phase = i % 3 orf[phase].append(i) return orf ## Convert the sequence into upper case # def upCase( self ): self.sequence = self.sequence.upper() ## Convert the sequence into lower case # def lowCase( self ): self.sequence = self.sequence.lower() ## Extract the cluster of the fragment (output from Grouper) # # @return cluster id (string) # def getClusterID( self ): data = self.header.split() return data[0].split("Cl")[1] ## Extract the group of the sequence (output from Grouper) # # @return group id (string) # def getGroupID( self ): data = self.header.split() return data[0].split("Gr")[1].split("Cl")[0] ## Get the header of the full sequence (output from Grouper) # # @example 'Dmel_Grouper_3091_Malign_3:LARD' from '>MbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203' # @return header (string) # def getHeaderFullSeq( self ): data = self.header.split() return data[1] ## Get the strand of the fragment (output from Grouper) # # @return: strand (+ or -) # def getFragStrand( self ): data = self.header.split() coord = data[3].split("..") if int(coord[0]) < int(coord[-1]): return "+" else: return "-" ## Get A, T, G, C or N from an IUPAC letter # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] # # @return A, T, G, C or N # def getATGCNFromIUPAC( self, nt ): subset = ["A","T","G","C","N"] if nt in subset: return nt elif nt == "U": return "T" elif nt == "R": return random.choice( "AG" ) elif nt == "Y": return random.choice( "CT" ) elif nt == "M": return random.choice( "CA" ) elif nt == "K": return random.choice( "TG" ) elif nt == "W": return random.choice( "TA" ) elif nt == "S": return random.choice( "CG" ) elif nt == "B": return random.choice( "CTG" ) elif nt == "D": return random.choice( "ATG" ) elif nt == "H": return random.choice( "ATC" ) elif nt == "V": return random.choice( "ACG" ) else: return "N" ## Get nucleotide from an IUPAC letter and a nucleotide # Works only for IUPAC code with two possibilities ['R','Y','M','K','W','S'] # Examples: # Y and C returns T # Y and T returns C # B and C throws RepetException # # @return A, T, G, C # def getATGCNFromIUPACandATGCN(self, IUPACCode, nt): if IUPACCode == "R": possibleNt = set(["A", "G"]) if nt not in possibleNt: raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) return (possibleNt - set(nt)).pop() elif IUPACCode == "Y": possibleNt = set(["C", "T"]) if nt not in possibleNt: raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) return (possibleNt - set(nt)).pop() elif IUPACCode == "M": possibleNt = set(["A", "C"]) if nt not in possibleNt: raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) return (possibleNt - set(nt)).pop() elif IUPACCode == "K": possibleNt = set(["T", "G"]) if nt not in possibleNt: raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) return (possibleNt - set(nt)).pop() elif IUPACCode == "W": possibleNt = set(["A", "T"]) if nt not in possibleNt: raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) return (possibleNt - set(nt)).pop() elif IUPACCode == "S": possibleNt = set(["C", "G"]) if nt not in possibleNt: raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) return (possibleNt - set(nt)).pop() else: raise RepetException("Can't retrieve the third nucleotide from IUPAC code '%s' and nucleotide '%s'" % (IUPACCode, nt)) def getSeqWithOnlyATGCN( self ): newSeq = "" for nt in self.sequence: newSeq += self.getATGCNFromIUPAC( nt ) return newSeq ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents # def partialIUPAC( self ): self.sequence = self.getSeqWithOnlyATGCN() ## Remove non Unix end-of-line symbols, if any # def checkEOF( self ): symbol = "\r" # corresponds to '^M' from Windows if symbol in self.sequence: print "WARNING: Windows EOF removed in '%s'" % ( self.header ) sys.stdout.flush() newSeq = self.sequence.replace( symbol, "" ) self.sequence = newSeq ## Write Bioseq instance into a fasta file handler # # @param faFileHandler file handler of a fasta file # def write( self, faFileHandler ): faFileHandler.write( ">%s\n" % ( self.header ) ) self.writeSeqInFasta( faFileHandler ) ## Write only the sequence of Bioseq instance into a fasta file handler # # @param faFileHandler file handler of a fasta file # def writeSeqInFasta( self, faFileHandler ): i = 0 while i < self.getLength(): faFileHandler.write( "%s\n" % ( self.sequence[i:i+60] ) ) i += 60 ## Append Bioseq instance to a fasta file # # @param faFile name of a fasta file as a string # @param mode 'write' or 'append' # def save( self, faFile, mode="a" ): faFileHandler = open( faFile, mode ) self.write( faFileHandler ) faFileHandler.close() ## Append Bioseq instance to a fasta file # # @param faFile name of a fasta file as a string # def appendBioseqInFile( self, faFile ): self.save( faFile, "a" ) ## Write Bioseq instance into a fasta file handler # # @param faFileHandler file handler on a file with writing right # def writeABioseqInAFastaFile( self, faFileHandler ): self.write( faFileHandler ) ## Write Bioseq instance with other header into a fasta file handler # # @param faFileHandler file handler on a file with writing right # @param otherHeader a string representing a new header (without the > and the \n) # def writeWithOtherHeader( self, faFileHandler, otherHeader ): self.header = otherHeader self.write( faFileHandler ) ## Append Bioseq header and Bioseq sequence in a fasta file # # @param faFileHandler file handler on a file with writing right # @param otherHeader a string representing a new header (without the > and the \n) # def writeABioseqInAFastaFileWithOtherHeader( self, faFileHandler, otherHeader ): self.writeWithOtherHeader( faFileHandler, otherHeader ) ## get the list of Maps corresponding to seq without gap # # @warning This method was called getMap() in pyRepet.Bioseq # @return a list of Map object # def getLMapWhithoutGap( self ): lMaps = [] countSite = 1 countSubseq = 1 inGap = False startMap = -1 endMap = -1 # initialize with the first site if self.sequence[0] == "-": inGap = True else: startMap = countSite # for each remaining site for site in self.sequence[1:]: countSite += 1 # if it is a gap if site == "-": # if this is the beginning of a gap, record the previous subsequence if inGap == False: inGap = True endMap = countSite - 1 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) ) countSubseq += 1 # if it is NOT a gap if site != "-": # if it is the end of a gap, begin the next subsequence if inGap == True: inGap = False startMap = countSite # if it is the last site if countSite == self.getLength(): endMap = countSite lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) ) return lMaps ## get the percentage of GC # # @return a percentage # def getGCpercentage( self ): tmpSeq = self.getSeqWithOnlyATGCN() nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" ) return 100 * nbGC / float( self.getLength() ) ## get the percentage of GC of a sequence without counting N in sequence length # # @return a percentage # def getGCpercentageInSequenceWithoutCountNInLength(self): tmpSeq = self.getSeqWithOnlyATGCN() nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" ) return 100 * nbGC / float( self.getLength() - self.countNt("N") ) ## get the 5 prime subsequence of a given length at the given position # # @param position integer # @param flankLength integer subsequence length # @return a sequence string # def get5PrimeFlank(self, position, flankLength): if(position == 1): return "" else: startOfFlank = 1 endOfFlank = position -1 if((position - flankLength) > 0): startOfFlank = position - flankLength else: startOfFlank = 1 return self.subseq(startOfFlank, endOfFlank).sequence ## get the 3 prime subsequence of a given length at the given position # In the case of indels, the polymorphism length can be specified # # @param position integer # @param flankLength integer subsequence length # @param polymLength integer polymorphism length # @return a sequence string # def get3PrimeFlank(self, position, flankLength, polymLength = 1): if((position + polymLength) > len( self.sequence )): return "" else: startOfFlank = position + polymLength if((position+polymLength+flankLength) > len( self.sequence )): endOfFlank = len( self.sequence ) else: endOfFlank = position+polymLength+flankLength-1 return self.subseq(startOfFlank, endOfFlank).sequence def _createWordList(self,size,l=['A','T','G','C']): if size == 1 : return l else: l2 = [] for i in l: for j in ['A','T','G','C']: l2.append( i + j ) return self._createWordList(size-1,l2) def removeSymbol( self, symbol ): tmp = self.sequence.replace( symbol, "" ) self.sequence = tmp