s_mart: commons/core/seq/AlignedBioseqDB.py comparison

comparison commons/core/seq/AlignedBioseqDB.py @ 36:44d5973c188c

Uploaded

author	m-zytnicki
date	Tue, 30 Apr 2013 15:02:29 -0400
parents
children

comparison

equal deleted inserted replaced

-:d94018ca4ada
+:44d5973c188c
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and,  more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+import sys
+from commons.core.seq.BioseqDB import BioseqDB
+from commons.core.seq.Bioseq import Bioseq
+from commons.core.coord.Align import Align
+from commons.core.coord.Range import Range
+from commons.core.stat.Stat import Stat
+from math import log
+## Multiple Sequence Alignment Representation
+#
+#
+class AlignedBioseqDB( BioseqDB ):
+def __init__( self, name="" ):
+BioseqDB.__init__( self, name )
+seqLength = self.getLength()
+if self.getSize() > 1:
+for bs in self.db[1:]:
+if bs.getLength() != seqLength:
+print "ERROR: aligned sequences have different length"
+## Get length of the alignment
+#
+# @return length
+# @warning name before migration was 'length'
+#
+def getLength( self ):
+length = 0
+if self.db != []:
+length = self.db[0].getLength()
+return length
+## Get the true length of a given sequence (without gaps)
+#
+# @param header string header of the sequence to analyze
+# @return length integer
+# @warning  name before migration was 'true_length'
+#
+def getSeqLengthWithoutGaps( self, header ):
+bs = self.fetch( header )
+count = 0
+for pos in xrange(0,len(bs.sequence)):
+if bs.sequence[pos] != "-":
+count += 1
+return count
+def cleanMSA( self ):
+#TODO: Refactoring
+"""clean the MSA"""
+i2del = []
+# for each sequence in the MSA
+for seqi in xrange(0,self.getSize()):
+if seqi in i2del:
+continue
+#define it as the reference
+ref = self.db[seqi].sequence
+refHeader = self.db[seqi].header
+# for each following sequence
+for seq_next in xrange(seqi+1,self.getSize()):
+if seq_next in i2del:
+continue
+keep = 0
+# for each position along the MSA
+for posx in xrange(0,self.getLength()):
+seq = self.db[seq_next].sequence
+if seq[posx] != '-' and ref[posx] != '-':
+keep = 1
+break
+seqHeader = self.db[seq_next].header
+# if there is at least one gap between the ref seq and the other seq
+# keep track of the shortest by recording it in "i2del"
+if keep == 0:
+if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader):
+if seqi not in i2del:
+i2del.append( seqi )
+else:
+if seq_next not in i2del:
+i2del.append( seq_next )
+# delete from the MSA each seq present in the list "i2del"
+for i in reversed(sorted(set(i2del))):
+del self.db[i]
+self.idx = {}
+count = 0
+for i in self.db:
+self.idx[i.header] = count
+count += 1
+## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site
+#
+# @return: list of dico whose keys are symbols and values are their occurrences
+#
+def getListOccPerSite( self ):
+lOccPerSite = []   # list of dictionaries, one per position on the sequence
+n = 0    # nb of sequences parsed from the input file
+firstSeq = True
+# for each sequence in the bank
+for bs in self.db:
+if bs.sequence == None:
+break
+n += 1
+# if it is the first to be parsed, create a dico at each site
+if firstSeq:
+for i in xrange(0,len(bs.sequence)):
+lOccPerSite.append( {} )
+firstSeq = False
+# for each site, add its nucleotide
+for i in xrange(0,len(bs.sequence)):
+nuc = bs.sequence[i].upper()
+if lOccPerSite[i].has_key( nuc ):
+lOccPerSite[i][nuc] += 1
+else:
+lOccPerSite[i][nuc] = 1
+return lOccPerSite
+#TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus...
+## Make a consensus from the MSA
+#
+# @param minNbNt: minimum nb of nucleotides to edit a consensus
+# @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)
+# @param verbose: level of information sent to stdout (default=0/1)
+# @return: consensus
+#
+def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False):
+maxPropN = 0.40  # discard consensus if more than 40% of N's
+nbInSeq = self.getSize()
+if verbose > 0:
+print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
+if nbInSeq < 2:
+print "ERROR: can't make a consensus with less than 2 sequences"
+sys.exit(1)
+if minNbNt >= nbInSeq:
+minNbNt = nbInSeq - 1
+print "minNbNt=%i" % ( minNbNt )
+if minPropNt >= 1.0:
+print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
+sys.exit(1)
+lOccPerSite = self.getListOccPerSite()
+nbSites = len(lOccPerSite)
+if verbose > 0:
+print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()
+seqConsensus = ""
+# for each site (i.e. each column of the MSA)
+nbRmvColumns = 0
+countSites = 0
+for dNt2Occ in lOccPerSite:
+countSites += 1
+if verbose > 1:
+print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
+nbSites )
+sys.stdout.flush()
+occMaxNt = 0   # occurrences of the predominant nucleotide at this site
+lBestNt = []
+nbNt = 0   # total nb of A, T, G and C (no gap)
+# for each distinct symbol at this site (A, T, G, C, N, -,...)
+for j in dNt2Occ.keys():
+if j != "-":
+nbNt += dNt2Occ[j]
+if verbose > 1:
+print "%s: %i" % ( j, dNt2Occ[j] )
+if dNt2Occ[j] > occMaxNt:
+occMaxNt = dNt2Occ[j]
+lBestNt = [ j ]
+elif dNt2Occ[j] == occMaxNt:
+lBestNt.append( j )
+if nbNt == 0:   # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
+nbRmvColumns += 1
+if len( lBestNt ) >= 1:
+bestNt = lBestNt[0]
+# if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
+if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
+bestNt = "N"
+if int(nbNt) >= int(minNbNt):
+seqConsensus += bestNt
+if verbose > 1:
+print "-> %s" % ( bestNt )
+if nbRmvColumns:
+if nbRmvColumns == 1:
+print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100)
+else:
+print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
+sys.stdout.flush()
+if seqConsensus == "":
+print "WARNING: no consensus can be built (no sequence left)"
+return
+propN = seqConsensus.count("N") / float(len(seqConsensus))
+if propN >= maxPropN:
+print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
+return
+elif propN >= maxPropN * 0.5:
+print "WARNING: %i%% of N's" % ( propN * 100 )
+consensus = Bioseq()
+consensus.sequence = seqConsensus
+if isHeaderSAtannot:
+header = self.db[0].header
+pyramid = header.split("Gr")[1].split("Cl")[0]
+pile = header.split("Cl")[1].split(" ")[0]
+consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid)
+else:
+consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )
+if verbose > 0:
+statEntropy = self.getEntropy( verbose - 1 )
+print "entropy: %s" % ( statEntropy.stringQuantiles() )
+sys.stdout.flush()
+return consensus
+## Get the entropy of the whole multiple alignment (only for A, T, G and C)
+#
+# @param verbose level of verbosity
+#
+# @return statistics about the entropy of the MSA
+#
+def getEntropy( self, verbose=0 ):
+stats = Stat()
+# get the occurrences of symbols at each site
+lOccPerSite = self.getListOccPerSite()
+countSite = 0
+# for each site
+for dSymbol2Occ in lOccPerSite:
+countSite += 1
+# count the number of nucleotides (A, T, G and C, doesn't count gap '-')
+nbNt = 0
+dATGC2Occ = {}
+for base in ["A","T","G","C"]:
+dATGC2Occ[ base ] = 0.0
+for nt in dSymbol2Occ.keys():
+if nt != "-":
+nbNt += dSymbol2Occ[ nt ]
+checkedNt = self.getATGCNFromIUPAC( nt )
+if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ):
+dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ]
+else:   # for 'N'
+if dSymbol2Occ.has_key( checkedNt ):
+dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+if verbose > 2:
+for base in dATGC2Occ.keys():
+print "%s: %i" % ( base, dATGC2Occ[ base ] )
+# compute the entropy for the site
+entropySite = 0.0
+for nt in dATGC2Occ.keys():
+entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt )
+if verbose > 1:
+print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite )
+stats.add( entropySite )
+return stats
+## Get A, T, G, C or N from an IUPAC letter
+#  IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
+#
+# @return A, T, G, C or N
+#
+def getATGCNFromIUPAC( self, nt ):
+iBs = Bioseq()
+return iBs.getATGCNFromIUPAC( nt )
+## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides
+#
+def computeEntropy( self, nbOcc, nbNt ):
+if nbOcc == 0.0:
+return 0.0
+else:
+freq = nbOcc / float(nbNt)
+return - freq * log(freq) / log(2)
+## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise
+#
+def saveAsBinaryMatrix( self, outFile ):
+outFileHandler = open( outFile, "w" )
+for bs in self.db:
+string = "%s" % ( bs.header )
+for nt in bs.sequence:
+if nt != "-":
+string += "\t%i" % ( 1 )
+else:
+string += "\t%i" % ( 0 )
+outFileHandler.write( "%s\n" % ( string ) )
+outFileHandler.close()
+## Return a list of Align instances corresponding to the aligned regions (without gaps)
+#
+# @param query string header of the sequence considered as query
+# @param subject string header of the sequence considered as subject
+#
+def getAlignList( self, query, subject ):
+lAligns = []
+alignQ = self.fetch( query ).sequence
+alignS = self.fetch( subject ).sequence
+createNewAlign = True
+indexAlign = 0
+indexQ = 0
+indexS = 0
+while indexAlign < len(alignQ):
+if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-":
+indexQ += 1
+indexS += 1
+if createNewAlign:
+iAlign = Align( Range( query, indexQ, indexQ ),
+Range( subject, indexS, indexS ),
+0,
+int( alignQ[ indexAlign ] == alignS[ indexAlign ] ),
+int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) )
+lAligns.append( iAlign )
+createNewAlign = False
+else:
+lAligns[-1].range_query.end += 1
+lAligns[-1].range_subject.end += 1
+lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
+lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
+else:
+if not createNewAlign:
+lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
+createNewAlign = True
+if alignQ[ indexAlign ] != "-":
+indexQ += 1
+elif alignS[ indexAlign ] != "-":
+indexS += 1
+indexAlign += 1
+if not createNewAlign:
+lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
+return lAligns
+def removeGaps(self):
+for iBs in self.db:
+iBs.removeSymbol( "-" )
+## Compute mean per cent identity for MSA.
+# First sequence in MSA is considered as reference sequence.
+#
+#
+def computeMeanPcentIdentity(self):
+seqRef = self.db[0]
+sumPcentIdentity = 0
+for seq in self.db[1:]:
+pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq)
+sumPcentIdentity = sumPcentIdentity + pcentIdentity
+nbSeq = len(self.db[1:])
+meanPcentIdentity = round (sumPcentIdentity/nbSeq)
+return meanPcentIdentity
+def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq):
+indexOnSeqRef = 0
+sumIdentity = 0
+for nuclSeq in seq.sequence:
+nuclRef = seqRef.sequence[indexOnSeqRef]
+if nuclRef != "-" and nuclRef == nuclSeq:
+sumIdentity = sumIdentity + 1
+indexOnSeqRef = indexOnSeqRef + 1
+return float(sumIdentity) / float(seqRef.getLength()) * 100

Mercurial > repos > yufei-luo > s_mart

comparison commons/core/seq/AlignedBioseqDB.py @ 36:44d5973c188c