Mercurial > repos > yufei-luo > s_mart
diff commons/core/seq/AlignedBioseqDB.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line diff
--- a/commons/core/seq/AlignedBioseqDB.py Mon Apr 22 11:11:10 2013 -0400 +++ b/commons/core/seq/AlignedBioseqDB.py Mon Apr 29 03:20:15 2013 -0400 @@ -78,7 +78,51 @@ count += 1 return count - + def cleanMSA( self ): + #TODO: Refactoring + """clean the MSA""" + i2del = [] + + # for each sequence in the MSA + for seqi in xrange(0,self.getSize()): + if seqi in i2del: + continue + #define it as the reference + ref = self.db[seqi].sequence + refHeader = self.db[seqi].header + # for each following sequence + for seq_next in xrange(seqi+1,self.getSize()): + if seq_next in i2del: + continue + keep = 0 + # for each position along the MSA + for posx in xrange(0,self.getLength()): + seq = self.db[seq_next].sequence + if seq[posx] != '-' and ref[posx] != '-': + keep = 1 + break + seqHeader = self.db[seq_next].header + # if there is at least one gap between the ref seq and the other seq + # keep track of the shortest by recording it in "i2del" + if keep == 0: + + if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader): + if seqi not in i2del: + i2del.append( seqi ) + else: + if seq_next not in i2del: + i2del.append( seq_next ) + + # delete from the MSA each seq present in the list "i2del" + for i in reversed(sorted(set(i2del))): + del self.db[i] + + self.idx = {} + count = 0 + for i in self.db: + self.idx[i.header] = count + count += 1 + ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site # # @return: list of dico whose keys are symbols and values are their occurrences