diff commons/core/seq/AlignedBioseqDB.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- a/commons/core/seq/AlignedBioseqDB.py	Mon Apr 22 11:11:10 2013 -0400
+++ b/commons/core/seq/AlignedBioseqDB.py	Mon Apr 29 03:20:15 2013 -0400
@@ -78,7 +78,51 @@
                 count += 1
         return count
     
-    
+    def cleanMSA( self ):
+        #TODO: Refactoring
+        """clean the MSA"""
+        i2del = []
+
+        # for each sequence in the MSA
+        for seqi in xrange(0,self.getSize()):
+            if seqi in i2del:
+                continue
+            #define it as the reference
+            ref = self.db[seqi].sequence
+            refHeader = self.db[seqi].header
+            # for each following sequence
+            for seq_next in xrange(seqi+1,self.getSize()):
+                if seq_next in i2del:
+                    continue
+                keep = 0
+                # for each position along the MSA
+                for posx in xrange(0,self.getLength()):
+                    seq = self.db[seq_next].sequence
+                    if seq[posx] != '-' and ref[posx] != '-':
+                        keep = 1
+                        break
+                seqHeader = self.db[seq_next].header
+                # if there is at least one gap between the ref seq and the other seq
+                # keep track of the shortest by recording it in "i2del"
+                if keep == 0:
+                    
+                    if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader):
+                        if seqi not in i2del:
+                            i2del.append( seqi )
+                    else:
+                        if seq_next not in i2del:
+                            i2del.append( seq_next )
+
+        # delete from the MSA each seq present in the list "i2del"
+        for i in reversed(sorted(set(i2del))):
+            del self.db[i]
+
+        self.idx = {}
+        count = 0
+        for i in self.db:
+            self.idx[i.header] = count
+            count += 1
+
     ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site
     #
     # @return: list of dico whose keys are symbols and values are their occurrences