diff commons/core/seq/FastaUtils.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- a/commons/core/seq/FastaUtils.py	Mon Apr 22 11:11:10 2013 -0400
+++ b/commons/core/seq/FastaUtils.py	Mon Apr 29 03:20:15 2013 -0400
@@ -36,6 +36,7 @@
 import shutil
 import re
 import glob
+from operator import itemgetter
 from commons.core.seq.BioseqDB import BioseqDB
 from commons.core.seq.Bioseq import Bioseq
 from commons.core.coord.MapUtils import MapUtils
@@ -1091,7 +1092,7 @@
                 iSequence.setName(newHeader)
                 f.write(iSequence.printFasta())
               
-    @staticmethod      
+    @staticmethod
     def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
         dHeader2ClusterClusterMember = {}
         clusterId = 0
@@ -1112,7 +1113,7 @@
                 print "%i clusters" % clusterId
         return dHeader2ClusterClusterMember, clusterId
     
-    @staticmethod     
+    @staticmethod
     def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
         """
         Write a map file from fasta output of clustering tool.
@@ -1140,4 +1141,57 @@
         fileDb.close()
         fileMap.close()
         print "saved in %s" % outMapFileName
- 
\ No newline at end of file
+
+    @staticmethod
+    def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
+        outFormat = outFormat.lower()
+        if outFormat in ["gff", "gff3"]:
+            outFormat = "gff3"
+        else:
+            outFormat = "map"
+            
+        lTNstretches = []
+        if nbN != 0:
+            iBSDB = BioseqDB(fastaFileName)
+            for iBS in iBSDB.db:
+                nbNFound = 0
+                start = 1
+                pos = 1
+                lastPos = 0
+                
+                while pos <= iBS.getLength():
+                    if nbNFound == 0:
+                        start = pos
+                        
+                    while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
+                        nbNFound += 1
+                        pos += 1
+                        lastPos = pos
+                    
+                    if pos - lastPos >= nbN:
+                        if nbNFound >= nbN:
+                            lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
+                        nbNFound = 0
+                    pos += 1
+                
+                if nbNFound >= nbN:
+                    lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
+    
+            lTNstretches.sort(key = itemgetter(0, 1, 2))
+        
+        if outFileName == "":
+            outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
+        
+        with open(outFileName, "w") as fH:
+            if outFormat == "gff3":
+                fH.write("##gff-version 3\n")
+            for item in lTNstretches:
+                seq = item[0]
+                start = item[1]
+                end = item[2]
+                if outFormat == "gff3":
+                    fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
+                else:
+                    fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
+                
+                
\ No newline at end of file