Mercurial > repos > yufei-luo > s_mart
diff commons/core/seq/FastaUtils.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line diff
--- a/commons/core/seq/FastaUtils.py Mon Apr 22 11:11:10 2013 -0400 +++ b/commons/core/seq/FastaUtils.py Mon Apr 29 03:20:15 2013 -0400 @@ -36,6 +36,7 @@ import shutil import re import glob +from operator import itemgetter from commons.core.seq.BioseqDB import BioseqDB from commons.core.seq.Bioseq import Bioseq from commons.core.coord.MapUtils import MapUtils @@ -1091,7 +1092,7 @@ iSequence.setName(newHeader) f.write(iSequence.printFasta()) - @staticmethod + @staticmethod def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): dHeader2ClusterClusterMember = {} clusterId = 0 @@ -1112,7 +1113,7 @@ print "%i clusters" % clusterId return dHeader2ClusterClusterMember, clusterId - @staticmethod + @staticmethod def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): """ Write a map file from fasta output of clustering tool. @@ -1140,4 +1141,57 @@ fileDb.close() fileMap.close() print "saved in %s" % outMapFileName - \ No newline at end of file + + @staticmethod + def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): + outFormat = outFormat.lower() + if outFormat in ["gff", "gff3"]: + outFormat = "gff3" + else: + outFormat = "map" + + lTNstretches = [] + if nbN != 0: + iBSDB = BioseqDB(fastaFileName) + for iBS in iBSDB.db: + nbNFound = 0 + start = 1 + pos = 1 + lastPos = 0 + + while pos <= iBS.getLength(): + if nbNFound == 0: + start = pos + + while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: + nbNFound += 1 + pos += 1 + lastPos = pos + + if pos - lastPos >= nbN: + if nbNFound >= nbN: + lTNstretches.append((iBS.getHeader(), start, lastPos - 1)) + nbNFound = 0 + pos += 1 + + if nbNFound >= nbN: + lTNstretches.append((iBS.getHeader(), start, lastPos - 1)) + + lTNstretches.sort(key = itemgetter(0, 1, 2)) + + if outFileName == "": + outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) + + with open(outFileName, "w") as fH: + if outFormat == "gff3": + fH.write("##gff-version 3\n") + for item in lTNstretches: + seq = item[0] + start = item[1] + end = item[2] + if outFormat == "gff3": + fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) + else: + fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end)) + + \ No newline at end of file