annotate commons/core/seq/FastaUtils.py @ 58:5f5c9b74c2dd

Uploaded
author m-zytnicki
date Fri, 07 Feb 2014 11:53:36 -0500
parents 44d5973c188c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
36
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1 # Copyright INRA (Institut National de la Recherche Agronomique)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
2 # http://www.inra.fr
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
3 # http://urgi.versailles.inra.fr
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
4 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
5 # This software is governed by the CeCILL license under French law and
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
6 # abiding by the rules of distribution of free software. You can use,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
7 # modify and/ or redistribute the software under the terms of the CeCILL
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
8 # license as circulated by CEA, CNRS and INRIA at the following URL
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
9 # "http://www.cecill.info".
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
10 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
11 # As a counterpart to the access to the source code and rights to copy,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
12 # modify and redistribute granted by the license, users are provided only
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
13 # with a limited warranty and the software's author, the holder of the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
14 # economic rights, and the successive licensors have only limited
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
15 # liability.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
16 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
17 # In this respect, the user's attention is drawn to the risks associated
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
18 # with loading, using, modifying and/or developing or reproducing the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
19 # software by the user in light of its specific status of free software,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
20 # that may mean that it is complicated to manipulate, and that also
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
21 # therefore means that it is reserved for developers and experienced
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
22 # professionals having in-depth computer knowledge. Users are therefore
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
23 # encouraged to load and test the software's suitability as regards their
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
24 # requirements in conditions enabling the security of their systems and/or
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
25 # data to be ensured and, more generally, to use and operate it in the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
26 # same conditions as regards security.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
27 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
28 # The fact that you are presently reading this means that you have had
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
29 # knowledge of the CeCILL license and that you accept its terms.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
30
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
31
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
32 import os
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
33 import sys
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
34 import string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
35 import math
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
36 import shutil
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
37 import re
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
38 import glob
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
39 from operator import itemgetter
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
40 from commons.core.seq.BioseqDB import BioseqDB
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
41 from commons.core.seq.Bioseq import Bioseq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
42 from commons.core.coord.MapUtils import MapUtils
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
43 from commons.core.coord.Range import Range
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
44 from commons.core.checker.CheckerUtils import CheckerUtils
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
45 from commons.core.launcher.LauncherUtils import LauncherUtils
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
46 from commons.core.coord.ConvCoord import ConvCoord
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
47 from commons.core.parsing.FastaParser import FastaParser
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
48
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
49
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
50 ## Static methods for fasta file manipulation
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
51 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
52 class FastaUtils( object ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
53
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
54 ## Count the number of sequences in the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
55 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
56 # @param inFile name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
57 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
58 # @return integer number of sequences in the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
59 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
60 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
61 def dbSize( inFile ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
62 nbSeq = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
63 inFileHandler = open( inFile, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
64 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
65 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
66 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
67 nbSeq = nbSeq + 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
68 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
69 inFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
70
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
71 return nbSeq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
72
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
73
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
74 ## Compute the cumulative sequence length in the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
75 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
76 # @param inFile handler of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
77 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
78 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
79 def dbCumLength( inFile ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
80 cumLength = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
81 line = inFile.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
82 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
83 if line[0] != ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
84 cumLength += len(string.rstrip(line))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
85 line = inFile.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
86
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
87 return cumLength
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
88
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
89
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
90 ## Return a list with the length of each sequence in the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
91 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
92 # @param inFile string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
93 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
94 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
95 def dbLengths( inFile ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
96 lLengths = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
97 inFileHandler = open( inFile, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
98 currentLength = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
99 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
100 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
101 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
102 if currentLength != 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
103 lLengths.append( currentLength )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
104 currentLength = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
105 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
106 currentLength += len(line[:-1])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
107 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
108 lLengths.append( currentLength )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
109 inFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
110 return lLengths
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
111
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
112
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
113 ## Retrieve the sequence headers present in the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
114 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
115 # @param inFile string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
116 # @param verbose integer level of verbosity
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
117 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
118 # @return list of sequence headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
119 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
120 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
121 def dbHeaders( inFile, verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
122 lHeaders = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
123
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
124 inFileHandler = open( inFile, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
125 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
126 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
127 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
128 lHeaders.append( string.rstrip(line[1:]) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
129 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
130 print string.rstrip(line[1:])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
131 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
132 inFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
133
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
134 return lHeaders
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
135
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
136
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
137 ## Cut a data bank into chunks according to the input parameters
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
138 # If a sequence is shorter than the threshold, it is only renamed (not cut)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
139 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
140 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
141 # @param chkLgth string chunk length (in bp, default=200000)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
142 # @param chkOver string chunk overlap (in bp, default=10000)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
143 # @param wordN string N stretch word length (default=11, 0 for no detection)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
144 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
145 # @param clean boolean remove 'cut' and 'Nstretch' files
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
146 # @param verbose integer (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
147 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
148 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
149 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
150 nbSeq = FastaUtils.dbSize( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
151 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
152 print "cut the %i input sequences with cutterDB..." % ( nbSeq )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
153 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
154
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
155 prg = "cutterDB"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
156 cmd = prg
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
157 cmd += " -l %s" % ( chkLgth )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
158 cmd += " -o %s" %( chkOver )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
159 cmd += " -w %s" % ( wordN )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
160 cmd += " %s" % ( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
161 returnStatus = os.system( cmd )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
162 if returnStatus != 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
163 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
164 sys.stderr.write( "%s\n" % ( msg ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
165 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
166
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
167 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
168 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
169 print "done (%i chunks)" % ( nbChunks )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
170 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
171
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
172 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
173 print "rename the headers..."
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
174 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
175
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
176 if outFilePrefix == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
177 outFastaName = inFileName + "_chunks.fa"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
178 outMapName = inFileName + "_chunks.map"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
179 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
180 outFastaName = outFilePrefix + ".fa"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
181 outMapName = outFilePrefix + ".map"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
182
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
183 inFile = open( "%s_cut" % ( inFileName ), "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
184 line = inFile.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
185
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
186 outFasta = open( outFastaName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
187 outMap = open( outMapName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
188
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
189 # read line after line (no need for big RAM) and change the sequence headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
190 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
191
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
192 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
193 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
194 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
195 data = line[:-1].split(" ")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
196 seqID = data[0].split(">")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
197 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
198 oldHeader = data[2]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
199 seqStart = data[4].split("..")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
200 seqEnd = data[4].split("..")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
201 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
202 outFasta.write( ">%s\n" % ( newHeader ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
203
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
204 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
205 outFasta.write( line.upper() )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
206
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
207 line = inFile.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
208
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
209 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
210 outFasta.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
211 outMap.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
212
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
213 if clean == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
214 os.remove(inFileName + "_cut")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
215 os.remove(inFileName + ".Nstretch.map")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
216
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
217
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
218 ## Split the input fasta file in several output files
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
219 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
220 # @param inFile string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
221 # @param nbSeqPerBatch integer number of sequences per output file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
222 # @param newDir boolean put the sequences in a new directory called 'batches'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
223 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
224 # @param prefix prefix in output file name
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
225 # @param verbose integer verbosity level (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
226 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
227 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
228 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
229 if not os.path.exists( inFile ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
230 msg = "ERROR: file '%s' doesn't exist" % ( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
231 sys.stderr.write( "%s\n" % ( msg ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
232 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
233
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
234 nbSeq = FastaUtils.dbSize( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
235
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
236 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
237 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
238 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
239 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
240
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
241 if nbSeqPerBatch > 1 and useSeqHeader:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
242 useSeqHeader = False
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
243
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
244 if newDir == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
245 if os.path.exists( "batches" ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
246 shutil.rmtree( "batches" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
247 os.mkdir( "batches" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
248 os.chdir( "batches" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
249 os.system( "ln -s ../%s ." % ( inFile ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
250
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
251 inFileHandler = open( inFile, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
252 inFileHandler.seek( 0, 0 )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
253 countBatch = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
254 countSeq = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
255 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
256 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
257 if line == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
258 break
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
259 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
260 countSeq += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
261 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
262 if "outFile" in locals():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
263 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
264 countBatch += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
265 if nbSeqPerBatch == 1 and useSeqHeader:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
266 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
267 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
268 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
269 outFile = open( outFileName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
270 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
271 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
272 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
273 outFile.write( line )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
274 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
275 inFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
276
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
277 if newDir == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
278 os.remove( os.path.basename( inFile ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
279 os.chdir( ".." )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
280
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
281
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
282 ## Split the input fasta file in several output files
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
283 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
284 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
285 # @param maxSize integer max cumulative length for each output file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
286 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
287 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
288 def splitFastaFileInBatches(inFileName, maxSize = 200000):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
289 iBioseqDB = BioseqDB(inFileName)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
290 lHeadersSizeTuples = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
291 for iBioseq in iBioseqDB.db:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
292 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
293
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
294 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
295 os.mkdir("batches")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
296 os.chdir("batches")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
297
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
298 iterator = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
299 for lHeader in lHeadersList :
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
300 iterator += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
301 with open("batch_%s.fa" % iterator, 'w') as f :
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
302 for header in lHeader :
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
303 iBioseq = iBioseqDB.fetch(header)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
304 iBioseq.write(f)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
305 os.chdir("..")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
306
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
307
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
308 ## Split the input fasta file in several output files according to their cluster identifier
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
309 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
310 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
311 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
312 # @param simplifyHeader boolean simplify the headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
313 # @param createDir boolean put the sequences in different directories
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
314 # @param outPrefix string prefix of the output files (default='seqCluster')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
315 # @param verbose integer (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
316 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
317 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
318 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
319 if not os.path.exists( inFileName ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
320 print "ERROR: %s doesn't exist" % ( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
321 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
322
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
323 inFile = open( inFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
324
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
325 line = inFile.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
326 if line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
327 name = line.split(" ")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
328 if "Cluster" in name:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
329 clusterID = name.split("Cluster")[1].split("Mb")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
330 seqID = name.split("Mb")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
331 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
332 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
333 if "Q" in name.split("Gr")[0]:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
334 seqID = name.split("Gr")[0].split("MbQ")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
335 elif "S" in name:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
336 seqID = name.split("Gr")[0].split("MbS")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
337 sClusterIDs = set( [ clusterID ] )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
338 if simplifyHeader == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
339 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
340 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
341 header = line[1:-1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
342 if createDir == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
343 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
344 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
345 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
346 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
347 outFile = open( outFileName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
348 outFile.write( ">%s\n" % ( header ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
349 prevClusterID = clusterID
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
350
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
351 line = inFile.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
352 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
353 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
354 name = line.split(" ")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
355 if "Cluster" in name:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
356 clusterID = name.split("Cluster")[1].split("Mb")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
357 seqID = name.split("Mb")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
358 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
359 clusterID = name.split("Cl")[0].split("Gr")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
360 if "Q" in name.split("Gr")[0]:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
361 seqID = name.split("Gr")[0].split("MbQ")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
362 elif "S" in name:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
363 seqID = name.split("Gr")[0].split("MbS")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
364
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
365 if clusterID != prevClusterID:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
366 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
367
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
368 if simplifyHeader == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
369 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
370 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
371 header = line[1:-1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
372
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
373 if createDir == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
374 os.chdir( ".." )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
375 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
376 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
377 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
378
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
379 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
380 if not os.path.exists( outFileName ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
381 outFile = open( outFileName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
382 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
383 if clusterID != prevClusterID:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
384 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
385 outFile = open( outFileName, "a" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
386 outFile.write( ">%s\n" % ( header ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
387 prevClusterID = clusterID
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
388 sClusterIDs.add( clusterID )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
389
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
390 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
391 outFile.write( line )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
392
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
393 line = inFile.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
394
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
395 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
396 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
397 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
398
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
399 if createDir == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
400 os.chdir("..")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
401 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
402 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
403
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
404
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
405 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
406 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
407 # @param len_min integer length sequence criterion to filter
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
408 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
409 # @param verbose integer (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
410 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
411 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
412 def dbLengthFilter( len_min, inFileName, verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
413 file_db = open( inFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
414 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
415 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
416 seq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
417 numseq = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
418 nbsave = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
419
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
420 seq.read( file_db )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
421 while seq.sequence:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
422 l = seq.getLength()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
423 numseq = numseq + 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
424 if l >= len_min:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
425 seq.write( file_dbSup )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
426 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
427 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
428 nbsave=nbsave+1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
429 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
430 seq.write( file_dbInf )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
431 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
432 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
433 nbsave=nbsave+1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
434 seq.read( file_db )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
435
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
436 file_db.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
437 file_dbInf.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
438 file_dbSup.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
439 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
440 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
441
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
442
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
443 ## Extract the longest sequences from a fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
444 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
445 # @param num integer maximum number of sequences in the output file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
446 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
447 # @param outFileName string name of the output fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
448 # @param minThresh integer minimum length threshold (default=0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
449 # @param verbose integer (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
450 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
451 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
452 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
453 bsDB = BioseqDB( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
454 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
455 print "nb of input sequences: %i" % ( bsDB.getSize() )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
456
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
457 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
458 outFileName = inFileName + ".best" + str(num)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
459 outFile = open( outFileName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
460
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
461 if bsDB.getSize()==0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
462 return 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
463
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
464 num = int(num)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
465 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
466 print "keep the %i longest sequences" % ( num )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
467 if minThresh > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
468 print "with length > %i bp" % ( minThresh )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
469 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
470
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
471 # retrieve the length of each input sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
472 tmpLSeqLgth = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
473 seqNum = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
474 for bs in bsDB.db:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
475 seqNum += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
476 tmpLSeqLgth.append( bs.getLength() )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
477 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
478 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
479 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
480
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
481 # sort the lengths
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
482 tmpLSeqLgth.sort()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
483 tmpLSeqLgth.reverse()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
484
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
485 # select the longest
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
486 lSeqLgth = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
487 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
488 if tmpLSeqLgth[i] >= minThresh:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
489 lSeqLgth.append( tmpLSeqLgth[i] )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
490 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
491 print "selected max length: %i" % ( max(lSeqLgth) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
492 print "selected min length: %i" % ( min(lSeqLgth) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
493 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
494
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
495 # save the longest
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
496 inFile = open( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
497 seqNum = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
498 nbSave = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
499 for bs in bsDB.db:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
500 seqNum += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
501 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
502 bs.write( outFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
503 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
504 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
505 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
506 nbSave += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
507 if nbSave == num:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
508 break
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
509 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
510 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
511 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
512 print nbSave, "saved sequences in ", outFileName
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
513 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
514
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
515 return 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
516
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
517
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
518 ## Extract all the sequence headers from a fasta file and write them in a new fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
519 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
520 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
521 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
522 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
523 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
524 def dbExtractSeqHeaders( inFileName, outFileName="" ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
525 lHeaders = FastaUtils.dbHeaders( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
526
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
527 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
528 outFileName = inFileName + ".headers"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
529
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
530 outFile = open( outFileName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
531 for i in lHeaders:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
532 outFile.write( i + "\n" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
533 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
534
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
535 return 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
536
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
537
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
538 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
539 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
540 # @param pattern regular expression to search in headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
541 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
542 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
543 # @param verbose integer verbosity level (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
544 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
545 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
546 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
547 if pattern == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
548 return
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
549
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
550 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
551 outFileName = inFileName + '.extracted'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
552 outFile = open( outFileName, 'w' )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
553
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
554 patternTosearch = re.compile( pattern )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
555 bioseq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
556 bioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
557 savedBioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
558 inFile = open( inFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
559 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
560 while bioseq.sequence:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
561 bioseqNb = bioseqNb + 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
562 m = patternTosearch.search( bioseq.header )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
563 if m:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
564 bioseq.write( outFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
565 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
566 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
567 savedBioseqNb = savedBioseqNb + 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
568 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
569 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
570
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
571 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
572
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
573 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
574 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
575
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
576
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
577 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
578 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
579 # @param patternFileName string file containing regular expression to search in headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
580 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
581 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
582 # @param verbose integer verbosity level (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
583 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
584 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
585 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
586
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
587 if patternFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
588 print "ERROR: no file of pattern"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
589 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
590
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
591 bioseq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
592 bioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
593 savedBioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
594 lHeaders = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
595
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
596 inFile = open( inFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
597 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
598 while bioseq.sequence != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
599 lHeaders.append( bioseq.header )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
600 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
601 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
602
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
603 lHeadersToKeep = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
604 patternFile = open( patternFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
605 for pattern in patternFile:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
606 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
607 print "pattern: ",pattern[:-1]; sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
608
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
609 patternToSearch = re.compile(pattern[:-1])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
610 for h in lHeaders:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
611 if patternToSearch.search(h):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
612 lHeadersToKeep.append(h)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
613 patternFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
614
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
615 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
616 outFileName = inFileName + ".extracted"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
617 outFile=open( outFileName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
618
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
619 inFile = open( inFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
620 bioseq.read(inFile)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
621 while bioseq.sequence:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
622 bioseqNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
623 if bioseq.header in lHeadersToKeep:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
624 bioseq.write(outFile)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
625 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
626 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
627 savedBioseqNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
628 bioseq.read(inFile)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
629 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
630
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
631 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
632
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
633 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
634 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
635
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
636
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
637 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
638 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
639 # @param pattern regular expression to search in headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
640 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
641 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
642 # @param verbose integer verbosity level (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
643 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
644 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
645 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
646 if pattern == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
647 return
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
648
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
649 patternToSearch = re.compile(pattern)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
650
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
651 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
652 outFileName = inFileName + '.cleaned'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
653 outFile = open(outFileName,'w')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
654
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
655 bioseq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
656 bioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
657 savedBioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
658 inFile = open(inFileName)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
659 bioseq.read(inFile)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
660 while bioseq.sequence != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
661 bioseqNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
662 if not patternToSearch.search(bioseq.header):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
663 bioseq.write(outFile)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
664 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
665 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
666 savedBioseqNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
667 bioseq.read(inFile)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
668 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
669
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
670 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
671
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
672 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
673 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
674
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
675
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
676 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
677 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
678 # @param patternFileName string file containing regular expression to search in headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
679 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
680 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
681 # @param verbose integer verbosity level (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
682 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
683 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
684 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
685 if patternFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
686 print "ERROR: no file of pattern"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
687 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
688
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
689 bioseq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
690 bioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
691 savedBioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
692 lHeaders = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
693 inFile = open( inFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
694 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
695 while bioseq.sequence != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
696 bioseqNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
697 lHeaders.append( bioseq.header )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
698 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
699 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
700
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
701 patternFile = open( patternFileName, "r")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
702 lHeadersToRemove = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
703 for pattern in patternFile:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
704 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
705 print "pattern: ",pattern[:-1]; sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
706
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
707 patternToSearch = re.compile( pattern[:-1] )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
708 for h in lHeaders:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
709 if patternToSearch.search(h):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
710 lHeadersToRemove.append(h)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
711 patternFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
712
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
713 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
714 outFileName = inFileName + '.cleaned'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
715 outFile = open( outFileName, 'w' )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
716
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
717 bioseqNum = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
718 inFile=open( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
719 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
720 while bioseq.sequence != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
721 bioseqNum += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
722 if bioseq.header not in lHeadersToRemove:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
723 bioseq.write( outFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
724 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
725 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
726 savedBioseqNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
727 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
728 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
729
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
730 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
731
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
732 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
733 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
734
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
735
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
736 ## Find sequence's ORFs from a fasta file and write them in a tab file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
737 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
738 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
739 # @param orfMaxNb integer Select orfMaxNb best ORFs
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
740 # @param orfMinLength integer Keep ORFs with length > orfMinLength
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
741 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
742 # @param verbose integer verbosity level (default = 0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
743 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
744 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
745 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
746 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
747 outFileName = inFileName + ".ORF.map"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
748 outFile = open( outFileName, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
749
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
750 bioseq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
751 bioseqNb = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
752
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
753 inFile = open( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
754 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
755 while bioseq.sequence != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
756 bioseq.upCase()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
757 bioseqNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
758 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
759 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
760
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
761 orf = bioseq.findORF()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
762 bestOrf = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
763 for i in orf.keys():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
764 orfLen = len(orf[i])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
765 for j in xrange(1, orfLen):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
766 start = orf[i][j-1] + 4
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
767 end = orf[i][j] + 3
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
768 if end - start >= orfMinLength:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
769 bestOrf.append( ( end-start, i+1, start, end ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
770
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
771 bioseq.reverseComplement()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
772
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
773 orf = bioseq.findORF()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
774 seqLen = bioseq.getLength()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
775 for i in orf.keys():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
776 orfLen = len(orf[i])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
777 for j in xrange(1, orfLen):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
778 start = seqLen - orf[i][j-1] - 3
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
779 end = seqLen - orf[i][j] - 2
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
780 if start - end >= orfMinLength:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
781 bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
782
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
783 bestOrf.sort()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
784 bestOrf.reverse()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
785 bestOrfNb = len(bestOrf)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
786 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
787 bestOrfNb = orfMaxNb
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
788 for i in xrange(0, bestOrfNb):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
789 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
790 print bestOrf[i]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
791 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
792 "|"+str(bestOrf[i][0]),bioseq.header,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
793 bestOrf[i][2],bestOrf[i][3]))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
794 bioseq.read( inFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
795
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
796 inFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
797 outFile.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
798
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
799 return 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
800
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
801
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
802 ## Sort sequences by increasing length (write a new file)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
803 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
804 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
805 # @param outFileName string name of the output fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
806 # @param verbose integer verbosity level
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
807 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
808 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
809 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
810 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
811 print "sort sequences by increasing length"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
812 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
813 if not os.path.exists( inFileName ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
814 print "ERROR: file '%s' doesn't exist" % ( inFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
815 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
816
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
817 # read each seq one by one
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
818 # save them in distinct temporary files
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
819 # with their length in the name
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
820 inFileHandler = open( inFileName, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
821 countSeq = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
822 bs = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
823 bs.read( inFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
824 while bs.header:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
825 countSeq += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
826 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
827 bs.appendBioseqInFile( tmpFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
828 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
829 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
830 bs.header = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
831 bs.sequence = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
832 bs.read( inFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
833 inFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
834
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
835 # sort temporary file names
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
836 # concatenate them into the output file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
837 if os.path.exists( outFileName ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
838 os.remove( outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
839 lFiles = glob.glob( "*bp_*nb" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
840 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
841 for fileName in lFiles:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
842 cmd = "cat %s >> %s" % ( fileName, outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
843 returnValue = os.system( cmd )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
844 if returnValue != 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
845 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
846 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
847 os.remove( fileName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
848
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
849 return 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
850
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
851
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
852 ## Sort sequences by header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
853 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
854 # @param inFileName string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
855 # @param outFileName string name of the output fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
856 # @param verbose integer verbosity level
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
857 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
858 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
859 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
860 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
861 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
862 iBioseqDB = BioseqDB(inFileName)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
863 f = open(outFileName, "w")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
864 lHeaders = sorted(iBioseqDB.getHeaderList())
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
865 for header in lHeaders:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
866 iBioseq = iBioseqDB.fetch(header)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
867 iBioseq.write(f)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
868 f.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
869
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
870
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
871 ## Return a dictionary which keys are the headers and values the length of the sequences
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
872 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
873 # @param inFile string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
874 # @param verbose integer verbosity level
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
875 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
876 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
877 def getLengthPerHeader( inFile, verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
878 dHeader2Length = {}
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
879
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
880 inFileHandler = open( inFile, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
881 currentSeqHeader = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
882 currentSeqLength = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
883 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
884 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
885 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
886 if currentSeqHeader != "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
887 dHeader2Length[ currentSeqHeader ] = currentSeqLength
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
888 currentSeqLength = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
889 currentSeqHeader = line[1:-1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
890 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
891 print "current header: %s" % ( currentSeqHeader )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
892 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
893 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
894 currentSeqLength += len( line.replace("\n","") )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
895 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
896 dHeader2Length[ currentSeqHeader ] = currentSeqLength
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
897 inFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
898
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
899 return dHeader2Length
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
900
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
901
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
902 ## Convert headers from a fasta file having chunk coordinates
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
903 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
904 # @param inFile string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
905 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
906 # @param outFile string name of the output file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
907 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
908 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
909 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
910 inFileHandler = open(inFile, "r")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
911 outFileHandler = open(outFile, "w")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
912 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
913 iConvCoord = ConvCoord()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
914 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
915 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
916 if line[0] == ">":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
917 if "{Fragment}" in line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
918 chkName = line.split(" ")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
919 chrName = dChunk2Map[chkName].seqname
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
920 lCoordPairs = line.split(" ")[3].split(",")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
921 lRangesOnChk = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
922 for i in lCoordPairs:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
923 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
924 lRangesOnChk.append(iRange)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
925 lRangesOnChr = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
926 for iRange in lRangesOnChk:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
927 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
928 newHeader = line[1:-1].split(" ")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
929 newHeader += " %s" % chrName
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
930 newHeader += " {Fragment}"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
931 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
932 for iRange in lRangesOnChr[1:]:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
933 newHeader += ",%i..%i" % (iRange.start, iRange.end)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
934 outFileHandler.write(">%s\n" % newHeader)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
935 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
936 chkName = line.split("_")[1].split(" ")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
937 chrName = dChunk2Map[chkName].seqname
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
938 coords = line[line.find("[")+1 : line.find("]")]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
939 start = int(coords.split(",")[0])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
940 end = int(coords.split(",")[1])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
941 iRangeOnChk = Range(chkName, start, end)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
942 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
943 newHeader = line[1:-1].split("_")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
944 newHeader += " %s" % chrName
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
945 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
946 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
947 outFileHandler.write(">%s\n" % newHeader)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
948 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
949 outFileHandler.write(line)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
950 line = inFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
951 inFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
952 outFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
953
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
954
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
955 ## Convert a fasta file to a length file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
956 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
957 # @param inFile string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
958 # @param outFile string name of the output file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
959 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
960 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
961 def convertFastaToLength(inFile, outFile = ""):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
962 if outFile == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
963 outFile = "%s.length" % inFile
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
964
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
965 if inFile != "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
966 with open(inFile, "r") as inFH:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
967 with open(outFile, "w") as outFH:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
968 bioseq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
969 bioseq.read(inFH)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
970 while bioseq.sequence != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
971 seqLen = bioseq.getLength()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
972 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
973 bioseq.read(inFH)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
974
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
975
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
976 ## Convert a fasta file to a seq file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
977 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
978 # @param inFile string name of the input fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
979 # @param outFile string name of the output file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
980 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
981 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
982 def convertFastaToSeq(inFile, outFile = ""):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
983 if outFile == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
984 outFile = "%s.seq" % inFile
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
985
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
986 if inFile != "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
987 with open(inFile, "r") as inFH:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
988 with open(outFile, "w") as outFH:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
989 bioseq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
990 bioseq.read(inFH)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
991 while bioseq.sequence != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
992 seqLen = bioseq.getLength()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
993 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
994 bioseq.sequence, bioseq.header, seqLen))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
995 bioseq.read(inFH)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
996
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
997
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
998 ## Splice an input fasta file using coordinates in a Map file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
999 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1000 # @note the coordinates should be merged beforehand!
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1001 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1002 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1003 def spliceFromCoords( genomeFile, coordFile, obsFile ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1004 genomeFileHandler = open( genomeFile, "r" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1005 obsFileHandler = open( obsFile, "w" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1006 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1007
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1008 bs = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1009 bs.read( genomeFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1010 while bs.sequence:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1011 if dChr2Maps.has_key( bs.header ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1012 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1013 splicedSeq = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1014 currentSite = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1015 for iMap in lCoords:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1016 minSplice = iMap.getMin() - 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1017 if minSplice > currentSite:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1018 splicedSeq += bs.sequence[ currentSite : minSplice ]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1019 currentSite = iMap.getMax()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1020 splicedSeq += bs.sequence[ currentSite : ]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1021 bs.sequence = splicedSeq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1022 bs.write( obsFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1023 bs.read( genomeFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1024
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1025 genomeFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1026 obsFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1027
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1028
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1029 ## Shuffle input sequences (single file or files in a directory)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1030 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1031 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1032 def dbShuffle( inData, outData, verbose=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1033 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1034 prg = "esl-shuffle"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1035 else : prg = "shuffle"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1036 genericCmd = prg + " -d INPUT > OUTPUT"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1037 if os.path.isfile( inData ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1038 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1039 print "shuffle input file '%s'" % inData
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1040 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1041 print cmd
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1042 returnStatus = os.system( cmd )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1043 if returnStatus != 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1044 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1045 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1046
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1047 elif os.path.isdir( inData ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1048 if verbose > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1049 print "shuffle files in input directory '%s'" % inData
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1050 if os.path.exists( outData ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1051 shutil.rmtree( outData )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1052 os.mkdir( outData )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1053 lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1054 nbFastaFiles = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1055 for inputFile in lInputFiles:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1056 nbFastaFiles += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1057 if verbose > 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1058 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1059 fastaBaseName = os.path.basename( inputFile )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1060 prefix, extension = os.path.splitext( fastaBaseName )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1061 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1062 returnStatus = os.system( cmd )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1063 if returnStatus != 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1064 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1065 sys.exit(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1066
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1067
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1068 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1069 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1070 # @param inClusterFileName string input cluster file name
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1071 # @param inFastaFileName string input fasta file name
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1072 # @param outFileName string output file name
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1073 # @param verbosity integer verbosity
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1074 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1075 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1076 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1077 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1078 iFastaParser = FastaParser(inFastaFileName)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1079 with open(outFileName, "w") as f:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1080 for iSequence in iFastaParser.getIterator():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1081
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1082 header = iSequence.getName()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1083 if dHeader2ClusterClusterMember.get(header):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1084 cluster = dHeader2ClusterClusterMember[header][0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1085 member = dHeader2ClusterClusterMember[header][1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1086 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1087 clusterIdForSingletonCluster += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1088 cluster = clusterIdForSingletonCluster
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1089 member = 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1090
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1091 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1092 iSequence.setName(newHeader)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1093 f.write(iSequence.printFasta())
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1094
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1095 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1096 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1097 dHeader2ClusterClusterMember = {}
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1098 clusterId = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1099 with open(inClusterFileName) as f:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1100 line = f.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1101 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1102 lineWithoutLastChar = line.rstrip()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1103 lHeaders = lineWithoutLastChar.split("\t")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1104 clusterId += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1105 if verbosity > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1106 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1107 memberId = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1108 for header in lHeaders:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1109 memberId += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1110 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1111 line = f.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1112 if verbosity > 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1113 print "%i clusters" % clusterId
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1114 return dHeader2ClusterClusterMember, clusterId
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1115
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1116 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1117 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1118 """
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1119 Write a map file from fasta output of clustering tool.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1120 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1121 """
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1122 if not outMapFileName:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1123 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1124
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1125 fileDb = open(fastaFileNameFromClustering , "r")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1126 fileMap = open(outMapFileName, "w")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1127 seq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1128 numseq = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1129 while 1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1130 seq.read(fileDb)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1131 if seq.sequence == None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1132 break
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1133 numseq = numseq + 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1134 ID = seq.header.split(' ')[0].split('_')[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1135 chunk = seq.header.split(' ')[0].split('_')[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1136 start = seq.header.split(' ')[-1].split(',')[0][1:]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1137 end = seq.header.split(' ')[-1].split(',')[1][:-1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1138 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1139 fileMap.write(line + "\n")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1140
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1141 fileDb.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1142 fileMap.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1143 print "saved in %s" % outMapFileName
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1144
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1145 @staticmethod
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1146 def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1147 outFormat = outFormat.lower()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1148 if outFormat in ["gff", "gff3"]:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1149 outFormat = "gff3"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1150 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1151 outFormat = "map"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1152
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1153 lTNstretches = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1154 if nbN != 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1155 iBSDB = BioseqDB(fastaFileName)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1156 for iBS in iBSDB.db:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1157 nbNFound = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1158 start = 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1159 pos = 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1160 lastPos = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1161
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1162 while pos <= iBS.getLength():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1163 if nbNFound == 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1164 start = pos
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1165
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1166 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1167 nbNFound += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1168 pos += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1169 lastPos = pos
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1170
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1171 if pos - lastPos >= nbN:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1172 if nbNFound >= nbN:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1173 lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1174 nbNFound = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1175 pos += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1176
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1177 if nbNFound >= nbN:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1178 lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1179
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1180 lTNstretches.sort(key = itemgetter(0, 1, 2))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1181
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1182 if outFileName == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1183 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1184
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1185 with open(outFileName, "w") as fH:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1186 if outFormat == "gff3":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1187 fH.write("##gff-version 3\n")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1188 for item in lTNstretches:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1189 seq = item[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1190 start = item[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1191 end = item[2]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1192 if outFormat == "gff3":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1193 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1194 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1195 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1196
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1197