annotate commons/core/seq/FastaUtils.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 94ab73e8a190
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1 # Copyright INRA (Institut National de la Recherche Agronomique)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
2 # http://www.inra.fr
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
3 # http://urgi.versailles.inra.fr
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
4 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
5 # This software is governed by the CeCILL license under French law and
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
6 # abiding by the rules of distribution of free software. You can use,
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
7 # modify and/ or redistribute the software under the terms of the CeCILL
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
8 # license as circulated by CEA, CNRS and INRIA at the following URL
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
9 # "http://www.cecill.info".
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
10 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
11 # As a counterpart to the access to the source code and rights to copy,
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
12 # modify and redistribute granted by the license, users are provided only
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
13 # with a limited warranty and the software's author, the holder of the
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
14 # economic rights, and the successive licensors have only limited
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
15 # liability.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
16 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
17 # In this respect, the user's attention is drawn to the risks associated
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
18 # with loading, using, modifying and/or developing or reproducing the
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
19 # software by the user in light of its specific status of free software,
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
20 # that may mean that it is complicated to manipulate, and that also
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
21 # therefore means that it is reserved for developers and experienced
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
22 # professionals having in-depth computer knowledge. Users are therefore
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
23 # encouraged to load and test the software's suitability as regards their
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
24 # requirements in conditions enabling the security of their systems and/or
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
25 # data to be ensured and, more generally, to use and operate it in the
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
26 # same conditions as regards security.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
27 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
28 # The fact that you are presently reading this means that you have had
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
29 # knowledge of the CeCILL license and that you accept its terms.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
30
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
31
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
32 import os
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
33 import sys
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
34 import string
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
35 import math
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
36 import shutil
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
37 import re
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
38 import glob
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
39 from commons.core.seq.BioseqDB import BioseqDB
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
40 from commons.core.seq.Bioseq import Bioseq
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
41 from commons.core.coord.MapUtils import MapUtils
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
42 from commons.core.coord.Range import Range
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
43 from commons.core.checker.CheckerUtils import CheckerUtils
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
44 from commons.core.launcher.LauncherUtils import LauncherUtils
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
45 from commons.core.coord.ConvCoord import ConvCoord
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
46 from commons.core.parsing.FastaParser import FastaParser
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
47
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
48
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
49 ## Static methods for fasta file manipulation
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
50 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
51 class FastaUtils( object ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
52
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
53 ## Count the number of sequences in the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
54 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
55 # @param inFile name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
56 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
57 # @return integer number of sequences in the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
58 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
59 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
60 def dbSize( inFile ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
61 nbSeq = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
62 inFileHandler = open( inFile, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
63 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
64 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
65 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
66 nbSeq = nbSeq + 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
67 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
68 inFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
69
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
70 return nbSeq
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
71
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
72
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
73 ## Compute the cumulative sequence length in the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
74 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
75 # @param inFile handler of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
76 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
77 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
78 def dbCumLength( inFile ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
79 cumLength = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
80 line = inFile.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
81 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
82 if line[0] != ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
83 cumLength += len(string.rstrip(line))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
84 line = inFile.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
85
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
86 return cumLength
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
87
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
88
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
89 ## Return a list with the length of each sequence in the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
90 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
91 # @param inFile string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
92 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
93 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
94 def dbLengths( inFile ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
95 lLengths = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
96 inFileHandler = open( inFile, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
97 currentLength = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
98 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
99 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
100 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
101 if currentLength != 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
102 lLengths.append( currentLength )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
103 currentLength = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
104 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
105 currentLength += len(line[:-1])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
106 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
107 lLengths.append( currentLength )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
108 inFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
109 return lLengths
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
110
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
111
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
112 ## Retrieve the sequence headers present in the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
113 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
114 # @param inFile string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
115 # @param verbose integer level of verbosity
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
116 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
117 # @return list of sequence headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
118 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
119 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
120 def dbHeaders( inFile, verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
121 lHeaders = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
122
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
123 inFileHandler = open( inFile, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
124 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
125 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
126 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
127 lHeaders.append( string.rstrip(line[1:]) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
128 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
129 print string.rstrip(line[1:])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
130 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
131 inFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
132
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
133 return lHeaders
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
134
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
135
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
136 ## Cut a data bank into chunks according to the input parameters
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
137 # If a sequence is shorter than the threshold, it is only renamed (not cut)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
138 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
139 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
140 # @param chkLgth string chunk length (in bp, default=200000)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
141 # @param chkOver string chunk overlap (in bp, default=10000)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
142 # @param wordN string N stretch word length (default=11, 0 for no detection)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
143 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
144 # @param clean boolean remove 'cut' and 'Nstretch' files
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
145 # @param verbose integer (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
146 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
147 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
148 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
149 nbSeq = FastaUtils.dbSize( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
150 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
151 print "cut the %i input sequences with cutterDB..." % ( nbSeq )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
152 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
153
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
154 prg = "cutterDB"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
155 cmd = prg
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
156 cmd += " -l %s" % ( chkLgth )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
157 cmd += " -o %s" %( chkOver )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
158 cmd += " -w %s" % ( wordN )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
159 cmd += " %s" % ( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
160 returnStatus = os.system( cmd )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
161 if returnStatus != 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
162 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
163 sys.stderr.write( "%s\n" % ( msg ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
164 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
165
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
166 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
167 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
168 print "done (%i chunks)" % ( nbChunks )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
169 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
170
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
171 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
172 print "rename the headers..."
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
173 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
174
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
175 if outFilePrefix == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
176 outFastaName = inFileName + "_chunks.fa"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
177 outMapName = inFileName + "_chunks.map"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
178 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
179 outFastaName = outFilePrefix + ".fa"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
180 outMapName = outFilePrefix + ".map"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
181
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
182 inFile = open( "%s_cut" % ( inFileName ), "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
183 line = inFile.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
184
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
185 outFasta = open( outFastaName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
186 outMap = open( outMapName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
187
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
188 # read line after line (no need for big RAM) and change the sequence headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
189 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
190
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
191 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
192 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
193 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
194 data = line[:-1].split(" ")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
195 seqID = data[0].split(">")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
196 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
197 oldHeader = data[2]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
198 seqStart = data[4].split("..")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
199 seqEnd = data[4].split("..")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
200 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
201 outFasta.write( ">%s\n" % ( newHeader ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
202
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
203 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
204 outFasta.write( line.upper() )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
205
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
206 line = inFile.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
207
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
208 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
209 outFasta.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
210 outMap.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
211
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
212 if clean == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
213 os.remove(inFileName + "_cut")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
214 os.remove(inFileName + ".Nstretch.map")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
215
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
216
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
217 ## Split the input fasta file in several output files
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
218 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
219 # @param inFile string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
220 # @param nbSeqPerBatch integer number of sequences per output file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
221 # @param newDir boolean put the sequences in a new directory called 'batches'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
222 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
223 # @param prefix prefix in output file name
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
224 # @param verbose integer verbosity level (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
225 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
226 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
227 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
228 if not os.path.exists( inFile ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
229 msg = "ERROR: file '%s' doesn't exist" % ( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
230 sys.stderr.write( "%s\n" % ( msg ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
231 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
232
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
233 nbSeq = FastaUtils.dbSize( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
234
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
235 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
236 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
237 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
238 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
239
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
240 if nbSeqPerBatch > 1 and useSeqHeader:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
241 useSeqHeader = False
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
242
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
243 if newDir == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
244 if os.path.exists( "batches" ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
245 shutil.rmtree( "batches" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
246 os.mkdir( "batches" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
247 os.chdir( "batches" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
248 os.system( "ln -s ../%s ." % ( inFile ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
249
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
250 inFileHandler = open( inFile, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
251 inFileHandler.seek( 0, 0 )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
252 countBatch = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
253 countSeq = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
254 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
255 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
256 if line == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
257 break
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
258 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
259 countSeq += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
260 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
261 if "outFile" in locals():
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
262 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
263 countBatch += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
264 if nbSeqPerBatch == 1 and useSeqHeader:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
265 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
266 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
267 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
268 outFile = open( outFileName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
269 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
270 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
271 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
272 outFile.write( line )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
273 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
274 inFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
275
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
276 if newDir == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
277 os.remove( os.path.basename( inFile ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
278 os.chdir( ".." )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
279
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
280
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
281 ## Split the input fasta file in several output files
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
282 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
283 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
284 # @param maxSize integer max cumulative length for each output file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
285 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
286 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
287 def splitFastaFileInBatches(inFileName, maxSize = 200000):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
288 iBioseqDB = BioseqDB(inFileName)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
289 lHeadersSizeTuples = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
290 for iBioseq in iBioseqDB.db:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
291 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
292
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
293 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
294 os.mkdir("batches")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
295 os.chdir("batches")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
296
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
297 iterator = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
298 for lHeader in lHeadersList :
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
299 iterator += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
300 with open("batch_%s.fa" % iterator, 'w') as f :
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
301 for header in lHeader :
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
302 iBioseq = iBioseqDB.fetch(header)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
303 iBioseq.write(f)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
304 os.chdir("..")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
305
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
306
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
307 ## Split the input fasta file in several output files according to their cluster identifier
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
308 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
309 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
310 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
311 # @param simplifyHeader boolean simplify the headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
312 # @param createDir boolean put the sequences in different directories
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
313 # @param outPrefix string prefix of the output files (default='seqCluster')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
314 # @param verbose integer (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
315 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
316 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
317 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
318 if not os.path.exists( inFileName ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
319 print "ERROR: %s doesn't exist" % ( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
320 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
321
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
322 inFile = open( inFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
323
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
324 line = inFile.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
325 if line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
326 name = line.split(" ")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
327 if "Cluster" in name:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
328 clusterID = name.split("Cluster")[1].split("Mb")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
329 seqID = name.split("Mb")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
330 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
331 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
332 if "Q" in name.split("Gr")[0]:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
333 seqID = name.split("Gr")[0].split("MbQ")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
334 elif "S" in name:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
335 seqID = name.split("Gr")[0].split("MbS")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
336 sClusterIDs = set( [ clusterID ] )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
337 if simplifyHeader == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
338 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
339 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
340 header = line[1:-1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
341 if createDir == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
342 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
343 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
344 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
345 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
346 outFile = open( outFileName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
347 outFile.write( ">%s\n" % ( header ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
348 prevClusterID = clusterID
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
349
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
350 line = inFile.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
351 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
352 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
353 name = line.split(" ")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
354 if "Cluster" in name:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
355 clusterID = name.split("Cluster")[1].split("Mb")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
356 seqID = name.split("Mb")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
357 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
358 clusterID = name.split("Cl")[0].split("Gr")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
359 if "Q" in name.split("Gr")[0]:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
360 seqID = name.split("Gr")[0].split("MbQ")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
361 elif "S" in name:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
362 seqID = name.split("Gr")[0].split("MbS")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
363
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
364 if clusterID != prevClusterID:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
365 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
366
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
367 if simplifyHeader == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
368 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
369 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
370 header = line[1:-1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
371
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
372 if createDir == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
373 os.chdir( ".." )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
374 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
375 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
376 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
377
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
378 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
379 if not os.path.exists( outFileName ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
380 outFile = open( outFileName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
381 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
382 if clusterID != prevClusterID:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
383 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
384 outFile = open( outFileName, "a" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
385 outFile.write( ">%s\n" % ( header ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
386 prevClusterID = clusterID
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
387 sClusterIDs.add( clusterID )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
388
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
389 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
390 outFile.write( line )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
391
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
392 line = inFile.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
393
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
394 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
395 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
396 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
397
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
398 if createDir == True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
399 os.chdir("..")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
400 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
401 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
402
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
403
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
404 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
405 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
406 # @param len_min integer length sequence criterion to filter
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
407 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
408 # @param verbose integer (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
409 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
410 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
411 def dbLengthFilter( len_min, inFileName, verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
412 file_db = open( inFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
413 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
414 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
415 seq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
416 numseq = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
417 nbsave = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
418
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
419 seq.read( file_db )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
420 while seq.sequence:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
421 l = seq.getLength()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
422 numseq = numseq + 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
423 if l >= len_min:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
424 seq.write( file_dbSup )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
425 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
426 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
427 nbsave=nbsave+1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
428 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
429 seq.write( file_dbInf )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
430 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
431 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
432 nbsave=nbsave+1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
433 seq.read( file_db )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
434
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
435 file_db.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
436 file_dbInf.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
437 file_dbSup.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
438 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
439 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
440
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
441
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
442 ## Extract the longest sequences from a fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
443 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
444 # @param num integer maximum number of sequences in the output file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
445 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
446 # @param outFileName string name of the output fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
447 # @param minThresh integer minimum length threshold (default=0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
448 # @param verbose integer (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
449 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
450 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
451 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
452 bsDB = BioseqDB( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
453 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
454 print "nb of input sequences: %i" % ( bsDB.getSize() )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
455
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
456 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
457 outFileName = inFileName + ".best" + str(num)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
458 outFile = open( outFileName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
459
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
460 if bsDB.getSize()==0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
461 return 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
462
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
463 num = int(num)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
464 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
465 print "keep the %i longest sequences" % ( num )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
466 if minThresh > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
467 print "with length > %i bp" % ( minThresh )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
468 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
469
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
470 # retrieve the length of each input sequence
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
471 tmpLSeqLgth = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
472 seqNum = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
473 for bs in bsDB.db:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
474 seqNum += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
475 tmpLSeqLgth.append( bs.getLength() )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
476 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
477 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
478 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
479
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
480 # sort the lengths
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
481 tmpLSeqLgth.sort()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
482 tmpLSeqLgth.reverse()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
483
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
484 # select the longest
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
485 lSeqLgth = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
486 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
487 if tmpLSeqLgth[i] >= minThresh:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
488 lSeqLgth.append( tmpLSeqLgth[i] )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
489 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
490 print "selected max length: %i" % ( max(lSeqLgth) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
491 print "selected min length: %i" % ( min(lSeqLgth) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
492 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
493
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
494 # save the longest
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
495 inFile = open( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
496 seqNum = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
497 nbSave = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
498 for bs in bsDB.db:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
499 seqNum += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
500 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
501 bs.write( outFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
502 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
503 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
504 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
505 nbSave += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
506 if nbSave == num:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
507 break
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
508 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
509 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
510 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
511 print nbSave, "saved sequences in ", outFileName
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
512 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
513
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
514 return 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
515
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
516
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
517 ## Extract all the sequence headers from a fasta file and write them in a new fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
518 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
519 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
520 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
521 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
522 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
523 def dbExtractSeqHeaders( inFileName, outFileName="" ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
524 lHeaders = FastaUtils.dbHeaders( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
525
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
526 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
527 outFileName = inFileName + ".headers"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
528
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
529 outFile = open( outFileName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
530 for i in lHeaders:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
531 outFile.write( i + "\n" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
532 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
533
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
534 return 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
535
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
536
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
537 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
538 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
539 # @param pattern regular expression to search in headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
540 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
541 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
542 # @param verbose integer verbosity level (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
543 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
544 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
545 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
546 if pattern == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
547 return
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
548
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
549 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
550 outFileName = inFileName + '.extracted'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
551 outFile = open( outFileName, 'w' )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
552
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
553 patternTosearch = re.compile( pattern )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
554 bioseq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
555 bioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
556 savedBioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
557 inFile = open( inFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
558 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
559 while bioseq.sequence:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
560 bioseqNb = bioseqNb + 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
561 m = patternTosearch.search( bioseq.header )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
562 if m:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
563 bioseq.write( outFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
564 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
565 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
566 savedBioseqNb = savedBioseqNb + 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
567 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
568 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
569
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
570 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
571
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
572 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
573 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
574
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
575
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
576 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
577 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
578 # @param patternFileName string file containing regular expression to search in headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
579 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
580 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
581 # @param verbose integer verbosity level (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
582 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
583 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
584 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
585
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
586 if patternFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
587 print "ERROR: no file of pattern"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
588 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
589
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
590 bioseq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
591 bioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
592 savedBioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
593 lHeaders = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
594
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
595 inFile = open( inFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
596 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
597 while bioseq.sequence != None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
598 lHeaders.append( bioseq.header )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
599 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
600 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
601
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
602 lHeadersToKeep = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
603 patternFile = open( patternFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
604 for pattern in patternFile:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
605 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
606 print "pattern: ",pattern[:-1]; sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
607
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
608 patternToSearch = re.compile(pattern[:-1])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
609 for h in lHeaders:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
610 if patternToSearch.search(h):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
611 lHeadersToKeep.append(h)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
612 patternFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
613
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
614 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
615 outFileName = inFileName + ".extracted"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
616 outFile=open( outFileName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
617
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
618 inFile = open( inFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
619 bioseq.read(inFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
620 while bioseq.sequence:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
621 bioseqNb += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
622 if bioseq.header in lHeadersToKeep:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
623 bioseq.write(outFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
624 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
625 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
626 savedBioseqNb += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
627 bioseq.read(inFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
628 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
629
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
630 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
631
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
632 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
633 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
634
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
635
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
636 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
637 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
638 # @param pattern regular expression to search in headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
639 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
640 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
641 # @param verbose integer verbosity level (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
642 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
643 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
644 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
645 if pattern == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
646 return
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
647
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
648 patternToSearch = re.compile(pattern)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
649
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
650 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
651 outFileName = inFileName + '.cleaned'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
652 outFile = open(outFileName,'w')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
653
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
654 bioseq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
655 bioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
656 savedBioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
657 inFile = open(inFileName)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
658 bioseq.read(inFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
659 while bioseq.sequence != None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
660 bioseqNb += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
661 if not patternToSearch.search(bioseq.header):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
662 bioseq.write(outFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
663 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
664 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
665 savedBioseqNb += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
666 bioseq.read(inFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
667 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
668
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
669 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
670
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
671 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
672 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
673
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
674
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
675 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
676 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
677 # @param patternFileName string file containing regular expression to search in headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
678 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
679 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
680 # @param verbose integer verbosity level (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
681 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
682 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
683 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
684 if patternFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
685 print "ERROR: no file of pattern"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
686 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
687
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
688 bioseq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
689 bioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
690 savedBioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
691 lHeaders = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
692 inFile = open( inFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
693 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
694 while bioseq.sequence != None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
695 bioseqNb += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
696 lHeaders.append( bioseq.header )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
697 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
698 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
699
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
700 patternFile = open( patternFileName, "r")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
701 lHeadersToRemove = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
702 for pattern in patternFile:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
703 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
704 print "pattern: ",pattern[:-1]; sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
705
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
706 patternToSearch = re.compile( pattern[:-1] )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
707 for h in lHeaders:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
708 if patternToSearch.search(h):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
709 lHeadersToRemove.append(h)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
710 patternFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
711
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
712 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
713 outFileName = inFileName + '.cleaned'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
714 outFile = open( outFileName, 'w' )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
715
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
716 bioseqNum = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
717 inFile=open( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
718 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
719 while bioseq.sequence != None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
720 bioseqNum += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
721 if bioseq.header not in lHeadersToRemove:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
722 bioseq.write( outFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
723 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
724 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
725 savedBioseqNb += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
726 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
727 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
728
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
729 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
730
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
731 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
732 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
733
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
734
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
735 ## Find sequence's ORFs from a fasta file and write them in a tab file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
736 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
737 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
738 # @param orfMaxNb integer Select orfMaxNb best ORFs
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
739 # @param orfMinLength integer Keep ORFs with length > orfMinLength
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
740 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
741 # @param verbose integer verbosity level (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
742 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
743 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
744 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
745 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
746 outFileName = inFileName + ".ORF.map"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
747 outFile = open( outFileName, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
748
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
749 bioseq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
750 bioseqNb = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
751
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
752 inFile = open( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
753 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
754 while bioseq.sequence != None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
755 bioseq.upCase()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
756 bioseqNb += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
757 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
758 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
759
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
760 orf = bioseq.findORF()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
761 bestOrf = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
762 for i in orf.keys():
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
763 orfLen = len(orf[i])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
764 for j in xrange(1, orfLen):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
765 start = orf[i][j-1] + 4
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
766 end = orf[i][j] + 3
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
767 if end - start >= orfMinLength:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
768 bestOrf.append( ( end-start, i+1, start, end ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
769
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
770 bioseq.reverseComplement()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
771
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
772 orf = bioseq.findORF()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
773 seqLen = bioseq.getLength()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
774 for i in orf.keys():
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
775 orfLen = len(orf[i])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
776 for j in xrange(1, orfLen):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
777 start = seqLen - orf[i][j-1] - 3
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
778 end = seqLen - orf[i][j] - 2
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
779 if start - end >= orfMinLength:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
780 bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
781
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
782 bestOrf.sort()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
783 bestOrf.reverse()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
784 bestOrfNb = len(bestOrf)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
785 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
786 bestOrfNb = orfMaxNb
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
787 for i in xrange(0, bestOrfNb):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
788 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
789 print bestOrf[i]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
790 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
791 "|"+str(bestOrf[i][0]),bioseq.header,
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
792 bestOrf[i][2],bestOrf[i][3]))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
793 bioseq.read( inFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
794
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
795 inFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
796 outFile.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
797
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
798 return 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
799
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
800
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
801 ## Sort sequences by increasing length (write a new file)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
802 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
803 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
804 # @param outFileName string name of the output fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
805 # @param verbose integer verbosity level
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
806 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
807 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
808 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
809 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
810 print "sort sequences by increasing length"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
811 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
812 if not os.path.exists( inFileName ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
813 print "ERROR: file '%s' doesn't exist" % ( inFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
814 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
815
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
816 # read each seq one by one
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
817 # save them in distinct temporary files
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
818 # with their length in the name
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
819 inFileHandler = open( inFileName, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
820 countSeq = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
821 bs = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
822 bs.read( inFileHandler )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
823 while bs.header:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
824 countSeq += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
825 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
826 bs.appendBioseqInFile( tmpFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
827 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
828 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
829 bs.header = ""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
830 bs.sequence = ""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
831 bs.read( inFileHandler )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
832 inFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
833
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
834 # sort temporary file names
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
835 # concatenate them into the output file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
836 if os.path.exists( outFileName ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
837 os.remove( outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
838 lFiles = glob.glob( "*bp_*nb" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
839 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
840 for fileName in lFiles:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
841 cmd = "cat %s >> %s" % ( fileName, outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
842 returnValue = os.system( cmd )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
843 if returnValue != 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
844 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
845 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
846 os.remove( fileName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
847
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
848 return 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
849
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
850
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
851 ## Sort sequences by header
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
852 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
853 # @param inFileName string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
854 # @param outFileName string name of the output fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
855 # @param verbose integer verbosity level
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
856 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
857 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
858 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
859 if outFileName == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
860 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
861 iBioseqDB = BioseqDB(inFileName)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
862 f = open(outFileName, "w")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
863 lHeaders = sorted(iBioseqDB.getHeaderList())
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
864 for header in lHeaders:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
865 iBioseq = iBioseqDB.fetch(header)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
866 iBioseq.write(f)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
867 f.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
868
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
869
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
870 ## Return a dictionary which keys are the headers and values the length of the sequences
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
871 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
872 # @param inFile string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
873 # @param verbose integer verbosity level
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
874 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
875 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
876 def getLengthPerHeader( inFile, verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
877 dHeader2Length = {}
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
878
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
879 inFileHandler = open( inFile, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
880 currentSeqHeader = ""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
881 currentSeqLength = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
882 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
883 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
884 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
885 if currentSeqHeader != "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
886 dHeader2Length[ currentSeqHeader ] = currentSeqLength
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
887 currentSeqLength = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
888 currentSeqHeader = line[1:-1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
889 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
890 print "current header: %s" % ( currentSeqHeader )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
891 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
892 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
893 currentSeqLength += len( line.replace("\n","") )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
894 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
895 dHeader2Length[ currentSeqHeader ] = currentSeqLength
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
896 inFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
897
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
898 return dHeader2Length
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
899
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
900
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
901 ## Convert headers from a fasta file having chunk coordinates
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
902 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
903 # @param inFile string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
904 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
905 # @param outFile string name of the output file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
906 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
907 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
908 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
909 inFileHandler = open(inFile, "r")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
910 outFileHandler = open(outFile, "w")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
911 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
912 iConvCoord = ConvCoord()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
913 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
914 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
915 if line[0] == ">":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
916 if "{Fragment}" in line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
917 chkName = line.split(" ")[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
918 chrName = dChunk2Map[chkName].seqname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
919 lCoordPairs = line.split(" ")[3].split(",")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
920 lRangesOnChk = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
921 for i in lCoordPairs:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
922 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
923 lRangesOnChk.append(iRange)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
924 lRangesOnChr = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
925 for iRange in lRangesOnChk:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
926 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
927 newHeader = line[1:-1].split(" ")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
928 newHeader += " %s" % chrName
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
929 newHeader += " {Fragment}"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
930 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
931 for iRange in lRangesOnChr[1:]:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
932 newHeader += ",%i..%i" % (iRange.start, iRange.end)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
933 outFileHandler.write(">%s\n" % newHeader)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
934 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
935 chkName = line.split("_")[1].split(" ")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
936 chrName = dChunk2Map[chkName].seqname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
937 coords = line[line.find("[")+1 : line.find("]")]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
938 start = int(coords.split(",")[0])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
939 end = int(coords.split(",")[1])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
940 iRangeOnChk = Range(chkName, start, end)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
941 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
942 newHeader = line[1:-1].split("_")[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
943 newHeader += " %s" % chrName
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
944 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
945 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
946 outFileHandler.write(">%s\n" % newHeader)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
947 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
948 outFileHandler.write(line)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
949 line = inFileHandler.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
950 inFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
951 outFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
952
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
953
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
954 ## Convert a fasta file to a length file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
955 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
956 # @param inFile string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
957 # @param outFile string name of the output file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
958 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
959 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
960 def convertFastaToLength(inFile, outFile = ""):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
961 if outFile == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
962 outFile = "%s.length" % inFile
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
963
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
964 if inFile != "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
965 with open(inFile, "r") as inFH:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
966 with open(outFile, "w") as outFH:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
967 bioseq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
968 bioseq.read(inFH)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
969 while bioseq.sequence != None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
970 seqLen = bioseq.getLength()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
971 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
972 bioseq.read(inFH)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
973
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
974
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
975 ## Convert a fasta file to a seq file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
976 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
977 # @param inFile string name of the input fasta file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
978 # @param outFile string name of the output file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
979 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
980 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
981 def convertFastaToSeq(inFile, outFile = ""):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
982 if outFile == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
983 outFile = "%s.seq" % inFile
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
984
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
985 if inFile != "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
986 with open(inFile, "r") as inFH:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
987 with open(outFile, "w") as outFH:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
988 bioseq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
989 bioseq.read(inFH)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
990 while bioseq.sequence != None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
991 seqLen = bioseq.getLength()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
992 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
993 bioseq.sequence, bioseq.header, seqLen))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
994 bioseq.read(inFH)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
995
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
996
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
997 ## Splice an input fasta file using coordinates in a Map file
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
998 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
999 # @note the coordinates should be merged beforehand!
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1000 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1001 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1002 def spliceFromCoords( genomeFile, coordFile, obsFile ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1003 genomeFileHandler = open( genomeFile, "r" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1004 obsFileHandler = open( obsFile, "w" )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1005 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1006
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1007 bs = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1008 bs.read( genomeFileHandler )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1009 while bs.sequence:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1010 if dChr2Maps.has_key( bs.header ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1011 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1012 splicedSeq = ""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1013 currentSite = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1014 for iMap in lCoords:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1015 minSplice = iMap.getMin() - 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1016 if minSplice > currentSite:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1017 splicedSeq += bs.sequence[ currentSite : minSplice ]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1018 currentSite = iMap.getMax()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1019 splicedSeq += bs.sequence[ currentSite : ]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1020 bs.sequence = splicedSeq
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1021 bs.write( obsFileHandler )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1022 bs.read( genomeFileHandler )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1023
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1024 genomeFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1025 obsFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1026
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1027
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1028 ## Shuffle input sequences (single file or files in a directory)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1029 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1030 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1031 def dbShuffle( inData, outData, verbose=0 ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1032 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1033 prg = "esl-shuffle"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1034 else : prg = "shuffle"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1035 genericCmd = prg + " -d INPUT > OUTPUT"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1036 if os.path.isfile( inData ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1037 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1038 print "shuffle input file '%s'" % inData
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1039 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1040 print cmd
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1041 returnStatus = os.system( cmd )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1042 if returnStatus != 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1043 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1044 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1045
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1046 elif os.path.isdir( inData ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1047 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1048 print "shuffle files in input directory '%s'" % inData
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1049 if os.path.exists( outData ):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1050 shutil.rmtree( outData )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1051 os.mkdir( outData )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1052 lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1053 nbFastaFiles = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1054 for inputFile in lInputFiles:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1055 nbFastaFiles += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1056 if verbose > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1057 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1058 fastaBaseName = os.path.basename( inputFile )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1059 prefix, extension = os.path.splitext( fastaBaseName )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1060 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1061 returnStatus = os.system( cmd )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1062 if returnStatus != 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1063 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1064 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1065
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1066
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1067 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1068 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1069 # @param inClusterFileName string input cluster file name
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1070 # @param inFastaFileName string input fasta file name
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1071 # @param outFileName string output file name
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1072 # @param verbosity integer verbosity
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1073 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1074 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1075 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1076 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1077 iFastaParser = FastaParser(inFastaFileName)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1078 with open(outFileName, "w") as f:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1079 for iSequence in iFastaParser.getIterator():
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1080
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1081 header = iSequence.getName()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1082 if dHeader2ClusterClusterMember.get(header):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1083 cluster = dHeader2ClusterClusterMember[header][0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1084 member = dHeader2ClusterClusterMember[header][1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1085 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1086 clusterIdForSingletonCluster += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1087 cluster = clusterIdForSingletonCluster
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1088 member = 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1089
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1090 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1091 iSequence.setName(newHeader)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1092 f.write(iSequence.printFasta())
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1093
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1094 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1095 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1096 dHeader2ClusterClusterMember = {}
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1097 clusterId = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1098 with open(inClusterFileName) as f:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1099 line = f.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1100 while line:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1101 lineWithoutLastChar = line.rstrip()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1102 lHeaders = lineWithoutLastChar.split("\t")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1103 clusterId += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1104 if verbosity > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1105 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1106 memberId = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1107 for header in lHeaders:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1108 memberId += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1109 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1110 line = f.readline()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1111 if verbosity > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1112 print "%i clusters" % clusterId
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1113 return dHeader2ClusterClusterMember, clusterId
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1114
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1115 @staticmethod
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1116 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1117 """
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1118 Write a map file from fasta output of clustering tool.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1119 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1120 """
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1121 if not outMapFileName:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1122 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1123
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1124 fileDb = open(fastaFileNameFromClustering , "r")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1125 fileMap = open(outMapFileName, "w")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1126 seq = Bioseq()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1127 numseq = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1128 while 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1129 seq.read(fileDb)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1130 if seq.sequence == None:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1131 break
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1132 numseq = numseq + 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1133 ID = seq.header.split(' ')[0].split('_')[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1134 chunk = seq.header.split(' ')[0].split('_')[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1135 start = seq.header.split(' ')[-1].split(',')[0][1:]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1136 end = seq.header.split(' ')[-1].split(',')[1][:-1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1137 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1138 fileMap.write(line + "\n")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1139
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1140 fileDb.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1141 fileMap.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1142 print "saved in %s" % outMapFileName
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1143