Mercurial > repos > yufei-luo > s_mart
comparison smart_toolShed/commons/core/coord/MatchUtils.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0f8dcca02ed |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 import math | |
32 import os | |
33 import sys | |
34 from commons.core.coord.Match import Match | |
35 from commons.core.checker.RepetException import RepetException | |
36 | |
37 ## Static methods for the manipulation of Match instances | |
38 # | |
39 class MatchUtils ( object ): | |
40 | |
41 ## Return a list with Match instances from the given file | |
42 # | |
43 # @param inFile name of a file in the Match format | |
44 # @return a list of Match instances | |
45 # | |
46 def getMatchListFromFile(inFile ): | |
47 lMatchInstances = [] | |
48 inFileHandler = open( inFile, "r" ) | |
49 while True: | |
50 line = inFileHandler.readline() | |
51 if line == "": | |
52 break | |
53 if line[0:10] == "query.name": | |
54 continue | |
55 m = Match() | |
56 m.setFromString( line ) | |
57 lMatchInstances.append( m ) | |
58 inFileHandler.close() | |
59 return lMatchInstances | |
60 | |
61 getMatchListFromFile = staticmethod( getMatchListFromFile ) | |
62 | |
63 ## Split a Match list in several Match lists according to the subject | |
64 # | |
65 # @param lMatches a list of Match instances | |
66 # @return a dictionary which keys are subject names and values Match lists | |
67 # | |
68 def getDictOfListsWithSubjectAsKey( lMatches ): | |
69 dSubject2MatchList = {} | |
70 for iMatch in lMatches: | |
71 if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ): | |
72 dSubject2MatchList[ iMatch.range_subject.seqname ] = [] | |
73 dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch ) | |
74 return dSubject2MatchList | |
75 | |
76 getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey ) | |
77 | |
78 ## Split a Match list in several Match lists according to the query | |
79 # | |
80 # @param lMatches a list of Match instances | |
81 # @return a dictionary which keys are query names and values Match lists | |
82 # | |
83 def getDictOfListsWithQueryAsKey ( lMatches ): | |
84 dQuery2MatchList = {} | |
85 for iMatch in lMatches: | |
86 if not dQuery2MatchList.has_key( iMatch.range_query.seqname ): | |
87 dQuery2MatchList[ iMatch.range_query.seqname ] = [] | |
88 dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch ) | |
89 return dQuery2MatchList | |
90 | |
91 getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey ) | |
92 | |
93 ## Write Match instances contained in the given list | |
94 # | |
95 # @param lMatches a list of Match instances | |
96 # @param fileName name of the file to write the Match instances | |
97 # @param mode the open mode of the file ""w"" or ""a"" | |
98 # | |
99 def writeListInFile( lMatches, fileName, mode="w", header=None ): | |
100 fileHandler = open( fileName, mode ) | |
101 if header: | |
102 fileHandler.write( header ) | |
103 for iMatch in lMatches: | |
104 iMatch.write( fileHandler ) | |
105 fileHandler.close() | |
106 | |
107 writeListInFile = staticmethod( writeListInFile ) | |
108 | |
109 ## Give path id list from a list of Match instances | |
110 # | |
111 # @param lMatch list of Match instances | |
112 # | |
113 # @return lId integer list | |
114 # | |
115 def getIdListFromMatchList(lMatch): | |
116 lId = [] | |
117 for iMatch in lMatch: | |
118 lId.append(iMatch.id) | |
119 return lId | |
120 | |
121 getIdListFromMatchList = staticmethod(getIdListFromMatchList) | |
122 | |
123 ## Remove duplicated matches in a match list | |
124 # ## replace old PyRepet.MatchDB.rmvDoublons() | |
125 # @param lMatch list of Match instances | |
126 # | |
127 # @return lMatchesUniq match unique list | |
128 # | |
129 def rmvDuplicateMatches(lMatch): | |
130 lMatchesUniq = [] | |
131 for match in lMatch: | |
132 if len(lMatchesUniq) == 0: | |
133 lMatchesUniq.append( match ) | |
134 else: | |
135 nbDoublons = 0 | |
136 for m in lMatchesUniq: | |
137 if match.isDoublonWith( m ): | |
138 nbDoublons += 1 | |
139 if nbDoublons == 0: | |
140 lMatchesUniq.append( match ) | |
141 | |
142 for match1 in lMatchesUniq: | |
143 for match2 in lMatchesUniq: | |
144 if match1.id != match2.id: | |
145 if match1.isDoublonWith( match2 ): | |
146 raise RepetException ( "*** Error: doublon not removed" ) | |
147 return lMatchesUniq | |
148 | |
149 rmvDuplicateMatches = staticmethod(rmvDuplicateMatches) | |
150 | |
151 ## Return the list of queries 'included' in subjects when two different databanks are used. | |
152 ##replace old pyRepet.MatchDB.filterDiffQrySbj() | |
153 # | |
154 # @param iBioseqDB bioseqDB databank of queries | |
155 # | |
156 # @param thresIdentity float identity threshold | |
157 # | |
158 # @param thresLength float length threshold | |
159 # | |
160 # @param verbose int verbosity | |
161 # | |
162 # @return lMatches match list to keep according to length and identity thresholds | |
163 #TODO: don't take into account match for sequence against itself. To do ? | |
164 def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity=0.95, thresLength=0.98, verbose=0 ): | |
165 if verbose > 0: | |
166 print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % ( thresIdentity, thresLength ); sys.stdout.flush() | |
167 | |
168 thresIdentityPerc = math.floor( thresIdentity*100 ) | |
169 lQryToKeep = [] | |
170 dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) | |
171 | |
172 for seqH in iBioseqDB.idx.keys(): | |
173 # keep it if it has no match | |
174 if not dQry2Matches.has_key( seqH ): | |
175 if seqH not in lQryToKeep: | |
176 lQryToKeep.append( seqH ) | |
177 else: | |
178 isConditionsMet = False | |
179 for match in dQry2Matches[ seqH ]: | |
180 # check if they are above the thresholds | |
181 if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: | |
182 isConditionsMet = True | |
183 break | |
184 if not isConditionsMet and seqH not in lQryToKeep: | |
185 lQryToKeep.append( seqH ) | |
186 return lQryToKeep | |
187 | |
188 filterDiffQrySbj = staticmethod(filterDiffQrySbj) | |
189 | |
190 ## Count the number of distinct matches involved in at least one match above the thresholds. | |
191 ##replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() | |
192 # @param thresIdentity float identity threshold | |
193 # | |
194 # @param thresLength float length threshold | |
195 # | |
196 def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity=0.95, thresLength=0.98, whatToCount="query" ): | |
197 thresIdentityPerc = math.floor( thresIdentity*100 ) | |
198 countSbj = 0 | |
199 if whatToCount.lower() == "query": | |
200 dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) | |
201 else: | |
202 dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) | |
203 | |
204 for qry in dMatches.keys(): | |
205 countMatch = 0 | |
206 for match in dMatches[ qry ]: | |
207 | |
208 if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength: | |
209 countMatch += 1 | |
210 if countMatch > 0: | |
211 countSbj += 1 | |
212 return countSbj | |
213 | |
214 getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh) | |
215 | |
216 ## Convert a 'match' file (output from Matcher) into an 'align' file | |
217 ## replace old parser.tab2align | |
218 # | |
219 # @param inFileName a string input file name | |
220 # | |
221 def convertMatchFileToAlignFile(inFileName): | |
222 basename = os.path.splitext(inFileName)[0] | |
223 outFileName = "%s.align" % basename | |
224 outFile = open(outFileName, "w") | |
225 | |
226 lMatches = MatchUtils.getMatchListFromFile(inFileName) | |
227 | |
228 for match in lMatches: | |
229 string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() ) | |
230 outFile.write( string ) | |
231 | |
232 outFile.close() | |
233 | |
234 convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile) | |
235 | |
236 ## Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) | |
237 # Use coverage on query for arc value | |
238 # | |
239 # @param matchFileName string input match file name | |
240 # @param outFileName string output abc file name | |
241 # @param coverage float query coverage filter threshold | |
242 # | |
243 @staticmethod | |
244 def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): | |
245 with open(matchFileName) as inF: | |
246 with open(outFileName, "w") as outF: | |
247 inF.readline() | |
248 inLine = inF.readline() | |
249 while inLine: | |
250 splittedLine = inLine.split("\t") | |
251 if float(splittedLine[4]) >= coverage: | |
252 outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) | |
253 outLine += "\n" | |
254 outF.write(outLine) | |
255 inLine = inF.readline() | |
256 | |
257 ## Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. | |
258 ## replace old parser.tabnum2id | |
259 # | |
260 # @param fileName a string input file name | |
261 # @param outputFileName a string output file name (optional) | |
262 # | |
263 def generateMatchFileWithNewPathId(fileName, outputFileName=None): | |
264 if outputFileName is None: | |
265 outFile = open(fileName, "w") | |
266 else: | |
267 outFile = open(outputFileName, "w") | |
268 outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") | |
269 | |
270 lMatches = MatchUtils.getMatchListFromFile(fileName) | |
271 count = 1 | |
272 dMatchKeyIdcount = {} | |
273 | |
274 for match in lMatches: | |
275 key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() | |
276 if not key_id in dMatchKeyIdcount.keys(): | |
277 newPath = count | |
278 count += 1 | |
279 dMatchKeyIdcount[ key_id ] = newPath | |
280 else: | |
281 newPath = dMatchKeyIdcount[ key_id ] | |
282 | |
283 match.id = newPath | |
284 outFile.write( match.toString()+"\n" ) | |
285 outFile.close() | |
286 | |
287 generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId) | |
288 |