Mercurial > repos > george-weingart > maaslin
comparison src/merge_metadata.py @ 8:e9677425c6c3 default tip
Updated the structure of the libraries
author | george.weingart@gmail.com |
---|---|
date | Mon, 09 Feb 2015 12:17:40 -0500 |
parents | e0b5980139d9 |
children |
comparison
equal
deleted
inserted
replaced
7:c72e14eabb08 | 8:e9677425c6c3 |
---|---|
1 #!/usr/bin/env python | |
2 ##################################################################################### | |
3 #Copyright (C) <2012> | |
4 # | |
5 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
6 #this software and associated documentation files (the "Software"), to deal in the | |
7 #Software without restriction, including without limitation the rights to use, copy, | |
8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
9 #and to permit persons to whom the Software is furnished to do so, subject to | |
10 #the following conditions: | |
11 # | |
12 #The above copyright notice and this permission notice shall be included in all copies | |
13 #or substantial portions of the Software. | |
14 # | |
15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
21 # | |
22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | |
23 # authored by the Huttenhower lab at the Harvard School of Public Health | |
24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | |
25 ##################################################################################### | |
26 """ | |
27 Examples | |
28 ~~~~~~~~ | |
29 | |
30 ``metadata.txt``:: | |
31 | |
32 - Y Z | |
33 a 1 x | |
34 b 0 y | |
35 c z | |
36 | |
37 ``data.pcl``:: | |
38 | |
39 - a b c | |
40 A|B 1 2 3 | |
41 A|C 4 5 6 | |
42 D|E 7 8 9 | |
43 | |
44 ``Examples``:: | |
45 | |
46 $ merge_metadata.py metadata.txt < data.pcl | |
47 sample a b c | |
48 Y 1 0 | |
49 Z x y z | |
50 A 0.416667 0.466667 0.5 | |
51 A|B 0.0833333 0.133333 0.166667 | |
52 A|C 0.333333 0.333333 0.333333 | |
53 D|E 0.583333 0.533333 0.5 | |
54 | |
55 $ merge_metadata.py metadata.txt -t 0 < data.pcl | |
56 sample a b c | |
57 Y 1 0 | |
58 Z x y z | |
59 A|B 0.0833333 0.133333 0.166667 | |
60 A|C 0.333333 0.333333 0.333333 | |
61 D|E 0.583333 0.533333 0.5 | |
62 | |
63 $ merge_metadata.py metadata.txt -t 1 < data.pcl | |
64 sample a b c | |
65 Y 1 0 | |
66 Z x y z | |
67 A 0.416667 0.466667 0.5 | |
68 D 0.583333 0.533333 0.5 | |
69 | |
70 $ merge_metadata.py metadata.txt -t 0 -n < data.pcl | |
71 sample a b c | |
72 Y 1 0 | |
73 Z x y z | |
74 A|B 1 2 3 | |
75 A|C 4 5 6 | |
76 D|E 7 8 9 | |
77 | |
78 $ merge_metadata.py metadata.txt -t 0 -m 0.8 -s "-" < data.pcl | |
79 sample b c | |
80 Y 0 - | |
81 Z y z | |
82 A|B 0.133333 0.166667 | |
83 A|C 0.333333 0.333333 | |
84 D|E 0.533333 0.5 | |
85 | |
86 $ merge_metadata.py -t 0 < data.pcl | |
87 sample a b c | |
88 A|B 1 2 3 | |
89 A|C 4 5 6 | |
90 D|E 7 8 9 | |
91 | |
92 .. testsetup:: | |
93 | |
94 from merge_metadata import * | |
95 """ | |
96 | |
97 import argparse | |
98 import blist | |
99 import csv | |
100 import re | |
101 import sys | |
102 | |
103 c_dTarget = 1.0 | |
104 c_fRound = False | |
105 | |
106 class CClade: | |
107 | |
108 def __init__( self ): | |
109 | |
110 self.m_hashChildren = {} | |
111 self.m_adValues = None | |
112 | |
113 def get( self, astrClade ): | |
114 | |
115 return self.m_hashChildren.setdefault( | |
116 astrClade[0], CClade( ) ).get( astrClade[1:] ) if astrClade else self | |
117 | |
118 def set( self, adValues ): | |
119 | |
120 self.m_adValues = blist.blist( [0] ) * len( adValues ) | |
121 for i, d in enumerate( adValues ): | |
122 if d: | |
123 self.m_adValues[i] = d | |
124 | |
125 def impute( self ): | |
126 | |
127 if not self.m_adValues: | |
128 for pChild in self.m_hashChildren.values( ): | |
129 adChild = pChild.impute( ) | |
130 if self.m_adValues: | |
131 for i in range( len( adChild or [] ) ): | |
132 if adChild[i]: | |
133 self.m_adValues[i] += adChild[i] | |
134 elif adChild: | |
135 self.m_adValues = adChild[:] | |
136 | |
137 return self.m_adValues | |
138 | |
139 def _freeze( self, hashValues, iTarget, astrClade, iDepth, fLeaves ): | |
140 | |
141 fHit = ( not iTarget ) or ( ( fLeaves and ( iDepth == iTarget ) ) or ( ( not fLeaves ) and ( iDepth <= iTarget ) ) ) | |
142 iDepth += 1 | |
143 setiRet = set() | |
144 if self.m_hashChildren: | |
145 for strChild, pChild in self.m_hashChildren.items( ): | |
146 setiRet |= pChild._freeze( hashValues, iTarget, astrClade + [strChild], iDepth, fLeaves ) | |
147 setiRet = set( ( i + 1 ) for i in setiRet ) | |
148 else: | |
149 setiRet.add( 0 ) | |
150 if iTarget < 0: | |
151 if fLeaves: | |
152 fHit = -( iTarget + 1 ) in setiRet | |
153 else: | |
154 fHit = -( iTarget + 1 ) <= max( setiRet ) | |
155 if astrClade and self.m_adValues and fHit: | |
156 hashValues["|".join( astrClade )] = self.m_adValues | |
157 return setiRet | |
158 | |
159 def freeze( self, hashValues, iTarget, fLeaves ): | |
160 | |
161 self._freeze( hashValues, iTarget, [], 0, fLeaves ) | |
162 | |
163 def _repr( self, strClade ): | |
164 | |
165 strRet = "<" | |
166 if strClade: | |
167 strRet += "%s %s" % (strClade, self.m_adValues) | |
168 if self.m_hashChildren: | |
169 strRet += " " | |
170 if self.m_hashChildren: | |
171 strRet += " ".join( p._repr( s ) for (s, p) in self.m_hashChildren.items( ) ) | |
172 | |
173 return ( strRet + ">" ) | |
174 | |
175 def __repr__( self ): | |
176 | |
177 return self._repr( "" ) | |
178 | |
179 """ | |
180 pTree = CClade( ) | |
181 pTree.get( ("A", "B") ).set( [1, 2, 3] ) | |
182 pTree.get( ("A", "C") ).set( [4, 5, 6] ) | |
183 pTree.get( ("D", "E") ).set( [7, 8, 9] ) | |
184 iTaxa = 0 | |
185 if iTaxa: | |
186 pTree.impute( ) | |
187 hashFeatures = {} | |
188 pTree.freeze( hashFeatures, iTaxa ) | |
189 print( pTree ) | |
190 print( hashFeatures ) | |
191 sys.exit( 0 ) | |
192 #""" | |
193 | |
194 def merge_metadata( aastrMetadata, aastrData, ostm, fNormalize, strMissing, astrExclude, dMin, iTaxa, fLeaves ): | |
195 """ | |
196 Joins and outputs a data matrix with a metadata matrix, optionally normalizing and filtering it. | |
197 A pipe-delimited taxonomy hierarchy can also be dynamically added or removed. | |
198 | |
199 :param aastrMetadata: Split lines from which metadata are read. | |
200 :type aastrMetadata: collection of string collections | |
201 :param aastrData: Split lines from which data are read. | |
202 :type aastrData: collection of string collections | |
203 :param ostm: Output stream to which joined rows are written. | |
204 :type ostm: output stream | |
205 :param fNormalize: If true, divide data values by column sums. | |
206 :type fNormalize: bool | |
207 :param strMissing: Representation for missing metadata values. | |
208 :type strMissing: str | |
209 :param astrExclude: Lines from which excluded IDs are read. | |
210 :type astrExclude: collection of strings | |
211 :param dMin: Minimum fraction of maximum value for per-column quality control. | |
212 :type dMin: bool | |
213 :param iTaxa: Depth of taxonomy to be computed, -1 = leaves only, 0 = no change | |
214 :type iTaxa: int | |
215 :param fLeaves: Output only leaves, not complete taxonomy; ignored if taxa = 0 | |
216 :type fLeaves: bool | |
217 | |
218 Metadata are optional; if not provided, data will be optionally normalized or its taxonomy | |
219 modified as requested. Metadata are provided one row per sample, data one column per | |
220 sample, both files tab-delimited text with one header row and one header column. | |
221 | |
222 Metadata IDs that do not match data IDs are discarded, and data IDs without corresponding | |
223 metadata IDs are given missing values. Missing data values are always treated (and output) | |
224 as zero. | |
225 | |
226 Per-column quality control is performed if the requested minimum fraction is greater than | |
227 zero. Specifically, for each column i, the row j containing the maximum value d is | |
228 identified. If d is less than the minimum fraction of row j's maximum value over all columns, | |
229 the entire column i is removed. | |
230 | |
231 A taxonomy hierarchy will be calculated by default if row IDs are pipe-delimited, i.e. of | |
232 the form A|B|C. All parent clades are computed by default, e.g. A|B and A, save when | |
233 they would be identical to a more specific child clade. Negative values are counted from the | |
234 bottom (right) of the hierarchy rather than the top. The special value of 0 deactivates | |
235 hierarchy calculation. | |
236 | |
237 >>> aastrMetadata = [[t.strip( ) for t in s] for s in ("-YZ", "a1x", "b0y", "c z")] | |
238 >>> aastrData = [s.split( ) for s in ( \ | |
239 "- a b c", \ | |
240 "A|B 1 2 3", \ | |
241 "A|C 4 5 6", \ | |
242 "D|E 7 8 9")] | |
243 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE | |
244 sample a b c | |
245 Y 1 0 | |
246 Z x y z | |
247 A 0.416667 0.466667 0.5 | |
248 A|B 0.0833333 0.133333 0.166667 | |
249 A|C 0.333333 0.333333 0.333333 | |
250 D|E 0.583333 0.533333 0.5 | |
251 | |
252 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, True ) #doctest: +NORMALIZE_WHITESPACE | |
253 sample a b c | |
254 Y 1 0 | |
255 Z x y z | |
256 A|B 0.0833333 0.133333 0.166667 | |
257 A|C 0.333333 0.333333 0.333333 | |
258 D|E 0.583333 0.533333 0.5 | |
259 | |
260 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE | |
261 sample a b c | |
262 Y 1 0 | |
263 Z x y z | |
264 A|B 0.0833333 0.133333 0.166667 | |
265 A|C 0.333333 0.333333 0.333333 | |
266 D|E 0.583333 0.533333 0.5 | |
267 | |
268 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 1, False ) #doctest: +NORMALIZE_WHITESPACE | |
269 sample a b c | |
270 Y 1 0 | |
271 Z x y z | |
272 A 0.416667 0.466667 0.5 | |
273 D 0.583333 0.533333 0.5 | |
274 | |
275 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, -1, True ) #doctest: +NORMALIZE_WHITESPACE | |
276 sample a b c | |
277 Y 1 0 | |
278 Z x y z | |
279 A|B 0.0833333 0.133333 0.166667 | |
280 A|C 0.333333 0.333333 0.333333 | |
281 D|E 0.583333 0.533333 0.5 | |
282 | |
283 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE | |
284 sample a b c | |
285 Y 1 0 | |
286 Z x y z | |
287 A|B 1 2 3 | |
288 A|C 4 5 6 | |
289 D|E 7 8 9 | |
290 | |
291 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "-", [], 0.8, 0, True ) #doctest: +NORMALIZE_WHITESPACE | |
292 sample b c | |
293 Y 0 - | |
294 Z y z | |
295 A|B 0.133333 0.166667 | |
296 A|C 0.333333 0.333333 | |
297 D|E 0.533333 0.5 | |
298 | |
299 >>> merge_metadata( None, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE | |
300 sample a b c | |
301 A|B 1 2 3 | |
302 A|C 4 5 6 | |
303 D|E 7 8 9 | |
304 | |
305 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", ["b"], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE | |
306 sample a c | |
307 Y 1 | |
308 Z x z | |
309 A 0.416667 0.5 | |
310 A|B 0.0833333 0.166667 | |
311 A|C 0.333333 0.333333 | |
312 D|E 0.583333 0.5 | |
313 """ | |
314 | |
315 #Put metadata in a dictionary | |
316 #{"First line element",["line element 2","line element 3","line element 4"]} | |
317 #If there is no metadata then | |
318 astrMetadata = None | |
319 hashMetadata = {} | |
320 for astrLine in ( aastrMetadata or [] ): | |
321 if astrMetadata: | |
322 hashMetadata[astrLine[0]] = astrLine[1:] | |
323 else: | |
324 astrMetadata = astrLine[1:] | |
325 | |
326 astrHeaders = adSeqs = iCol = None | |
327 pTree = CClade( ) | |
328 aastrRaw = [] | |
329 for astrLine in aastrData: | |
330 if astrHeaders: | |
331 if ( astrLine[0] == "EWEIGHT" ) or ( astrLine[0] == "total" ) or \ | |
332 ( len( astrLine ) < 2 ): | |
333 continue | |
334 try: | |
335 adCounts = [( float(strCur) if len( strCur.strip( ) ) else 0 ) for | |
336 strCur in astrLine[iCol:]] | |
337 except ValueError: | |
338 aastrRaw.append( astrLine ) | |
339 continue | |
340 for i in range( len( adCounts ) ): | |
341 adSeqs[i] += adCounts[i] | |
342 if ( iCol > 1 ) and ( astrLine[0] != astrLine[1] ): | |
343 if astrLine[1].find( astrLine[0] ) >= 0: | |
344 astrLine[0] = astrLine[1] | |
345 else: | |
346 astrLine[0] += " " + astrLine[1] | |
347 pTree.get( astrLine[0].split( "|" ) ).set( adCounts ) | |
348 else: | |
349 iCol = 2 if ( astrLine[1].upper( ) == "NAME" ) else 1 | |
350 astrHeaders = [strCur.replace( " ", "_" ) for strCur in astrLine[iCol:]] | |
351 adSeqs = [0] * len( astrHeaders ) | |
352 | |
353 if iTaxa: | |
354 pTree.impute( ) | |
355 hashFeatures = {} | |
356 pTree.freeze( hashFeatures, iTaxa, fLeaves ) | |
357 setstrFeatures = hashFeatures.keys( ) | |
358 | |
359 afOmit = [False] * len( astrHeaders ) | |
360 if dMin > 0: | |
361 aadData = list(hashFeatures.values( )) | |
362 for i in range( len( astrHeaders ) ): | |
363 iMax = max( range( len( aadData ) ), key = lambda j: aadData[j][i] ) | |
364 dMaxUs = aadData[iMax][i] | |
365 dMaxThem = max( aadData[iMax][j] for j in ( range( i ) + range( i + 1, len( astrHeaders ) ) ) ) | |
366 if dMaxUs < ( dMin * dMaxThem ): | |
367 sys.stderr.write( "Omitting: %s\n" % astrHeaders[i] ) | |
368 afOmit[i] = True | |
369 | |
370 if astrExclude: | |
371 setstrExclude = set(s.strip( ) for s in astrExclude) | |
372 for i in range( len( astrHeaders ) ): | |
373 if ( not afOmit[i] ) and ( astrHeaders[i] in setstrExclude ): | |
374 afOmit[i] = True | |
375 | |
376 adMult = [( ( c_dTarget / d ) if ( fNormalize and ( d > 0 ) ) else 1 ) for d in adSeqs] | |
377 for strFeature, adCounts in hashFeatures.items( ): | |
378 for i in range( len( adCounts ) ): | |
379 if adCounts[i]: | |
380 adCounts[i] *= adMult[i] | |
381 if c_fRound: | |
382 adCounts[i] = round( adCounts[i] ) | |
383 for strFeature, adCounts in hashFeatures.items( ): | |
384 astrFeature = strFeature.strip( ).split( "|" ) | |
385 while len( astrFeature ) > 1: | |
386 astrFeature = astrFeature[:-1] | |
387 strParent = "|".join( astrFeature ) | |
388 adParent = hashFeatures.get( strParent ) | |
389 if adParent == adCounts: | |
390 del hashFeatures[strParent] | |
391 setstrFeatures.remove( strParent ) | |
392 | |
393 if astrMetadata: | |
394 for i in range( len( astrMetadata ) ): | |
395 hashFeatures[astrMetadata[i]] = astrCur = [] | |
396 for strSubject in astrHeaders: | |
397 astrSubject = hashMetadata.get( strSubject ) | |
398 if not astrSubject: | |
399 strSubject = re.sub( '_.*$', "", strSubject ) | |
400 astrSubject = hashMetadata.get( strSubject, [] ) | |
401 astrCur.append( astrSubject[i] if ( i < len( astrSubject ) ) else "" ) | |
402 | |
403 astrFeatures = sorted( astrMetadata or [] ) + sorted( setstrFeatures ) | |
404 aiHeaders = filter( lambda i: not afOmit[i], range( len( astrHeaders ) ) ) | |
405 csvw = csv.writer( sys.stdout, csv.excel_tab ) | |
406 csvw.writerow( ["sample"] + [astrHeaders[i] for i in aiHeaders] ) | |
407 for iFeature in range( len( astrFeatures ) ): | |
408 strFeature = astrFeatures[iFeature] | |
409 adFeature = hashFeatures[strFeature] | |
410 astrValues = [adFeature[i] for i in aiHeaders] | |
411 for i in range( len( astrValues ) ): | |
412 strValue = astrValues[i] | |
413 if type( strValue ) in (int, float): | |
414 astrValues[i] = "%g" % astrValues[i] | |
415 elif ( not strValue ) or ( ( type( strValue ) == str ) and | |
416 ( len( strValue ) == 0 ) ): | |
417 astrValues[i] = strMissing | |
418 csvw.writerow( [strFeature] + astrValues ) | |
419 | |
420 for astrRaw in aastrRaw: | |
421 csvw.writerow( [astrRaw[i] for i in aiHeaders] ) | |
422 | |
423 argp = argparse.ArgumentParser( prog = "merge_metadata.py", | |
424 description = "Join a data matrix with a metadata matrix, optionally normalizing and filtering it.\n\n" + | |
425 "A pipe-delimited taxonomy hierarchy can also be dynamically added or removed." ) | |
426 argp.add_argument( "-n", dest = "fNormalize", action = "store_false", | |
427 help = "Don't normalize data values by column sums" ) | |
428 argp.add_argument( "-s", dest = "strMissing", metavar = "missing", | |
429 type = str, default = " ", | |
430 help = "String representing missing metadata values" ) | |
431 argp.add_argument( "-m", dest = "dMin", metavar = "min", | |
432 type = float, default = 0.01, | |
433 help = "Per-column quality control, minimum fraction of maximum value" ) | |
434 argp.add_argument( "-t", dest = "iTaxa", metavar = "taxa", | |
435 type = int, default = -1, | |
436 help = "Depth of taxonomy to be computed, negative = from right, 0 = no change" ) | |
437 argp.add_argument( "-l", dest = "fLeaves", action = "store_true", | |
438 help = "Output only leaves, not complete taxonomy" ) | |
439 argp.add_argument( "-x", dest = "istmExclude", metavar = "exclude.txt", | |
440 type = file, | |
441 help = "File from which sample IDs to exclude are read" ) | |
442 argp.add_argument( "istmMetadata", metavar = "metadata.txt", | |
443 type = file, nargs = "?", | |
444 help = "File from which metadata is read" ) | |
445 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ | |
446 | |
447 def _main( ): | |
448 args = argp.parse_args( ) | |
449 merge_metadata( args.istmMetadata and csv.reader( args.istmMetadata, csv.excel_tab ), | |
450 csv.reader( sys.stdin, csv.excel_tab ), sys.stdout, args.fNormalize, args.strMissing, | |
451 args.istmExclude, args.dMin, args.iTaxa, args.fLeaves ) | |
452 | |
453 if __name__ == "__main__": | |
454 _main( ) |