0
|
1 #!/usr/bin/env python
|
|
2 #####################################################################################
|
|
3 #Copyright (C) <2012>
|
|
4 #
|
|
5 #Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
6 #this software and associated documentation files (the "Software"), to deal in the
|
|
7 #Software without restriction, including without limitation the rights to use, copy,
|
|
8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
9 #and to permit persons to whom the Software is furnished to do so, subject to
|
|
10 #the following conditions:
|
|
11 #
|
|
12 #The above copyright notice and this permission notice shall be included in all copies
|
|
13 #or substantial portions of the Software.
|
|
14 #
|
|
15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
|
16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
21 #
|
|
22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
|
|
23 # authored by the Huttenhower lab at the Harvard School of Public Health
|
|
24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu).
|
|
25 #####################################################################################
|
|
26 """
|
|
27 Examples
|
|
28 ~~~~~~~~
|
|
29
|
|
30 ``metadata.txt``::
|
|
31
|
|
32 - Y Z
|
|
33 a 1 x
|
|
34 b 0 y
|
|
35 c z
|
|
36
|
|
37 ``data.pcl``::
|
|
38
|
|
39 - a b c
|
|
40 A|B 1 2 3
|
|
41 A|C 4 5 6
|
|
42 D|E 7 8 9
|
|
43
|
|
44 ``Examples``::
|
|
45
|
|
46 $ merge_metadata.py metadata.txt < data.pcl
|
|
47 sample a b c
|
|
48 Y 1 0
|
|
49 Z x y z
|
|
50 A 0.416667 0.466667 0.5
|
|
51 A|B 0.0833333 0.133333 0.166667
|
|
52 A|C 0.333333 0.333333 0.333333
|
|
53 D|E 0.583333 0.533333 0.5
|
|
54
|
|
55 $ merge_metadata.py metadata.txt -t 0 < data.pcl
|
|
56 sample a b c
|
|
57 Y 1 0
|
|
58 Z x y z
|
|
59 A|B 0.0833333 0.133333 0.166667
|
|
60 A|C 0.333333 0.333333 0.333333
|
|
61 D|E 0.583333 0.533333 0.5
|
|
62
|
|
63 $ merge_metadata.py metadata.txt -t 1 < data.pcl
|
|
64 sample a b c
|
|
65 Y 1 0
|
|
66 Z x y z
|
|
67 A 0.416667 0.466667 0.5
|
|
68 D 0.583333 0.533333 0.5
|
|
69
|
|
70 $ merge_metadata.py metadata.txt -t 0 -n < data.pcl
|
|
71 sample a b c
|
|
72 Y 1 0
|
|
73 Z x y z
|
|
74 A|B 1 2 3
|
|
75 A|C 4 5 6
|
|
76 D|E 7 8 9
|
|
77
|
|
78 $ merge_metadata.py metadata.txt -t 0 -m 0.8 -s "-" < data.pcl
|
|
79 sample b c
|
|
80 Y 0 -
|
|
81 Z y z
|
|
82 A|B 0.133333 0.166667
|
|
83 A|C 0.333333 0.333333
|
|
84 D|E 0.533333 0.5
|
|
85
|
|
86 $ merge_metadata.py -t 0 < data.pcl
|
|
87 sample a b c
|
|
88 A|B 1 2 3
|
|
89 A|C 4 5 6
|
|
90 D|E 7 8 9
|
|
91
|
|
92 .. testsetup::
|
|
93
|
|
94 from merge_metadata import *
|
|
95 """
|
|
96
|
|
97 import argparse
|
|
98 import blist
|
|
99 import csv
|
|
100 import re
|
|
101 import sys
|
|
102
|
|
103 c_dTarget = 1.0
|
|
104 c_fRound = False
|
|
105
|
|
106 class CClade:
|
|
107
|
|
108 def __init__( self ):
|
|
109
|
|
110 self.m_hashChildren = {}
|
|
111 self.m_adValues = None
|
|
112
|
|
113 def get( self, astrClade ):
|
|
114
|
|
115 return self.m_hashChildren.setdefault(
|
|
116 astrClade[0], CClade( ) ).get( astrClade[1:] ) if astrClade else self
|
|
117
|
|
118 def set( self, adValues ):
|
|
119
|
|
120 self.m_adValues = blist.blist( [0] ) * len( adValues )
|
|
121 for i, d in enumerate( adValues ):
|
|
122 if d:
|
|
123 self.m_adValues[i] = d
|
|
124
|
|
125 def impute( self ):
|
|
126
|
|
127 if not self.m_adValues:
|
|
128 for pChild in self.m_hashChildren.values( ):
|
|
129 adChild = pChild.impute( )
|
|
130 if self.m_adValues:
|
|
131 for i in range( len( adChild or [] ) ):
|
|
132 if adChild[i]:
|
|
133 self.m_adValues[i] += adChild[i]
|
|
134 elif adChild:
|
|
135 self.m_adValues = adChild[:]
|
|
136
|
|
137 return self.m_adValues
|
|
138
|
|
139 def _freeze( self, hashValues, iTarget, astrClade, iDepth, fLeaves ):
|
|
140
|
|
141 fHit = ( not iTarget ) or ( ( fLeaves and ( iDepth == iTarget ) ) or ( ( not fLeaves ) and ( iDepth <= iTarget ) ) )
|
|
142 iDepth += 1
|
|
143 setiRet = set()
|
|
144 if self.m_hashChildren:
|
|
145 for strChild, pChild in self.m_hashChildren.items( ):
|
|
146 setiRet |= pChild._freeze( hashValues, iTarget, astrClade + [strChild], iDepth, fLeaves )
|
|
147 setiRet = set( ( i + 1 ) for i in setiRet )
|
|
148 else:
|
|
149 setiRet.add( 0 )
|
|
150 if iTarget < 0:
|
|
151 if fLeaves:
|
|
152 fHit = -( iTarget + 1 ) in setiRet
|
|
153 else:
|
|
154 fHit = -( iTarget + 1 ) <= max( setiRet )
|
|
155 if astrClade and self.m_adValues and fHit:
|
|
156 hashValues["|".join( astrClade )] = self.m_adValues
|
|
157 return setiRet
|
|
158
|
|
159 def freeze( self, hashValues, iTarget, fLeaves ):
|
|
160
|
|
161 self._freeze( hashValues, iTarget, [], 0, fLeaves )
|
|
162
|
|
163 def _repr( self, strClade ):
|
|
164
|
|
165 strRet = "<"
|
|
166 if strClade:
|
|
167 strRet += "%s %s" % (strClade, self.m_adValues)
|
|
168 if self.m_hashChildren:
|
|
169 strRet += " "
|
|
170 if self.m_hashChildren:
|
|
171 strRet += " ".join( p._repr( s ) for (s, p) in self.m_hashChildren.items( ) )
|
|
172
|
|
173 return ( strRet + ">" )
|
|
174
|
|
175 def __repr__( self ):
|
|
176
|
|
177 return self._repr( "" )
|
|
178
|
|
179 """
|
|
180 pTree = CClade( )
|
|
181 pTree.get( ("A", "B") ).set( [1, 2, 3] )
|
|
182 pTree.get( ("A", "C") ).set( [4, 5, 6] )
|
|
183 pTree.get( ("D", "E") ).set( [7, 8, 9] )
|
|
184 iTaxa = 0
|
|
185 if iTaxa:
|
|
186 pTree.impute( )
|
|
187 hashFeatures = {}
|
|
188 pTree.freeze( hashFeatures, iTaxa )
|
|
189 print( pTree )
|
|
190 print( hashFeatures )
|
|
191 sys.exit( 0 )
|
|
192 #"""
|
|
193
|
|
194 def merge_metadata( aastrMetadata, aastrData, ostm, fNormalize, strMissing, astrExclude, dMin, iTaxa, fLeaves ):
|
|
195 """
|
|
196 Joins and outputs a data matrix with a metadata matrix, optionally normalizing and filtering it.
|
|
197 A pipe-delimited taxonomy hierarchy can also be dynamically added or removed.
|
|
198
|
|
199 :param aastrMetadata: Split lines from which metadata are read.
|
|
200 :type aastrMetadata: collection of string collections
|
|
201 :param aastrData: Split lines from which data are read.
|
|
202 :type aastrData: collection of string collections
|
|
203 :param ostm: Output stream to which joined rows are written.
|
|
204 :type ostm: output stream
|
|
205 :param fNormalize: If true, divide data values by column sums.
|
|
206 :type fNormalize: bool
|
|
207 :param strMissing: Representation for missing metadata values.
|
|
208 :type strMissing: str
|
|
209 :param astrExclude: Lines from which excluded IDs are read.
|
|
210 :type astrExclude: collection of strings
|
|
211 :param dMin: Minimum fraction of maximum value for per-column quality control.
|
|
212 :type dMin: bool
|
|
213 :param iTaxa: Depth of taxonomy to be computed, -1 = leaves only, 0 = no change
|
|
214 :type iTaxa: int
|
|
215 :param fLeaves: Output only leaves, not complete taxonomy; ignored if taxa = 0
|
|
216 :type fLeaves: bool
|
|
217
|
|
218 Metadata are optional; if not provided, data will be optionally normalized or its taxonomy
|
|
219 modified as requested. Metadata are provided one row per sample, data one column per
|
|
220 sample, both files tab-delimited text with one header row and one header column.
|
|
221
|
|
222 Metadata IDs that do not match data IDs are discarded, and data IDs without corresponding
|
|
223 metadata IDs are given missing values. Missing data values are always treated (and output)
|
|
224 as zero.
|
|
225
|
|
226 Per-column quality control is performed if the requested minimum fraction is greater than
|
|
227 zero. Specifically, for each column i, the row j containing the maximum value d is
|
|
228 identified. If d is less than the minimum fraction of row j's maximum value over all columns,
|
|
229 the entire column i is removed.
|
|
230
|
|
231 A taxonomy hierarchy will be calculated by default if row IDs are pipe-delimited, i.e. of
|
|
232 the form A|B|C. All parent clades are computed by default, e.g. A|B and A, save when
|
|
233 they would be identical to a more specific child clade. Negative values are counted from the
|
|
234 bottom (right) of the hierarchy rather than the top. The special value of 0 deactivates
|
|
235 hierarchy calculation.
|
|
236
|
|
237 >>> aastrMetadata = [[t.strip( ) for t in s] for s in ("-YZ", "a1x", "b0y", "c z")]
|
|
238 >>> aastrData = [s.split( ) for s in ( \
|
|
239 "- a b c", \
|
|
240 "A|B 1 2 3", \
|
|
241 "A|C 4 5 6", \
|
|
242 "D|E 7 8 9")]
|
|
243 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE
|
|
244 sample a b c
|
|
245 Y 1 0
|
|
246 Z x y z
|
|
247 A 0.416667 0.466667 0.5
|
|
248 A|B 0.0833333 0.133333 0.166667
|
|
249 A|C 0.333333 0.333333 0.333333
|
|
250 D|E 0.583333 0.533333 0.5
|
|
251
|
|
252 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, True ) #doctest: +NORMALIZE_WHITESPACE
|
|
253 sample a b c
|
|
254 Y 1 0
|
|
255 Z x y z
|
|
256 A|B 0.0833333 0.133333 0.166667
|
|
257 A|C 0.333333 0.333333 0.333333
|
|
258 D|E 0.583333 0.533333 0.5
|
|
259
|
|
260 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
|
|
261 sample a b c
|
|
262 Y 1 0
|
|
263 Z x y z
|
|
264 A|B 0.0833333 0.133333 0.166667
|
|
265 A|C 0.333333 0.333333 0.333333
|
|
266 D|E 0.583333 0.533333 0.5
|
|
267
|
|
268 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 1, False ) #doctest: +NORMALIZE_WHITESPACE
|
|
269 sample a b c
|
|
270 Y 1 0
|
|
271 Z x y z
|
|
272 A 0.416667 0.466667 0.5
|
|
273 D 0.583333 0.533333 0.5
|
|
274
|
|
275 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, -1, True ) #doctest: +NORMALIZE_WHITESPACE
|
|
276 sample a b c
|
|
277 Y 1 0
|
|
278 Z x y z
|
|
279 A|B 0.0833333 0.133333 0.166667
|
|
280 A|C 0.333333 0.333333 0.333333
|
|
281 D|E 0.583333 0.533333 0.5
|
|
282
|
|
283 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
|
|
284 sample a b c
|
|
285 Y 1 0
|
|
286 Z x y z
|
|
287 A|B 1 2 3
|
|
288 A|C 4 5 6
|
|
289 D|E 7 8 9
|
|
290
|
|
291 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "-", [], 0.8, 0, True ) #doctest: +NORMALIZE_WHITESPACE
|
|
292 sample b c
|
|
293 Y 0 -
|
|
294 Z y z
|
|
295 A|B 0.133333 0.166667
|
|
296 A|C 0.333333 0.333333
|
|
297 D|E 0.533333 0.5
|
|
298
|
|
299 >>> merge_metadata( None, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
|
|
300 sample a b c
|
|
301 A|B 1 2 3
|
|
302 A|C 4 5 6
|
|
303 D|E 7 8 9
|
|
304
|
|
305 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", ["b"], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE
|
|
306 sample a c
|
|
307 Y 1
|
|
308 Z x z
|
|
309 A 0.416667 0.5
|
|
310 A|B 0.0833333 0.166667
|
|
311 A|C 0.333333 0.333333
|
|
312 D|E 0.583333 0.5
|
|
313 """
|
|
314
|
|
315 #Put metadata in a dictionary
|
|
316 #{"First line element",["line element 2","line element 3","line element 4"]}
|
|
317 #If there is no metadata then
|
|
318 astrMetadata = None
|
|
319 hashMetadata = {}
|
|
320 for astrLine in ( aastrMetadata or [] ):
|
|
321 if astrMetadata:
|
|
322 hashMetadata[astrLine[0]] = astrLine[1:]
|
|
323 else:
|
|
324 astrMetadata = astrLine[1:]
|
|
325
|
|
326 astrHeaders = adSeqs = iCol = None
|
|
327 pTree = CClade( )
|
|
328 aastrRaw = []
|
|
329 for astrLine in aastrData:
|
|
330 if astrHeaders:
|
|
331 if ( astrLine[0] == "EWEIGHT" ) or ( astrLine[0] == "total" ) or \
|
|
332 ( len( astrLine ) < 2 ):
|
|
333 continue
|
|
334 try:
|
|
335 adCounts = [( float(strCur) if len( strCur.strip( ) ) else 0 ) for
|
|
336 strCur in astrLine[iCol:]]
|
|
337 except ValueError:
|
|
338 aastrRaw.append( astrLine )
|
|
339 continue
|
|
340 for i in range( len( adCounts ) ):
|
|
341 adSeqs[i] += adCounts[i]
|
|
342 if ( iCol > 1 ) and ( astrLine[0] != astrLine[1] ):
|
|
343 if astrLine[1].find( astrLine[0] ) >= 0:
|
|
344 astrLine[0] = astrLine[1]
|
|
345 else:
|
|
346 astrLine[0] += " " + astrLine[1]
|
|
347 pTree.get( astrLine[0].split( "|" ) ).set( adCounts )
|
|
348 else:
|
|
349 iCol = 2 if ( astrLine[1].upper( ) == "NAME" ) else 1
|
|
350 astrHeaders = [strCur.replace( " ", "_" ) for strCur in astrLine[iCol:]]
|
|
351 adSeqs = [0] * len( astrHeaders )
|
|
352
|
|
353 if iTaxa:
|
|
354 pTree.impute( )
|
|
355 hashFeatures = {}
|
|
356 pTree.freeze( hashFeatures, iTaxa, fLeaves )
|
|
357 setstrFeatures = hashFeatures.keys( )
|
|
358
|
|
359 afOmit = [False] * len( astrHeaders )
|
|
360 if dMin > 0:
|
|
361 aadData = list(hashFeatures.values( ))
|
|
362 for i in range( len( astrHeaders ) ):
|
|
363 iMax = max( range( len( aadData ) ), key = lambda j: aadData[j][i] )
|
|
364 dMaxUs = aadData[iMax][i]
|
|
365 dMaxThem = max( aadData[iMax][j] for j in ( range( i ) + range( i + 1, len( astrHeaders ) ) ) )
|
|
366 if dMaxUs < ( dMin * dMaxThem ):
|
|
367 sys.stderr.write( "Omitting: %s\n" % astrHeaders[i] )
|
|
368 afOmit[i] = True
|
|
369
|
|
370 if astrExclude:
|
|
371 setstrExclude = set(s.strip( ) for s in astrExclude)
|
|
372 for i in range( len( astrHeaders ) ):
|
|
373 if ( not afOmit[i] ) and ( astrHeaders[i] in setstrExclude ):
|
|
374 afOmit[i] = True
|
|
375
|
|
376 adMult = [( ( c_dTarget / d ) if ( fNormalize and ( d > 0 ) ) else 1 ) for d in adSeqs]
|
|
377 for strFeature, adCounts in hashFeatures.items( ):
|
|
378 for i in range( len( adCounts ) ):
|
|
379 if adCounts[i]:
|
|
380 adCounts[i] *= adMult[i]
|
|
381 if c_fRound:
|
|
382 adCounts[i] = round( adCounts[i] )
|
|
383 for strFeature, adCounts in hashFeatures.items( ):
|
|
384 astrFeature = strFeature.strip( ).split( "|" )
|
|
385 while len( astrFeature ) > 1:
|
|
386 astrFeature = astrFeature[:-1]
|
|
387 strParent = "|".join( astrFeature )
|
|
388 adParent = hashFeatures.get( strParent )
|
|
389 if adParent == adCounts:
|
|
390 del hashFeatures[strParent]
|
|
391 setstrFeatures.remove( strParent )
|
|
392
|
|
393 if astrMetadata:
|
|
394 for i in range( len( astrMetadata ) ):
|
|
395 hashFeatures[astrMetadata[i]] = astrCur = []
|
|
396 for strSubject in astrHeaders:
|
|
397 astrSubject = hashMetadata.get( strSubject )
|
|
398 if not astrSubject:
|
|
399 strSubject = re.sub( '_.*$', "", strSubject )
|
|
400 astrSubject = hashMetadata.get( strSubject, [] )
|
|
401 astrCur.append( astrSubject[i] if ( i < len( astrSubject ) ) else "" )
|
|
402
|
|
403 astrFeatures = sorted( astrMetadata or [] ) + sorted( setstrFeatures )
|
|
404 aiHeaders = filter( lambda i: not afOmit[i], range( len( astrHeaders ) ) )
|
|
405 csvw = csv.writer( sys.stdout, csv.excel_tab )
|
|
406 csvw.writerow( ["sample"] + [astrHeaders[i] for i in aiHeaders] )
|
|
407 for iFeature in range( len( astrFeatures ) ):
|
|
408 strFeature = astrFeatures[iFeature]
|
|
409 adFeature = hashFeatures[strFeature]
|
|
410 astrValues = [adFeature[i] for i in aiHeaders]
|
|
411 for i in range( len( astrValues ) ):
|
|
412 strValue = astrValues[i]
|
|
413 if type( strValue ) in (int, float):
|
|
414 astrValues[i] = "%g" % astrValues[i]
|
|
415 elif ( not strValue ) or ( ( type( strValue ) == str ) and
|
|
416 ( len( strValue ) == 0 ) ):
|
|
417 astrValues[i] = strMissing
|
|
418 csvw.writerow( [strFeature] + astrValues )
|
|
419
|
|
420 for astrRaw in aastrRaw:
|
|
421 csvw.writerow( [astrRaw[i] for i in aiHeaders] )
|
|
422
|
|
423 argp = argparse.ArgumentParser( prog = "merge_metadata.py",
|
|
424 description = "Join a data matrix with a metadata matrix, optionally normalizing and filtering it.\n\n" +
|
|
425 "A pipe-delimited taxonomy hierarchy can also be dynamically added or removed." )
|
|
426 argp.add_argument( "-n", dest = "fNormalize", action = "store_false",
|
|
427 help = "Don't normalize data values by column sums" )
|
|
428 argp.add_argument( "-s", dest = "strMissing", metavar = "missing",
|
|
429 type = str, default = " ",
|
|
430 help = "String representing missing metadata values" )
|
|
431 argp.add_argument( "-m", dest = "dMin", metavar = "min",
|
|
432 type = float, default = 0.01,
|
|
433 help = "Per-column quality control, minimum fraction of maximum value" )
|
|
434 argp.add_argument( "-t", dest = "iTaxa", metavar = "taxa",
|
|
435 type = int, default = -1,
|
|
436 help = "Depth of taxonomy to be computed, negative = from right, 0 = no change" )
|
|
437 argp.add_argument( "-l", dest = "fLeaves", action = "store_true",
|
|
438 help = "Output only leaves, not complete taxonomy" )
|
|
439 argp.add_argument( "-x", dest = "istmExclude", metavar = "exclude.txt",
|
|
440 type = file,
|
|
441 help = "File from which sample IDs to exclude are read" )
|
|
442 argp.add_argument( "istmMetadata", metavar = "metadata.txt",
|
|
443 type = file, nargs = "?",
|
|
444 help = "File from which metadata is read" )
|
|
445 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__
|
|
446
|
|
447 def _main( ):
|
|
448 args = argp.parse_args( )
|
|
449 merge_metadata( args.istmMetadata and csv.reader( args.istmMetadata, csv.excel_tab ),
|
|
450 csv.reader( sys.stdin, csv.excel_tab ), sys.stdout, args.fNormalize, args.strMissing,
|
|
451 args.istmExclude, args.dMin, args.iTaxa, args.fLeaves )
|
|
452
|
|
453 if __name__ == "__main__":
|
|
454 _main( )
|