Mercurial > repos > george-weingart > maaslin
view src/merge_metadata.py @ 8:e9677425c6c3 default tip
Updated the structure of the libraries
author | george.weingart@gmail.com |
---|---|
date | Mon, 09 Feb 2015 12:17:40 -0500 |
parents | e0b5980139d9 |
children |
line wrap: on
line source
#!/usr/bin/env python ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), # authored by the Huttenhower lab at the Harvard School of Public Health # (contact Timothy Tickle, ttickle@hsph.harvard.edu). ##################################################################################### """ Examples ~~~~~~~~ ``metadata.txt``:: - Y Z a 1 x b 0 y c z ``data.pcl``:: - a b c A|B 1 2 3 A|C 4 5 6 D|E 7 8 9 ``Examples``:: $ merge_metadata.py metadata.txt < data.pcl sample a b c Y 1 0 Z x y z A 0.416667 0.466667 0.5 A|B 0.0833333 0.133333 0.166667 A|C 0.333333 0.333333 0.333333 D|E 0.583333 0.533333 0.5 $ merge_metadata.py metadata.txt -t 0 < data.pcl sample a b c Y 1 0 Z x y z A|B 0.0833333 0.133333 0.166667 A|C 0.333333 0.333333 0.333333 D|E 0.583333 0.533333 0.5 $ merge_metadata.py metadata.txt -t 1 < data.pcl sample a b c Y 1 0 Z x y z A 0.416667 0.466667 0.5 D 0.583333 0.533333 0.5 $ merge_metadata.py metadata.txt -t 0 -n < data.pcl sample a b c Y 1 0 Z x y z A|B 1 2 3 A|C 4 5 6 D|E 7 8 9 $ merge_metadata.py metadata.txt -t 0 -m 0.8 -s "-" < data.pcl sample b c Y 0 - Z y z A|B 0.133333 0.166667 A|C 0.333333 0.333333 D|E 0.533333 0.5 $ merge_metadata.py -t 0 < data.pcl sample a b c A|B 1 2 3 A|C 4 5 6 D|E 7 8 9 .. testsetup:: from merge_metadata import * """ import argparse import blist import csv import re import sys c_dTarget = 1.0 c_fRound = False class CClade: def __init__( self ): self.m_hashChildren = {} self.m_adValues = None def get( self, astrClade ): return self.m_hashChildren.setdefault( astrClade[0], CClade( ) ).get( astrClade[1:] ) if astrClade else self def set( self, adValues ): self.m_adValues = blist.blist( [0] ) * len( adValues ) for i, d in enumerate( adValues ): if d: self.m_adValues[i] = d def impute( self ): if not self.m_adValues: for pChild in self.m_hashChildren.values( ): adChild = pChild.impute( ) if self.m_adValues: for i in range( len( adChild or [] ) ): if adChild[i]: self.m_adValues[i] += adChild[i] elif adChild: self.m_adValues = adChild[:] return self.m_adValues def _freeze( self, hashValues, iTarget, astrClade, iDepth, fLeaves ): fHit = ( not iTarget ) or ( ( fLeaves and ( iDepth == iTarget ) ) or ( ( not fLeaves ) and ( iDepth <= iTarget ) ) ) iDepth += 1 setiRet = set() if self.m_hashChildren: for strChild, pChild in self.m_hashChildren.items( ): setiRet |= pChild._freeze( hashValues, iTarget, astrClade + [strChild], iDepth, fLeaves ) setiRet = set( ( i + 1 ) for i in setiRet ) else: setiRet.add( 0 ) if iTarget < 0: if fLeaves: fHit = -( iTarget + 1 ) in setiRet else: fHit = -( iTarget + 1 ) <= max( setiRet ) if astrClade and self.m_adValues and fHit: hashValues["|".join( astrClade )] = self.m_adValues return setiRet def freeze( self, hashValues, iTarget, fLeaves ): self._freeze( hashValues, iTarget, [], 0, fLeaves ) def _repr( self, strClade ): strRet = "<" if strClade: strRet += "%s %s" % (strClade, self.m_adValues) if self.m_hashChildren: strRet += " " if self.m_hashChildren: strRet += " ".join( p._repr( s ) for (s, p) in self.m_hashChildren.items( ) ) return ( strRet + ">" ) def __repr__( self ): return self._repr( "" ) """ pTree = CClade( ) pTree.get( ("A", "B") ).set( [1, 2, 3] ) pTree.get( ("A", "C") ).set( [4, 5, 6] ) pTree.get( ("D", "E") ).set( [7, 8, 9] ) iTaxa = 0 if iTaxa: pTree.impute( ) hashFeatures = {} pTree.freeze( hashFeatures, iTaxa ) print( pTree ) print( hashFeatures ) sys.exit( 0 ) #""" def merge_metadata( aastrMetadata, aastrData, ostm, fNormalize, strMissing, astrExclude, dMin, iTaxa, fLeaves ): """ Joins and outputs a data matrix with a metadata matrix, optionally normalizing and filtering it. A pipe-delimited taxonomy hierarchy can also be dynamically added or removed. :param aastrMetadata: Split lines from which metadata are read. :type aastrMetadata: collection of string collections :param aastrData: Split lines from which data are read. :type aastrData: collection of string collections :param ostm: Output stream to which joined rows are written. :type ostm: output stream :param fNormalize: If true, divide data values by column sums. :type fNormalize: bool :param strMissing: Representation for missing metadata values. :type strMissing: str :param astrExclude: Lines from which excluded IDs are read. :type astrExclude: collection of strings :param dMin: Minimum fraction of maximum value for per-column quality control. :type dMin: bool :param iTaxa: Depth of taxonomy to be computed, -1 = leaves only, 0 = no change :type iTaxa: int :param fLeaves: Output only leaves, not complete taxonomy; ignored if taxa = 0 :type fLeaves: bool Metadata are optional; if not provided, data will be optionally normalized or its taxonomy modified as requested. Metadata are provided one row per sample, data one column per sample, both files tab-delimited text with one header row and one header column. Metadata IDs that do not match data IDs are discarded, and data IDs without corresponding metadata IDs are given missing values. Missing data values are always treated (and output) as zero. Per-column quality control is performed if the requested minimum fraction is greater than zero. Specifically, for each column i, the row j containing the maximum value d is identified. If d is less than the minimum fraction of row j's maximum value over all columns, the entire column i is removed. A taxonomy hierarchy will be calculated by default if row IDs are pipe-delimited, i.e. of the form A|B|C. All parent clades are computed by default, e.g. A|B and A, save when they would be identical to a more specific child clade. Negative values are counted from the bottom (right) of the hierarchy rather than the top. The special value of 0 deactivates hierarchy calculation. >>> aastrMetadata = [[t.strip( ) for t in s] for s in ("-YZ", "a1x", "b0y", "c z")] >>> aastrData = [s.split( ) for s in ( \ "- a b c", \ "A|B 1 2 3", \ "A|C 4 5 6", \ "D|E 7 8 9")] >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE sample a b c Y 1 0 Z x y z A 0.416667 0.466667 0.5 A|B 0.0833333 0.133333 0.166667 A|C 0.333333 0.333333 0.333333 D|E 0.583333 0.533333 0.5 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, True ) #doctest: +NORMALIZE_WHITESPACE sample a b c Y 1 0 Z x y z A|B 0.0833333 0.133333 0.166667 A|C 0.333333 0.333333 0.333333 D|E 0.583333 0.533333 0.5 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE sample a b c Y 1 0 Z x y z A|B 0.0833333 0.133333 0.166667 A|C 0.333333 0.333333 0.333333 D|E 0.583333 0.533333 0.5 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 1, False ) #doctest: +NORMALIZE_WHITESPACE sample a b c Y 1 0 Z x y z A 0.416667 0.466667 0.5 D 0.583333 0.533333 0.5 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, -1, True ) #doctest: +NORMALIZE_WHITESPACE sample a b c Y 1 0 Z x y z A|B 0.0833333 0.133333 0.166667 A|C 0.333333 0.333333 0.333333 D|E 0.583333 0.533333 0.5 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE sample a b c Y 1 0 Z x y z A|B 1 2 3 A|C 4 5 6 D|E 7 8 9 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "-", [], 0.8, 0, True ) #doctest: +NORMALIZE_WHITESPACE sample b c Y 0 - Z y z A|B 0.133333 0.166667 A|C 0.333333 0.333333 D|E 0.533333 0.5 >>> merge_metadata( None, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE sample a b c A|B 1 2 3 A|C 4 5 6 D|E 7 8 9 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", ["b"], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE sample a c Y 1 Z x z A 0.416667 0.5 A|B 0.0833333 0.166667 A|C 0.333333 0.333333 D|E 0.583333 0.5 """ #Put metadata in a dictionary #{"First line element",["line element 2","line element 3","line element 4"]} #If there is no metadata then astrMetadata = None hashMetadata = {} for astrLine in ( aastrMetadata or [] ): if astrMetadata: hashMetadata[astrLine[0]] = astrLine[1:] else: astrMetadata = astrLine[1:] astrHeaders = adSeqs = iCol = None pTree = CClade( ) aastrRaw = [] for astrLine in aastrData: if astrHeaders: if ( astrLine[0] == "EWEIGHT" ) or ( astrLine[0] == "total" ) or \ ( len( astrLine ) < 2 ): continue try: adCounts = [( float(strCur) if len( strCur.strip( ) ) else 0 ) for strCur in astrLine[iCol:]] except ValueError: aastrRaw.append( astrLine ) continue for i in range( len( adCounts ) ): adSeqs[i] += adCounts[i] if ( iCol > 1 ) and ( astrLine[0] != astrLine[1] ): if astrLine[1].find( astrLine[0] ) >= 0: astrLine[0] = astrLine[1] else: astrLine[0] += " " + astrLine[1] pTree.get( astrLine[0].split( "|" ) ).set( adCounts ) else: iCol = 2 if ( astrLine[1].upper( ) == "NAME" ) else 1 astrHeaders = [strCur.replace( " ", "_" ) for strCur in astrLine[iCol:]] adSeqs = [0] * len( astrHeaders ) if iTaxa: pTree.impute( ) hashFeatures = {} pTree.freeze( hashFeatures, iTaxa, fLeaves ) setstrFeatures = hashFeatures.keys( ) afOmit = [False] * len( astrHeaders ) if dMin > 0: aadData = list(hashFeatures.values( )) for i in range( len( astrHeaders ) ): iMax = max( range( len( aadData ) ), key = lambda j: aadData[j][i] ) dMaxUs = aadData[iMax][i] dMaxThem = max( aadData[iMax][j] for j in ( range( i ) + range( i + 1, len( astrHeaders ) ) ) ) if dMaxUs < ( dMin * dMaxThem ): sys.stderr.write( "Omitting: %s\n" % astrHeaders[i] ) afOmit[i] = True if astrExclude: setstrExclude = set(s.strip( ) for s in astrExclude) for i in range( len( astrHeaders ) ): if ( not afOmit[i] ) and ( astrHeaders[i] in setstrExclude ): afOmit[i] = True adMult = [( ( c_dTarget / d ) if ( fNormalize and ( d > 0 ) ) else 1 ) for d in adSeqs] for strFeature, adCounts in hashFeatures.items( ): for i in range( len( adCounts ) ): if adCounts[i]: adCounts[i] *= adMult[i] if c_fRound: adCounts[i] = round( adCounts[i] ) for strFeature, adCounts in hashFeatures.items( ): astrFeature = strFeature.strip( ).split( "|" ) while len( astrFeature ) > 1: astrFeature = astrFeature[:-1] strParent = "|".join( astrFeature ) adParent = hashFeatures.get( strParent ) if adParent == adCounts: del hashFeatures[strParent] setstrFeatures.remove( strParent ) if astrMetadata: for i in range( len( astrMetadata ) ): hashFeatures[astrMetadata[i]] = astrCur = [] for strSubject in astrHeaders: astrSubject = hashMetadata.get( strSubject ) if not astrSubject: strSubject = re.sub( '_.*$', "", strSubject ) astrSubject = hashMetadata.get( strSubject, [] ) astrCur.append( astrSubject[i] if ( i < len( astrSubject ) ) else "" ) astrFeatures = sorted( astrMetadata or [] ) + sorted( setstrFeatures ) aiHeaders = filter( lambda i: not afOmit[i], range( len( astrHeaders ) ) ) csvw = csv.writer( sys.stdout, csv.excel_tab ) csvw.writerow( ["sample"] + [astrHeaders[i] for i in aiHeaders] ) for iFeature in range( len( astrFeatures ) ): strFeature = astrFeatures[iFeature] adFeature = hashFeatures[strFeature] astrValues = [adFeature[i] for i in aiHeaders] for i in range( len( astrValues ) ): strValue = astrValues[i] if type( strValue ) in (int, float): astrValues[i] = "%g" % astrValues[i] elif ( not strValue ) or ( ( type( strValue ) == str ) and ( len( strValue ) == 0 ) ): astrValues[i] = strMissing csvw.writerow( [strFeature] + astrValues ) for astrRaw in aastrRaw: csvw.writerow( [astrRaw[i] for i in aiHeaders] ) argp = argparse.ArgumentParser( prog = "merge_metadata.py", description = "Join a data matrix with a metadata matrix, optionally normalizing and filtering it.\n\n" + "A pipe-delimited taxonomy hierarchy can also be dynamically added or removed." ) argp.add_argument( "-n", dest = "fNormalize", action = "store_false", help = "Don't normalize data values by column sums" ) argp.add_argument( "-s", dest = "strMissing", metavar = "missing", type = str, default = " ", help = "String representing missing metadata values" ) argp.add_argument( "-m", dest = "dMin", metavar = "min", type = float, default = 0.01, help = "Per-column quality control, minimum fraction of maximum value" ) argp.add_argument( "-t", dest = "iTaxa", metavar = "taxa", type = int, default = -1, help = "Depth of taxonomy to be computed, negative = from right, 0 = no change" ) argp.add_argument( "-l", dest = "fLeaves", action = "store_true", help = "Output only leaves, not complete taxonomy" ) argp.add_argument( "-x", dest = "istmExclude", metavar = "exclude.txt", type = file, help = "File from which sample IDs to exclude are read" ) argp.add_argument( "istmMetadata", metavar = "metadata.txt", type = file, nargs = "?", help = "File from which metadata is read" ) __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ def _main( ): args = argp.parse_args( ) merge_metadata( args.istmMetadata and csv.reader( args.istmMetadata, csv.excel_tab ), csv.reader( sys.stdin, csv.excel_tab ), sys.stdout, args.fNormalize, args.strMissing, args.istmExclude, args.dMin, args.iTaxa, args.fLeaves ) if __name__ == "__main__": _main( )