Mercurial > repos > george-weingart > maaslin
annotate maaslin-4450aa4ecc84/src/merge_metadata.py @ 1:a87d5a5f2776
Uploaded the version running on the prod server
author | george-weingart |
---|---|
date | Sun, 08 Feb 2015 23:08:38 -0500 |
parents | |
children |
rev | line source |
---|---|
1
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
1 #!/usr/bin/env python |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
2 ##################################################################################### |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
3 #Copyright (C) <2012> |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
4 # |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
5 #Permission is hereby granted, free of charge, to any person obtaining a copy of |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
6 #this software and associated documentation files (the "Software"), to deal in the |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
7 #Software without restriction, including without limitation the rights to use, copy, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
9 #and to permit persons to whom the Software is furnished to do so, subject to |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
10 #the following conditions: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
11 # |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
12 #The above copyright notice and this permission notice shall be included in all copies |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
13 #or substantial portions of the Software. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
14 # |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
21 # |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
23 # authored by the Huttenhower lab at the Harvard School of Public Health |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
25 ##################################################################################### |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
26 """ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
27 Examples |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
28 ~~~~~~~~ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
29 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
30 ``metadata.txt``:: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
31 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
32 - Y Z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
33 a 1 x |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
34 b 0 y |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
35 c z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
36 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
37 ``data.pcl``:: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
38 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
39 - a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
40 A|B 1 2 3 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
41 A|C 4 5 6 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
42 D|E 7 8 9 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
43 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
44 ``Examples``:: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
45 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
46 $ merge_metadata.py metadata.txt < data.pcl |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
47 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
48 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
49 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
50 A 0.416667 0.466667 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
51 A|B 0.0833333 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
52 A|C 0.333333 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
53 D|E 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
54 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
55 $ merge_metadata.py metadata.txt -t 0 < data.pcl |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
56 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
57 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
58 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
59 A|B 0.0833333 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
60 A|C 0.333333 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
61 D|E 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
62 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
63 $ merge_metadata.py metadata.txt -t 1 < data.pcl |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
64 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
65 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
66 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
67 A 0.416667 0.466667 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
68 D 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
69 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
70 $ merge_metadata.py metadata.txt -t 0 -n < data.pcl |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
71 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
72 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
73 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
74 A|B 1 2 3 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
75 A|C 4 5 6 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
76 D|E 7 8 9 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
77 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
78 $ merge_metadata.py metadata.txt -t 0 -m 0.8 -s "-" < data.pcl |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
79 sample b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
80 Y 0 - |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
81 Z y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
82 A|B 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
83 A|C 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
84 D|E 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
85 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
86 $ merge_metadata.py -t 0 < data.pcl |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
87 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
88 A|B 1 2 3 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
89 A|C 4 5 6 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
90 D|E 7 8 9 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
91 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
92 .. testsetup:: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
93 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
94 from merge_metadata import * |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
95 """ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
96 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
97 import argparse |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
98 import blist |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
99 import csv |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
100 import re |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
101 import sys |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
102 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
103 c_dTarget = 1.0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
104 c_fRound = False |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
105 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
106 class CClade: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
107 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
108 def __init__( self ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
109 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
110 self.m_hashChildren = {} |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
111 self.m_adValues = None |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
112 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
113 def get( self, astrClade ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
114 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
115 return self.m_hashChildren.setdefault( |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
116 astrClade[0], CClade( ) ).get( astrClade[1:] ) if astrClade else self |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
117 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
118 def set( self, adValues ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
119 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
120 self.m_adValues = blist.blist( [0] ) * len( adValues ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
121 for i, d in enumerate( adValues ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
122 if d: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
123 self.m_adValues[i] = d |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
124 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
125 def impute( self ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
126 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
127 if not self.m_adValues: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
128 for pChild in self.m_hashChildren.values( ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
129 adChild = pChild.impute( ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
130 if self.m_adValues: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
131 for i in range( len( adChild or [] ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
132 if adChild[i]: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
133 self.m_adValues[i] += adChild[i] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
134 elif adChild: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
135 self.m_adValues = adChild[:] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
136 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
137 return self.m_adValues |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
138 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
139 def _freeze( self, hashValues, iTarget, astrClade, iDepth, fLeaves ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
140 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
141 fHit = ( not iTarget ) or ( ( fLeaves and ( iDepth == iTarget ) ) or ( ( not fLeaves ) and ( iDepth <= iTarget ) ) ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
142 iDepth += 1 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
143 setiRet = set() |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
144 if self.m_hashChildren: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
145 for strChild, pChild in self.m_hashChildren.items( ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
146 setiRet |= pChild._freeze( hashValues, iTarget, astrClade + [strChild], iDepth, fLeaves ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
147 setiRet = set( ( i + 1 ) for i in setiRet ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
148 else: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
149 setiRet.add( 0 ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
150 if iTarget < 0: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
151 if fLeaves: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
152 fHit = -( iTarget + 1 ) in setiRet |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
153 else: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
154 fHit = -( iTarget + 1 ) <= max( setiRet ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
155 if astrClade and self.m_adValues and fHit: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
156 hashValues["|".join( astrClade )] = self.m_adValues |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
157 return setiRet |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
158 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
159 def freeze( self, hashValues, iTarget, fLeaves ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
160 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
161 self._freeze( hashValues, iTarget, [], 0, fLeaves ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
162 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
163 def _repr( self, strClade ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
164 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
165 strRet = "<" |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
166 if strClade: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
167 strRet += "%s %s" % (strClade, self.m_adValues) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
168 if self.m_hashChildren: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
169 strRet += " " |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
170 if self.m_hashChildren: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
171 strRet += " ".join( p._repr( s ) for (s, p) in self.m_hashChildren.items( ) ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
172 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
173 return ( strRet + ">" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
174 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
175 def __repr__( self ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
176 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
177 return self._repr( "" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
178 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
179 """ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
180 pTree = CClade( ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
181 pTree.get( ("A", "B") ).set( [1, 2, 3] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
182 pTree.get( ("A", "C") ).set( [4, 5, 6] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
183 pTree.get( ("D", "E") ).set( [7, 8, 9] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
184 iTaxa = 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
185 if iTaxa: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
186 pTree.impute( ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
187 hashFeatures = {} |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
188 pTree.freeze( hashFeatures, iTaxa ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
189 print( pTree ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
190 print( hashFeatures ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
191 sys.exit( 0 ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
192 #""" |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
193 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
194 def merge_metadata( aastrMetadata, aastrData, ostm, fNormalize, strMissing, astrExclude, dMin, iTaxa, fLeaves ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
195 """ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
196 Joins and outputs a data matrix with a metadata matrix, optionally normalizing and filtering it. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
197 A pipe-delimited taxonomy hierarchy can also be dynamically added or removed. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
198 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
199 :param aastrMetadata: Split lines from which metadata are read. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
200 :type aastrMetadata: collection of string collections |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
201 :param aastrData: Split lines from which data are read. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
202 :type aastrData: collection of string collections |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
203 :param ostm: Output stream to which joined rows are written. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
204 :type ostm: output stream |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
205 :param fNormalize: If true, divide data values by column sums. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
206 :type fNormalize: bool |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
207 :param strMissing: Representation for missing metadata values. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
208 :type strMissing: str |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
209 :param astrExclude: Lines from which excluded IDs are read. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
210 :type astrExclude: collection of strings |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
211 :param dMin: Minimum fraction of maximum value for per-column quality control. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
212 :type dMin: bool |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
213 :param iTaxa: Depth of taxonomy to be computed, -1 = leaves only, 0 = no change |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
214 :type iTaxa: int |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
215 :param fLeaves: Output only leaves, not complete taxonomy; ignored if taxa = 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
216 :type fLeaves: bool |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
217 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
218 Metadata are optional; if not provided, data will be optionally normalized or its taxonomy |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
219 modified as requested. Metadata are provided one row per sample, data one column per |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
220 sample, both files tab-delimited text with one header row and one header column. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
221 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
222 Metadata IDs that do not match data IDs are discarded, and data IDs without corresponding |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
223 metadata IDs are given missing values. Missing data values are always treated (and output) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
224 as zero. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
225 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
226 Per-column quality control is performed if the requested minimum fraction is greater than |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
227 zero. Specifically, for each column i, the row j containing the maximum value d is |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
228 identified. If d is less than the minimum fraction of row j's maximum value over all columns, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
229 the entire column i is removed. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
230 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
231 A taxonomy hierarchy will be calculated by default if row IDs are pipe-delimited, i.e. of |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
232 the form A|B|C. All parent clades are computed by default, e.g. A|B and A, save when |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
233 they would be identical to a more specific child clade. Negative values are counted from the |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
234 bottom (right) of the hierarchy rather than the top. The special value of 0 deactivates |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
235 hierarchy calculation. |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
236 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
237 >>> aastrMetadata = [[t.strip( ) for t in s] for s in ("-YZ", "a1x", "b0y", "c z")] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
238 >>> aastrData = [s.split( ) for s in ( \ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
239 "- a b c", \ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
240 "A|B 1 2 3", \ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
241 "A|C 4 5 6", \ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
242 "D|E 7 8 9")] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
243 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
244 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
245 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
246 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
247 A 0.416667 0.466667 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
248 A|B 0.0833333 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
249 A|C 0.333333 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
250 D|E 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
251 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
252 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, True ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
253 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
254 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
255 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
256 A|B 0.0833333 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
257 A|C 0.333333 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
258 D|E 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
259 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
260 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
261 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
262 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
263 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
264 A|B 0.0833333 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
265 A|C 0.333333 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
266 D|E 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
267 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
268 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 1, False ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
269 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
270 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
271 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
272 A 0.416667 0.466667 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
273 D 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
274 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
275 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, -1, True ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
276 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
277 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
278 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
279 A|B 0.0833333 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
280 A|C 0.333333 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
281 D|E 0.583333 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
282 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
283 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
284 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
285 Y 1 0 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
286 Z x y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
287 A|B 1 2 3 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
288 A|C 4 5 6 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
289 D|E 7 8 9 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
290 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
291 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "-", [], 0.8, 0, True ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
292 sample b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
293 Y 0 - |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
294 Z y z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
295 A|B 0.133333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
296 A|C 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
297 D|E 0.533333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
298 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
299 >>> merge_metadata( None, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
300 sample a b c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
301 A|B 1 2 3 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
302 A|C 4 5 6 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
303 D|E 7 8 9 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
304 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
305 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", ["b"], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
306 sample a c |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
307 Y 1 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
308 Z x z |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
309 A 0.416667 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
310 A|B 0.0833333 0.166667 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
311 A|C 0.333333 0.333333 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
312 D|E 0.583333 0.5 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
313 """ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
314 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
315 #Put metadata in a dictionary |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
316 #{"First line element",["line element 2","line element 3","line element 4"]} |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
317 #If there is no metadata then |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
318 astrMetadata = None |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
319 hashMetadata = {} |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
320 for astrLine in ( aastrMetadata or [] ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
321 if astrMetadata: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
322 hashMetadata[astrLine[0]] = astrLine[1:] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
323 else: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
324 astrMetadata = astrLine[1:] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
325 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
326 astrHeaders = adSeqs = iCol = None |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
327 pTree = CClade( ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
328 aastrRaw = [] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
329 for astrLine in aastrData: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
330 if astrHeaders: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
331 if ( astrLine[0] == "EWEIGHT" ) or ( astrLine[0] == "total" ) or \ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
332 ( len( astrLine ) < 2 ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
333 continue |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
334 try: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
335 adCounts = [( float(strCur) if len( strCur.strip( ) ) else 0 ) for |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
336 strCur in astrLine[iCol:]] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
337 except ValueError: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
338 aastrRaw.append( astrLine ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
339 continue |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
340 for i in range( len( adCounts ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
341 adSeqs[i] += adCounts[i] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
342 if ( iCol > 1 ) and ( astrLine[0] != astrLine[1] ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
343 if astrLine[1].find( astrLine[0] ) >= 0: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
344 astrLine[0] = astrLine[1] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
345 else: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
346 astrLine[0] += " " + astrLine[1] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
347 pTree.get( astrLine[0].split( "|" ) ).set( adCounts ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
348 else: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
349 iCol = 2 if ( astrLine[1].upper( ) == "NAME" ) else 1 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
350 astrHeaders = [strCur.replace( " ", "_" ) for strCur in astrLine[iCol:]] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
351 adSeqs = [0] * len( astrHeaders ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
352 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
353 if iTaxa: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
354 pTree.impute( ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
355 hashFeatures = {} |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
356 pTree.freeze( hashFeatures, iTaxa, fLeaves ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
357 setstrFeatures = hashFeatures.keys( ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
358 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
359 afOmit = [False] * len( astrHeaders ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
360 if dMin > 0: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
361 aadData = list(hashFeatures.values( )) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
362 for i in range( len( astrHeaders ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
363 iMax = max( range( len( aadData ) ), key = lambda j: aadData[j][i] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
364 dMaxUs = aadData[iMax][i] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
365 dMaxThem = max( aadData[iMax][j] for j in ( range( i ) + range( i + 1, len( astrHeaders ) ) ) ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
366 if dMaxUs < ( dMin * dMaxThem ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
367 sys.stderr.write( "Omitting: %s\n" % astrHeaders[i] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
368 afOmit[i] = True |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
369 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
370 if astrExclude: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
371 setstrExclude = set(s.strip( ) for s in astrExclude) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
372 for i in range( len( astrHeaders ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
373 if ( not afOmit[i] ) and ( astrHeaders[i] in setstrExclude ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
374 afOmit[i] = True |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
375 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
376 adMult = [( ( c_dTarget / d ) if ( fNormalize and ( d > 0 ) ) else 1 ) for d in adSeqs] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
377 for strFeature, adCounts in hashFeatures.items( ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
378 for i in range( len( adCounts ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
379 if adCounts[i]: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
380 adCounts[i] *= adMult[i] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
381 if c_fRound: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
382 adCounts[i] = round( adCounts[i] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
383 for strFeature, adCounts in hashFeatures.items( ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
384 astrFeature = strFeature.strip( ).split( "|" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
385 while len( astrFeature ) > 1: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
386 astrFeature = astrFeature[:-1] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
387 strParent = "|".join( astrFeature ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
388 adParent = hashFeatures.get( strParent ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
389 if adParent == adCounts: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
390 del hashFeatures[strParent] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
391 setstrFeatures.remove( strParent ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
392 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
393 if astrMetadata: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
394 for i in range( len( astrMetadata ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
395 hashFeatures[astrMetadata[i]] = astrCur = [] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
396 for strSubject in astrHeaders: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
397 astrSubject = hashMetadata.get( strSubject ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
398 if not astrSubject: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
399 strSubject = re.sub( '_.*$', "", strSubject ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
400 astrSubject = hashMetadata.get( strSubject, [] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
401 astrCur.append( astrSubject[i] if ( i < len( astrSubject ) ) else "" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
402 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
403 astrFeatures = sorted( astrMetadata or [] ) + sorted( setstrFeatures ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
404 aiHeaders = filter( lambda i: not afOmit[i], range( len( astrHeaders ) ) ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
405 csvw = csv.writer( sys.stdout, csv.excel_tab ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
406 csvw.writerow( ["sample"] + [astrHeaders[i] for i in aiHeaders] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
407 for iFeature in range( len( astrFeatures ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
408 strFeature = astrFeatures[iFeature] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
409 adFeature = hashFeatures[strFeature] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
410 astrValues = [adFeature[i] for i in aiHeaders] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
411 for i in range( len( astrValues ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
412 strValue = astrValues[i] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
413 if type( strValue ) in (int, float): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
414 astrValues[i] = "%g" % astrValues[i] |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
415 elif ( not strValue ) or ( ( type( strValue ) == str ) and |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
416 ( len( strValue ) == 0 ) ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
417 astrValues[i] = strMissing |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
418 csvw.writerow( [strFeature] + astrValues ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
419 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
420 for astrRaw in aastrRaw: |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
421 csvw.writerow( [astrRaw[i] for i in aiHeaders] ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
422 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
423 argp = argparse.ArgumentParser( prog = "merge_metadata.py", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
424 description = "Join a data matrix with a metadata matrix, optionally normalizing and filtering it.\n\n" + |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
425 "A pipe-delimited taxonomy hierarchy can also be dynamically added or removed." ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
426 argp.add_argument( "-n", dest = "fNormalize", action = "store_false", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
427 help = "Don't normalize data values by column sums" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
428 argp.add_argument( "-s", dest = "strMissing", metavar = "missing", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
429 type = str, default = " ", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
430 help = "String representing missing metadata values" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
431 argp.add_argument( "-m", dest = "dMin", metavar = "min", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
432 type = float, default = 0.01, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
433 help = "Per-column quality control, minimum fraction of maximum value" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
434 argp.add_argument( "-t", dest = "iTaxa", metavar = "taxa", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
435 type = int, default = -1, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
436 help = "Depth of taxonomy to be computed, negative = from right, 0 = no change" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
437 argp.add_argument( "-l", dest = "fLeaves", action = "store_true", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
438 help = "Output only leaves, not complete taxonomy" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
439 argp.add_argument( "-x", dest = "istmExclude", metavar = "exclude.txt", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
440 type = file, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
441 help = "File from which sample IDs to exclude are read" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
442 argp.add_argument( "istmMetadata", metavar = "metadata.txt", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
443 type = file, nargs = "?", |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
444 help = "File from which metadata is read" ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
445 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
446 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
447 def _main( ): |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
448 args = argp.parse_args( ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
449 merge_metadata( args.istmMetadata and csv.reader( args.istmMetadata, csv.excel_tab ), |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
450 csv.reader( sys.stdin, csv.excel_tab ), sys.stdout, args.fNormalize, args.strMissing, |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
451 args.istmExclude, args.dMin, args.iTaxa, args.fLeaves ) |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
452 |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
453 if __name__ == "__main__": |
a87d5a5f2776
Uploaded the version running on the prod server
george-weingart
parents:
diff
changeset
|
454 _main( ) |