3
|
1 """
|
|
2 Author: Timothy Tickle
|
|
3 Description: Performs and plots Principle Components Analysis.
|
|
4 """
|
|
5
|
|
6 #####################################################################################
|
|
7 #Copyright (C) <2012>
|
|
8 #
|
|
9 #Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
10 #this software and associated documentation files (the "Software"), to deal in the
|
|
11 #Software without restriction, including without limitation the rights to use, copy,
|
|
12 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
13 #and to permit persons to whom the Software is furnished to do so, subject to
|
|
14 #the following conditions:
|
|
15 #
|
|
16 #The above copyright notice and this permission notice shall be included in all copies
|
|
17 #or substantial portions of the Software.
|
|
18 #
|
|
19 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
|
20 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
21 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
22 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
23 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
24 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
25 #####################################################################################
|
|
26
|
|
27 __author__ = "Timothy Tickle"
|
|
28 __copyright__ = "Copyright 2013"
|
|
29 __credits__ = ["Timothy Tickle"]
|
|
30 __license__ = "MIT"
|
|
31 __maintainer__ = "Timothy Tickle"
|
|
32 __email__ = "ttickle@sph.harvard.edu"
|
|
33 __status__ = "Development"
|
|
34
|
|
35 #External libraries
|
|
36 from AbundanceTable import AbundanceTable
|
|
37 from ConstantsFiguresBreadCrumbs import ConstantsFiguresBreadCrumbs
|
|
38 from Ordination import Ordination
|
|
39 import matplotlib.cm as cm
|
|
40 from math import sqrt,asin
|
|
41 from matplotlib.mlab import PCA as mplPCA
|
|
42 from matplotlib import pyplot as plt
|
|
43 from numpy import *
|
|
44 from UtilityMath import UtilityMath
|
|
45 from ValidateData import ValidateData
|
|
46
|
|
47 class PCA(Ordination):
|
|
48 """
|
|
49 Class to Run Principle Components Analysis on an abundance table object
|
|
50 """
|
|
51
|
|
52 def __init__(self):
|
|
53 Ordination.__init__(self)
|
|
54 self.c_strComponents = "components"
|
|
55 self.c_strVariance = "percent_variance"
|
|
56
|
|
57 def run(self,fScale=True,fCenter=True,fASTransform=False):
|
|
58 if not self.dataMatrix is None:
|
|
59 mtrxPrepped = self.dataMatrix.T
|
|
60 if fASTransform:
|
|
61 mtrxPrepped = array([self.doAsinOnList(row) for row in sqrt(mtrxPrepped)])
|
|
62 if fCenter:
|
|
63 mtrxPrepped = mtrxPrepped-mean(mtrxPrepped,0)
|
|
64 if fScale:
|
|
65 # This is consistent to R's prcomp method.
|
|
66 vStd = std(a=mtrxPrepped,axis=0) if fCenter else [sqrt(sum(square(ldRow))/len(ldRow)) for ldRow in mtrxPrepped.T]
|
|
67 mtrxPrepped /= vStd
|
|
68 iRows, iCols = mtrxPrepped.shape
|
|
69 U,S,V = linalg.svd(a=mtrxPrepped,full_matrices=False)
|
|
70 ldVariance = square(S*(iCols-1))
|
|
71 ldVariance = ldVariance/sum(ldVariance)
|
|
72 # Here components are row-wise so each component is a row.
|
|
73 # Here percent variance is given and it is in the order of the components.
|
|
74 self.dataProcessed = {self.c_strComponents:V, self.c_strVariance:ldVariance}
|
|
75 return True
|
|
76 else:
|
|
77 print("PCA:run::Error Tried to run analysis on no data load data first.")
|
|
78 return False
|
|
79
|
|
80 def getVariance(self,iIndex=None):
|
|
81 if not self.dataProcessed is None:
|
|
82 if not iIndex is None:
|
|
83 return self.dataProcessed[self.c_strVariance][iIndex]
|
|
84 return self.dataProcessed[self.c_strVariance]
|
|
85 else:
|
|
86 print("PCA:getVariance::Error Tried to run analysis on no data load data first.")
|
|
87 return False
|
|
88
|
|
89 def getComponents(self,iIndex=None):
|
|
90 if not self.dataProcessed is None:
|
|
91 if not iIndex is None:
|
|
92 return self.dataProcessed[self.c_strComponents].T[iIndex]
|
|
93 return self.dataProcessed[self.c_strComponents].T
|
|
94 else:
|
|
95 print("PCA:getComponents::Error Tried to run analysis on no data load data first.")
|
|
96 return False
|
|
97
|
|
98 def doAsinOnList(self, lsValues):
|
|
99 return([asin(element) for element in lsValues])
|
|
100
|
|
101 def convertMetadataForPCA(self,abndTable):
|
|
102 """ This takes a metadata dictionary from an abundance table and formats the metadata for use in the PCA.
|
|
103 This formatting includes reducing discontinuous data to leveles and replacing NA values to the means of the value (continuous data only)
|
|
104 This returns a numpy array of the format needed for this PCA object.
|
|
105 """
|
|
106
|
|
107 # Replace missing values with the mean
|
|
108 # dummy the discrete data
|
|
109 dictMetadata = abndTable.funcGetMetadataCopy()
|
|
110 if(len(dictMetadata) < 2):
|
|
111 return None
|
|
112
|
|
113 ## Remove the metadata id
|
|
114 dictMetadata.pop(abndTable.funcGetIDMetadataName(),None)
|
|
115 lMetadata = []
|
|
116 for lxItem in dictMetadata.values():
|
|
117 ## If this is not numeric data then dummy
|
|
118 ## Treat NA as a seperate category
|
|
119 if not (sum([ ValidateData.funcIsValidStringFloat(xItem) for xItem in lxItem]) == len(lxItem)):
|
|
120 # Get levels
|
|
121 setsLevels = set(lxItem)
|
|
122 # Go through each level and dummy the metadata
|
|
123 for sLevel in setsLevels:
|
|
124 lMetadata.append([1.0 if xItem==sLevel else 0.0 for xItem in lxItem])
|
|
125 else:
|
|
126 # Change NA to Mean and store numeric data as float
|
|
127 # Also add to the metadata so that there are no negative numbers
|
|
128 ldNONA = [float(xItem) for xItem in lxItem if not xItem.strip().lower() in ["na",""]]
|
|
129 dMean = sum(ldNONA)/float(len(ldNONA))
|
|
130 lsMetadataValues = [dMean if xItem.strip().lower() in ["na",""] else float(xItem) for xItem in lxItem]
|
|
131 dMinValueAdj = abs(min(lsMetadataValues))
|
|
132 lMetadata.append([sValue + dMinValueAdj for sValue in lsMetadataValues])
|
|
133 return(array(lMetadata).T)
|