annotate galaxy_micropita/src/breadcrumbs/src/under_development/PCA.py @ 3:8fb4630ab314 draft default tip

Uploaded
author sagun98
date Thu, 03 Jun 2021 17:07:36 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
1 """
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
2 Author: Timothy Tickle
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
3 Description: Performs and plots Principle Components Analysis.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
4 """
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
5
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
6 #####################################################################################
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
7 #Copyright (C) <2012>
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
8 #
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
9 #Permission is hereby granted, free of charge, to any person obtaining a copy of
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
10 #this software and associated documentation files (the "Software"), to deal in the
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
11 #Software without restriction, including without limitation the rights to use, copy,
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
12 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
13 #and to permit persons to whom the Software is furnished to do so, subject to
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
14 #the following conditions:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
15 #
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
16 #The above copyright notice and this permission notice shall be included in all copies
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
17 #or substantial portions of the Software.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
18 #
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
19 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
20 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
21 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
22 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
23 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
24 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
25 #####################################################################################
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
26
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
27 __author__ = "Timothy Tickle"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
28 __copyright__ = "Copyright 2013"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
29 __credits__ = ["Timothy Tickle"]
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
30 __license__ = "MIT"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
31 __maintainer__ = "Timothy Tickle"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
32 __email__ = "ttickle@sph.harvard.edu"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
33 __status__ = "Development"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
34
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
35 #External libraries
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
36 from AbundanceTable import AbundanceTable
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
37 from ConstantsFiguresBreadCrumbs import ConstantsFiguresBreadCrumbs
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
38 from Ordination import Ordination
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
39 import matplotlib.cm as cm
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
40 from math import sqrt,asin
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
41 from matplotlib.mlab import PCA as mplPCA
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
42 from matplotlib import pyplot as plt
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
43 from numpy import *
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
44 from UtilityMath import UtilityMath
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
45 from ValidateData import ValidateData
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
46
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
47 class PCA(Ordination):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
48 """
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
49 Class to Run Principle Components Analysis on an abundance table object
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
50 """
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
51
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
52 def __init__(self):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
53 Ordination.__init__(self)
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
54 self.c_strComponents = "components"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
55 self.c_strVariance = "percent_variance"
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
56
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
57 def run(self,fScale=True,fCenter=True,fASTransform=False):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
58 if not self.dataMatrix is None:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
59 mtrxPrepped = self.dataMatrix.T
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
60 if fASTransform:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
61 mtrxPrepped = array([self.doAsinOnList(row) for row in sqrt(mtrxPrepped)])
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
62 if fCenter:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
63 mtrxPrepped = mtrxPrepped-mean(mtrxPrepped,0)
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
64 if fScale:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
65 # This is consistent to R's prcomp method.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
66 vStd = std(a=mtrxPrepped,axis=0) if fCenter else [sqrt(sum(square(ldRow))/len(ldRow)) for ldRow in mtrxPrepped.T]
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
67 mtrxPrepped /= vStd
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
68 iRows, iCols = mtrxPrepped.shape
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
69 U,S,V = linalg.svd(a=mtrxPrepped,full_matrices=False)
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
70 ldVariance = square(S*(iCols-1))
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
71 ldVariance = ldVariance/sum(ldVariance)
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
72 # Here components are row-wise so each component is a row.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
73 # Here percent variance is given and it is in the order of the components.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
74 self.dataProcessed = {self.c_strComponents:V, self.c_strVariance:ldVariance}
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
75 return True
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
76 else:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
77 print("PCA:run::Error Tried to run analysis on no data load data first.")
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
78 return False
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
79
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
80 def getVariance(self,iIndex=None):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
81 if not self.dataProcessed is None:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
82 if not iIndex is None:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
83 return self.dataProcessed[self.c_strVariance][iIndex]
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
84 return self.dataProcessed[self.c_strVariance]
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
85 else:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
86 print("PCA:getVariance::Error Tried to run analysis on no data load data first.")
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
87 return False
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
88
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
89 def getComponents(self,iIndex=None):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
90 if not self.dataProcessed is None:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
91 if not iIndex is None:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
92 return self.dataProcessed[self.c_strComponents].T[iIndex]
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
93 return self.dataProcessed[self.c_strComponents].T
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
94 else:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
95 print("PCA:getComponents::Error Tried to run analysis on no data load data first.")
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
96 return False
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
97
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
98 def doAsinOnList(self, lsValues):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
99 return([asin(element) for element in lsValues])
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
100
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
101 def convertMetadataForPCA(self,abndTable):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
102 """ This takes a metadata dictionary from an abundance table and formats the metadata for use in the PCA.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
103 This formatting includes reducing discontinuous data to leveles and replacing NA values to the means of the value (continuous data only)
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
104 This returns a numpy array of the format needed for this PCA object.
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
105 """
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
106
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
107 # Replace missing values with the mean
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
108 # dummy the discrete data
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
109 dictMetadata = abndTable.funcGetMetadataCopy()
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
110 if(len(dictMetadata) < 2):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
111 return None
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
112
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
113 ## Remove the metadata id
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
114 dictMetadata.pop(abndTable.funcGetIDMetadataName(),None)
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
115 lMetadata = []
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
116 for lxItem in dictMetadata.values():
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
117 ## If this is not numeric data then dummy
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
118 ## Treat NA as a seperate category
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
119 if not (sum([ ValidateData.funcIsValidStringFloat(xItem) for xItem in lxItem]) == len(lxItem)):
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
120 # Get levels
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
121 setsLevels = set(lxItem)
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
122 # Go through each level and dummy the metadata
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
123 for sLevel in setsLevels:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
124 lMetadata.append([1.0 if xItem==sLevel else 0.0 for xItem in lxItem])
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
125 else:
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
126 # Change NA to Mean and store numeric data as float
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
127 # Also add to the metadata so that there are no negative numbers
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
128 ldNONA = [float(xItem) for xItem in lxItem if not xItem.strip().lower() in ["na",""]]
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
129 dMean = sum(ldNONA)/float(len(ldNONA))
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
130 lsMetadataValues = [dMean if xItem.strip().lower() in ["na",""] else float(xItem) for xItem in lxItem]
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
131 dMinValueAdj = abs(min(lsMetadataValues))
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
132 lMetadata.append([sValue + dMinValueAdj for sValue in lsMetadataValues])
8fb4630ab314 Uploaded
sagun98
parents:
diff changeset
133 return(array(lMetadata).T)