Mercurial > repos > sagun98 > micropita
comparison galaxy_micropita/src/breadcrumbs/src/under_development/PCA.py @ 3:8fb4630ab314 draft default tip
Uploaded
author | sagun98 |
---|---|
date | Thu, 03 Jun 2021 17:07:36 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:1c5736dc85ab | 3:8fb4630ab314 |
---|---|
1 """ | |
2 Author: Timothy Tickle | |
3 Description: Performs and plots Principle Components Analysis. | |
4 """ | |
5 | |
6 ##################################################################################### | |
7 #Copyright (C) <2012> | |
8 # | |
9 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
10 #this software and associated documentation files (the "Software"), to deal in the | |
11 #Software without restriction, including without limitation the rights to use, copy, | |
12 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
13 #and to permit persons to whom the Software is furnished to do so, subject to | |
14 #the following conditions: | |
15 # | |
16 #The above copyright notice and this permission notice shall be included in all copies | |
17 #or substantial portions of the Software. | |
18 # | |
19 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
20 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
21 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
22 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
23 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
24 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
25 ##################################################################################### | |
26 | |
27 __author__ = "Timothy Tickle" | |
28 __copyright__ = "Copyright 2013" | |
29 __credits__ = ["Timothy Tickle"] | |
30 __license__ = "MIT" | |
31 __maintainer__ = "Timothy Tickle" | |
32 __email__ = "ttickle@sph.harvard.edu" | |
33 __status__ = "Development" | |
34 | |
35 #External libraries | |
36 from AbundanceTable import AbundanceTable | |
37 from ConstantsFiguresBreadCrumbs import ConstantsFiguresBreadCrumbs | |
38 from Ordination import Ordination | |
39 import matplotlib.cm as cm | |
40 from math import sqrt,asin | |
41 from matplotlib.mlab import PCA as mplPCA | |
42 from matplotlib import pyplot as plt | |
43 from numpy import * | |
44 from UtilityMath import UtilityMath | |
45 from ValidateData import ValidateData | |
46 | |
47 class PCA(Ordination): | |
48 """ | |
49 Class to Run Principle Components Analysis on an abundance table object | |
50 """ | |
51 | |
52 def __init__(self): | |
53 Ordination.__init__(self) | |
54 self.c_strComponents = "components" | |
55 self.c_strVariance = "percent_variance" | |
56 | |
57 def run(self,fScale=True,fCenter=True,fASTransform=False): | |
58 if not self.dataMatrix is None: | |
59 mtrxPrepped = self.dataMatrix.T | |
60 if fASTransform: | |
61 mtrxPrepped = array([self.doAsinOnList(row) for row in sqrt(mtrxPrepped)]) | |
62 if fCenter: | |
63 mtrxPrepped = mtrxPrepped-mean(mtrxPrepped,0) | |
64 if fScale: | |
65 # This is consistent to R's prcomp method. | |
66 vStd = std(a=mtrxPrepped,axis=0) if fCenter else [sqrt(sum(square(ldRow))/len(ldRow)) for ldRow in mtrxPrepped.T] | |
67 mtrxPrepped /= vStd | |
68 iRows, iCols = mtrxPrepped.shape | |
69 U,S,V = linalg.svd(a=mtrxPrepped,full_matrices=False) | |
70 ldVariance = square(S*(iCols-1)) | |
71 ldVariance = ldVariance/sum(ldVariance) | |
72 # Here components are row-wise so each component is a row. | |
73 # Here percent variance is given and it is in the order of the components. | |
74 self.dataProcessed = {self.c_strComponents:V, self.c_strVariance:ldVariance} | |
75 return True | |
76 else: | |
77 print("PCA:run::Error Tried to run analysis on no data load data first.") | |
78 return False | |
79 | |
80 def getVariance(self,iIndex=None): | |
81 if not self.dataProcessed is None: | |
82 if not iIndex is None: | |
83 return self.dataProcessed[self.c_strVariance][iIndex] | |
84 return self.dataProcessed[self.c_strVariance] | |
85 else: | |
86 print("PCA:getVariance::Error Tried to run analysis on no data load data first.") | |
87 return False | |
88 | |
89 def getComponents(self,iIndex=None): | |
90 if not self.dataProcessed is None: | |
91 if not iIndex is None: | |
92 return self.dataProcessed[self.c_strComponents].T[iIndex] | |
93 return self.dataProcessed[self.c_strComponents].T | |
94 else: | |
95 print("PCA:getComponents::Error Tried to run analysis on no data load data first.") | |
96 return False | |
97 | |
98 def doAsinOnList(self, lsValues): | |
99 return([asin(element) for element in lsValues]) | |
100 | |
101 def convertMetadataForPCA(self,abndTable): | |
102 """ This takes a metadata dictionary from an abundance table and formats the metadata for use in the PCA. | |
103 This formatting includes reducing discontinuous data to leveles and replacing NA values to the means of the value (continuous data only) | |
104 This returns a numpy array of the format needed for this PCA object. | |
105 """ | |
106 | |
107 # Replace missing values with the mean | |
108 # dummy the discrete data | |
109 dictMetadata = abndTable.funcGetMetadataCopy() | |
110 if(len(dictMetadata) < 2): | |
111 return None | |
112 | |
113 ## Remove the metadata id | |
114 dictMetadata.pop(abndTable.funcGetIDMetadataName(),None) | |
115 lMetadata = [] | |
116 for lxItem in dictMetadata.values(): | |
117 ## If this is not numeric data then dummy | |
118 ## Treat NA as a seperate category | |
119 if not (sum([ ValidateData.funcIsValidStringFloat(xItem) for xItem in lxItem]) == len(lxItem)): | |
120 # Get levels | |
121 setsLevels = set(lxItem) | |
122 # Go through each level and dummy the metadata | |
123 for sLevel in setsLevels: | |
124 lMetadata.append([1.0 if xItem==sLevel else 0.0 for xItem in lxItem]) | |
125 else: | |
126 # Change NA to Mean and store numeric data as float | |
127 # Also add to the metadata so that there are no negative numbers | |
128 ldNONA = [float(xItem) for xItem in lxItem if not xItem.strip().lower() in ["na",""]] | |
129 dMean = sum(ldNONA)/float(len(ldNONA)) | |
130 lsMetadataValues = [dMean if xItem.strip().lower() in ["na",""] else float(xItem) for xItem in lxItem] | |
131 dMinValueAdj = abs(min(lsMetadataValues)) | |
132 lMetadata.append([sValue + dMinValueAdj for sValue in lsMetadataValues]) | |
133 return(array(lMetadata).T) |