diff galaxy_micropita/src/breadcrumbs/src/under_development/PCA.py @ 3:8fb4630ab314 draft default tip

Uploaded
author sagun98
date Thu, 03 Jun 2021 17:07:36 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy_micropita/src/breadcrumbs/src/under_development/PCA.py	Thu Jun 03 17:07:36 2021 +0000
@@ -0,0 +1,133 @@
+"""
+Author: Timothy Tickle
+Description: Performs and plots Principle Components Analysis.
+"""
+
+#####################################################################################
+#Copyright (C) <2012>
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in the
+#Software without restriction, including without limitation the rights to use, copy,
+#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+#and to permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all copies
+#or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#####################################################################################
+
+__author__ = "Timothy Tickle"
+__copyright__ = "Copyright 2013"
+__credits__ = ["Timothy Tickle"]
+__license__ = "MIT"
+__maintainer__ = "Timothy Tickle"
+__email__ = "ttickle@sph.harvard.edu"
+__status__ = "Development"
+
+#External libraries
+from AbundanceTable import AbundanceTable
+from ConstantsFiguresBreadCrumbs import ConstantsFiguresBreadCrumbs
+from Ordination import Ordination
+import matplotlib.cm as cm
+from math import sqrt,asin
+from matplotlib.mlab import PCA as mplPCA
+from matplotlib import pyplot as plt
+from numpy import *
+from UtilityMath import UtilityMath
+from ValidateData import ValidateData
+
+class PCA(Ordination):
+  """
+  Class to Run Principle Components Analysis on an abundance table object
+  """
+
+  def __init__(self):
+    Ordination.__init__(self)
+    self.c_strComponents = "components"
+    self.c_strVariance = "percent_variance"
+
+  def run(self,fScale=True,fCenter=True,fASTransform=False):
+    if not self.dataMatrix is None:
+      mtrxPrepped = self.dataMatrix.T
+      if fASTransform:
+        mtrxPrepped = array([self.doAsinOnList(row) for row in sqrt(mtrxPrepped)])
+      if fCenter:
+        mtrxPrepped = mtrxPrepped-mean(mtrxPrepped,0)
+      if fScale:
+        # This is consistent to R's prcomp method.
+        vStd = std(a=mtrxPrepped,axis=0) if fCenter else [sqrt(sum(square(ldRow))/len(ldRow)) for ldRow in mtrxPrepped.T]
+        mtrxPrepped /= vStd
+      iRows, iCols = mtrxPrepped.shape
+      U,S,V = linalg.svd(a=mtrxPrepped,full_matrices=False)
+      ldVariance = square(S*(iCols-1))
+      ldVariance = ldVariance/sum(ldVariance)
+      # Here components are row-wise so each component is a row.
+      # Here percent variance is given and it is in the order of the components.
+      self.dataProcessed = {self.c_strComponents:V, self.c_strVariance:ldVariance}
+      return True
+    else:
+      print("PCA:run::Error Tried to run analysis on no data load data first.")
+    return False
+
+  def getVariance(self,iIndex=None):
+    if not self.dataProcessed is None:
+      if not iIndex is None:
+        return self.dataProcessed[self.c_strVariance][iIndex]
+      return self.dataProcessed[self.c_strVariance]
+    else:
+      print("PCA:getVariance::Error Tried to run analysis on no data load data first.")
+    return False
+
+  def getComponents(self,iIndex=None):
+    if not self.dataProcessed is None:
+      if not iIndex is None:
+        return self.dataProcessed[self.c_strComponents].T[iIndex]
+      return self.dataProcessed[self.c_strComponents].T
+    else:
+      print("PCA:getComponents::Error Tried to run analysis on no data load data first.")
+    return False
+
+  def doAsinOnList(self, lsValues):
+    return([asin(element) for element in lsValues])
+
+  def convertMetadataForPCA(self,abndTable):
+    """ This takes a metadata dictionary from an abundance table and formats the metadata for use in the PCA.
+        This formatting includes reducing discontinuous data to leveles and replacing NA values to the means of the value (continuous data only)
+        This returns a numpy array of the format needed for this PCA object.
+    """
+
+    # Replace missing values with the mean
+    # dummy the discrete data
+    dictMetadata = abndTable.funcGetMetadataCopy()
+    if(len(dictMetadata) < 2):
+      return None
+
+    ## Remove the metadata id
+    dictMetadata.pop(abndTable.funcGetIDMetadataName(),None)
+    lMetadata = []
+    for lxItem in dictMetadata.values():
+      ## If this is not numeric data then dummy
+      ## Treat NA as a seperate category
+      if not (sum([ ValidateData.funcIsValidStringFloat(xItem) for xItem in lxItem]) == len(lxItem)):
+        # Get levels
+        setsLevels = set(lxItem)
+        # Go through each level and dummy the metadata
+        for sLevel in setsLevels:
+          lMetadata.append([1.0 if xItem==sLevel else 0.0 for xItem in lxItem])
+      else:
+        # Change NA to Mean and store numeric data as float
+        # Also add to the metadata so that there are no negative numbers
+        ldNONA = [float(xItem) for xItem in lxItem if not xItem.strip().lower() in ["na",""]]
+        dMean = sum(ldNONA)/float(len(ldNONA))
+        lsMetadataValues = [dMean if xItem.strip().lower() in ["na",""] else float(xItem) for xItem in lxItem]
+        dMinValueAdj = abs(min(lsMetadataValues))
+        lMetadata.append([sValue + dMinValueAdj for sValue in lsMetadataValues])
+    return(array(lMetadata).T)