micropita: galaxy_micropita/src/breadcrumbs/src/SVM.py comparison

comparison galaxy_micropita/src/breadcrumbs/src/SVM.py @ 3:8fb4630ab314 draft default tip

Uploaded

author	sagun98
date	Thu, 03 Jun 2021 17:07:36 +0000
parents
children

comparison

equal deleted inserted replaced

-:1c5736dc85ab
+:8fb4630ab314
+"""
+Author: Timothy Tickle
+Description: Class to Allow Support Vector Machine analysis and to contain associated scripts
+"""
+#####################################################################################
+#Copyright (C) <2012>
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in the
+#Software without restriction, including without limitation the rights to use, copy,
+#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+#and to permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all copies
+#or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#####################################################################################
+__author__ = "Timothy Tickle"
+__copyright__ = "Copyright 2012"
+__credits__ = ["Timothy Tickle"]
+__license__ = "MIT"
+__maintainer__ = "Timothy Tickle"
+__email__ = "ttickle@sph.harvard.edu"
+__status__ = "Development"
+#Libraries
+from AbundanceTable import AbundanceTable
+from ConstantsBreadCrumbs import ConstantsBreadCrumbs
+import csv
+import os
+from random import shuffle
+from ValidateData import ValidateData
+class SVM:
+"""
+Class which holds generic methods for SVM use.
+"""
+#1 Happy Path tested
+@staticmethod
+def funcConvertAbundanceTableToSVMFile(abndAbundanceTable, xOutputSVMFile, sMetadataLabel, lsOriginalLabels = None, lsSampleOrdering = None):
+"""
+Converts abundance files to input SVM files.
+:param abndAbundanceTable:    AbudanceTable object to turn to input SVM file.
+:type:	AbundanceTable
+:param xOutputSVMFile: File to save SVM data to when converted from the abundance table.
+:type:	FileStream or string file path
+:param	sMetadataLabel: The name of the last row in the abundance table representing metadata.
+:type:	String
+	:param:	lsOriginalLabels The original labels.
+	:type:	List of strings
+:param	lsSampleOrdering: Order of samples to output to output file. If none, the order in the abundance table is used.
+:type:	List of strings
+:return	lsUniqueLabels:	List of unique labels.
+"""
+#Create data matrix
+dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy())
+#Add labels
+llData = []
+lsLabels = lsOriginalLabels if lsOriginalLabels else SVM.funcMakeLabels(abndAbundanceTable.funcGetMetadata(sMetadataLabel))
+if not isinstance(xOutputSVMFile,str):
+if xOutputSVMFile.closed:
+xOutputSVMFile = open(xOutputSVMFile.name,"w")
+	ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
+f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
+	#This allows the creation of partially known files for stratification purposes
+	lsCurrentSamples = abndAbundanceTable.funcGetSampleNames()
+lsOrderingSamples = lsSampleOrdering if lsSampleOrdering else lsCurrentSamples[:]
+	iLabelIndex = 0
+	iSize = len(dataMatrix[0])
+	iIndexSample = 1
+	for sSample in lsOrderingSamples:
+		if sSample in lsCurrentSamples:
+		f.writerow([lsLabels[iLabelIndex]]+
+				[ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])])
+			iLabelIndex += 1
+			iIndexSample += 1
+		#Make blank entry
+		else:
+			f.writerow([ConstantsBreadCrumbs.c_strSVMNoSample]+[ConstantsBreadCrumbs.c_strColon.join([str(tpleNas[0]+1),str(tpleNas[1])])
+						for tpleNas in enumerate([ConstantsBreadCrumbs.c_strSVMNoSample]*iSize)])
+			if lsOriginalLabels:
+				iLabelIndex += 1
+	ostm.close()
+return set(lsLabels)
+@staticmethod
+def funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable, xOutputSVMFile, lsOriginalLabels, lsSampleOrdering):
+"""
+Takes a SVM input file and updates it with an abundance table.
+lsOriginalLabels and lsSampleOrdering should be consistent to the input file.
+Samples in the abundance table will be used to update the file if the sample name in the abundace table is also in the lsSampleOrdering.
+lsOriginalLabels and lsSampleOrdering should be in the same order.
+:param abndAbundanceTable:   AbudanceTable object to turn to input SVM file.
+:type:    AbundanceTable
+:param xOutputSVMFile: File to save SVM data to when converted from the abundance table.
+:type:	FileStream or string file path
+:param	lsOriginalLabels: The list of the original labels (as numerics 0,1,2,3,4...as should be in the file).
+:type:	List of strings
+:param	lsSampleOrdering: Order of samples in the output file.
+:type:	List of strings
+:return	lsUniqueLabels:	List of unique labels.
+"""
+#Read in old file
+if not isinstance(xOutputSVMFile,str):
+if xOutputSVMFile.closed:
+xOutputSVMFile = open(xOutputSVMFile.name,"r")
+	ostm = open(xOutputSVMFile,"r") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
+fin = csv.reader(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
+	#Read in contents of file
+	llsOldContents = [lsRow for lsRow in fin]
+	ostm.close()
+	#Check to make sure this ordering covers all positions in the old file
+	if not len(llsOldContents) == len(lsSampleOrdering):
+		print "The length of the original file ("+str(len(llsOldContents))+") does not match the length of the ordering given ("+str(len(lsSampleOrdering))+")."
+		return False
+#Create data matrix from new data
+dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy())
+#Add labels
+llData = []
+	#Write to file
+if not isinstance(xOutputSVMFile,str):
+if xOutputSVMFile.closed:
+xOutputSVMFile = open(xOutputSVMFile.name,"w")
+	ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
+f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
+	#This allows to know what position to place the new lines
+	lsCurrentSamples = abndAbundanceTable.funcGetSampleNames()
+	iSize = len(dataMatrix[0])
+	iIndexSample = 1
+	iIndexOriginalOrder = 0
+	for sSample in lsSampleOrdering:
+		if sSample in lsCurrentSamples:
+		f.writerow([lsOriginalLabels[iIndexOriginalOrder]]+
+				[ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])])
+			iIndexSample += 1
+		#Make blank entry
+		else:
+			f.writerow(llsOldContents[iIndexOriginalOrder])
+		iIndexOriginalOrder += 1
+	ostm.close()
+return True
+#Tested 5
+@staticmethod
+def funcMakeLabels(lsMetadata):
+"""
+Given a list of metadata, labels are assigned. This is function represents a central location to make labels so all are consistent.
+:param	lsMetafdata:    List of metadata to turn into labels based on the metadata's values.
+:type:	List of integer labels
+"""
+#Do not use a set to make elements unique. Need to preserve order.
+#First label should be 0
+lsUniqueLabels = []
+[lsUniqueLabels.append(sElement) for sElement in lsMetadata if not (sElement in lsUniqueLabels)]
+dictLabels = dict([[str(lenuLabels[1]),str(lenuLabels[0])] for lenuLabels in enumerate(lsUniqueLabels)])
+return [dictLabels[sLabel] for sLabel in lsMetadata]
+#Tested
+@staticmethod
+def funcReadLabelsFromFile(xSVMFile, lsAllSampleNames, isPredictFile):
+"""
+Reads in the labels from the input file or prediction output file of a LibSVM formatted file
+and associates them in order with the given sample names.
+Prediction file expected format: Labels declared in first line with labels keyword.
+Each following row a sample with the first entry the predicted label
+Prediction file example:
+labels 0 1
+0	0.3	0.4	0.6
+1	0.1	0.2	0.3
+1	0.2	0.2	0.2
+0	0.2	0.4	0.3
+Input file expected format:
+Each row a sample with the first entry the predicted label
+Input file example:
+0	0.3	0.4	0.6
+1	0.1	0.2	0.3
+1	0.2	0.2	0.2
+0	0.2	0.4	0.3
+:param xSVMFile:  File path to read in prediction labels.
+:type String
+:param lsAllSampleNames List of sample ids in the order of the labels.
+:type List of Strings
+:param isPredictFile: Indicates if the file is the input (False) or prediction (True) file
+:type boolean
+:return: Dictionary {label:["sampleName1", "sampleName2"...],...} or False on error
+"""
+#Open prediction file and input file and get labels to compare to the predictions
+g = csv.reader( open(xSVMFile, 'r') if isinstance(xSVMFile, str) else xSVMFile, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace )
+lsOriginalLabels = [lsLineElements[0] for lsLineElements in g if not lsLineElements[0] == ConstantsBreadCrumbs.c_strSVMNoSample]
+if isPredictFile:
+lsOriginalLabels = lsOriginalLabels[1:]
+#Check sample name length
+if not len(lsAllSampleNames) == len(lsOriginalLabels):
+print "SVM::funcReadLabelsFromFile. Error, the length of sample names did not match the original labels length. Samples ("+str(len(lsAllSampleNames))+"):"+str(lsAllSampleNames)+" Labels ("+str(len(lsOriginalLabels))+"):"+str(lsOriginalLabels)
+return False
+#Change to {label:["sampleName1", "sampleName2"...],...}
+dictSampleLabelsRet = dict()
+for sValue in set(lsOriginalLabels):
+dictSampleLabelsRet[sValue] = set([lsAllSampleNames[iindex] for iindex, sLabel in enumerate(lsOriginalLabels) if sLabel == sValue])
+return dictSampleLabelsRet
+#Tested
+@staticmethod
+def funcScaleFeature(npdData):
+"""
+Scale a feature between 0 and 1. Using 01 and not 01,1 because it keeps the sparsity of the data and may save time.
+:param	npdData:	Feature data to scale.
+:type	Numpy Array	Scaled feature data.
+:return npaFloat:    A numpy array of floats.
+"""
+if sum(npdData) == 0 or len(set(npdData))==1:
+return npdData
+dMin = min(npdData)
+return (npdData-dMin)/float(max(npdData-dMin))
+#Tested
+@staticmethod
+def funcWeightLabels(lLabels):
+"""
+Returns weights for labels based on how balanced the labels are. Weights try to balance unbalanced results.
+:params	lLabels:	List of labels to use for measure how balanced the comparison is.
+:type	List
+:return	List:		[dictWeights ({"label":weight}),lUniqueLabels (unique occurences of original labels)]
+"""
+#Convert to dict
+#Do not use set to make elements unique. Need to preserve order.
+#First label should be 0
+lUniqueLabels = []
+for sElement in lLabels:
+if sElement not in lUniqueLabels:
+lUniqueLabels.append(sElement)
+dictLabels = dict(zip(lUniqueLabels, range(len(lUniqueLabels))))
+#Build a dict of weights per label {label:weight, label:weight}
+#Get the occurrence of each label
+dictWeights = dict()
+for sLabelKey in dictLabels:
+sCurLabel = dictLabels[sLabelKey]
+dictWeights[sCurLabel] = lLabels.count(sLabelKey)
+#Divide the highest occurrence each occurrence
+iMaxOccurence = max(dictWeights.values())
+for sWeightKey in dictWeights:
+dictWeights[sWeightKey]=iMaxOccurence/float(dictWeights[sWeightKey])
+return [dictWeights,lUniqueLabels]
+#Tested 3/4 cases could add in test 12 with randomize True
+def func10FoldCrossvalidation(self, iTotalSampleCount, fRandomise = False):
+"""
+Generator.
+Generates the indexes for a 10 fold cross validation given a sample count.
+If there are less than 10 samples, it uses the sample count as the K-fold cross validation
+as a leave one out method.
+:param	iTotalSampleCount:	Total Sample Count
+	:type	Integer	Sample Count
+	:param	fRandomise:	Random sample indices
+	:type	Boolean	True indicates randomise (Default False)
+"""
+#Make indices and shuffle if needed
+liindices = range(iTotalSampleCount)
+if fRandomise:
+shuffle(liindices)
+#For 10 times
+iKFold = 10
+if iTotalSampleCount < iKFold:
+iKFold = iTotalSampleCount
+for iiteration in xrange(iKFold):
+lfTraining = [iindex % iKFold != iiteration for iindex in liindices]
+lfValidation = [not iindex for iindex in lfTraining]
+yield lfTraining, lfValidation

Mercurial > repos > sagun98 > micropita

comparison galaxy_micropita/src/breadcrumbs/src/SVM.py @ 3:8fb4630ab314 draft default tip