Mercurial > repos > sagun98 > micropita
diff galaxy_micropita/src/breadcrumbs/src/SVM.py @ 3:8fb4630ab314 draft default tip
Uploaded
author | sagun98 |
---|---|
date | Thu, 03 Jun 2021 17:07:36 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy_micropita/src/breadcrumbs/src/SVM.py Thu Jun 03 17:07:36 2021 +0000 @@ -0,0 +1,306 @@ +""" +Author: Timothy Tickle +Description: Class to Allow Support Vector Machine analysis and to contain associated scripts +""" + +##################################################################################### +#Copyright (C) <2012> +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal in the +#Software without restriction, including without limitation the rights to use, copy, +#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +#and to permit persons to whom the Software is furnished to do so, subject to +#the following conditions: +# +#The above copyright notice and this permission notice shall be included in all copies +#or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +##################################################################################### + +__author__ = "Timothy Tickle" +__copyright__ = "Copyright 2012" +__credits__ = ["Timothy Tickle"] +__license__ = "MIT" +__maintainer__ = "Timothy Tickle" +__email__ = "ttickle@sph.harvard.edu" +__status__ = "Development" + +#Libraries +from AbundanceTable import AbundanceTable +from ConstantsBreadCrumbs import ConstantsBreadCrumbs +import csv +import os +from random import shuffle +from ValidateData import ValidateData + +class SVM: + """ + Class which holds generic methods for SVM use. + """ + + #1 Happy Path tested + @staticmethod + def funcConvertAbundanceTableToSVMFile(abndAbundanceTable, xOutputSVMFile, sMetadataLabel, lsOriginalLabels = None, lsSampleOrdering = None): + """ + Converts abundance files to input SVM files. + + :param abndAbundanceTable: AbudanceTable object to turn to input SVM file. + :type: AbundanceTable + :param xOutputSVMFile: File to save SVM data to when converted from the abundance table. + :type: FileStream or string file path + :param sMetadataLabel: The name of the last row in the abundance table representing metadata. + :type: String + :param: lsOriginalLabels The original labels. + :type: List of strings + :param lsSampleOrdering: Order of samples to output to output file. If none, the order in the abundance table is used. + :type: List of strings + :return lsUniqueLabels: List of unique labels. + """ + + #Create data matrix + dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy()) + + #Add labels + llData = [] + lsLabels = lsOriginalLabels if lsOriginalLabels else SVM.funcMakeLabels(abndAbundanceTable.funcGetMetadata(sMetadataLabel)) + if not isinstance(xOutputSVMFile,str): + if xOutputSVMFile.closed: + xOutputSVMFile = open(xOutputSVMFile.name,"w") + ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile + f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) + + #This allows the creation of partially known files for stratification purposes + lsCurrentSamples = abndAbundanceTable.funcGetSampleNames() + lsOrderingSamples = lsSampleOrdering if lsSampleOrdering else lsCurrentSamples[:] + + iLabelIndex = 0 + iSize = len(dataMatrix[0]) + iIndexSample = 1 + for sSample in lsOrderingSamples: + if sSample in lsCurrentSamples: + f.writerow([lsLabels[iLabelIndex]]+ + [ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])]) + iLabelIndex += 1 + iIndexSample += 1 + #Make blank entry + else: + f.writerow([ConstantsBreadCrumbs.c_strSVMNoSample]+[ConstantsBreadCrumbs.c_strColon.join([str(tpleNas[0]+1),str(tpleNas[1])]) + for tpleNas in enumerate([ConstantsBreadCrumbs.c_strSVMNoSample]*iSize)]) + if lsOriginalLabels: + iLabelIndex += 1 + ostm.close() + return set(lsLabels) + + @staticmethod + def funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable, xOutputSVMFile, lsOriginalLabels, lsSampleOrdering): + """ + Takes a SVM input file and updates it with an abundance table. + lsOriginalLabels and lsSampleOrdering should be consistent to the input file. + Samples in the abundance table will be used to update the file if the sample name in the abundace table is also in the lsSampleOrdering. + lsOriginalLabels and lsSampleOrdering should be in the same order. + + :param abndAbundanceTable: AbudanceTable object to turn to input SVM file. + :type: AbundanceTable + :param xOutputSVMFile: File to save SVM data to when converted from the abundance table. + :type: FileStream or string file path + :param lsOriginalLabels: The list of the original labels (as numerics 0,1,2,3,4...as should be in the file). + :type: List of strings + :param lsSampleOrdering: Order of samples in the output file. + :type: List of strings + :return lsUniqueLabels: List of unique labels. + """ + + #Read in old file + if not isinstance(xOutputSVMFile,str): + if xOutputSVMFile.closed: + xOutputSVMFile = open(xOutputSVMFile.name,"r") + ostm = open(xOutputSVMFile,"r") if isinstance(xOutputSVMFile, str) else xOutputSVMFile + fin = csv.reader(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) + #Read in contents of file + llsOldContents = [lsRow for lsRow in fin] + ostm.close() + + #Check to make sure this ordering covers all positions in the old file + if not len(llsOldContents) == len(lsSampleOrdering): + print "The length of the original file ("+str(len(llsOldContents))+") does not match the length of the ordering given ("+str(len(lsSampleOrdering))+")." + return False + + #Create data matrix from new data + dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy()) + + #Add labels + llData = [] + + #Write to file + if not isinstance(xOutputSVMFile,str): + if xOutputSVMFile.closed: + xOutputSVMFile = open(xOutputSVMFile.name,"w") + ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile + f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) + + #This allows to know what position to place the new lines + lsCurrentSamples = abndAbundanceTable.funcGetSampleNames() + + iSize = len(dataMatrix[0]) + iIndexSample = 1 + iIndexOriginalOrder = 0 + for sSample in lsSampleOrdering: + if sSample in lsCurrentSamples: + f.writerow([lsOriginalLabels[iIndexOriginalOrder]]+ + [ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])]) + iIndexSample += 1 + #Make blank entry + else: + f.writerow(llsOldContents[iIndexOriginalOrder]) + iIndexOriginalOrder += 1 + ostm.close() + return True + + #Tested 5 + @staticmethod + def funcMakeLabels(lsMetadata): + """ + Given a list of metadata, labels are assigned. This is function represents a central location to make labels so all are consistent. + + :param lsMetafdata: List of metadata to turn into labels based on the metadata's values. + :type: List of integer labels + """ + #Do not use a set to make elements unique. Need to preserve order. + #First label should be 0 + lsUniqueLabels = [] + [lsUniqueLabels.append(sElement) for sElement in lsMetadata if not (sElement in lsUniqueLabels)] + + dictLabels = dict([[str(lenuLabels[1]),str(lenuLabels[0])] for lenuLabels in enumerate(lsUniqueLabels)]) + return [dictLabels[sLabel] for sLabel in lsMetadata] + + #Tested + @staticmethod + def funcReadLabelsFromFile(xSVMFile, lsAllSampleNames, isPredictFile): + """ + Reads in the labels from the input file or prediction output file of a LibSVM formatted file + and associates them in order with the given sample names. + + Prediction file expected format: Labels declared in first line with labels keyword. + Each following row a sample with the first entry the predicted label + Prediction file example: + labels 0 1 + 0 0.3 0.4 0.6 + 1 0.1 0.2 0.3 + 1 0.2 0.2 0.2 + 0 0.2 0.4 0.3 + + Input file expected format: + Each row a sample with the first entry the predicted label + Input file example: + 0 0.3 0.4 0.6 + 1 0.1 0.2 0.3 + 1 0.2 0.2 0.2 + 0 0.2 0.4 0.3 + + :param xSVMFile: File path to read in prediction labels. + :type String + :param lsAllSampleNames List of sample ids in the order of the labels. + :type List of Strings + :param isPredictFile: Indicates if the file is the input (False) or prediction (True) file + :type boolean + :return: Dictionary {label:["sampleName1", "sampleName2"...],...} or False on error + """ + #Open prediction file and input file and get labels to compare to the predictions + g = csv.reader( open(xSVMFile, 'r') if isinstance(xSVMFile, str) else xSVMFile, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace ) + lsOriginalLabels = [lsLineElements[0] for lsLineElements in g if not lsLineElements[0] == ConstantsBreadCrumbs.c_strSVMNoSample] + + if isPredictFile: + lsOriginalLabels = lsOriginalLabels[1:] + + #Check sample name length + if not len(lsAllSampleNames) == len(lsOriginalLabels): + print "SVM::funcReadLabelsFromFile. Error, the length of sample names did not match the original labels length. Samples ("+str(len(lsAllSampleNames))+"):"+str(lsAllSampleNames)+" Labels ("+str(len(lsOriginalLabels))+"):"+str(lsOriginalLabels) + return False + + #Change to {label:["sampleName1", "sampleName2"...],...} + dictSampleLabelsRet = dict() + for sValue in set(lsOriginalLabels): + dictSampleLabelsRet[sValue] = set([lsAllSampleNames[iindex] for iindex, sLabel in enumerate(lsOriginalLabels) if sLabel == sValue]) + return dictSampleLabelsRet + + #Tested + @staticmethod + def funcScaleFeature(npdData): + """ + Scale a feature between 0 and 1. Using 01 and not 01,1 because it keeps the sparsity of the data and may save time. + + :param npdData: Feature data to scale. + :type Numpy Array Scaled feature data. + :return npaFloat: A numpy array of floats. + """ + if sum(npdData) == 0 or len(set(npdData))==1: + return npdData + dMin = min(npdData) + return (npdData-dMin)/float(max(npdData-dMin)) + + #Tested + @staticmethod + def funcWeightLabels(lLabels): + """ + Returns weights for labels based on how balanced the labels are. Weights try to balance unbalanced results. + + :params lLabels: List of labels to use for measure how balanced the comparison is. + :type List + :return List: [dictWeights ({"label":weight}),lUniqueLabels (unique occurences of original labels)] + """ + #Convert to dict + #Do not use set to make elements unique. Need to preserve order. + #First label should be 0 + lUniqueLabels = [] + for sElement in lLabels: + if sElement not in lUniqueLabels: + lUniqueLabels.append(sElement) + dictLabels = dict(zip(lUniqueLabels, range(len(lUniqueLabels)))) + + #Build a dict of weights per label {label:weight, label:weight} + #Get the occurrence of each label + dictWeights = dict() + for sLabelKey in dictLabels: + sCurLabel = dictLabels[sLabelKey] + dictWeights[sCurLabel] = lLabels.count(sLabelKey) + + #Divide the highest occurrence each occurrence + iMaxOccurence = max(dictWeights.values()) + for sWeightKey in dictWeights: + dictWeights[sWeightKey]=iMaxOccurence/float(dictWeights[sWeightKey]) + + return [dictWeights,lUniqueLabels] + + #Tested 3/4 cases could add in test 12 with randomize True + def func10FoldCrossvalidation(self, iTotalSampleCount, fRandomise = False): + """ + Generator. + Generates the indexes for a 10 fold cross validation given a sample count. + If there are less than 10 samples, it uses the sample count as the K-fold cross validation + as a leave one out method. + + :param iTotalSampleCount: Total Sample Count + :type Integer Sample Count + :param fRandomise: Random sample indices + :type Boolean True indicates randomise (Default False) + """ + #Make indices and shuffle if needed + liindices = range(iTotalSampleCount) + if fRandomise: + shuffle(liindices) + + #For 10 times + iKFold = 10 + if iTotalSampleCount < iKFold: + iKFold = iTotalSampleCount + for iiteration in xrange(iKFold): + lfTraining = [iindex % iKFold != iiteration for iindex in liindices] + lfValidation = [not iindex for iindex in lfTraining] + yield lfTraining, lfValidation