Mercurial > repos > rnateam > graphclust_postprocessing_no_align
diff evaluation.py @ 0:0a48b2db75e7 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/GraphClust/CollectResultsNoAlign commit 2a6fd70c1bcec36ffdf0bba2ec82489b39cfc84e
author | rnateam |
---|---|
date | Sat, 27 Oct 2018 13:49:00 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/evaluation.py Sat Oct 27 13:49:00 2018 -0400 @@ -0,0 +1,105 @@ +#!/usr/bin/env python +import glob +from os import system +import re +from sklearn import metrics +from shutil import make_archive +import sys +import fnmatch, os + +def sh(script): + system("bash -c '%s'" % script) + +fasta_dir = sys.argv[1] +results_dir = sys.argv[2] +dataNames = os.path.join(fasta_dir,"data.names") + +listOfClusters = [] +listOfHeaders = [] +headersNames = set() +idsNames = set() + + +names = os.listdir(results_dir) +cluster_seqs_stats_files = fnmatch.filter(names, '*.cluster.all') +with open(dataNames, "r") as names: + for line2 in names: + splits2 = line2.split() + fullHeader = '' + if len(splits2) >= 6: + fullHeader = splits2[5] + headersNames.add(fullHeader) + fullID = splits2[3] + idsNames.add(fullID) + +blackList = [] +numberOfClusters = 0 +for singleFile in sorted(cluster_seqs_stats_files): + singleFile = os.path.join(results_dir,singleFile) + numberOfClusters += 1 + with open(singleFile, "r") as f: + for line in f: + splits = line.split() + header = '' + idd = '' + if len(splits) >= 11: + header = splits[10] + idd = splits[8] + clustNum = splits[2] + listOfHeaders.append(header) + listOfClusters.append(clustNum) + if idd in idsNames: #header in headersNames: + blackList.append(idd) + +numberOfClusters += 1 # 1 cluster for all unassigned seqs +ignoreBlackList = False +with open(dataNames, "r") as names: + for line in names: + splits = line.split() + fullUniqeId = splits[3] + fullHeader = '' + fullID = '' + if len(splits) >= 6: + fullHeader = line.split()[5] + fullID = line.split()[3] + if ignoreBlackList or ( fullID not in blackList #fullHeader not in blackList + or len(fullHeader) == 0): + listOfHeaders.append(fullHeader) + listOfClusters.append(str(numberOfClusters)) + numberOfClusters += 1 # separate cluster for all unassigned seqs + # else: + # print ("Skip header", fullHeader) + +toWrite = "" +for i in range(len(listOfClusters)): + toWrite += "%s\t%s\n" % (listOfHeaders[i], listOfClusters[i]) + +with open(os.path.join(results_dir,"fullTab.tabular"), "w") as full: + full.write(toWrite) + + +pattern = re.compile("^RF.*$") + +if len(listOfHeaders) > 1: # and pattern.match(str(listOfHeaders[0])): + + completeness_score = metrics.completeness_score(listOfHeaders, listOfClusters) + homogeneity_score = metrics.homogeneity_score(listOfHeaders, listOfClusters) + adjusted_rand_score = metrics.adjusted_rand_score(listOfHeaders, listOfClusters) + adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters) + v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters) + + toWrite = "completeness_score : {}\n".format(completeness_score) + toWrite += "homogeneity_score : {}\n".format(homogeneity_score) + toWrite += "adjusted_rand_score : {}\n".format(adjusted_rand_score) + toWrite += "adjusted_mutual_info_score : {}\n".format(adjusted_mutual_info_score) + toWrite += "v_measure_score : {}\n".format(v_measure_score) + + +else: + toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA" + +with open(os.path.join(results_dir,"evaluation.txt"), "w") as fOut: + fOut.write(toWrite) + + +make_archive('RESULTS', 'zip', root_dir=results_dir)