comparison evaluation.py @ 16:79df97a1bc0f draft

planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit b8f82a8101d9eb74c8dbac51b8a0c75585a888a2
author rnateam
date Fri, 23 Feb 2018 10:46:41 -0500
parents b5f49453af8c
children f93c868203cc
comparison
equal deleted inserted replaced
15:c7ca5d173482 16:79df97a1bc0f
2 import glob 2 import glob
3 from os import system 3 from os import system
4 import re 4 import re
5 from sklearn import metrics 5 from sklearn import metrics
6 from shutil import make_archive 6 from shutil import make_archive
7 import sys
8 import fnmatch, os
7 9
8 def sh(script): 10 def sh(script):
9 system("bash -c '%s'" % script) 11 system("bash -c '%s'" % script)
10 12
11 dataNames = "FASTA/data.names" 13 fasta_dir = sys.argv[1]
14 results_dir = sys.argv[2]
15 dataNames = fasta_dir+"/data.names"
12 16
13 listOfClusters = [] 17 listOfClusters = []
14 listOfHeaders = [] 18 listOfHeaders = []
15 headersNames = set() 19 headersNames = set()
16 cluster_seqs_stats_path = "RESULTS/*.cluster.all" 20 idsNames = set()
17 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path)
18 21
22
23 names = os.listdir(results_dir)
24 cluster_seqs_stats_files = fnmatch.filter(names, '*.cluster.all')
19 with open(dataNames, "r") as names: 25 with open(dataNames, "r") as names:
20 for line2 in names: 26 for line2 in names:
21 splits2 = line2.split() 27 splits2 = line2.split()
22 fullHeader = '' 28 fullHeader = ''
23 if len(splits2) >= 6: 29 if len(splits2) >= 6:
24 fullHeader = splits2[5] 30 fullHeader = splits2[5]
25 headersNames.add(fullHeader) 31 headersNames.add(fullHeader)
32 fullID = splits2[3]
33 idsNames.add(fullID)
26 34
27 blackList = [] 35 blackList = []
28 numberOfClusters = 0 36 numberOfClusters = 0
29 for singleFile in sorted(cluster_seqs_stats_files): 37 for singleFile in sorted(cluster_seqs_stats_files):
38 singleFile = os.path.join(results_dir,singleFile)
30 numberOfClusters += 1 39 numberOfClusters += 1
31 with open(singleFile, "r") as f: 40 with open(singleFile, "r") as f:
32 for line in f: 41 for line in f:
33 splits = line.split() 42 splits = line.split()
34 header = '' 43 header = ''
44 idd = ''
35 if len(splits) >= 11: 45 if len(splits) >= 11:
36 header = splits[10] 46 header = splits[10]
47 idd = splits[8]
37 clustNum = splits[2] 48 clustNum = splits[2]
38 listOfHeaders.append(header) 49 listOfHeaders.append(header)
39 listOfClusters.append(clustNum) 50 listOfClusters.append(clustNum)
40 if header in headersNames: 51 if idd in idsNames: #header in headersNames:
41 blackList.append(header) 52 blackList.append(idd)
42 53
43 numberOfClusters += 1 # 1 cluster for all unassigned seqs 54 numberOfClusters += 1 # 1 cluster for all unassigned seqs
55 ignoreBlackList = False
44 with open(dataNames, "r") as names: 56 with open(dataNames, "r") as names:
45 for line in names.readlines(): 57 for line in names.readlines():
46 splits = line.split() 58 splits = line.split()
47 fullUniqeId = splits[3] 59 fullUniqeId = splits[3]
48 fullHeader = '' 60 fullHeader = ''
61 fullID = ''
49 if len(splits) >= 6: 62 if len(splits) >= 6:
50 fullHeader = line.split()[5] 63 fullHeader = line.split()[5]
51 if fullHeader not in blackList or len(fullHeader) == 0: 64 fullID = line.split()[3]
65 if ignoreBlackList or ( fullID not in blackList #fullHeader not in blackList
66 or len(fullHeader) == 0):
52 listOfHeaders.append(fullHeader) 67 listOfHeaders.append(fullHeader)
53 listOfClusters.append(str(numberOfClusters)) 68 listOfClusters.append(str(numberOfClusters))
54 numberOfClusters += 1 # separate cluster for all unassigned seqs 69 numberOfClusters += 1 # separate cluster for all unassigned seqs
70 # else:
71 # print ("Skip header", fullHeader)
55 72
56 toWrite = "" 73 toWrite = ""
57 for i in range(len(listOfClusters)): 74 for i in range(len(listOfClusters)):
58 toWrite += listOfHeaders[i] + "\t" + listOfClusters[i] + '\n' 75 toWrite += listOfHeaders[i] + "\t" + listOfClusters[i] + '\n'
59 with open("RESULTS/fullTab.tabular", "w") as full: 76
77 with open(results_dir+"/fullTab.tabular", "w") as full:
60 full.write(toWrite) 78 full.write(toWrite)
61 79
62 80
63 pattern = re.compile("^RF.*$") 81 pattern = re.compile("^RF.*$")
64 82
70 adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters) 88 adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters)
71 v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters) 89 v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters)
72 90
73 toWrite = "completeness_score : " + str(completeness_score) + "\n" + "homogeneity_score : " + str(homogeneity_score) + "\n" + "adjusted_rand_score : " +str(adjusted_rand_score) + "\n" + "adjusted_mutual_info_score : " + str(adjusted_mutual_info_score)+ "\n" + "v_measure_score : " + str(v_measure_score) 91 toWrite = "completeness_score : " + str(completeness_score) + "\n" + "homogeneity_score : " + str(homogeneity_score) + "\n" + "adjusted_rand_score : " +str(adjusted_rand_score) + "\n" + "adjusted_mutual_info_score : " + str(adjusted_mutual_info_score)+ "\n" + "v_measure_score : " + str(v_measure_score)
74 92
93
75 else: 94 else:
76 toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA" 95 toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA"
77 96
78 with open("RESULTS/evaluation.txt", "w") as fOut: 97 with open(os.path.join(results_dir,"evaluation.txt"), "w") as fOut:
79 fOut.write(toWrite) 98 fOut.write(toWrite)
80 99
81 100
82 make_archive('RESULTS', 'zip', root_dir='RESULTS') 101 make_archive('RESULTS', 'zip', root_dir=results_dir)