comparison evaluation.py @ 0:0a48b2db75e7 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/GraphClust/CollectResultsNoAlign commit 2a6fd70c1bcec36ffdf0bba2ec82489b39cfc84e
author rnateam
date Sat, 27 Oct 2018 13:49:00 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0a48b2db75e7
1 #!/usr/bin/env python
2 import glob
3 from os import system
4 import re
5 from sklearn import metrics
6 from shutil import make_archive
7 import sys
8 import fnmatch, os
9
10 def sh(script):
11 system("bash -c '%s'" % script)
12
13 fasta_dir = sys.argv[1]
14 results_dir = sys.argv[2]
15 dataNames = os.path.join(fasta_dir,"data.names")
16
17 listOfClusters = []
18 listOfHeaders = []
19 headersNames = set()
20 idsNames = set()
21
22
23 names = os.listdir(results_dir)
24 cluster_seqs_stats_files = fnmatch.filter(names, '*.cluster.all')
25 with open(dataNames, "r") as names:
26 for line2 in names:
27 splits2 = line2.split()
28 fullHeader = ''
29 if len(splits2) >= 6:
30 fullHeader = splits2[5]
31 headersNames.add(fullHeader)
32 fullID = splits2[3]
33 idsNames.add(fullID)
34
35 blackList = []
36 numberOfClusters = 0
37 for singleFile in sorted(cluster_seqs_stats_files):
38 singleFile = os.path.join(results_dir,singleFile)
39 numberOfClusters += 1
40 with open(singleFile, "r") as f:
41 for line in f:
42 splits = line.split()
43 header = ''
44 idd = ''
45 if len(splits) >= 11:
46 header = splits[10]
47 idd = splits[8]
48 clustNum = splits[2]
49 listOfHeaders.append(header)
50 listOfClusters.append(clustNum)
51 if idd in idsNames: #header in headersNames:
52 blackList.append(idd)
53
54 numberOfClusters += 1 # 1 cluster for all unassigned seqs
55 ignoreBlackList = False
56 with open(dataNames, "r") as names:
57 for line in names:
58 splits = line.split()
59 fullUniqeId = splits[3]
60 fullHeader = ''
61 fullID = ''
62 if len(splits) >= 6:
63 fullHeader = line.split()[5]
64 fullID = line.split()[3]
65 if ignoreBlackList or ( fullID not in blackList #fullHeader not in blackList
66 or len(fullHeader) == 0):
67 listOfHeaders.append(fullHeader)
68 listOfClusters.append(str(numberOfClusters))
69 numberOfClusters += 1 # separate cluster for all unassigned seqs
70 # else:
71 # print ("Skip header", fullHeader)
72
73 toWrite = ""
74 for i in range(len(listOfClusters)):
75 toWrite += "%s\t%s\n" % (listOfHeaders[i], listOfClusters[i])
76
77 with open(os.path.join(results_dir,"fullTab.tabular"), "w") as full:
78 full.write(toWrite)
79
80
81 pattern = re.compile("^RF.*$")
82
83 if len(listOfHeaders) > 1: # and pattern.match(str(listOfHeaders[0])):
84
85 completeness_score = metrics.completeness_score(listOfHeaders, listOfClusters)
86 homogeneity_score = metrics.homogeneity_score(listOfHeaders, listOfClusters)
87 adjusted_rand_score = metrics.adjusted_rand_score(listOfHeaders, listOfClusters)
88 adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters)
89 v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters)
90
91 toWrite = "completeness_score : {}\n".format(completeness_score)
92 toWrite += "homogeneity_score : {}\n".format(homogeneity_score)
93 toWrite += "adjusted_rand_score : {}\n".format(adjusted_rand_score)
94 toWrite += "adjusted_mutual_info_score : {}\n".format(adjusted_mutual_info_score)
95 toWrite += "v_measure_score : {}\n".format(v_measure_score)
96
97
98 else:
99 toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA"
100
101 with open(os.path.join(results_dir,"evaluation.txt"), "w") as fOut:
102 fOut.write(toWrite)
103
104
105 make_archive('RESULTS', 'zip', root_dir=results_dir)