comparison evaluation.py @ 12:b5f49453af8c draft

planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit 65d322f9ab2f24d65b307f3553589149a1d678d5
author rnateam
date Wed, 31 May 2017 14:53:30 -0400
parents 869a6e807d76
children 79df97a1bc0f
comparison
equal deleted inserted replaced
11:e080ebe95476 12:b5f49453af8c
1 #!/usr/bin/env python2
1 import glob 2 import glob
2 from os import system 3 from os import system
3 import re 4 import re
4 from sklearn import metrics 5 from sklearn import metrics
5 from shutil import make_archive 6 from shutil import make_archive
8 system("bash -c '%s'" % script) 9 system("bash -c '%s'" % script)
9 10
10 dataNames = "FASTA/data.names" 11 dataNames = "FASTA/data.names"
11 12
12 listOfClusters = [] 13 listOfClusters = []
13 listOfClasses = [] 14 listOfHeaders = []
15 headersNames = set()
14 cluster_seqs_stats_path = "RESULTS/*.cluster.all" 16 cluster_seqs_stats_path = "RESULTS/*.cluster.all"
15 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) 17 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path)
18
19 with open(dataNames, "r") as names:
20 for line2 in names:
21 splits2 = line2.split()
22 fullHeader = ''
23 if len(splits2) >= 6:
24 fullHeader = splits2[5]
25 headersNames.add(fullHeader)
16 26
17 blackList = [] 27 blackList = []
18 numberOfClusters = 0 28 numberOfClusters = 0
19 for singleFile in sorted(cluster_seqs_stats_files): 29 for singleFile in sorted(cluster_seqs_stats_files):
20 numberOfClusters += 1 30 numberOfClusters += 1
21 with open(singleFile, "r") as f: 31 with open(singleFile, "r") as f:
22 for line in f.readlines(): 32 for line in f:
23 uniqueId = line.split()[8] 33 splits = line.split()
24 clustNum = line.split()[2] 34 header = ''
25 rnaClass, sep, tail = uniqueId.partition("_") 35 if len(splits) >= 11:
26 listOfClasses.append(rnaClass) 36 header = splits[10]
37 clustNum = splits[2]
38 listOfHeaders.append(header)
27 listOfClusters.append(clustNum) 39 listOfClusters.append(clustNum)
28 with open(dataNames, "r") as names: 40 if header in headersNames:
29 for line in names.readlines(): 41 blackList.append(header)
30 fullUniqeId = line.split()[3]
31 rnaClass, sep, tail = fullUniqeId.partition("_")
32 if fullUniqeId == uniqueId:
33 blackList.append(uniqueId)
34 42
35 numberOfClusters += 1 # 1 cluster for all unassigned seqs 43 numberOfClusters += 1 # 1 cluster for all unassigned seqs
36 with open(dataNames, "r") as names: 44 with open(dataNames, "r") as names:
37 for line in names.readlines(): 45 for line in names.readlines():
38 fullUniqeId = line.split()[3] 46 splits = line.split()
39 rnaClass, sep, tail = fullUniqeId.partition("_") 47 fullUniqeId = splits[3]
40 rnaClass, sep, tail = fullUniqeId.partition("_") 48 fullHeader = ''
41 if fullUniqeId not in blackList: 49 if len(splits) >= 6:
42 listOfClasses.append(rnaClass) 50 fullHeader = line.split()[5]
51 if fullHeader not in blackList or len(fullHeader) == 0:
52 listOfHeaders.append(fullHeader)
43 listOfClusters.append(str(numberOfClusters)) 53 listOfClusters.append(str(numberOfClusters))
44 numberOfClusters += 1 # separate cluster for all unassigned seqs 54 numberOfClusters += 1 # separate cluster for all unassigned seqs
45 55
46 toWrite = "" 56 toWrite = ""
47 for i in range(len(listOfClusters)): 57 for i in range(len(listOfClusters)):
48 toWrite += listOfClasses[i] + "\t" + listOfClusters[i] + '\n' 58 toWrite += listOfHeaders[i] + "\t" + listOfClusters[i] + '\n'
49 with open("RESULTS/fullTab.tabular", "w") as full: 59 with open("RESULTS/fullTab.tabular", "w") as full:
50 full.write(toWrite) 60 full.write(toWrite)
51 61
52 62
53 pattern = re.compile("^RF.*$") 63 pattern = re.compile("^RF.*$")
54 64
65 if len(listOfHeaders) > 1: # and pattern.match(str(listOfHeaders[0])):
55 66
56 if len(listOfClasses) > 0 and pattern.match(str(listOfClasses[0])): 67 completeness_score = metrics.completeness_score(listOfHeaders, listOfClusters)
57 68 homogeneity_score = metrics.homogeneity_score(listOfHeaders, listOfClusters)
58 completeness_score = metrics.completeness_score(listOfClasses, listOfClusters) 69 adjusted_rand_score = metrics.adjusted_rand_score(listOfHeaders, listOfClusters)
59 homogeneity_score = metrics.homogeneity_score(listOfClasses, listOfClusters) 70 adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters)
60 adjusted_rand_score = metrics.adjusted_rand_score(listOfClasses, listOfClusters) 71 v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters)
61 adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfClasses, listOfClusters)
62 v_measure_score = metrics.v_measure_score(listOfClasses, listOfClusters)
63 72
64 toWrite = "completeness_score : " + str(completeness_score) + "\n" + "homogeneity_score : " + str(homogeneity_score) + "\n" + "adjusted_rand_score : " +str(adjusted_rand_score) + "\n" + "adjusted_mutual_info_score : " + str(adjusted_mutual_info_score)+ "\n" + "v_measure_score : " + str(v_measure_score) 73 toWrite = "completeness_score : " + str(completeness_score) + "\n" + "homogeneity_score : " + str(homogeneity_score) + "\n" + "adjusted_rand_score : " +str(adjusted_rand_score) + "\n" + "adjusted_mutual_info_score : " + str(adjusted_mutual_info_score)+ "\n" + "v_measure_score : " + str(v_measure_score)
65 74
66 else: 75 else:
67 toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA" 76 toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA"