view evaluation.py @ 17:f93c868203cc draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/GraphClust/CollectResults commit 4406735e44aba20859c252be39f4e99df28c7a92
author rnateam
date Sat, 27 Oct 2018 13:23:06 -0400
parents 79df97a1bc0f
children
line wrap: on
line source

#!/usr/bin/env python
import glob
from os import system
import re
from sklearn import metrics
from shutil import make_archive
import sys
import fnmatch, os

def sh(script):
    system("bash -c '%s'" % script)

fasta_dir = sys.argv[1]
results_dir = sys.argv[2]
dataNames = os.path.join(fasta_dir,"data.names")

listOfClusters = []
listOfHeaders = []
headersNames = set()
idsNames = set()


names = os.listdir(results_dir)
cluster_seqs_stats_files = fnmatch.filter(names, '*.cluster.all')
with open(dataNames, "r") as names:
    for line2 in names:
        splits2 = line2.split()
        fullHeader = ''
        if len(splits2) >= 6:
            fullHeader = splits2[5]
            headersNames.add(fullHeader)
            fullID = splits2[3]
            idsNames.add(fullID)

blackList = []
numberOfClusters = 0
for singleFile in sorted(cluster_seqs_stats_files):
    singleFile = os.path.join(results_dir,singleFile)
    numberOfClusters += 1
    with open(singleFile, "r") as f:
        for line in f:
            splits = line.split()
            header = ''
            idd = ''
            if len(splits) >= 11:
                header = splits[10]
                idd = splits[8]
            clustNum = splits[2]
            listOfHeaders.append(header)
            listOfClusters.append(clustNum)
            if idd in idsNames: #header in headersNames:
                blackList.append(idd)

numberOfClusters += 1  # 1 cluster for all unassigned seqs
ignoreBlackList = False
with open(dataNames, "r") as names:
    for line in names:
        splits = line.split() 
        fullUniqeId = splits[3]
        fullHeader = ''
        fullID = ''
        if len(splits) >= 6:
            fullHeader = line.split()[5]
            fullID = line.split()[3]
        if ignoreBlackList or ( fullID not in blackList #fullHeader not in blackList 
            or len(fullHeader) == 0):
            listOfHeaders.append(fullHeader)
            listOfClusters.append(str(numberOfClusters))
            numberOfClusters += 1  # separate cluster for all unassigned seqs
        # else:
        #     print ("Skip header", fullHeader)

toWrite = ""
for i in range(len(listOfClusters)):
    toWrite += "%s\t%s\n" % (listOfHeaders[i], listOfClusters[i]) 
 
with open(os.path.join(results_dir,"fullTab.tabular"), "w") as full:
    full.write(toWrite)


pattern = re.compile("^RF.*$")

if len(listOfHeaders) > 1: # and  pattern.match(str(listOfHeaders[0])):

    completeness_score = metrics.completeness_score(listOfHeaders, listOfClusters)
    homogeneity_score = metrics.homogeneity_score(listOfHeaders, listOfClusters)
    adjusted_rand_score = metrics.adjusted_rand_score(listOfHeaders, listOfClusters)
    adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfHeaders, listOfClusters)
    v_measure_score = metrics.v_measure_score(listOfHeaders, listOfClusters)

    toWrite = "completeness_score : {}\n".format(completeness_score) 
    toWrite += "homogeneity_score : {}\n".format(homogeneity_score) 
    toWrite += "adjusted_rand_score : {}\n".format(adjusted_rand_score)
    toWrite += "adjusted_mutual_info_score : {}\n".format(adjusted_mutual_info_score)
    toWrite += "v_measure_score : {}\n".format(v_measure_score)


else:
    toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA"

with open(os.path.join(results_dir,"evaluation.txt"), "w") as fOut:
    fOut.write(toWrite)


make_archive('RESULTS', 'zip', root_dir=results_dir)