Mercurial > repos > rnateam > graphclust_postprocessing
changeset 3:79b9117aef01 draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit c03cf64554289eb098267c0923cf0cf7b245cc0c
author | rnateam |
---|---|
date | Wed, 04 Jan 2017 18:15:07 -0500 |
parents | b8e32e577597 |
children | 4a9754d476fe |
files | addCdhitseqs.py evaluation.py glob_report.xml |
diffstat | 3 files changed, 75 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/addCdhitseqs.py Wed Jan 04 18:15:07 2017 -0500 @@ -0,0 +1,59 @@ +import re +import glob +import sys + +cdhitcluster = sys.argv[1] +#clusters = sys.argv[2] + +cluster_seqs_stats_path = "RESULTS/*.cluster.all" +cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) + +#clusterFiles = clusters.split(',') +repSeqRedSeqdict = {} +repLine = "" +count = 0 +first = False + +with open(cdhitcluster, 'r+') as f: + lines = f.readlines() + for i in range(0, len(lines)): + line = lines[i] + if ">Cluster" in line: + first = True + count = 0 + if i+1 < len(lines): + repLine = lines[i+1] + continue + elif not first: + count += 1 + first = False + else: + first = False + lineArr = [] + if count > 0: + repLine = repLine.strip() + rep_FullId = repLine.split()[2] + rep_FullId = rep_FullId.replace(">", "") + #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0] + rep_FullId = rep_FullId.replace("...", "") + line = line.strip() + add_FullId = line.split()[2] + add_FullId = add_FullId.replace(">", "") + add_FullId = add_FullId.replace("...", "") + #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0] + lineArr.append(add_FullId) + repSeqRedSeqdict[rep_FullId] = lineArr + #lineArr.append(add_short_id) + #repSeqRedSeqdict[rep_short_id] = lineArr + +toWrite = "" + +for singleFile in sorted(cluster_seqs_stats_files): + with open(singleFile, "a+") as clFile: + file_content = clFile.read() + first_line = file_content.split('\n')[0] + for key, val in repSeqRedSeqdict.items(): + if key in file_content: + for i in val: + toWrite += first_line.split()[0] + " " + first_line.split()[1] + " " + first_line.split()[2] + " " + " - " + " " + "CD-Hit" + " " + first_line.split()[5] + " " + "ORIGID" + " " + str(i) + "\n" + clFile.write(toWrite)
--- a/evaluation.py Sat Dec 24 18:08:36 2016 -0500 +++ b/evaluation.py Wed Jan 04 18:15:07 2017 -0500 @@ -49,16 +49,10 @@ full.write(toWrite) -listOfClasses = [] -listOfClusters = [] pattern = re.compile("^RF.*$") if len(listOfClasses) > 0 and pattern.match(str(listOfClasses[0])): - with open("RESULTS/fullTab.tabular", "r") as tabF: - for line in tabF.readlines(): - listOfClasses.append(line.split()[0]) - listOfClusters.append(line.split()[1]) completeness_score = metrics.completeness_score(listOfClasses, listOfClusters) homogeneity_score = metrics.homogeneity_score(listOfClasses, listOfClusters)
--- a/glob_report.xml Sat Dec 24 18:08:36 2016 -0500 +++ b/glob_report.xml Wed Jan 04 18:15:07 2017 -0500 @@ -9,7 +9,6 @@ </stdio> <command> <![CDATA[ - unzip $FASTA &> /dev/null && #set $inputFiles = "" @@ -25,17 +24,29 @@ #set $inputFilesTrees += str($mods)+',' #end for #set $inputFilesTrees = $inputFilesTrees[:-1] - - 'glob_res.pl' '$inputFiles' $merge_cluster_ol $merge_overlap $min_cluster_size $cm_min_bitscore $cm_max_eval $cm_bitscore_sig $partition_type '' $cut_type '$inputFilesTrees' + glob_res.pl + '$inputFiles' + $merge_cluster_ol + $merge_overlap + $min_cluster_size + $cm_min_bitscore + $cm_max_eval + $cm_bitscore_sig + $partition_type '' + $cut_type + '$inputFilesTrees' #if $iteration_num.iteration_num_selector: $iteration_num.CI - $final_partition_soft $final_partition_used_cmsearch #end if && python '$__tool_directory__/evaluation.py' + #if $cdhit: + && + python '$__tool_directory__/addCdhitseqs.py' '$cdhit' + #end if ]]> </command> <inputs> @@ -44,6 +55,7 @@ <param type="data" name="model_tree_files" format="txt" multiple="True"/> <param name="partition_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Hard partition"/> <param name="cut_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Use CM score for cutoff" help="otherwise use E-value"/> + <param type="data" name="cdhit" format="txt" optional="true"/> <conditional name="iteration_num"> <param name="iteration_num_selector" type="boolean" checked="no" label="Multiple iterations" help="for single iteration- NO, for multiple-YES"/> <when value="true">