Mercurial > repos > rnateam > graphclust_postprocessing

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/addCdhitseqs.py	Wed Jan 04 18:15:07 2017 -0500
@@ -0,0 +1,59 @@
+import re
+import glob
+import sys
+
+cdhitcluster = sys.argv[1]
+#clusters = sys.argv[2]
+
+cluster_seqs_stats_path = "RESULTS/*.cluster.all"
+cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path)
+
+#clusterFiles = clusters.split(',')
+repSeqRedSeqdict = {}
+repLine = ""
+count = 0
+first = False
+
+with open(cdhitcluster, 'r+') as f:
+    lines = f.readlines()
+    for i in range(0, len(lines)):
+        line = lines[i]
+        if ">Cluster" in line:
+            first = True
+            count = 0
+            if i+1 < len(lines):
+                repLine = lines[i+1]
+            continue
+        elif not first:
+            count += 1
+            first = False
+        else:
+            first = False
+            lineArr = []
+        if count > 0:
+            repLine = repLine.strip()
+            rep_FullId = repLine.split()[2]
+            rep_FullId = rep_FullId.replace(">", "")
+            #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0]
+            rep_FullId = rep_FullId.replace("...", "")
+            line = line.strip()
+            add_FullId = line.split()[2]
+            add_FullId = add_FullId.replace(">", "")
+            add_FullId = add_FullId.replace("...", "")
+            #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0]
+            lineArr.append(add_FullId)
+            repSeqRedSeqdict[rep_FullId] = lineArr
+            #lineArr.append(add_short_id)
+            #repSeqRedSeqdict[rep_short_id] = lineArr
+
+toWrite = ""
+
+for singleFile in sorted(cluster_seqs_stats_files):
+    with open(singleFile, "a+") as clFile:
+        file_content = clFile.read()
+        first_line = file_content.split('\n')[0]
+        for key, val in repSeqRedSeqdict.items():
+            if key in file_content:
+                for i in val:
+                    toWrite += first_line.split()[0] + "  " + first_line.split()[1] + "  " + first_line.split()[2] + "  " + " - " + "   " + "CD-Hit" + "    " + first_line.split()[5] + "  " + "ORIGID" + "  "  + str(i) + "\n"
+        clFile.write(toWrite)
--- a/evaluation.py	Sat Dec 24 18:08:36 2016 -0500
+++ b/evaluation.py	Wed Jan 04 18:15:07 2017 -0500
@@ -49,16 +49,10 @@
     full.write(toWrite)


-listOfClasses = []
-listOfClusters = []
 pattern = re.compile("^RF.*$")


 if len(listOfClasses) > 0 and  pattern.match(str(listOfClasses[0])):
-    with open("RESULTS/fullTab.tabular", "r") as tabF:
-        for line in tabF.readlines():
-            listOfClasses.append(line.split()[0])
-            listOfClusters.append(line.split()[1])

     completeness_score = metrics.completeness_score(listOfClasses, listOfClusters)
     homogeneity_score = metrics.homogeneity_score(listOfClasses, listOfClusters)
--- a/glob_report.xml	Sat Dec 24 18:08:36 2016 -0500
+++ b/glob_report.xml	Wed Jan 04 18:15:07 2017 -0500
@@ -9,7 +9,6 @@
 	</stdio>
 	<command>
 		<![CDATA[
-
         unzip $FASTA  &> /dev/null &&

         #set $inputFiles = ""
@@ -25,17 +24,29 @@
             #set $inputFilesTrees += str($mods)+','
         #end for
         #set $inputFilesTrees = $inputFilesTrees[:-1]
-
-		    'glob_res.pl' '$inputFiles' $merge_cluster_ol $merge_overlap $min_cluster_size $cm_min_bitscore $cm_max_eval $cm_bitscore_sig $partition_type '' $cut_type '$inputFilesTrees'
+		    glob_res.pl
+                '$inputFiles'
+                $merge_cluster_ol
+                $merge_overlap
+                $min_cluster_size
+                $cm_min_bitscore
+                $cm_max_eval
+                $cm_bitscore_sig
+                $partition_type ''
+                $cut_type
+                '$inputFilesTrees'
         #if  $iteration_num.iteration_num_selector:
           $iteration_num.CI
-
           $final_partition_soft
           $final_partition_used_cmsearch
         #end if

         &&
         python '$__tool_directory__/evaluation.py'
+		#if $cdhit:
+    		&&
+	    	python '$__tool_directory__/addCdhitseqs.py' '$cdhit'
+	    #end if
 ]]>
 	</command>
 	<inputs>
@@ -44,6 +55,7 @@
 		<param type="data" name="model_tree_files" format="txt" multiple="True"/>
 		<param name="partition_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Hard partition"/>
 		<param name="cut_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Use CM score for cutoff" help="otherwise use E-value"/>
+		<param type="data" name="cdhit" format="txt" optional="true"/>
 		<conditional name="iteration_num">
 			<param name="iteration_num_selector" type="boolean"  checked="no" label="Multiple iterations"  help="for single iteration- NO, for multiple-YES"/>
 			<when value="true">