changeset 49:f5fe63533c58 draft

Uploaded
author davidvanzessen
date Thu, 11 May 2017 10:21:39 -0400
parents c5295dd10dfc
children 75ee66a691a0
files merge_and_filter.r shm_csr.xml wrapper.sh
diffstat 3 files changed, 33 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/merge_and_filter.r	Mon May 08 09:27:27 2017 -0400
+++ b/merge_and_filter.r	Thu May 11 10:21:39 2017 -0400
@@ -15,8 +15,11 @@
 functionality=args[12]
 unique.type=args[13]
 filter.unique=args[14]
-class.filter=args[15]
-empty.region.filter=args[16]
+filter.unique.count=as.numeric(args[15])
+class.filter=args[16]
+empty.region.filter=args[17]
+
+print(paste("filter.unique.count:", filter.unique.count))
 
 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
@@ -96,7 +99,7 @@
 
 if(FALSE){ #to speed up debugging
     set.seed(1)
-    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),]
+    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.1)),]
     print(paste("Number of sequences after sampling 5%:", nrow(summ)))
 
     filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ)))
@@ -225,6 +228,12 @@
 	
 	result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it
 	
+	if(filter.unique == "remove"){
+        unique.defs = data.frame(table(result$unique.def))
+        unique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,]
+        result = result[result$unique.def %in% unique.defs$Var1,]
+	} 
+	
 	result = result[!duplicated(result$unique.def),]
 }
 
--- a/shm_csr.xml	Mon May 08 09:27:27 2017 -0400
+++ b/shm_csr.xml	Thu May 11 10:21:39 2017 -0400
@@ -1,7 +1,11 @@
 <tool id="shm_csr" name="SHM &amp; CSR pipeline" version="1.0">
 	<description></description>
 	<command interpreter="bash">
-		wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_uniques $class_filter_cond.class_filter $empty_region_filter $fast
+		#if str ( $filter_unique.filter_unique_select ) == "remove":
+			wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast
+		#else:
+			wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast
+		#end if
 	</command>
 	<inputs>
 		<param name="in_file" type="data" label="IMGT zip file to be analysed" />
@@ -16,11 +20,16 @@
 			<option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option>
 			<option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>
 		</param>
-		<param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example.">
-			<option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>
-			<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>
-			<option value="no">No</option>
-		</param>
+        <conditional name="filter_unique">
+			<param name="filter_unique_select" type="select" label="Filter unique sequences" help="See below for an example.">
+				<option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>
+				<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>
+				<option value="no">No</option>
+			</param>
+			<when value="remove">
+				<param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/>
+			</when>
+		</conditional>
 		<param name="unique" type="select" label="Remove duplicates based on" help="" >
 			<option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>
 			<option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>
--- a/wrapper.sh	Mon May 08 09:27:27 2017 -0400
+++ b/wrapper.sh	Thu May 11 10:21:39 2017 -0400
@@ -17,9 +17,11 @@
 naive_output_ce=${13}
 naive_output_all=${14}
 filter_unique=${15}
-class_filter=${16}
-empty_region_filter=${17}
-fast=${18}
+filter_unique_count=${16}
+class_filter=${17}
+empty_region_filter=${18}
+fast=${19}
+
 mkdir $outdir
 
 tar -xzf $dir/style.tar.gz -C $outdir
@@ -65,7 +67,7 @@
 echo "---------------- merge_and_filter.r ----------------"
 echo "---------------- merge_and_filter.r ----------------<br />" >> $log
 
-Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1
+Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${filter_unique_count} ${class_filter} ${empty_region_filter} 2>&1
 
 if [[ "$fast" == "no" ]] ; then