# HG changeset patch # User davidvanzessen # Date 1494512499 14400 # Node ID f5fe63533c5857588b9de187fa283ac06cae4d2d # Parent c5295dd10dfc1993d756f2f88ca58b07b1fd865d Uploaded diff -r c5295dd10dfc -r f5fe63533c58 merge_and_filter.r --- a/merge_and_filter.r Mon May 08 09:27:27 2017 -0400 +++ b/merge_and_filter.r Thu May 11 10:21:39 2017 -0400 @@ -15,8 +15,11 @@ functionality=args[12] unique.type=args[13] filter.unique=args[14] -class.filter=args[15] -empty.region.filter=args[16] +filter.unique.count=as.numeric(args[15]) +class.filter=args[16] +empty.region.filter=args[17] + +print(paste("filter.unique.count:", filter.unique.count)) summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") @@ -96,7 +99,7 @@ if(FALSE){ #to speed up debugging set.seed(1) - summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),] + summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.1)),] print(paste("Number of sequences after sampling 5%:", nrow(summ))) filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) @@ -225,6 +228,12 @@ result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it + if(filter.unique == "remove"){ + unique.defs = data.frame(table(result$unique.def)) + unique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,] + result = result[result$unique.def %in% unique.defs$Var1,] + } + result = result[!duplicated(result$unique.def),] } diff -r c5295dd10dfc -r f5fe63533c58 shm_csr.xml --- a/shm_csr.xml Mon May 08 09:27:27 2017 -0400 +++ b/shm_csr.xml Thu May 11 10:21:39 2017 -0400 @@ -1,7 +1,11 @@ - wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_uniques $class_filter_cond.class_filter $empty_region_filter $fast + #if str ( $filter_unique.filter_unique_select ) == "remove": + wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast + #else: + wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast + #end if @@ -16,11 +20,16 @@ - - - - - + + + + + + + + + + diff -r c5295dd10dfc -r f5fe63533c58 wrapper.sh --- a/wrapper.sh Mon May 08 09:27:27 2017 -0400 +++ b/wrapper.sh Thu May 11 10:21:39 2017 -0400 @@ -17,9 +17,11 @@ naive_output_ce=${13} naive_output_all=${14} filter_unique=${15} -class_filter=${16} -empty_region_filter=${17} -fast=${18} +filter_unique_count=${16} +class_filter=${17} +empty_region_filter=${18} +fast=${19} + mkdir $outdir tar -xzf $dir/style.tar.gz -C $outdir @@ -65,7 +67,7 @@ echo "---------------- merge_and_filter.r ----------------" echo "---------------- merge_and_filter.r ----------------
" >> $log -Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1 +Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${filter_unique_count} ${class_filter} ${empty_region_filter} 2>&1 if [[ "$fast" == "no" ]] ; then