# HG changeset patch # User davidvanzessen # Date 1500302680 14400 # Node ID ee807645b2241760ef454cbf81682280d4fa67b7 # Parent 6cd12c71c3d3b0cf490ebcc53e91e75e5a56307a Uploaded diff -r 6cd12c71c3d3 -r ee807645b224 check_unique_id.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/check_unique_id.r Mon Jul 17 10:44:40 2017 -0400 @@ -0,0 +1,25 @@ +args <- commandArgs(trailingOnly = TRUE) #first argument must be the summary file so it can grab the + +current_file = args[1] + +current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F) + +if(!("Sequence number" %in% names(current))){ + stop("First argument doesn't contain the 'Sequence number' column") +} + +tbl = table(current$Sequence.ID) +l_tbl = length(tbl) +check = any(tbl > 1) + +#if(l_tbl != nrow(current)){ # non unique IDs? +if(check){ + print("Sequence.ID is not unique for every sequence, adding sequence number to IDs") + for(i in 1:length(args)){ + current_file = args[i] + print(paste("Appending 'Sequence number' column to 'Sequence ID' column in", current_file)) + current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F) + current[,"Sequence ID"] = paste(current[,"Sequence ID"], current[,"Sequence number"], sep="_") + write.table(x = current, file = current_file, quote = F, sep = "\t", na = "", row.names = F, col.names = T) + } +} diff -r 6cd12c71c3d3 -r ee807645b224 merge_and_filter.r --- a/merge_and_filter.r Wed Jun 14 11:14:00 2017 -0400 +++ b/merge_and_filter.r Mon Jul 17 10:44:40 2017 -0400 @@ -41,6 +41,11 @@ return(df) } +fix_non_unique_ids = function(df){ + df$Sequence.ID = paste(df$Sequence.ID, 1:nrow(df)) + return(df) +} + summ = fix_column_names(summ) sequences = fix_column_names(sequences) mutationanalysis = fix_column_names(mutationanalysis) @@ -79,6 +84,8 @@ summ = merge(summ, gene_identification, by="Sequence.ID") +print(paste("Number of sequences after merging with gene identification:", nrow(summ))) + summ = summ[summ$Functionality != "No results",] print(paste("Number of sequences after 'No results' filter:", nrow(summ))) @@ -99,15 +106,21 @@ if(F){ #to speed up debugging set.seed(1) - summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),] - print(paste("Number of sequences after sampling 5%:", nrow(summ))) + summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.03)),] + print(paste("Number of sequences after sampling 3%:", nrow(summ))) - filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) + filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 3%", nrow(summ))) } print("mutation analysis files columns") print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])])) +print(head(summ$Sequence.ID)) + +print("_-------------------------------------") + +print(head(mutationanalysis$Sequence.ID)) + result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) diff -r 6cd12c71c3d3 -r ee807645b224 shm_csr.xml --- a/shm_csr.xml Wed Jun 14 11:14:00 2017 -0400 +++ b/shm_csr.xml Mon Jul 17 10:44:40 2017 -0400 @@ -1,5 +1,12 @@ + + r-seqinr + r-ggplot2 + r-reshape2 + r-scales + r-data.table + #if str ( $filter_unique.filter_unique_select ) == "remove": wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast @@ -8,7 +15,7 @@ #end if - + @@ -29,6 +36,8 @@ + + @@ -51,12 +60,20 @@ + + + + + + + + @@ -86,10 +103,12 @@ class_filter_cond['class_filter'] == "101_101" - - 10.1093/nar/gks457 - 10.1093/bioinformatics/btv359 - + + + + + + + + 10.1093/nar/gks457 + 10.1093/bioinformatics/btv359 + diff -r 6cd12c71c3d3 -r ee807645b224 wrapper.sh --- a/wrapper.sh Wed Jun 14 11:14:00 2017 -0400 +++ b/wrapper.sh Mon Jul 17 10:44:40 2017 -0400 @@ -41,6 +41,10 @@ echo "tar -xJf $input -C $PWD/files/" mkdir -p "$PWD/files/$title" tar -xJf $input -C "$PWD/files/$title" +else + echo "Unrecognized format $type" + echo "Unrecognized format $type" > $log + exit 1 fi cat "`find $PWD/files/ -name "1_*"`" > $PWD/summary.txt @@ -52,6 +56,10 @@ cat "`find $PWD/files/ -name "8_*"`" > $PWD/mutationstats.txt cat "`find $PWD/files/ -name "10_*"`" > $PWD/hotspots.txt +echo "---------------- unique id check ----------------" + +Rscript $dir/check_unique_id.r $PWD/summary.txt $PWD/sequences.txt $PWD/gapped_aa.txt $PWD/aa.txt $PWD/junction.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt + if [[ ${#BLASTN_DIR} -ge 5 ]] ; then echo "On server, using BLASTN_DIR env: ${BLASTN_DIR}" else @@ -69,7 +77,7 @@ Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${filter_unique_count} ${class_filter} ${empty_region_filter} 2>&1 -if [[ "$fast" == "no" ]] ; then +if [[ "${naive_output}" == "yes" ]] ; then echo "---------------- creating new IMGT zips ----------------" echo "---------------- creating new IMGT zips ----------------
" >> $log