# HG changeset patch # User davidvanzessen # Date 1497453240 14400 # Node ID 6cd12c71c3d3b0cf490ebcc53e91e75e5a56307a # Parent ba3220f921af7bc6a9c18c7ed7582d9cb1fdc78b Uploaded diff -r ba3220f921af -r 6cd12c71c3d3 change_o/select_first_in_clone.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/select_first_in_clone.r Wed Jun 14 11:14:00 2017 -0400 @@ -0,0 +1,16 @@ +args <- commandArgs(trailingOnly = TRUE) + +input.file = args[1] +output.file = args[2] + +print("select_in_first_clone.r") +print(input.file) +print(output.file) + +input = read.table(input.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") + +input = input[!duplicated(input$CLONE),] + +names(input)[1] = "Sequence.ID" + +write.table(input, output.file, quote=F, sep="\t", row.names=F, col.names=T, na="") diff -r ba3220f921af -r 6cd12c71c3d3 merge_and_filter.r --- a/merge_and_filter.r Tue May 30 07:40:15 2017 -0400 +++ b/merge_and_filter.r Wed Jun 14 11:14:00 2017 -0400 @@ -97,9 +97,9 @@ filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) -if(FALSE){ #to speed up debugging +if(F){ #to speed up debugging set.seed(1) - summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.1)),] + summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),] print(paste("Number of sequences after sampling 5%:", nrow(summ))) filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) diff -r ba3220f921af -r 6cd12c71c3d3 new_imgt.r --- a/new_imgt.r Tue May 30 07:40:15 2017 -0400 +++ b/new_imgt.r Wed Jun 14 11:14:00 2017 -0400 @@ -8,15 +8,15 @@ if(gene != "-"){ merged = merged[grepl(paste("^", gene, sep=""), merged$best_match),] -} else { +} + +if("best_match" %in% names(merged)){ merged = merged[!grepl("unmatched", merged$best_match),] } -merged = merged[!grepl("unmatched", merged$best_match),] - for(f in list.files(imgt.dir, pattern="*.txt$")){ #print(paste("filtering", f)) - path = paste(imgt.dir, f, sep="") + path = file.path(imgt.dir, f) dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE, comment.char="") dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,] diff -r ba3220f921af -r 6cd12c71c3d3 wrapper.sh --- a/wrapper.sh Tue May 30 07:40:15 2017 -0400 +++ b/wrapper.sh Wed Jun 14 11:14:00 2017 -0400 @@ -555,22 +555,58 @@ bash $dir/change_o/makedb.sh $outdir/new_IMGT.txz false false false $outdir/change_o/change-o-db.txt bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones.txt $outdir/change_o/change-o-defined_clones-summary.txt - + Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/change_o/change-o-db-defined_first_clones.txt 2>&1 + + mkdir $outdir/new_IMGT_changeo + cp $outdir/new_IMGT/* $outdir/new_IMGT_changeo + + Rscript $dir/new_imgt.r $outdir/new_IMGT_changeo $outdir/change_o/change-o-db-defined_first_clones.txt "-" 2>&1 + + cd $outdir/new_IMGT_changeo + tar -cJf ../new_IMGT_first_seq_of_clone.txz * + cd $outdir/change_o + + rm -rf $outdir/new_IMGT_changeo + Rscript $dir/merge.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/merged.txt "all" "Sequence.ID,best_match" "SEQUENCE_ID" "Sequence.ID" $outdir/change_o/change-o-db-defined_clones.txt 2>&1 - echo "Rscript $dir/merge.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/$outdir/merged.txt 'all' 'Sequence.ID,best_match' 'Sequence.ID' 'Sequence.ID' '\t' $outdir/change_o/change-o-db-defined_clones.txt 2>&1" - + if [[ $(wc -l < $outdir/new_IMGT_IGA/1_Summary.txt) -gt "1" ]]; then bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGA.txz false false false $outdir/change_o/change-o-db-IGA.txt bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGA.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGA.txt $outdir/change_o/change-o-defined_clones-summary-IGA.txt + Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGA.txt $outdir/change_o/change-o-db-defined_first_clones-IGA.txt 2>&1 + + mkdir $outdir/new_IMGT_IGA_changeo + cp $outdir/new_IMGT/* $outdir/new_IMGT_IGA_changeo + + Rscript $dir/new_imgt.r $outdir/new_IMGT_IGA_changeo $outdir/change_o/change-o-db-defined_first_clones-IGA.txt "-" 2>&1 + + cd $outdir/new_IMGT_IGA_changeo + tar -cJf ../new_IMGT_IGA_first_seq_of_clone.txz * + + rm -rf $outdir/new_IMGT_IGA_changeo + + cd $outdir/change_o else echo "No IGA sequences" > "$outdir/change_o/change-o-db-defined_clones-IGA.txt" echo "No IGA sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGA.txt" fi - + if [[ $(wc -l < $outdir/new_IMGT_IGG/1_Summary.txt) -gt "1" ]]; then bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGG.txz false false false $outdir/change_o/change-o-db-IGG.txt bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGG.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGG.txt $outdir/change_o/change-o-defined_clones-summary-IGG.txt + Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGG.txt $outdir/change_o/change-o-db-defined_first_clones-IGG.txt 2>&1 + + mkdir $outdir/new_IMGT_IGG_changeo + cp $outdir/new_IMGT/* $outdir/new_IMGT_IGG_changeo + + Rscript $dir/new_imgt.r $outdir/new_IMGT_IGG_changeo $outdir/change_o/change-o-db-defined_first_clones-IGG.txt "-" 2>&1 + + cd $outdir/new_IMGT_IGG_changeo + tar -cJf ../new_IMGT_IGG_first_seq_of_clone.txz * + rm -rf $outdir/new_IMGT_IGG_changeo + + cd $outdir/change_o else echo "No IGG sequences" > "$outdir/change_o/change-o-db-defined_clones-IGG.txt" echo "No IGG sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGG.txt" @@ -579,6 +615,19 @@ if [[ $(wc -l < $outdir/new_IMGT_IGM/1_Summary.txt) -gt "1" ]]; then bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGM.txz false false false $outdir/change_o/change-o-db-IGM.txt bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGM.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGM.txt $outdir/change_o/change-o-defined_clones-summary-IGM.txt + Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGM.txt $outdir/change_o/change-o-db-defined_first_clones-IGM.txt 2>&1 + + mkdir $outdir/new_IMGT_IGM_changeo + cp $outdir/new_IMGT/* $outdir/new_IMGT_IGM_changeo + + Rscript $dir/new_imgt.r $outdir/new_IMGT_IGM_changeo $outdir/change_o/change-o-db-defined_first_clones-IGM.txt "-" 2>&1 + + cd $outdir/new_IMGT_IGM_changeo + tar -cJf ../new_IMGT_IGM_first_seq_of_clone.txz * + + rm -rf $outdir/new_IMGT_IGM_changeo + + cd $outdir/change_o else echo "No IGM sequences" > "$outdir/change_o/change-o-db-defined_clones-IGM.txt" echo "No IGM sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGM.txt" @@ -587,12 +636,37 @@ if [[ $(wc -l < $outdir/new_IMGT_IGE/1_Summary.txt) -gt "1" ]]; then bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGE.txz false false false $outdir/change_o/change-o-db-IGE.txt bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGE.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGE.txt $outdir/change_o/change-o-defined_clones-summary-IGE.txt + Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGE.txt $outdir/change_o/change-o-db-defined_first_clones-IGE.txt 2>&1 + + mkdir $outdir/new_IMGT_IGE_changeo + cp $outdir/new_IMGT/* $outdir/new_IMGT_IGE_changeo + + Rscript $dir/new_imgt.r $outdir/new_IMGT_IGE_changeo $outdir/change_o/change-o-db-defined_first_clones-IGE.txt "-" 2>&1 + + cd $outdir/new_IMGT_IGE_changeo + tar -cJf ../new_IMGT_IGE_first_seq_of_clone.txz * + + rm -rf $outdir/new_IMGT_IGE_changeo + + cd $outdir/change_o else echo "No IGE sequences" > "$outdir/change_o/change-o-db-defined_clones-IGE.txt" echo "No IGE sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGE.txt" fi - PWD="$tmp" + cd "$tmp" + + rm -rf $outdir/new_IMGT + rm -rf $outdir/new_IMGT_IGA/ + rm -rf $outdir/new_IMGT_IGA1/ + rm -rf $outdir/new_IMGT_IGA2/ + rm -rf $outdir/new_IMGT_IGG/ + rm -rf $outdir/new_IMGT_IGG1/ + rm -rf $outdir/new_IMGT_IGG2/ + rm -rf $outdir/new_IMGT_IGG3/ + rm -rf $outdir/new_IMGT_IGG4/ + rm -rf $outdir/new_IMGT_IGM/ + rm -rf $outdir/new_IMGT_IGE/ echo "