changeset 55:6cd12c71c3d3 draft

Uploaded
author davidvanzessen
date Wed, 14 Jun 2017 11:14:00 -0400
parents ba3220f921af
children ee807645b224
files change_o/select_first_in_clone.r merge_and_filter.r new_imgt.r wrapper.sh
diffstat 4 files changed, 111 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/select_first_in_clone.r	Wed Jun 14 11:14:00 2017 -0400
@@ -0,0 +1,16 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input.file = args[1]
+output.file = args[2]
+
+print("select_in_first_clone.r")
+print(input.file)
+print(output.file)
+
+input = read.table(input.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
+
+input = input[!duplicated(input$CLONE),]
+
+names(input)[1] = "Sequence.ID"
+
+write.table(input, output.file, quote=F, sep="\t", row.names=F, col.names=T, na="")
--- a/merge_and_filter.r	Tue May 30 07:40:15 2017 -0400
+++ b/merge_and_filter.r	Wed Jun 14 11:14:00 2017 -0400
@@ -97,9 +97,9 @@
 
 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))
 
-if(FALSE){ #to speed up debugging
+if(F){ #to speed up debugging
     set.seed(1)
-    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.1)),]
+    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),]
     print(paste("Number of sequences after sampling 5%:", nrow(summ)))
 
     filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ)))
--- a/new_imgt.r	Tue May 30 07:40:15 2017 -0400
+++ b/new_imgt.r	Wed Jun 14 11:14:00 2017 -0400
@@ -8,15 +8,15 @@
 
 if(gene != "-"){
 	merged = merged[grepl(paste("^", gene, sep=""), merged$best_match),]
-} else {
+}
+
+if("best_match" %in% names(merged)){
 	merged = merged[!grepl("unmatched", merged$best_match),]
 }
 
-merged = merged[!grepl("unmatched", merged$best_match),]
-
 for(f in list.files(imgt.dir, pattern="*.txt$")){
 	#print(paste("filtering", f))
-	path = paste(imgt.dir, f, sep="")
+	path = file.path(imgt.dir, f)
 	dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE, comment.char="")
 	
 	dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,]
--- a/wrapper.sh	Tue May 30 07:40:15 2017 -0400
+++ b/wrapper.sh	Wed Jun 14 11:14:00 2017 -0400
@@ -555,22 +555,58 @@
 
 	bash $dir/change_o/makedb.sh $outdir/new_IMGT.txz false false false $outdir/change_o/change-o-db.txt
 	bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones.txt $outdir/change_o/change-o-defined_clones-summary.txt
-
+	Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/change_o/change-o-db-defined_first_clones.txt 2>&1
+	
+	mkdir $outdir/new_IMGT_changeo
+	cp $outdir/new_IMGT/* $outdir/new_IMGT_changeo
+	
+	Rscript $dir/new_imgt.r $outdir/new_IMGT_changeo $outdir/change_o/change-o-db-defined_first_clones.txt "-" 2>&1
+	
+	cd $outdir/new_IMGT_changeo
+	tar -cJf ../new_IMGT_first_seq_of_clone.txz *
+	cd $outdir/change_o
+	
+	rm -rf $outdir/new_IMGT_changeo
+	
 	Rscript $dir/merge.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/merged.txt "all" "Sequence.ID,best_match" "SEQUENCE_ID" "Sequence.ID" $outdir/change_o/change-o-db-defined_clones.txt 2>&1
-
 	echo "Rscript $dir/merge.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/$outdir/merged.txt 'all' 'Sequence.ID,best_match' 'Sequence.ID' 'Sequence.ID' '\t' $outdir/change_o/change-o-db-defined_clones.txt 2>&1"
-
+	
 	if [[ $(wc -l < $outdir/new_IMGT_IGA/1_Summary.txt) -gt "1" ]]; then
 		bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGA.txz false false false $outdir/change_o/change-o-db-IGA.txt
 		bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGA.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGA.txt $outdir/change_o/change-o-defined_clones-summary-IGA.txt
+		Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGA.txt $outdir/change_o/change-o-db-defined_first_clones-IGA.txt 2>&1
+		
+		mkdir $outdir/new_IMGT_IGA_changeo
+		cp $outdir/new_IMGT/* $outdir/new_IMGT_IGA_changeo
+		
+		Rscript $dir/new_imgt.r $outdir/new_IMGT_IGA_changeo $outdir/change_o/change-o-db-defined_first_clones-IGA.txt "-" 2>&1
+		
+		cd $outdir/new_IMGT_IGA_changeo
+		tar -cJf ../new_IMGT_IGA_first_seq_of_clone.txz *
+		
+		rm -rf $outdir/new_IMGT_IGA_changeo
+		
+		cd $outdir/change_o
 	else
 		echo "No IGA sequences" > "$outdir/change_o/change-o-db-defined_clones-IGA.txt"
 		echo "No IGA sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGA.txt"
 	fi
-
+	
 	if [[ $(wc -l < $outdir/new_IMGT_IGG/1_Summary.txt) -gt "1" ]]; then
 		bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGG.txz false false false $outdir/change_o/change-o-db-IGG.txt
 		bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGG.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGG.txt $outdir/change_o/change-o-defined_clones-summary-IGG.txt
+		Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGG.txt $outdir/change_o/change-o-db-defined_first_clones-IGG.txt 2>&1
+		
+		mkdir $outdir/new_IMGT_IGG_changeo
+		cp $outdir/new_IMGT/* $outdir/new_IMGT_IGG_changeo
+		
+		Rscript $dir/new_imgt.r $outdir/new_IMGT_IGG_changeo $outdir/change_o/change-o-db-defined_first_clones-IGG.txt "-" 2>&1
+		
+		cd $outdir/new_IMGT_IGG_changeo
+		tar -cJf ../new_IMGT_IGG_first_seq_of_clone.txz *
+		rm -rf $outdir/new_IMGT_IGG_changeo
+		
+		cd $outdir/change_o
 	else
 		echo "No IGG sequences" > "$outdir/change_o/change-o-db-defined_clones-IGG.txt"
 		echo "No IGG sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGG.txt"
@@ -579,6 +615,19 @@
 	if [[ $(wc -l < $outdir/new_IMGT_IGM/1_Summary.txt) -gt "1" ]]; then
 		bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGM.txz false false false $outdir/change_o/change-o-db-IGM.txt
 		bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGM.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGM.txt $outdir/change_o/change-o-defined_clones-summary-IGM.txt
+		Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGM.txt $outdir/change_o/change-o-db-defined_first_clones-IGM.txt 2>&1
+		
+		mkdir $outdir/new_IMGT_IGM_changeo
+		cp $outdir/new_IMGT/* $outdir/new_IMGT_IGM_changeo
+		
+		Rscript $dir/new_imgt.r $outdir/new_IMGT_IGM_changeo $outdir/change_o/change-o-db-defined_first_clones-IGM.txt "-" 2>&1
+		
+		cd $outdir/new_IMGT_IGM_changeo
+		tar -cJf ../new_IMGT_IGM_first_seq_of_clone.txz *
+		
+		rm -rf $outdir/new_IMGT_IGM_changeo
+		
+		cd $outdir/change_o
 	else
 		echo "No IGM sequences" > "$outdir/change_o/change-o-db-defined_clones-IGM.txt"
 		echo "No IGM sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGM.txt"
@@ -587,12 +636,37 @@
 	if [[ $(wc -l < $outdir/new_IMGT_IGE/1_Summary.txt) -gt "1" ]]; then
 		bash $dir/change_o/makedb.sh $outdir/new_IMGT_IGE.txz false false false $outdir/change_o/change-o-db-IGE.txt
 		bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-IGE.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-IGE.txt $outdir/change_o/change-o-defined_clones-summary-IGE.txt
+		Rscript $dir/change_o/select_first_in_clone.r $outdir/change_o/change-o-db-defined_clones-IGE.txt $outdir/change_o/change-o-db-defined_first_clones-IGE.txt 2>&1
+		
+		mkdir $outdir/new_IMGT_IGE_changeo
+		cp $outdir/new_IMGT/* $outdir/new_IMGT_IGE_changeo
+		
+		Rscript $dir/new_imgt.r $outdir/new_IMGT_IGE_changeo $outdir/change_o/change-o-db-defined_first_clones-IGE.txt "-" 2>&1
+		
+		cd $outdir/new_IMGT_IGE_changeo
+		tar -cJf ../new_IMGT_IGE_first_seq_of_clone.txz *
+		
+		rm -rf $outdir/new_IMGT_IGE_changeo
+		
+		cd $outdir/change_o
 	else
 		echo "No IGE sequences" > "$outdir/change_o/change-o-db-defined_clones-IGE.txt"
 		echo "No IGE sequences" > "$outdir/change_o/change-o-defined_clones-summary-IGE.txt"
 	fi
 
-	PWD="$tmp"
+	cd "$tmp"
+	
+	rm -rf $outdir/new_IMGT
+	rm -rf $outdir/new_IMGT_IGA/
+	rm -rf $outdir/new_IMGT_IGA1/
+	rm -rf $outdir/new_IMGT_IGA2/
+	rm -rf $outdir/new_IMGT_IGG/
+	rm -rf $outdir/new_IMGT_IGG1/
+	rm -rf $outdir/new_IMGT_IGG2/
+	rm -rf $outdir/new_IMGT_IGG3/
+	rm -rf $outdir/new_IMGT_IGG4/
+	rm -rf $outdir/new_IMGT_IGM/
+	rm -rf $outdir/new_IMGT_IGE/
 
 	echo "<div class='tabbertab' title='Clonal Relation' style='width: 7000px;'>" >> $output #clonality tab
 
@@ -712,18 +786,28 @@
 echo "<tr><td>The data for the IGA subclass distribution plot</td><td><a href='IGA_pie.txt' download='IGA_pie.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The data for the IGG subclass distribution plot</td><td><a href='IGG_pie.txt' download='IGG_pie.txt' >Download</a></td></tr>" >> $output
 
+
 echo "<tr><td colspan='2' style='background-color:#E0E0E0;'>Clonal Relation</td></tr>" >> $output
 echo "<tr><td>Sequence overlap between subclasses</td><td><a href='sequence_overview/index.html'>View</a></td></tr>" >> $output
 echo "<tr><td>The Change-O DB file with defined clones and subclass annotation</td><td><a href='change_o/change-o-db-defined_clones.txt' download='change_o/change-o-db-defined_clones.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The Change-O DB defined clones summary file</td><td><a href='change_o/change-o-defined_clones-summary.txt' download='change_o/change-o-defined_clones-summary.txt' >Download</a></td></tr>" >> $output
+echo "<tr><td>An IMGT archive with just just the first sequence of a clone</td><td><a href='new_IMGT_first_seq_of_clone.txz' download='new_IMGT_first_seq_of_clone.txz' >Download</a></td></tr>" >> $output
+
 echo "<tr><td>The Change-O DB file with defined clones of IGA</td><td><a href='change_o/change-o-db-defined_clones-IGA.txt' download='change_o/change-o-db-defined_clones-IGA.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The Change-O DB defined clones summary file of IGA</td><td><a href='change_o/change-o-defined_clones-summary-IGA.txt' download='change_o/change-o-defined_clones-summary-IGA.txt' >Download</a></td></tr>" >> $output
+echo "<tr><td>An IMGT archive with just just the first sequence of a clone (IGA)</td><td><a href='new_IMGT_IGA_first_seq_of_clone.txz' download='new_IMGT_IGA_first_seq_of_clone.txz' >Download</a></td></tr>" >> $output
+
 echo "<tr><td>The Change-O DB file with defined clones of IGG</td><td><a href='change_o/change-o-db-defined_clones-IGG.txt' download='change_o/change-o-db-defined_clones-IGG.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The Change-O DB defined clones summary file of IGG</td><td><a href='change_o/change-o-defined_clones-summary-IGG.txt' download='change_o/change-o-defined_clones-summary-IGG.txt' >Download</a></td></tr>" >> $output
+echo "<tr><td>An IMGT archive with just just the first sequence of a clone (IGG)</td><td><a href='new_IMGT_IGG_first_seq_of_clone.txz' download='new_IMGT_IGG_first_seq_of_clone.txz' >Download</a></td></tr>" >> $output
+
 echo "<tr><td>The Change-O DB file with defined clones of IGM</td><td><a href='change_o/change-o-db-defined_clones-IGM.txt' download='change_o/change-o-db-defined_clones-IGM.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The Change-O DB defined clones summary file of IGM</td><td><a href='change_o/change-o-defined_clones-summary-IGM.txt' download='change_o/change-o-defined_clones-summary-IGM.txt' >Download</a></td></tr>" >> $output
+echo "<tr><td>An IMGT archive with just just the first sequence of a clone (IGM)</td><td><a href='new_IMGT_IGM_first_seq_of_clone.txz' download='new_IMGT_IGM_first_seq_of_clone.txz' >Download</a></td></tr>" >> $output
+
 echo "<tr><td>The Change-O DB file with defined clones of IGE</td><td><a href='change_o/change-o-db-defined_clones-IGE.txt' download='change_o/change-o-db-defined_clones-IGE.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The Change-O DB defined clones summary file of IGE</td><td><a href='change_o/change-o-defined_clones-summary-IGE.txt' download='change_o/change-o-defined_clones-summary-IGE.txt' >Download</a></td></tr>" >> $output
+echo "<tr><td>An IMGT archive with just just the first sequence of a clone (IGE)</td><td><a href='new_IMGT_IGE_first_seq_of_clone.txz' download='new_IMGT_IGE_first_seq_of_clone.txz' >Download</a></td></tr>" >> $output
 
 echo "<tr><td colspan='2' style='background-color:#E0E0E0;'>Filtered IMGT output files</td></tr>" >> $output
 echo "<tr><td>An IMGT archive with just the matched and filtered sequences</td><td><a href='new_IMGT.txz' download='new_IMGT.txz' >Download</a></td></tr>" >> $output