Mercurial > repos > davidvanzessen > shm_csr

--- a/merge_and_filter.r	Thu Dec 22 09:39:27 2016 -0500
+++ b/merge_and_filter.r	Thu Dec 29 07:05:45 2016 -0500
@@ -47,8 +47,8 @@
 filtering.steps[,2] = as.character(filtering.steps[,2])
 #filtering.steps[,3] = as.numeric(filtering.steps[,3])

-print("summary files columns")
-print(names(summ))
+#print("summary files columns")
+#print(names(summ))

 summ = merge(summ, gene_identification, by="Sequence.ID")

@@ -171,7 +171,6 @@

 if(filter.unique != "no"){
 	clmns = names(result)
-
 	if(empty.region.filter == "leader"){
 		result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
 	} else if(empty.region.filter == "FR1"){
@@ -185,6 +184,7 @@
 	if(filter.unique == "remove"){
 		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
 	}
+
 	result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it

 	result = result[!duplicated(result$unique.def),]
@@ -194,16 +194,21 @@

 filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))

+print(paste("Number of sequences in result after unique filtering:", nrow(result)))
+
 if(nrow(summ) == 0){
 	stop("No data remaining after filter")
 }

 result$best_match_class = gsub(",.*", "", result$best_match) #gsub so the unmatched don't have a class after it

-result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
+#result$past = ""
+#cls = unlist(strsplit(unique.type, ","))
+#for (i in 1:nrow(result)){
+#	result[i,"past"] = paste(result[i,cls], collapse=":")
+#}

-
-
+result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))

 result.matched = result[!grepl("unmatched", result$best_match),]
 result.unmatched = result[grepl("unmatched", result$best_match),]
--- a/shm_csr.py	Thu Dec 22 09:39:27 2016 -0500
+++ b/shm_csr.py	Thu Dec 29 07:05:45 2016 -0500
@@ -38,7 +38,7 @@
 cdr1LengthDic = {}
 cdr2LengthDic = {}

-with open(infile, 'r') as i:
+with open(infile, 'ru') as i:
 	for line in i:
 		if first:
 			linesplt = line.split("\t")
@@ -80,6 +80,8 @@

 		IDlist += [ID]

+print mutationList, linecount
+
 AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent
 if AALength < 60:
 	AALength = 64
@@ -173,7 +175,7 @@
 aggctatIndex = 0
 atagcctIndex = 0
 first = True
-with open(infile, 'r') as i:
+with open(infile, 'ru') as i:
 	for line in i:
 		if first:
 			linesplt = line.split("\t")
--- a/wrapper.sh	Thu Dec 22 09:39:27 2016 -0500
+++ b/wrapper.sh	Thu Dec 29 07:05:45 2016 -0500
@@ -43,6 +43,7 @@

 cat "`find $PWD/files/ -name "1_*"`" > $PWD/summary.txt
 cat "`find $PWD/files/ -name "3_*"`" > $PWD/sequences.txt
+cat "`find $PWD/files/ -name "4_*"`" > $PWD/gapped_aa.txt
 cat "`find $PWD/files/ -name "5_*"`" > $PWD/aa.txt
 cat "`find $PWD/files/ -name "6_*"`" > $PWD/junction.txt
 cat "`find $PWD/files/ -name "7_*"`" > $PWD/mutationanalysis.txt
@@ -64,7 +65,7 @@
 echo "---------------- merge_and_filter.r ----------------"
 echo "---------------- merge_and_filter.r ----------------<br />" >> $log

-Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $PWD/aa.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1
+Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1

 if [[ "$fast" == "no" ]] ; then