diff merge_and_filter.r @ 5:012a738edf5a draft

Uploaded
author davidvanzessen
date Tue, 01 Nov 2016 10:15:37 -0400
parents 275ab5175fd6
children 372ccdcf0b2d
line wrap: on
line diff
--- a/merge_and_filter.r	Mon Oct 31 05:05:26 2016 -0400
+++ b/merge_and_filter.r	Tue Nov 01 10:15:37 2016 -0400
@@ -141,44 +141,6 @@
   result[is.na(result[,col]),] = 0
 }
 
-write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
-
-if(filter.unique != "no"){
-	clmns = names(result)
-	
-	if(empty.region.filter == "leader"){
-		result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
-	} else if(empty.region.filter == "FR1"){
-		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
-	} else if(empty.region.filter == "CDR1"){
-		rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
-	} else if(empty.region.filter == "FR2"){
-		result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
-	}
-	
-	if(grepl("_c", filter.unique)){
-		result$unique.def = paste(result$unique.def, result$best_match)
-	}
-
-	#fltr = result$unique.def %in% result.filtered$unique.def
-
-	if(grepl("keep", filter.unique)){
-		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
-		result = result[!duplicated(result$unique.def),]
-	} else {
-		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
-		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
-		result = result[!duplicated(result$unique.def),]
-	}
-	
-	#result = result[,clmns]
-	
-	#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
-}
-
-filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
-
-
 splt = strsplit(class.filter, "_")[[1]]
 chunk_hit_threshold = as.numeric(splt[1])
 nt_hit_threshold = as.numeric(splt[2])
@@ -198,10 +160,39 @@
 	result$best_match = "all"
 }
 
-if(any(higher_than, na.rm=T)){
-	#summ = summ[higher_than,]
+write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
+
+if(filter.unique != "no"){
+	clmns = names(result)
+	
+	if(empty.region.filter == "leader"){
+		result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+	} else if(empty.region.filter == "FR1"){
+		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+	} else if(empty.region.filter == "CDR1"){
+		result$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+	} else if(empty.region.filter == "FR2"){
+		result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+	}
+
+	if(grepl("_c", filter.unique)){
+		result$unique.def = paste(result$unique.def, result$best_match)
+	}
+
+	if(grepl("keep", filter.unique)){
+		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+		result = result[!duplicated(result$unique.def),]
+	} else {
+		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
+		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+		result = result[!duplicated(result$unique.def),]
+	}
 }
 
+write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T)
+
+filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
+
 if(nrow(summ) == 0){
 	stop("No data remaining after filter")
 }