Repository 'shm_csr'
hg clone https://radegast.galaxyproject.org/repos/davidvanzessen/shm_csr

Changeset 12:6b66c1c57f22 (2016-11-10)
Previous changeset 11:c4ab5034c4d4 (2016-11-09) Next changeset 13:933fb21568ce (2016-11-11)
Commit message:
Uploaded
modified:
merge_and_filter.r
shm_csr.xml
b
diff -r c4ab5034c4d4 -r 6b66c1c57f22 merge_and_filter.r
--- a/merge_and_filter.r Wed Nov 09 10:42:25 2016 -0500
+++ b/merge_and_filter.r Thu Nov 10 08:36:18 2016 -0500
[
@@ -90,6 +90,22 @@
 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
 result$JGene = gsub("[*].*", "", result$JGene)
 
+splt = strsplit(class.filter, "_")[[1]]
+chunk_hit_threshold = as.numeric(splt[1])
+nt_hit_threshold = as.numeric(splt[2])
+
+higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold)
+
+if(!all(higher_than, na.rm=T)){ #check for no unmatched
+ result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"])
+}
+
+if(class.filter == "101_101"){
+ result$best_match = "all"
+}
+
+write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
+
 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
@@ -138,20 +154,6 @@
   result[is.na(result[,col]),] = 0
 }
 
-splt = strsplit(class.filter, "_")[[1]]
-chunk_hit_threshold = as.numeric(splt[1])
-nt_hit_threshold = as.numeric(splt[2])
-
-higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold)
-
-if(!all(higher_than, na.rm=T)){ #check for no unmatched
- result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"])
-}
-
-if(class.filter == "101_101"){
- result$best_match = "all"
-}
-
 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
 
 if(filter.unique != "no"){
@@ -166,13 +168,13 @@
  } else if(empty.region.filter == "FR2"){
  result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
  }
-
+
  if(grepl("keep", filter.unique)){
- result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+ result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes
  result = result[!duplicated(result$unique.def),]
  } else {
  result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
- result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+ result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it
  result = result[!duplicated(result$unique.def),]
  }
 }