# HG changeset patch # User davidvanzessen # Date 1478784978 18000 # Node ID 6b66c1c57f229c966e871a24162675284fa09caf # Parent c4ab5034c4d45f2d899c80907369700dd74973a0 Uploaded diff -r c4ab5034c4d4 -r 6b66c1c57f22 merge_and_filter.r --- a/merge_and_filter.r Wed Nov 09 10:42:25 2016 -0500 +++ b/merge_and_filter.r Thu Nov 10 08:36:18 2016 -0500 @@ -90,6 +90,22 @@ result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) result$JGene = gsub("[*].*", "", result$JGene) +splt = strsplit(class.filter, "_")[[1]] +chunk_hit_threshold = as.numeric(splt[1]) +nt_hit_threshold = as.numeric(splt[2]) + +higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold) + +if(!all(higher_than, na.rm=T)){ #check for no unmatched + result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"]) +} + +if(class.filter == "101_101"){ + result$best_match = "all" +} + +write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) + print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) @@ -138,20 +154,6 @@ result[is.na(result[,col]),] = 0 } -splt = strsplit(class.filter, "_")[[1]] -chunk_hit_threshold = as.numeric(splt[1]) -nt_hit_threshold = as.numeric(splt[2]) - -higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold) - -if(!all(higher_than, na.rm=T)){ #check for no unmatched - result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"]) -} - -if(class.filter == "101_101"){ - result$best_match = "all" -} - write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) if(filter.unique != "no"){ @@ -166,13 +168,13 @@ } else if(empty.region.filter == "FR2"){ result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) } - + if(grepl("keep", filter.unique)){ - result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes result = result[!duplicated(result$unique.def),] } else { result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] - result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it result = result[!duplicated(result$unique.def),] } } diff -r c4ab5034c4d4 -r 6b66c1c57f22 shm_csr.xml