# HG changeset patch # User davidvanzessen # Date 1479386001 18000 # Node ID 61d0a6318711d3b52e248a6f4f787ee033628df0 # Parent 59765d2c8890bf8a5ae82fe78374052b80c5a36d Uploaded diff -r 59765d2c8890 -r 61d0a6318711 merge_and_filter.r --- a/merge_and_filter.r Fri Nov 11 07:31:48 2016 -0500 +++ b/merge_and_filter.r Thu Nov 17 07:33:21 2016 -0500 @@ -175,14 +175,12 @@ result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) } - if(grepl("keep", filter.unique)){ - result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes - result = result[!duplicated(result$unique.def),] - } else { + if(filter.unique == "remove"){ result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] - result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it - result = result[!duplicated(result$unique.def),] } + result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it + + result = result[!duplicated(result$unique.def),] } write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T) @@ -197,8 +195,21 @@ result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) + + + +result.matched = result[!grepl("unmatched", result$best_match),] +result.unmatched = result[grepl("unmatched", result$best_match),] + +result = rbind(result.matched, result.unmatched) + result = result[!(duplicated(result$past)), ] + + + + + result = result[,!(names(result) %in% c("past", "best_match_class"))] print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) diff -r 59765d2c8890 -r 61d0a6318711 shm_csr.xml --- a/shm_csr.xml Fri Nov 11 07:31:48 2016 -0500 +++ b/shm_csr.xml Thu Nov 17 07:33:21 2016 -0500 @@ -75,27 +75,115 @@ 10.1093/bioinformatics/btv359 - Takes an IMGT zip (http://www.imgt.org/HighV-QUEST/search.action) file and creates a summarization of the mutation analysis. - - +--------------------------+ - | unique filter | - +--------+--------+--------+ - | values | remove | keep | - +--------+--------+--------+ - | A | A | A | - +--------+--------+--------+ - | A | B | B | - +--------+--------+--------+ - | B | D | C | - +--------+--------+--------+ - | B | | D | - +--------+--------+--------+ - | C | | | - +--------+--------+--------+ - | D | | | - +--------+--------+--------+ - | D | | | - +--------+--------+--------+ - +