Mercurial > repos > davidvanzessen > shm_csr
diff merge_and_filter.r @ 5:012a738edf5a draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 01 Nov 2016 10:15:37 -0400 |
parents | 275ab5175fd6 |
children | 372ccdcf0b2d |
line wrap: on
line diff
--- a/merge_and_filter.r Mon Oct 31 05:05:26 2016 -0400 +++ b/merge_and_filter.r Tue Nov 01 10:15:37 2016 -0400 @@ -141,44 +141,6 @@ result[is.na(result[,col]),] = 0 } -write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) - -if(filter.unique != "no"){ - clmns = names(result) - - if(empty.region.filter == "leader"){ - result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) - } else if(empty.region.filter == "FR1"){ - result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) - } else if(empty.region.filter == "CDR1"){ - rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) - } else if(empty.region.filter == "FR2"){ - result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) - } - - if(grepl("_c", filter.unique)){ - result$unique.def = paste(result$unique.def, result$best_match) - } - - #fltr = result$unique.def %in% result.filtered$unique.def - - if(grepl("keep", filter.unique)){ - result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes - result = result[!duplicated(result$unique.def),] - } else { - result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] - result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes - result = result[!duplicated(result$unique.def),] - } - - #result = result[,clmns] - - #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) -} - -filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result))) - - splt = strsplit(class.filter, "_")[[1]] chunk_hit_threshold = as.numeric(splt[1]) nt_hit_threshold = as.numeric(splt[2]) @@ -198,10 +160,39 @@ result$best_match = "all" } -if(any(higher_than, na.rm=T)){ - #summ = summ[higher_than,] +write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) + +if(filter.unique != "no"){ + clmns = names(result) + + if(empty.region.filter == "leader"){ + result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } else if(empty.region.filter == "FR1"){ + result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } else if(empty.region.filter == "CDR1"){ + result$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } else if(empty.region.filter == "FR2"){ + result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } + + if(grepl("_c", filter.unique)){ + result$unique.def = paste(result$unique.def, result$best_match) + } + + if(grepl("keep", filter.unique)){ + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result = result[!duplicated(result$unique.def),] + } else { + result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result = result[!duplicated(result$unique.def),] + } } +write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T) + +filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result))) + if(nrow(summ) == 0){ stop("No data remaining after filter") }