Mercurial > repos > davidvanzessen > shm_csr
comparison merge_and_filter.r @ 49:f5fe63533c58 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 11 May 2017 10:21:39 -0400 |
parents | 64711f461c8e |
children | 8fa8836bd605 |
comparison
equal
deleted
inserted
replaced
48:c5295dd10dfc | 49:f5fe63533c58 |
---|---|
13 unmatchedfile = args[10] | 13 unmatchedfile = args[10] |
14 method=args[11] | 14 method=args[11] |
15 functionality=args[12] | 15 functionality=args[12] |
16 unique.type=args[13] | 16 unique.type=args[13] |
17 filter.unique=args[14] | 17 filter.unique=args[14] |
18 class.filter=args[15] | 18 filter.unique.count=as.numeric(args[15]) |
19 empty.region.filter=args[16] | 19 class.filter=args[16] |
20 empty.region.filter=args[17] | |
21 | |
22 print(paste("filter.unique.count:", filter.unique.count)) | |
20 | 23 |
21 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | 24 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") |
22 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | 25 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") |
23 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | 26 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") |
24 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | 27 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") |
94 | 97 |
95 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) | 98 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) |
96 | 99 |
97 if(FALSE){ #to speed up debugging | 100 if(FALSE){ #to speed up debugging |
98 set.seed(1) | 101 set.seed(1) |
99 summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),] | 102 summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.1)),] |
100 print(paste("Number of sequences after sampling 5%:", nrow(summ))) | 103 print(paste("Number of sequences after sampling 5%:", nrow(summ))) |
101 | 104 |
102 filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) | 105 filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) |
103 } | 106 } |
104 | 107 |
223 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] | 226 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] |
224 } | 227 } |
225 | 228 |
226 result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it | 229 result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it |
227 | 230 |
231 if(filter.unique == "remove"){ | |
232 unique.defs = data.frame(table(result$unique.def)) | |
233 unique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,] | |
234 result = result[result$unique.def %in% unique.defs$Var1,] | |
235 } | |
236 | |
228 result = result[!duplicated(result$unique.def),] | 237 result = result[!duplicated(result$unique.def),] |
229 } | 238 } |
230 | 239 |
231 write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T) | 240 write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T) |
232 | 241 |