comparison merge_and_filter.r @ 49:f5fe63533c58 draft

Uploaded
author davidvanzessen
date Thu, 11 May 2017 10:21:39 -0400
parents 64711f461c8e
children 8fa8836bd605
comparison
equal deleted inserted replaced
48:c5295dd10dfc 49:f5fe63533c58
13 unmatchedfile = args[10] 13 unmatchedfile = args[10]
14 method=args[11] 14 method=args[11]
15 functionality=args[12] 15 functionality=args[12]
16 unique.type=args[13] 16 unique.type=args[13]
17 filter.unique=args[14] 17 filter.unique=args[14]
18 class.filter=args[15] 18 filter.unique.count=as.numeric(args[15])
19 empty.region.filter=args[16] 19 class.filter=args[16]
20 empty.region.filter=args[17]
21
22 print(paste("filter.unique.count:", filter.unique.count))
20 23
21 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") 24 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
22 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") 25 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
23 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") 26 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
24 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") 27 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
94 97
95 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) 98 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))
96 99
97 if(FALSE){ #to speed up debugging 100 if(FALSE){ #to speed up debugging
98 set.seed(1) 101 set.seed(1)
99 summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),] 102 summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.1)),]
100 print(paste("Number of sequences after sampling 5%:", nrow(summ))) 103 print(paste("Number of sequences after sampling 5%:", nrow(summ)))
101 104
102 filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) 105 filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ)))
103 } 106 }
104 107
223 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] 226 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
224 } 227 }
225 228
226 result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it 229 result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it
227 230
231 if(filter.unique == "remove"){
232 unique.defs = data.frame(table(result$unique.def))
233 unique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,]
234 result = result[result$unique.def %in% unique.defs$Var1,]
235 }
236
228 result = result[!duplicated(result$unique.def),] 237 result = result[!duplicated(result$unique.def),]
229 } 238 }
230 239
231 write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T) 240 write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T)
232 241