67
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3
|
|
4 summaryfile = args[1]
|
|
5 sequencesfile = args[2]
|
|
6 mutationanalysisfile = args[3]
|
|
7 mutationstatsfile = args[4]
|
|
8 hotspotsfile = args[5]
|
|
9 aafile = args[6]
|
|
10 gene_identification_file= args[7]
|
|
11 output = args[8]
|
|
12 before.unique.file = args[9]
|
|
13 unmatchedfile = args[10]
|
|
14 method=args[11]
|
|
15 functionality=args[12]
|
|
16 unique.type=args[13]
|
|
17 filter.unique=args[14]
|
|
18 filter.unique.count=as.numeric(args[15])
|
|
19 class.filter=args[16]
|
|
20 empty.region.filter=args[17]
|
|
21
|
|
22 print(paste("filter.unique.count:", filter.unique.count))
|
|
23
|
|
24 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
25 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
26 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
27 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
28 hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
29 AAs = read.table(aafile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
30 gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
31
|
|
32 fix_column_names = function(df){
|
|
33 if("V.DOMAIN.Functionality" %in% names(df)){
|
|
34 names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
|
|
35 print("found V.DOMAIN.Functionality, changed")
|
|
36 }
|
|
37 if("V.DOMAIN.Functionality.comment" %in% names(df)){
|
|
38 names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
|
|
39 print("found V.DOMAIN.Functionality.comment, changed")
|
|
40 }
|
|
41 return(df)
|
|
42 }
|
|
43
|
|
44 fix_non_unique_ids = function(df){
|
|
45 df$Sequence.ID = paste(df$Sequence.ID, 1:nrow(df))
|
|
46 return(df)
|
|
47 }
|
|
48
|
|
49 summ = fix_column_names(summ)
|
|
50 sequences = fix_column_names(sequences)
|
|
51 mutationanalysis = fix_column_names(mutationanalysis)
|
|
52 mutationstats = fix_column_names(mutationstats)
|
|
53 hotspots = fix_column_names(hotspots)
|
|
54 AAs = fix_column_names(AAs)
|
|
55
|
77
|
56 if(!("Sequence.number" %in% names(summ))){
|
|
57 summ["Sequence.number"] = 1:nrow(summ)
|
|
58 }
|
|
59
|
67
|
60 if(method == "blastn"){
|
|
61 #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
|
|
62 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
|
|
63 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))
|
|
64 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)
|
|
65 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100
|
|
66 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]
|
|
67 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
|
|
68 }
|
|
69
|
|
70 #print("Summary analysis files columns")
|
|
71 #print(names(summ))
|
|
72
|
|
73
|
|
74
|
|
75 input.sequence.count = nrow(summ)
|
|
76 print(paste("Number of sequences in summary file:", input.sequence.count))
|
|
77
|
|
78 filtering.steps = data.frame(character(0), numeric(0))
|
|
79
|
|
80 filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))
|
|
81
|
|
82 filtering.steps[,1] = as.character(filtering.steps[,1])
|
|
83 filtering.steps[,2] = as.character(filtering.steps[,2])
|
|
84 #filtering.steps[,3] = as.numeric(filtering.steps[,3])
|
|
85
|
|
86 #print("summary files columns")
|
|
87 #print(names(summ))
|
|
88
|
|
89 summ = merge(summ, gene_identification, by="Sequence.ID")
|
|
90
|
|
91 print(paste("Number of sequences after merging with gene identification:", nrow(summ)))
|
|
92
|
|
93 summ = summ[summ$Functionality != "No results",]
|
|
94
|
|
95 print(paste("Number of sequences after 'No results' filter:", nrow(summ)))
|
|
96
|
|
97 filtering.steps = rbind(filtering.steps, c("After 'No results' filter", nrow(summ)))
|
|
98
|
|
99 if(functionality == "productive"){
|
|
100 summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]
|
|
101 } else if (functionality == "unproductive"){
|
|
102 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]
|
|
103 } else if (functionality == "remove_unknown"){
|
|
104 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]
|
|
105 }
|
|
106
|
|
107 print(paste("Number of sequences after functionality filter:", nrow(summ)))
|
|
108
|
|
109 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))
|
|
110
|
|
111 if(F){ #to speed up debugging
|
|
112 set.seed(1)
|
|
113 summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.03)),]
|
|
114 print(paste("Number of sequences after sampling 3%:", nrow(summ)))
|
|
115
|
|
116 filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 3%", nrow(summ)))
|
|
117 }
|
|
118
|
|
119 print("mutation analysis files columns")
|
|
120 print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])]))
|
|
121
|
|
122 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")
|
|
123
|
|
124 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
|
|
125
|
|
126 #print("mutation stats files columns")
|
|
127 #print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])]))
|
|
128
|
|
129 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
|
|
130
|
|
131 print(paste("Number of sequences after merging with mutation stats file:", nrow(result)))
|
|
132
|
|
133 print("hotspots files columns")
|
|
134 print(names(hotspots[,!(names(hotspots) %in% names(result)[-1])]))
|
|
135
|
|
136 result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")
|
|
137
|
|
138 print(paste("Number of sequences after merging with hotspots file:", nrow(result)))
|
|
139
|
|
140 print("sequences files columns")
|
|
141 print(c("FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT"))
|
|
142
|
|
143 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
|
|
144 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
|
|
145 result = merge(result, sequences, by="Sequence.ID", all.x=T)
|
|
146
|
|
147 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")]
|
|
148 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA")
|
|
149 result = merge(result, AAs, by="Sequence.ID", all.x=T)
|
|
150
|
|
151 print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
|
|
152
|
|
153 result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
|
|
154 result$VGene = gsub("[*].*", "", result$VGene)
|
|
155 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele)
|
|
156 result$DGene = gsub("[*].*", "", result$DGene)
|
|
157 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
|
|
158 result$JGene = gsub("[*].*", "", result$JGene)
|
|
159
|
|
160 splt = strsplit(class.filter, "_")[[1]]
|
|
161 chunk_hit_threshold = as.numeric(splt[1])
|
|
162 nt_hit_threshold = as.numeric(splt[2])
|
|
163
|
|
164 higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold)
|
|
165
|
|
166 if(!all(higher_than, na.rm=T)){ #check for no unmatched
|
|
167 result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"])
|
|
168 }
|
|
169
|
|
170 if(class.filter == "101_101"){
|
|
171 result$best_match = "all"
|
|
172 }
|
|
173
|
|
174 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
|
|
175
|
77
|
176 missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq)
|
|
177 missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq)
|
|
178 missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq)
|
|
179 missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq)
|
|
180 missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq)
|
|
181
|
|
182 print(paste("Number of empty CDR1 sequences:", sum(missing.FR1)))
|
|
183 print(paste("Number of empty FR2 sequences:", sum(missing.CDR1)))
|
|
184 print(paste("Number of empty CDR2 sequences:", sum(missing.FR2)))
|
|
185 print(paste("Number of empty FR3 sequences:", sum(missing.CDR2)))
|
|
186 print(paste("Number of empty FR3 sequences:", sum(missing.FR3)))
|
67
|
187
|
|
188 if(empty.region.filter == "leader"){
|
|
189 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
|
|
190 } else if(empty.region.filter == "FR1"){
|
|
191 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
|
|
192 } else if(empty.region.filter == "CDR1"){
|
|
193 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
|
|
194 } else if(empty.region.filter == "FR2"){
|
|
195 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
|
|
196 }
|
|
197
|
|
198 print(paste("After removal sequences that are missing a gene region:", nrow(result)))
|
|
199 filtering.steps = rbind(filtering.steps, c("After removal sequences that are missing a gene region", nrow(result)))
|
|
200
|
|
201 if(empty.region.filter == "leader"){
|
|
202 result = result[!(grepl("n|N", result$FR1.IMGT.seq) | grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
|
|
203 } else if(empty.region.filter == "FR1"){
|
|
204 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
|
|
205 } else if(empty.region.filter == "CDR1"){
|
|
206 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
|
|
207 } else if(empty.region.filter == "FR2"){
|
|
208 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
|
|
209 }
|
|
210
|
|
211 print(paste("Number of sequences in result after n filtering:", nrow(result)))
|
|
212 filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result)))
|
|
213
|
|
214 cleanup_columns = c("FR1.IMGT.Nb.of.mutations",
|
|
215 "CDR1.IMGT.Nb.of.mutations",
|
|
216 "FR2.IMGT.Nb.of.mutations",
|
|
217 "CDR2.IMGT.Nb.of.mutations",
|
|
218 "FR3.IMGT.Nb.of.mutations")
|
|
219
|
|
220 for(col in cleanup_columns){
|
|
221 result[,col] = gsub("\\(.*\\)", "", result[,col])
|
|
222 result[,col] = as.numeric(result[,col])
|
|
223 result[is.na(result[,col]),] = 0
|
|
224 }
|
|
225
|
|
226 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
|
|
227
|
77
|
228
|
67
|
229 if(filter.unique != "no"){
|
|
230 clmns = names(result)
|
|
231 if(filter.unique == "remove_vjaa"){
|
|
232 result$unique.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA)
|
|
233 } else if(empty.region.filter == "leader"){
|
|
234 result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
|
|
235 } else if(empty.region.filter == "FR1"){
|
|
236 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
|
|
237 } else if(empty.region.filter == "CDR1"){
|
|
238 result$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
|
|
239 } else if(empty.region.filter == "FR2"){
|
|
240 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
|
|
241 }
|
|
242
|
|
243 if(grepl("remove", filter.unique)){
|
|
244 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
|
|
245 unique.defs = data.frame(table(result$unique.def))
|
|
246 unique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,]
|
|
247 result = result[result$unique.def %in% unique.defs$Var1,]
|
|
248 }
|
|
249
|
|
250 if(filter.unique != "remove_vjaa"){
|
|
251 result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it
|
|
252 }
|
|
253
|
|
254 result = result[!duplicated(result$unique.def),]
|
|
255 }
|
|
256
|
|
257 write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T)
|
|
258
|
|
259 filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
|
|
260
|
|
261 print(paste("Number of sequences in result after unique filtering:", nrow(result)))
|
|
262
|
|
263 if(nrow(summ) == 0){
|
|
264 stop("No data remaining after filter")
|
|
265 }
|
|
266
|
|
267 result$best_match_class = gsub(",.*", "", result$best_match) #gsub so the unmatched don't have a class after it
|
|
268
|
|
269 #result$past = ""
|
|
270 #cls = unlist(strsplit(unique.type, ","))
|
|
271 #for (i in 1:nrow(result)){
|
|
272 # result[i,"past"] = paste(result[i,cls], collapse=":")
|
|
273 #}
|
|
274
|
|
275
|
|
276
|
|
277 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
|
|
278
|
|
279 result.matched = result[!grepl("unmatched", result$best_match),]
|
|
280 result.unmatched = result[grepl("unmatched", result$best_match),]
|
|
281
|
|
282 result = rbind(result.matched, result.unmatched)
|
|
283
|
|
284 result = result[!(duplicated(result$past)), ]
|
|
285
|
|
286 result = result[,!(names(result) %in% c("past", "best_match_class"))]
|
|
287
|
|
288 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))
|
|
289
|
|
290 filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))
|
|
291
|
|
292 unmatched = result[grepl("^unmatched", result$best_match),c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
|
|
293
|
|
294 print(paste("Number of rows in result:", nrow(result)))
|
|
295 print(paste("Number of rows in unmatched:", nrow(unmatched)))
|
|
296
|
|
297 matched.sequences = result[!grepl("^unmatched", result$best_match),]
|
|
298
|
|
299 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
|
|
300
|
|
301 matched.sequences.count = nrow(matched.sequences)
|
|
302 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
|
77
|
303 if(matched.sequences.count <= unmatched.sequences.count){
|
|
304 print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")
|
|
305 }
|
67
|
306
|
|
307 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
|
|
308 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))
|
|
309 filtering.steps[,2] = as.numeric(filtering.steps[,2])
|
|
310 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)
|
|
311
|
|
312 write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\t",quote=F,row.names=F,col.names=F)
|
|
313
|
|
314 write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
|
|
315 write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
|