0
|
1 library(reshape2)
|
|
2
|
|
3 args <- commandArgs(trailingOnly = TRUE)
|
|
4
|
|
5 before.unique.file = args[1]
|
|
6 merged.file = args[2]
|
|
7 outputdir = args[3]
|
|
8 gene.classes = unlist(strsplit(args[4], ","))
|
|
9 hotspot.analysis.sum.file = args[5]
|
|
10 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
|
|
11 NTsum.file = paste(outputdir, "ntsum.txt", sep="/")
|
|
12 main.html = "index.html"
|
7
|
13 empty.region.filter = args[6]
|
|
14
|
0
|
15
|
|
16 setwd(outputdir)
|
|
17
|
|
18 before.unique = read.table(before.unique.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
19 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
20 hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")
|
|
21
|
|
22 #before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]
|
|
23
|
7
|
24 if(empty.region.filter == "leader"){
|
31
|
25 before.unique$seq_conc = paste(before.unique$FR1.IMGT.seq, before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
7
|
26 } else if(empty.region.filter == "FR1"){
|
31
|
27 before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
7
|
28 } else if(empty.region.filter == "CDR1"){
|
31
|
29 before.unique$seq_conc = paste(before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
7
|
30 } else if(empty.region.filter == "FR2"){
|
31
|
31 before.unique$seq_conc = paste(before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
7
|
32 }
|
0
|
33
|
|
34 IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]
|
|
35 IDs$best_match = as.character(IDs$best_match)
|
|
36
|
7
|
37 dat = data.frame(table(before.unique$seq_conc))
|
0
|
38
|
|
39 names(dat) = c("seq_conc", "Freq")
|
|
40
|
|
41 dat$seq_conc = factor(dat$seq_conc)
|
|
42
|
|
43 dat = dat[order(as.character(dat$seq_conc)),]
|
|
44
|
|
45 #writing html from R...
|
|
46 get.bg.color = function(val){
|
|
47 if(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color
|
|
48 return(ifelse(val,"#eafaf1","#f9ebea"))
|
|
49 } else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0
|
|
50 return(ifelse(val > 0,"#eaecee","white"))
|
|
51 } else {
|
|
52 return("white")
|
|
53 }
|
|
54 }
|
|
55 td = function(val) {
|
|
56 return(paste("<td bgcolor='", get.bg.color(val), "'>", val, "</td>", sep=""))
|
|
57 }
|
|
58 tr = function(val) {
|
|
59 return(paste(c("<tr>", sapply(val, td), "</tr>"), collapse=""))
|
|
60 }
|
|
61
|
|
62 make.link = function(id, clss, val) {
|
|
63 paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="")
|
|
64 }
|
|
65 tbl = function(df) {
|
|
66 res = "<table border='1'>"
|
|
67 for(i in 1:nrow(df)){
|
|
68 res = paste(res, tr(df[i,]), sep="")
|
|
69 }
|
|
70 res = paste(res, "</table>")
|
|
71 }
|
|
72
|
39
|
73 cat("<center><img src='data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQVQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8XlAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3Xn+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgLRLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fLorZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5VfdATueO1fdLdI0AAAAAElFTkSuQmCC'> Please note that this tab is based on all sequences before filter unique sequences and the remove duplicates based on filters are applied. In this table only sequences according more than once are included. </center>", file=main.html, append=F)
|
|
74 cat("<table border='1' class='pure-table pure-table-striped'>", file=main.html, append=T)
|
7
|
75
|
|
76 if(empty.region.filter == "leader"){
|
|
77 cat("<caption>FR1+CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
78 } else if(empty.region.filter == "FR1"){
|
|
79 cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
80 } else if(empty.region.filter == "CDR1"){
|
|
81 cat("<caption>FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
82 } else if(empty.region.filter == "FR2"){
|
|
83 cat("<caption>CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
84 }
|
|
85
|
0
|
86 cat("<tr>", file=main.html, append=T)
|
32
|
87 cat("<th>Sequence</th><th>Functionality</th><th>IGA1</th><th>IGA2</th><th>IGG1</th><th>IGG2</th><th>IGG3</th><th>IGG4</th><th>IGM</th><th>IGE</th><th>UN</th>", file=main.html, append=T)
|
|
88 cat("<th>total IGA</th><th>total IGG</th><th>total IGE</th><th>total IGM</th><th>number of subclasses</th><th>present in both IGA and IGG</th><th>present in IGA, IGG and IGM</th><th>present in IGA, IGG and IGE</th><th>present in IGA, IGG, IGM and IGE</th><th>IGA1+IGA2</th>", file=main.html, append=T)
|
|
89 cat("<th>IGG1+IGG2</th><th>IGG1+IGG3</th><th>IGG1+IGG4</th><th>IGG2+IGG3</th><th>IGG2+IGG4</th><th>IGG3+IGG4</th>", file=main.html, append=T)
|
|
90 cat("<th>IGG1+IGG2+IGG3</th><th>IGG2+IGG3+IGG4</th><th>IGG1+IGG2+IGG4</th><th>IGG1+IGG3+IGG4</th><th>IGG1+IGG2+IGG3+IGG4</th>", file=main.html, append=T)
|
0
|
91 cat("</tr>", file=main.html, append=T)
|
|
92
|
|
93
|
|
94
|
|
95 single.sequences=0 #sequence only found once, skipped
|
|
96 in.multiple=0 #same sequence across multiple subclasses
|
|
97 multiple.in.one=0 #same sequence multiple times in one subclass
|
|
98 unmatched=0 #all of the sequences are unmatched
|
|
99 some.unmatched=0 #one or more sequences in a clone are unmatched
|
|
100 matched=0 #should be the same als matched sequences
|
|
101
|
|
102 sequence.id.page="by_id.html"
|
|
103
|
|
104 for(i in 1:nrow(dat)){
|
|
105
|
|
106 ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGA1", IDs$best_match),]
|
|
107 ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGA2", IDs$best_match),]
|
|
108
|
|
109 cg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG1", IDs$best_match),]
|
|
110 cg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG2", IDs$best_match),]
|
|
111 cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG3", IDs$best_match),]
|
|
112 cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG4", IDs$best_match),]
|
|
113
|
|
114 cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGM", IDs$best_match),]
|
|
115
|
32
|
116 ce = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGE", IDs$best_match),]
|
|
117
|
0
|
118 un = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^unmatched", IDs$best_match),]
|
32
|
119
|
|
120 allc = rbind(ca1, ca2, cg1, cg2, cg3, cg4, cm, ce, un)
|
0
|
121
|
|
122 ca1.n = nrow(ca1)
|
|
123 ca2.n = nrow(ca2)
|
|
124
|
|
125 cg1.n = nrow(cg1)
|
|
126 cg2.n = nrow(cg2)
|
|
127 cg3.n = nrow(cg3)
|
|
128 cg4.n = nrow(cg4)
|
|
129
|
|
130 cm.n = nrow(cm)
|
|
131
|
32
|
132 ce.n = nrow(ce)
|
|
133
|
0
|
134 un.n = nrow(un)
|
|
135
|
32
|
136 classes = c(ca1.n, ca2.n, cg1.n, cg2.n, cg3.n, cg4.n, cm.n, ce.n, un.n)
|
0
|
137
|
|
138 classes.sum = sum(classes)
|
|
139
|
|
140 if(classes.sum == 1){
|
|
141 single.sequences = single.sequences + 1
|
|
142 next
|
|
143 }
|
|
144
|
|
145 if(un.n == classes.sum){
|
|
146 unmatched = unmatched + 1
|
|
147 next
|
|
148 }
|
|
149
|
|
150 in.classes = sum(classes > 0)
|
|
151
|
|
152 matched = matched + in.classes #count in how many subclasses the sequence occurs.
|
|
153
|
32
|
154 if(any(classes == classes.sum)){
|
0
|
155 multiple.in.one = multiple.in.one + 1
|
|
156 } else if (un.n > 0) {
|
|
157 some.unmatched = some.unmatched + 1
|
|
158 } else {
|
|
159 in.multiple = in.multiple + 1
|
|
160 }
|
|
161
|
|
162 id = as.numeric(dat[i,"seq_conc"])
|
|
163
|
|
164 functionality = paste(unique(allc[,"Functionality"]), collapse=",")
|
|
165
|
|
166 by.id.row = c()
|
|
167
|
|
168 if(ca1.n > 0){
|
|
169 cat(tbl(ca1), file=paste("IGA1_", id, ".html", sep=""))
|
|
170 }
|
|
171
|
|
172 if(ca2.n > 0){
|
|
173 cat(tbl(ca2), file=paste("IGA2_", id, ".html", sep=""))
|
|
174 }
|
|
175
|
|
176 if(cg1.n > 0){
|
|
177 cat(tbl(cg1), file=paste("IGG1_", id, ".html", sep=""))
|
|
178 }
|
|
179
|
|
180 if(cg2.n > 0){
|
|
181 cat(tbl(cg2), file=paste("IGG2_", id, ".html", sep=""))
|
|
182 }
|
|
183
|
|
184 if(cg3.n > 0){
|
|
185 cat(tbl(cg3), file=paste("IGG3_", id, ".html", sep=""))
|
|
186 }
|
|
187
|
|
188 if(cg4.n > 0){
|
|
189 cat(tbl(cg4), file=paste("IGG4_", id, ".html", sep=""))
|
|
190 }
|
|
191
|
|
192 if(cm.n > 0){
|
|
193 cat(tbl(cm), file=paste("IGM_", id, ".html", sep=""))
|
|
194 }
|
|
195
|
32
|
196 if(ce.n > 0){
|
|
197 cat(tbl(ce), file=paste("IGE_", id, ".html", sep=""))
|
|
198 }
|
|
199
|
0
|
200 if(un.n > 0){
|
|
201 cat(tbl(un), file=paste("un_", id, ".html", sep=""))
|
|
202 }
|
|
203
|
|
204 ca1.html = make.link(id, "IGA1", ca1.n)
|
|
205 ca2.html = make.link(id, "IGA2", ca2.n)
|
|
206
|
|
207 cg1.html = make.link(id, "IGG1", cg1.n)
|
|
208 cg2.html = make.link(id, "IGG2", cg2.n)
|
|
209 cg3.html = make.link(id, "IGG3", cg3.n)
|
|
210 cg4.html = make.link(id, "IGG4", cg4.n)
|
|
211
|
|
212 cm.html = make.link(id, "IGM", cm.n)
|
|
213
|
32
|
214 ce.html = make.link(id, "IGE", ce.n)
|
|
215
|
0
|
216 un.html = make.link(id, "un", un.n)
|
|
217
|
|
218 #extra columns
|
|
219 ca.n = ca1.n + ca2.n
|
|
220
|
|
221 cg.n = cg1.n + cg2.n + cg3.n + cg4.n
|
|
222
|
|
223 #in.classes
|
|
224
|
|
225 in.ca.cg = (ca.n > 0 & cg.n > 0)
|
|
226
|
32
|
227 in.ca.cg.cm = (ca.n > 0 & cg.n > 0 & cm.n > 0)
|
|
228
|
|
229 in.ca.cg.ce = (ca.n > 0 & cg.n > 0 & ce.n > 0)
|
|
230
|
|
231 in.ca.cg.cm.ce = (ca.n > 0 & cg.n > 0 & cm.n > 0 & ce.n > 0)
|
|
232
|
0
|
233 in.ca1.ca2 = (ca1.n > 0 & ca2.n > 0)
|
|
234
|
|
235 in.cg1.cg2 = (cg1.n > 0 & cg2.n > 0)
|
|
236 in.cg1.cg3 = (cg1.n > 0 & cg3.n > 0)
|
|
237 in.cg1.cg4 = (cg1.n > 0 & cg4.n > 0)
|
|
238 in.cg2.cg3 = (cg2.n > 0 & cg3.n > 0)
|
|
239 in.cg2.cg4 = (cg2.n > 0 & cg4.n > 0)
|
|
240 in.cg3.cg4 = (cg3.n > 0 & cg4.n > 0)
|
|
241
|
|
242 in.cg1.cg2.cg3 = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0)
|
|
243 in.cg2.cg3.cg4 = (cg2.n > 0 & cg3.n > 0 & cg4.n > 0)
|
|
244 in.cg1.cg2.cg4 = (cg1.n > 0 & cg2.n > 0 & cg4.n > 0)
|
|
245 in.cg1.cg3.cg4 = (cg1.n > 0 & cg3.n > 0 & cg4.n > 0)
|
|
246
|
|
247 in.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)
|
|
248
|
|
249 #rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)
|
32
|
250 rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, ce.html, un.html)
|
|
251 rw = c(rw, ca.n, cg.n, cm.n, ce.n, in.classes, in.ca.cg, in.ca.cg.cm, in.ca.cg.ce, in.ca.cg.cm.ce, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)
|
0
|
252
|
|
253 cat(tr(rw), file=main.html, append=T)
|
|
254
|
|
255
|
|
256 for(i in 1:nrow(allc)){ #generate html by id
|
|
257 html = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])
|
|
258 cat(paste(html, "<br />"), file=sequence.id.page, append=T)
|
|
259 }
|
|
260 }
|
|
261
|
|
262 cat("</table>", file=main.html, append=T)
|
|
263
|
|
264 print(paste("Single sequences:", single.sequences))
|
|
265 print(paste("Sequences in multiple subclasses:", in.multiple))
|
|
266 print(paste("Multiple sequences in one subclass:", multiple.in.one))
|
|
267 print(paste("Matched with unmatched:", some.unmatched))
|
|
268 print(paste("Count that should match 'matched' sequences:", matched))
|
|
269
|
|
270 #ACGT overview
|
|
271
|
7
|
272 #NToverview = merged[!grepl("^unmatched", merged$best_match),]
|
|
273 NToverview = merged
|
0
|
274
|
7
|
275 if(empty.region.filter == "leader"){
|
|
276 NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
277 } else if(empty.region.filter == "FR1"){
|
|
278 NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
279 } else if(empty.region.filter == "CDR1"){
|
|
280 NToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
281 } else if(empty.region.filter == "FR2"){
|
|
282 NToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
283 }
|
0
|
284
|
|
285 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
|
|
286 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
|
|
287 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
|
|
288 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
|
|
289
|
|
290 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
|
|
291
|
|
292 #NToverview = rbind(NToverview, NTsum)
|
|
293
|
|
294 NTresult = data.frame(nt=c("A", "C", "T", "G"))
|
|
295
|
|
296 for(clazz in gene.classes){
|
8
|
297 print(paste("class:", clazz))
|
0
|
298 NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]
|
8
|
299 print(paste("nrow:", nrow(NToverview.sub)))
|
0
|
300 new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
|
|
301 new.col.y = sum(new.col.x)
|
|
302 new.col.z = round(new.col.x / new.col.y * 100, 2)
|
|
303
|
|
304 tmp = names(NTresult)
|
|
305 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
|
|
306 names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
|
|
307 }
|
|
308
|
39
|
309 NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]
|
|
310
|
|
311 names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")
|
|
312
|
|
313 write.table(NToverview.tmp, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
|
0
|
314
|
|
315 NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
|
|
316
|
|
317 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
|
|
318 new.col.y = sum(new.col.x)
|
|
319 new.col.z = round(new.col.x / new.col.y * 100, 2)
|
|
320
|
|
321 tmp = names(NTresult)
|
|
322 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
|
|
323 names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))
|
|
324
|
|
325 names(hotspot.analysis.sum) = names(NTresult)
|
|
326
|
|
327 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
|
|
328
|
|
329 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
|
|
330
|
|
331
|
|
332
|
|
333
|
|
334
|
|
335
|
|
336
|
|
337
|
|
338
|
|
339
|
|
340
|
|
341
|
|
342
|
|
343
|
|
344
|
|
345
|
|
346
|
|
347
|
|
348
|
|
349
|
|
350
|
|
351
|
|
352
|
|
353
|
|
354
|
|
355
|
|
356
|
|
357
|
|
358
|
|
359
|