81
|
1 library(reshape2)
|
|
2
|
|
3 args <- commandArgs(trailingOnly = TRUE)
|
|
4
|
|
5 before.unique.file = args[1]
|
|
6 merged.file = args[2]
|
|
7 outputdir = args[3]
|
|
8 gene.classes = unlist(strsplit(args[4], ","))
|
|
9 hotspot.analysis.sum.file = args[5]
|
|
10 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
|
|
11 NTsum.file = paste(outputdir, "ntsum.txt", sep="/")
|
|
12 main.html = "index.html"
|
|
13 empty.region.filter = args[6]
|
|
14
|
|
15
|
|
16 setwd(outputdir)
|
|
17
|
|
18 before.unique = read.table(before.unique.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
19 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
|
|
20 hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")
|
|
21
|
|
22 #before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]
|
|
23
|
|
24 if(empty.region.filter == "leader"){
|
|
25 before.unique$seq_conc = paste(before.unique$FR1.IMGT.seq, before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
|
26 } else if(empty.region.filter == "FR1"){
|
|
27 before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
|
28 } else if(empty.region.filter == "CDR1"){
|
|
29 before.unique$seq_conc = paste(before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
|
30 } else if(empty.region.filter == "FR2"){
|
|
31 before.unique$seq_conc = paste(before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
|
|
32 }
|
|
33
|
|
34 IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]
|
|
35 IDs$best_match = as.character(IDs$best_match)
|
|
36
|
|
37 dat = data.frame(table(before.unique$seq_conc))
|
|
38
|
|
39 names(dat) = c("seq_conc", "Freq")
|
|
40
|
|
41 dat$seq_conc = factor(dat$seq_conc)
|
|
42
|
|
43 dat = dat[order(as.character(dat$seq_conc)),]
|
|
44
|
|
45 #writing html from R...
|
|
46 get.bg.color = function(val){
|
|
47 if(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color
|
|
48 return(ifelse(val,"#eafaf1","#f9ebea"))
|
|
49 } else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0
|
|
50 return(ifelse(val > 0,"#eaecee","white"))
|
|
51 } else {
|
|
52 return("white")
|
|
53 }
|
|
54 }
|
|
55 td = function(val) {
|
|
56 return(paste("<td bgcolor='", get.bg.color(val), "'>", val, "</td>", sep=""))
|
|
57 }
|
|
58 tr = function(val) {
|
|
59 return(paste(c("<tr>", sapply(val, td), "</tr>"), collapse=""))
|
|
60 }
|
|
61
|
|
62 make.link = function(id, clss, val) {
|
|
63 paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="")
|
|
64 }
|
|
65 tbl = function(df) {
|
|
66 res = "<table border='1'>"
|
|
67 for(i in 1:nrow(df)){
|
|
68 res = paste(res, tr(df[i,]), sep="")
|
|
69 }
|
|
70 res = paste(res, "</table>")
|
|
71 }
|
|
72
|
|
73 cat("<center><img src='data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQVQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8XlAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3Xn+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgLRLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fLorZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5VfdATueO1fdLdI0AAAAAElFTkSuQmCC'> Please note that this tab is based on all sequences before filter unique sequences and the remove duplicates based on filters are applied. In this table only sequences occuring more than once are included. </center>", file=main.html, append=F)
|
|
74 cat("<table border='1' class='pure-table pure-table-striped'>", file=main.html, append=T)
|
|
75
|
|
76 if(empty.region.filter == "leader"){
|
|
77 cat("<caption>FR1+CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
78 } else if(empty.region.filter == "FR1"){
|
|
79 cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
80 } else if(empty.region.filter == "CDR1"){
|
|
81 cat("<caption>FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
82 } else if(empty.region.filter == "FR2"){
|
|
83 cat("<caption>CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
|
|
84 }
|
|
85
|
|
86 cat("<tr>", file=main.html, append=T)
|
|
87 cat("<th>Sequence</th><th>Functionality</th><th>IGA1</th><th>IGA2</th><th>IGG1</th><th>IGG2</th><th>IGG3</th><th>IGG4</th><th>IGM</th><th>IGE</th><th>UN</th>", file=main.html, append=T)
|
|
88 cat("<th>total IGA</th><th>total IGG</th><th>total IGM</th><th>total IGE</th><th>number of subclasses</th><th>present in both IGA and IGG</th><th>present in IGA, IGG and IGM</th><th>present in IGA, IGG and IGE</th><th>present in IGA, IGG, IGM and IGE</th><th>IGA1+IGA2</th>", file=main.html, append=T)
|
|
89 cat("<th>IGG1+IGG2</th><th>IGG1+IGG3</th><th>IGG1+IGG4</th><th>IGG2+IGG3</th><th>IGG2+IGG4</th><th>IGG3+IGG4</th>", file=main.html, append=T)
|
|
90 cat("<th>IGG1+IGG2+IGG3</th><th>IGG2+IGG3+IGG4</th><th>IGG1+IGG2+IGG4</th><th>IGG1+IGG3+IGG4</th><th>IGG1+IGG2+IGG3+IGG4</th>", file=main.html, append=T)
|
|
91 cat("</tr>", file=main.html, append=T)
|
|
92
|
|
93
|
|
94
|
|
95 single.sequences=0 #sequence only found once, skipped
|
|
96 in.multiple=0 #same sequence across multiple subclasses
|
|
97 multiple.in.one=0 #same sequence multiple times in one subclass
|
|
98 unmatched=0 #all of the sequences are unmatched
|
|
99 some.unmatched=0 #one or more sequences in a clone are unmatched
|
|
100 matched=0 #should be the same als matched sequences
|
|
101
|
|
102 sequence.id.page="by_id.html"
|
|
103
|
|
104 for(i in 1:nrow(dat)){
|
|
105
|
|
106 ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGA1", IDs$best_match),]
|
|
107 ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGA2", IDs$best_match),]
|
|
108
|
|
109 cg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG1", IDs$best_match),]
|
|
110 cg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG2", IDs$best_match),]
|
|
111 cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG3", IDs$best_match),]
|
|
112 cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG4", IDs$best_match),]
|
|
113
|
|
114 cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGM", IDs$best_match),]
|
|
115
|
|
116 ce = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGE", IDs$best_match),]
|
|
117
|
|
118 un = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^unmatched", IDs$best_match),]
|
|
119
|
|
120 allc = rbind(ca1, ca2, cg1, cg2, cg3, cg4, cm, ce, un)
|
|
121
|
|
122 ca1.n = nrow(ca1)
|
|
123 ca2.n = nrow(ca2)
|
|
124
|
|
125 cg1.n = nrow(cg1)
|
|
126 cg2.n = nrow(cg2)
|
|
127 cg3.n = nrow(cg3)
|
|
128 cg4.n = nrow(cg4)
|
|
129
|
|
130 cm.n = nrow(cm)
|
|
131
|
|
132 ce.n = nrow(ce)
|
|
133
|
|
134 un.n = nrow(un)
|
|
135
|
|
136 classes = c(ca1.n, ca2.n, cg1.n, cg2.n, cg3.n, cg4.n, cm.n, ce.n, un.n)
|
|
137
|
|
138 classes.sum = sum(classes)
|
|
139
|
|
140 if(classes.sum == 1){
|
|
141 single.sequences = single.sequences + 1
|
|
142 next
|
|
143 }
|
|
144
|
|
145 if(un.n == classes.sum){
|
|
146 unmatched = unmatched + 1
|
|
147 next
|
|
148 }
|
|
149
|
|
150 classes.no.un = classes[-length(classes)]
|
|
151
|
|
152 in.classes = sum(classes.no.un > 0)
|
|
153
|
|
154 matched = matched + in.classes #count in how many subclasses the sequence occurs.
|
|
155
|
|
156 if(any(classes == classes.sum)){
|
|
157 multiple.in.one = multiple.in.one + 1
|
|
158 } else if (un.n > 0) {
|
|
159 some.unmatched = some.unmatched + 1
|
|
160 } else {
|
|
161 in.multiple = in.multiple + 1
|
|
162 }
|
|
163
|
|
164 id = as.numeric(dat[i,"seq_conc"])
|
|
165
|
|
166 functionality = paste(unique(allc[,"Functionality"]), collapse=",")
|
|
167
|
|
168 by.id.row = c()
|
|
169
|
|
170 if(ca1.n > 0){
|
|
171 cat(tbl(ca1), file=paste("IGA1_", id, ".html", sep=""))
|
|
172 }
|
|
173
|
|
174 if(ca2.n > 0){
|
|
175 cat(tbl(ca2), file=paste("IGA2_", id, ".html", sep=""))
|
|
176 }
|
|
177
|
|
178 if(cg1.n > 0){
|
|
179 cat(tbl(cg1), file=paste("IGG1_", id, ".html", sep=""))
|
|
180 }
|
|
181
|
|
182 if(cg2.n > 0){
|
|
183 cat(tbl(cg2), file=paste("IGG2_", id, ".html", sep=""))
|
|
184 }
|
|
185
|
|
186 if(cg3.n > 0){
|
|
187 cat(tbl(cg3), file=paste("IGG3_", id, ".html", sep=""))
|
|
188 }
|
|
189
|
|
190 if(cg4.n > 0){
|
|
191 cat(tbl(cg4), file=paste("IGG4_", id, ".html", sep=""))
|
|
192 }
|
|
193
|
|
194 if(cm.n > 0){
|
|
195 cat(tbl(cm), file=paste("IGM_", id, ".html", sep=""))
|
|
196 }
|
|
197
|
|
198 if(ce.n > 0){
|
|
199 cat(tbl(ce), file=paste("IGE_", id, ".html", sep=""))
|
|
200 }
|
|
201
|
|
202 if(un.n > 0){
|
|
203 cat(tbl(un), file=paste("un_", id, ".html", sep=""))
|
|
204 }
|
|
205
|
|
206 ca1.html = make.link(id, "IGA1", ca1.n)
|
|
207 ca2.html = make.link(id, "IGA2", ca2.n)
|
|
208
|
|
209 cg1.html = make.link(id, "IGG1", cg1.n)
|
|
210 cg2.html = make.link(id, "IGG2", cg2.n)
|
|
211 cg3.html = make.link(id, "IGG3", cg3.n)
|
|
212 cg4.html = make.link(id, "IGG4", cg4.n)
|
|
213
|
|
214 cm.html = make.link(id, "IGM", cm.n)
|
|
215
|
|
216 ce.html = make.link(id, "IGE", ce.n)
|
|
217
|
|
218 un.html = make.link(id, "un", un.n)
|
|
219
|
|
220 #extra columns
|
|
221 ca.n = ca1.n + ca2.n
|
|
222
|
|
223 cg.n = cg1.n + cg2.n + cg3.n + cg4.n
|
|
224
|
|
225 #in.classes
|
|
226
|
|
227 in.ca.cg = (ca.n > 0 & cg.n > 0)
|
|
228
|
|
229 in.ca.cg.cm = (ca.n > 0 & cg.n > 0 & cm.n > 0)
|
|
230
|
|
231 in.ca.cg.ce = (ca.n > 0 & cg.n > 0 & ce.n > 0)
|
|
232
|
|
233 in.ca.cg.cm.ce = (ca.n > 0 & cg.n > 0 & cm.n > 0 & ce.n > 0)
|
|
234
|
|
235 in.ca1.ca2 = (ca1.n > 0 & ca2.n > 0)
|
|
236
|
|
237 in.cg1.cg2 = (cg1.n > 0 & cg2.n > 0)
|
|
238 in.cg1.cg3 = (cg1.n > 0 & cg3.n > 0)
|
|
239 in.cg1.cg4 = (cg1.n > 0 & cg4.n > 0)
|
|
240 in.cg2.cg3 = (cg2.n > 0 & cg3.n > 0)
|
|
241 in.cg2.cg4 = (cg2.n > 0 & cg4.n > 0)
|
|
242 in.cg3.cg4 = (cg3.n > 0 & cg4.n > 0)
|
|
243
|
|
244 in.cg1.cg2.cg3 = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0)
|
|
245 in.cg2.cg3.cg4 = (cg2.n > 0 & cg3.n > 0 & cg4.n > 0)
|
|
246 in.cg1.cg2.cg4 = (cg1.n > 0 & cg2.n > 0 & cg4.n > 0)
|
|
247 in.cg1.cg3.cg4 = (cg1.n > 0 & cg3.n > 0 & cg4.n > 0)
|
|
248
|
|
249 in.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)
|
|
250
|
|
251 #rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)
|
|
252 rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, ce.html, un.html)
|
|
253 rw = c(rw, ca.n, cg.n, cm.n, ce.n, in.classes, in.ca.cg, in.ca.cg.cm, in.ca.cg.ce, in.ca.cg.cm.ce, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)
|
|
254
|
|
255
|
|
256
|
|
257 cat(tr(rw), file=main.html, append=T)
|
|
258
|
|
259
|
|
260 for(i in 1:nrow(allc)){ #generate html by id
|
|
261 html = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])
|
|
262 cat(paste(html, "<br />"), file=sequence.id.page, append=T)
|
|
263 }
|
|
264 }
|
|
265
|
|
266 cat("</table>", file=main.html, append=T)
|
|
267
|
|
268 print(paste("Single sequences:", single.sequences))
|
|
269 print(paste("Sequences in multiple subclasses:", in.multiple))
|
|
270 print(paste("Multiple sequences in one subclass:", multiple.in.one))
|
|
271 print(paste("Matched with unmatched:", some.unmatched))
|
|
272 print(paste("Count that should match 'matched' sequences:", matched))
|
|
273
|
|
274 #ACGT overview
|
|
275
|
|
276 #NToverview = merged[!grepl("^unmatched", merged$best_match),]
|
|
277 NToverview = merged
|
|
278
|
|
279 if(empty.region.filter == "leader"){
|
|
280 NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
281 } else if(empty.region.filter == "FR1"){
|
|
282 NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
283 } else if(empty.region.filter == "CDR1"){
|
|
284 NToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
285 } else if(empty.region.filter == "FR2"){
|
|
286 NToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
|
|
287 }
|
|
288
|
|
289 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
|
|
290 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
|
|
291 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
|
|
292 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
|
|
293
|
|
294 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
|
|
295
|
|
296 #NToverview = rbind(NToverview, NTsum)
|
|
297
|
|
298 NTresult = data.frame(nt=c("A", "C", "T", "G"))
|
|
299
|
|
300 for(clazz in gene.classes){
|
|
301 print(paste("class:", clazz))
|
|
302 NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]
|
|
303 print(paste("nrow:", nrow(NToverview.sub)))
|
|
304 new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
|
|
305 new.col.y = sum(new.col.x)
|
|
306 new.col.z = round(new.col.x / new.col.y * 100, 2)
|
|
307
|
|
308 tmp = names(NTresult)
|
|
309 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
|
|
310 names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
|
|
311 }
|
|
312
|
|
313 NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]
|
|
314
|
|
315 names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")
|
|
316
|
|
317 write.table(NToverview.tmp, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
|
|
318
|
|
319 NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
|
|
320
|
|
321 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
|
|
322 new.col.y = sum(new.col.x)
|
|
323 new.col.z = round(new.col.x / new.col.y * 100, 2)
|
|
324
|
|
325 tmp = names(NTresult)
|
|
326 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
|
|
327 names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))
|
|
328
|
|
329 names(hotspot.analysis.sum) = names(NTresult)
|
|
330
|
|
331 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
|
|
332
|
|
333 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
|
|
334
|
|
335
|
|
336
|
|
337
|
|
338
|
|
339
|
|
340
|
|
341
|
|
342
|
|
343
|
|
344
|
|
345
|
|
346
|
|
347
|
|
348
|
|
349
|
|
350
|
|
351
|
|
352
|
|
353
|
|
354
|
|
355
|
|
356
|
|
357
|
|
358
|
|
359
|
|
360
|
|
361
|
|
362
|
|
363
|