comparison sequence_overview.r @ 81:b6f9a640e098 draft

Uploaded
author davidvanzessen
date Fri, 19 Feb 2021 15:10:54 +0000
parents
children
comparison
equal deleted inserted replaced
80:a4617f1d1d89 81:b6f9a640e098
1 library(reshape2)
2
3 args <- commandArgs(trailingOnly = TRUE)
4
5 before.unique.file = args[1]
6 merged.file = args[2]
7 outputdir = args[3]
8 gene.classes = unlist(strsplit(args[4], ","))
9 hotspot.analysis.sum.file = args[5]
10 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
11 NTsum.file = paste(outputdir, "ntsum.txt", sep="/")
12 main.html = "index.html"
13 empty.region.filter = args[6]
14
15
16 setwd(outputdir)
17
18 before.unique = read.table(before.unique.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
19 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
20 hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")
21
22 #before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]
23
24 if(empty.region.filter == "leader"){
25 before.unique$seq_conc = paste(before.unique$FR1.IMGT.seq, before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
26 } else if(empty.region.filter == "FR1"){
27 before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
28 } else if(empty.region.filter == "CDR1"){
29 before.unique$seq_conc = paste(before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
30 } else if(empty.region.filter == "FR2"){
31 before.unique$seq_conc = paste(before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)
32 }
33
34 IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]
35 IDs$best_match = as.character(IDs$best_match)
36
37 dat = data.frame(table(before.unique$seq_conc))
38
39 names(dat) = c("seq_conc", "Freq")
40
41 dat$seq_conc = factor(dat$seq_conc)
42
43 dat = dat[order(as.character(dat$seq_conc)),]
44
45 #writing html from R...
46 get.bg.color = function(val){
47 if(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color
48 return(ifelse(val,"#eafaf1","#f9ebea"))
49 } else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0
50 return(ifelse(val > 0,"#eaecee","white"))
51 } else {
52 return("white")
53 }
54 }
55 td = function(val) {
56 return(paste("<td bgcolor='", get.bg.color(val), "'>", val, "</td>", sep=""))
57 }
58 tr = function(val) {
59 return(paste(c("<tr>", sapply(val, td), "</tr>"), collapse=""))
60 }
61
62 make.link = function(id, clss, val) {
63 paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="")
64 }
65 tbl = function(df) {
66 res = "<table border='1'>"
67 for(i in 1:nrow(df)){
68 res = paste(res, tr(df[i,]), sep="")
69 }
70 res = paste(res, "</table>")
71 }
72
73 cat("<center><img src=''> Please note that this tab is based on all sequences before filter unique sequences and the remove duplicates based on filters are applied. In this table only sequences occuring more than once are included. </center>", file=main.html, append=F)
74 cat("<table border='1' class='pure-table pure-table-striped'>", file=main.html, append=T)
75
76 if(empty.region.filter == "leader"){
77 cat("<caption>FR1+CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
78 } else if(empty.region.filter == "FR1"){
79 cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
80 } else if(empty.region.filter == "CDR1"){
81 cat("<caption>FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
82 } else if(empty.region.filter == "FR2"){
83 cat("<caption>CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
84 }
85
86 cat("<tr>", file=main.html, append=T)
87 cat("<th>Sequence</th><th>Functionality</th><th>IGA1</th><th>IGA2</th><th>IGG1</th><th>IGG2</th><th>IGG3</th><th>IGG4</th><th>IGM</th><th>IGE</th><th>UN</th>", file=main.html, append=T)
88 cat("<th>total IGA</th><th>total IGG</th><th>total IGM</th><th>total IGE</th><th>number of subclasses</th><th>present in both IGA and IGG</th><th>present in IGA, IGG and IGM</th><th>present in IGA, IGG and IGE</th><th>present in IGA, IGG, IGM and IGE</th><th>IGA1+IGA2</th>", file=main.html, append=T)
89 cat("<th>IGG1+IGG2</th><th>IGG1+IGG3</th><th>IGG1+IGG4</th><th>IGG2+IGG3</th><th>IGG2+IGG4</th><th>IGG3+IGG4</th>", file=main.html, append=T)
90 cat("<th>IGG1+IGG2+IGG3</th><th>IGG2+IGG3+IGG4</th><th>IGG1+IGG2+IGG4</th><th>IGG1+IGG3+IGG4</th><th>IGG1+IGG2+IGG3+IGG4</th>", file=main.html, append=T)
91 cat("</tr>", file=main.html, append=T)
92
93
94
95 single.sequences=0 #sequence only found once, skipped
96 in.multiple=0 #same sequence across multiple subclasses
97 multiple.in.one=0 #same sequence multiple times in one subclass
98 unmatched=0 #all of the sequences are unmatched
99 some.unmatched=0 #one or more sequences in a clone are unmatched
100 matched=0 #should be the same als matched sequences
101
102 sequence.id.page="by_id.html"
103
104 for(i in 1:nrow(dat)){
105
106 ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGA1", IDs$best_match),]
107 ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGA2", IDs$best_match),]
108
109 cg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG1", IDs$best_match),]
110 cg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG2", IDs$best_match),]
111 cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG3", IDs$best_match),]
112 cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGG4", IDs$best_match),]
113
114 cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGM", IDs$best_match),]
115
116 ce = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^IGE", IDs$best_match),]
117
118 un = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^unmatched", IDs$best_match),]
119
120 allc = rbind(ca1, ca2, cg1, cg2, cg3, cg4, cm, ce, un)
121
122 ca1.n = nrow(ca1)
123 ca2.n = nrow(ca2)
124
125 cg1.n = nrow(cg1)
126 cg2.n = nrow(cg2)
127 cg3.n = nrow(cg3)
128 cg4.n = nrow(cg4)
129
130 cm.n = nrow(cm)
131
132 ce.n = nrow(ce)
133
134 un.n = nrow(un)
135
136 classes = c(ca1.n, ca2.n, cg1.n, cg2.n, cg3.n, cg4.n, cm.n, ce.n, un.n)
137
138 classes.sum = sum(classes)
139
140 if(classes.sum == 1){
141 single.sequences = single.sequences + 1
142 next
143 }
144
145 if(un.n == classes.sum){
146 unmatched = unmatched + 1
147 next
148 }
149
150 classes.no.un = classes[-length(classes)]
151
152 in.classes = sum(classes.no.un > 0)
153
154 matched = matched + in.classes #count in how many subclasses the sequence occurs.
155
156 if(any(classes == classes.sum)){
157 multiple.in.one = multiple.in.one + 1
158 } else if (un.n > 0) {
159 some.unmatched = some.unmatched + 1
160 } else {
161 in.multiple = in.multiple + 1
162 }
163
164 id = as.numeric(dat[i,"seq_conc"])
165
166 functionality = paste(unique(allc[,"Functionality"]), collapse=",")
167
168 by.id.row = c()
169
170 if(ca1.n > 0){
171 cat(tbl(ca1), file=paste("IGA1_", id, ".html", sep=""))
172 }
173
174 if(ca2.n > 0){
175 cat(tbl(ca2), file=paste("IGA2_", id, ".html", sep=""))
176 }
177
178 if(cg1.n > 0){
179 cat(tbl(cg1), file=paste("IGG1_", id, ".html", sep=""))
180 }
181
182 if(cg2.n > 0){
183 cat(tbl(cg2), file=paste("IGG2_", id, ".html", sep=""))
184 }
185
186 if(cg3.n > 0){
187 cat(tbl(cg3), file=paste("IGG3_", id, ".html", sep=""))
188 }
189
190 if(cg4.n > 0){
191 cat(tbl(cg4), file=paste("IGG4_", id, ".html", sep=""))
192 }
193
194 if(cm.n > 0){
195 cat(tbl(cm), file=paste("IGM_", id, ".html", sep=""))
196 }
197
198 if(ce.n > 0){
199 cat(tbl(ce), file=paste("IGE_", id, ".html", sep=""))
200 }
201
202 if(un.n > 0){
203 cat(tbl(un), file=paste("un_", id, ".html", sep=""))
204 }
205
206 ca1.html = make.link(id, "IGA1", ca1.n)
207 ca2.html = make.link(id, "IGA2", ca2.n)
208
209 cg1.html = make.link(id, "IGG1", cg1.n)
210 cg2.html = make.link(id, "IGG2", cg2.n)
211 cg3.html = make.link(id, "IGG3", cg3.n)
212 cg4.html = make.link(id, "IGG4", cg4.n)
213
214 cm.html = make.link(id, "IGM", cm.n)
215
216 ce.html = make.link(id, "IGE", ce.n)
217
218 un.html = make.link(id, "un", un.n)
219
220 #extra columns
221 ca.n = ca1.n + ca2.n
222
223 cg.n = cg1.n + cg2.n + cg3.n + cg4.n
224
225 #in.classes
226
227 in.ca.cg = (ca.n > 0 & cg.n > 0)
228
229 in.ca.cg.cm = (ca.n > 0 & cg.n > 0 & cm.n > 0)
230
231 in.ca.cg.ce = (ca.n > 0 & cg.n > 0 & ce.n > 0)
232
233 in.ca.cg.cm.ce = (ca.n > 0 & cg.n > 0 & cm.n > 0 & ce.n > 0)
234
235 in.ca1.ca2 = (ca1.n > 0 & ca2.n > 0)
236
237 in.cg1.cg2 = (cg1.n > 0 & cg2.n > 0)
238 in.cg1.cg3 = (cg1.n > 0 & cg3.n > 0)
239 in.cg1.cg4 = (cg1.n > 0 & cg4.n > 0)
240 in.cg2.cg3 = (cg2.n > 0 & cg3.n > 0)
241 in.cg2.cg4 = (cg2.n > 0 & cg4.n > 0)
242 in.cg3.cg4 = (cg3.n > 0 & cg4.n > 0)
243
244 in.cg1.cg2.cg3 = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0)
245 in.cg2.cg3.cg4 = (cg2.n > 0 & cg3.n > 0 & cg4.n > 0)
246 in.cg1.cg2.cg4 = (cg1.n > 0 & cg2.n > 0 & cg4.n > 0)
247 in.cg1.cg3.cg4 = (cg1.n > 0 & cg3.n > 0 & cg4.n > 0)
248
249 in.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)
250
251 #rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)
252 rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, ce.html, un.html)
253 rw = c(rw, ca.n, cg.n, cm.n, ce.n, in.classes, in.ca.cg, in.ca.cg.cm, in.ca.cg.ce, in.ca.cg.cm.ce, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)
254
255
256
257 cat(tr(rw), file=main.html, append=T)
258
259
260 for(i in 1:nrow(allc)){ #generate html by id
261 html = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])
262 cat(paste(html, "<br />"), file=sequence.id.page, append=T)
263 }
264 }
265
266 cat("</table>", file=main.html, append=T)
267
268 print(paste("Single sequences:", single.sequences))
269 print(paste("Sequences in multiple subclasses:", in.multiple))
270 print(paste("Multiple sequences in one subclass:", multiple.in.one))
271 print(paste("Matched with unmatched:", some.unmatched))
272 print(paste("Count that should match 'matched' sequences:", matched))
273
274 #ACGT overview
275
276 #NToverview = merged[!grepl("^unmatched", merged$best_match),]
277 NToverview = merged
278
279 if(empty.region.filter == "leader"){
280 NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
281 } else if(empty.region.filter == "FR1"){
282 NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
283 } else if(empty.region.filter == "CDR1"){
284 NToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
285 } else if(empty.region.filter == "FR2"){
286 NToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
287 }
288
289 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
290 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
291 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
292 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
293
294 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
295
296 #NToverview = rbind(NToverview, NTsum)
297
298 NTresult = data.frame(nt=c("A", "C", "T", "G"))
299
300 for(clazz in gene.classes){
301 print(paste("class:", clazz))
302 NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]
303 print(paste("nrow:", nrow(NToverview.sub)))
304 new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
305 new.col.y = sum(new.col.x)
306 new.col.z = round(new.col.x / new.col.y * 100, 2)
307
308 tmp = names(NTresult)
309 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
310 names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
311 }
312
313 NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]
314
315 names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")
316
317 write.table(NToverview.tmp, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
318
319 NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
320
321 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
322 new.col.y = sum(new.col.x)
323 new.col.z = round(new.col.x / new.col.y * 100, 2)
324
325 tmp = names(NTresult)
326 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
327 names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))
328
329 names(hotspot.analysis.sum) = names(NTresult)
330
331 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
332
333 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363