comparison shm_csr.r @ 81:b6f9a640e098 draft

Uploaded
author davidvanzessen
date Fri, 19 Feb 2021 15:10:54 +0000
parents
children 8fcf31272f6e
comparison
equal deleted inserted replaced
80:a4617f1d1d89 81:b6f9a640e098
1 library(data.table)
2 library(ggplot2)
3 library(reshape2)
4
5 args <- commandArgs(trailingOnly = TRUE)
6
7 input = args[1]
8 genes = unlist(strsplit(args[2], ","))
9 outputdir = args[3]
10 empty.region.filter = args[4]
11 setwd(outputdir)
12
13 #dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
14
15 dat = data.frame(fread(input, sep="\t", header=T, stringsAsFactors=F)) #fread because read.table suddenly skips certain rows...
16
17 if(length(dat$Sequence.ID) == 0){
18 setwd(outputdir)
19 result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
20 row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)")
21 write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
22 transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
23 row.names(transitionTable) = c("A", "C", "G", "T")
24 transitionTable["A","A"] = NA
25 transitionTable["C","C"] = NA
26 transitionTable["G","G"] = NA
27 transitionTable["T","T"] = NA
28
29 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
30 cat("0", file="n.txt")
31 stop("No data")
32 }
33
34 cleanup_columns = c("FR1.IMGT.c.a",
35 "FR2.IMGT.g.t",
36 "CDR1.IMGT.Nb.of.nucleotides",
37 "CDR2.IMGT.t.a",
38 "FR1.IMGT.c.g",
39 "CDR1.IMGT.c.t",
40 "FR2.IMGT.a.c",
41 "FR2.IMGT.Nb.of.mutations",
42 "FR2.IMGT.g.c",
43 "FR2.IMGT.a.g",
44 "FR3.IMGT.t.a",
45 "FR3.IMGT.t.c",
46 "FR2.IMGT.g.a",
47 "FR3.IMGT.c.g",
48 "FR1.IMGT.Nb.of.mutations",
49 "CDR1.IMGT.g.a",
50 "CDR1.IMGT.t.g",
51 "CDR1.IMGT.g.c",
52 "CDR2.IMGT.Nb.of.nucleotides",
53 "FR2.IMGT.a.t",
54 "CDR1.IMGT.Nb.of.mutations",
55 "CDR3.IMGT.Nb.of.nucleotides",
56 "CDR1.IMGT.a.g",
57 "FR3.IMGT.a.c",
58 "FR1.IMGT.g.a",
59 "FR3.IMGT.a.g",
60 "FR1.IMGT.a.t",
61 "CDR2.IMGT.a.g",
62 "CDR2.IMGT.Nb.of.mutations",
63 "CDR2.IMGT.g.t",
64 "CDR2.IMGT.a.c",
65 "CDR1.IMGT.t.c",
66 "FR3.IMGT.g.c",
67 "FR1.IMGT.g.t",
68 "FR3.IMGT.g.t",
69 "CDR1.IMGT.a.t",
70 "FR1.IMGT.a.g",
71 "FR3.IMGT.a.t",
72 "FR3.IMGT.Nb.of.nucleotides",
73 "FR2.IMGT.t.c",
74 "CDR2.IMGT.g.a",
75 "FR2.IMGT.t.a",
76 "CDR1.IMGT.t.a",
77 "FR2.IMGT.t.g",
78 "FR3.IMGT.t.g",
79 "FR2.IMGT.Nb.of.nucleotides",
80 "FR1.IMGT.t.a",
81 "FR1.IMGT.t.g",
82 "FR3.IMGT.c.t",
83 "FR1.IMGT.t.c",
84 "CDR2.IMGT.a.t",
85 "FR2.IMGT.c.t",
86 "CDR1.IMGT.g.t",
87 "CDR2.IMGT.t.g",
88 "FR1.IMGT.Nb.of.nucleotides",
89 "CDR1.IMGT.c.g",
90 "CDR2.IMGT.t.c",
91 "FR3.IMGT.g.a",
92 "CDR1.IMGT.a.c",
93 "FR2.IMGT.c.a",
94 "FR3.IMGT.Nb.of.mutations",
95 "FR2.IMGT.c.g",
96 "CDR2.IMGT.g.c",
97 "FR1.IMGT.g.c",
98 "CDR2.IMGT.c.t",
99 "FR3.IMGT.c.a",
100 "CDR1.IMGT.c.a",
101 "CDR2.IMGT.c.g",
102 "CDR2.IMGT.c.a",
103 "FR1.IMGT.c.t",
104 "FR1.IMGT.Nb.of.silent.mutations",
105 "FR2.IMGT.Nb.of.silent.mutations",
106 "FR3.IMGT.Nb.of.silent.mutations",
107 "FR1.IMGT.Nb.of.nonsilent.mutations",
108 "FR2.IMGT.Nb.of.nonsilent.mutations",
109 "FR3.IMGT.Nb.of.nonsilent.mutations")
110
111 print("Cleaning up columns")
112
113 for(col in cleanup_columns){
114 dat[,col] = gsub("\\(.*\\)", "", dat[,col])
115 #dat[dat[,col] == "",] = "0"
116 dat[,col] = as.numeric(dat[,col])
117 dat[is.na(dat[,col]),col] = 0
118 }
119
120 regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3")
121 if(empty.region.filter == "FR1") {
122 regions = c("CDR1", "FR2", "CDR2", "FR3")
123 } else if (empty.region.filter == "CDR1") {
124 regions = c("FR2", "CDR2", "FR3")
125 } else if (empty.region.filter == "FR2") {
126 regions = c("CDR2", "FR3")
127 }
128
129 pdfplots = list() #save() this later to create the pdf plots in another script (maybe avoids the "address (nil), cause memory not mapped")
130
131 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }
132
133 print("aggregating data into new columns")
134
135 VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="")
136 dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns)
137
138 VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="")
139 dat$FR3.IMGT.Nb.of.nucleotides = nchar(dat$FR3.IMGT.seq)
140 dat$VRegionNucleotides = apply(dat, FUN=sum_by_row, 1, columns=VRegionNucleotides_columns)
141
142 transitionMutations_columns = paste(rep(regions, each=4), c(".IMGT.a.g", ".IMGT.g.a", ".IMGT.c.t", ".IMGT.t.c"), sep="")
143 dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns)
144
145 transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="")
146 dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns)
147
148 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="")
149 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns)
150
151 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="")
152 #totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="")
153 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns)
154
155 transitionMutationsAtAT_columns = paste(rep(regions, each=2), c(".IMGT.a.g",".IMGT.t.c"), sep="")
156 dat$transitionMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtAT_columns)
157
158 totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="")
159 #totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="")
160 dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns)
161
162 FRRegions = regions[grepl("FR", regions)]
163 CDRRegions = regions[grepl("CDR", regions)]
164
165 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="")
166 dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns)
167
168 CDR_silentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.silent.mutations", sep="")
169 dat$silentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_silentMutations_columns)
170
171 FR_nonSilentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="")
172 dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns)
173
174 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="")
175 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns)
176
177 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR")
178 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T)
179
180 setwd(outputdir)
181
182 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T)
183
184 base.order.x = data.frame(base=c("A", "C", "G", "T"), order.x=1:4)
185 base.order.y = data.frame(base=c("T", "G", "C", "A"), order.y=1:4)
186
187 calculate_result = function(i, gene, dat, matrx, f, fname, name){
188 tmp = dat[grepl(paste("^", gene, ".*", sep=""), dat$best_match),]
189
190 j = i - 1
191 x = (j * 3) + 1
192 y = (j * 3) + 2
193 z = (j * 3) + 3
194
195 if(nrow(tmp) > 0){
196 if(fname == "sum"){
197 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
198 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
199 matrx[1,z] = round(f(matrx[1,x] / matrx[1,y]) * 100, digits=1)
200 } else {
201 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
202 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
203 matrx[1,z] = round(f(tmp$VRegionMutations / tmp$VRegionNucleotides) * 100, digits=1)
204 }
205
206 matrx[2,x] = round(f(tmp$transitionMutations, na.rm=T), digits=1)
207 matrx[2,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
208 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
209
210 matrx[3,x] = round(f(tmp$transversionMutations, na.rm=T), digits=1)
211 matrx[3,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
212 matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
213
214 matrx[4,x] = round(f(tmp$transitionMutationsAtGC, na.rm=T), digits=1)
215 matrx[4,y] = round(f(tmp$totalMutationsAtGC, na.rm=T), digits=1)
216 matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
217
218 matrx[5,x] = round(f(tmp$totalMutationsAtGC, na.rm=T), digits=1)
219 matrx[5,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
220 matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
221
222 matrx[6,x] = round(f(tmp$transitionMutationsAtAT, na.rm=T), digits=1)
223 matrx[6,y] = round(f(tmp$totalMutationsAtAT, na.rm=T), digits=1)
224 matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1)
225
226 matrx[7,x] = round(f(tmp$totalMutationsAtAT, na.rm=T), digits=1)
227 matrx[7,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
228 matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1)
229
230 matrx[8,x] = round(f(tmp$nonSilentMutationsFR, na.rm=T), digits=1)
231 matrx[8,y] = round(f(tmp$silentMutationsFR, na.rm=T), digits=1)
232 matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1)
233
234 matrx[9,x] = round(f(tmp$nonSilentMutationsCDR, na.rm=T), digits=1)
235 matrx[9,y] = round(f(tmp$silentMutationsCDR, na.rm=T), digits=1)
236 matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1)
237
238 if(fname == "sum"){
239
240 regions.fr = regions[grepl("FR", regions)]
241 regions.fr = paste(regions.fr, ".IMGT.Nb.of.nucleotides", sep="")
242 regions.cdr = regions[grepl("CDR", regions)]
243 regions.cdr = paste(regions.cdr, ".IMGT.Nb.of.nucleotides", sep="")
244
245 if(length(regions.fr) > 1){ #in case there is only on FR region (rowSums needs >1 column)
246 matrx[10,x] = round(f(rowSums(tmp[,regions.fr], na.rm=T)), digits=1)
247 } else {
248 matrx[10,x] = round(f(tmp[,regions.fr], na.rm=T), digits=1)
249 }
250 matrx[10,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
251 matrx[10,z] = round(matrx[10,x] / matrx[10,y] * 100, digits=1)
252
253 if(length(regions.cdr) > 1){ #in case there is only on CDR region
254 matrx[11,x] = round(f(rowSums(tmp[,regions.cdr], na.rm=T)), digits=1)
255 } else {
256 matrx[11,x] = round(f(tmp[,regions.cdr], na.rm=T), digits=1)
257 }
258 matrx[11,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
259 matrx[11,z] = round(matrx[11,x] / matrx[11,y] * 100, digits=1)
260 }
261 }
262
263 transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros)
264 row.names(transitionTable) = c("A", "C", "G", "T")
265 transitionTable["A","A"] = NA
266 transitionTable["C","C"] = NA
267 transitionTable["G","G"] = NA
268 transitionTable["T","T"] = NA
269
270 if(nrow(tmp) > 0){
271 for(nt1 in nts){
272 for(nt2 in nts){
273 if(nt1 == nt2){
274 next
275 }
276 NT1 = LETTERS[letters == nt1]
277 NT2 = LETTERS[letters == nt2]
278 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
279 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
280 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
281 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
282 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
283 if (empty.region.filter == "leader"){
284 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)])
285 } else if (empty.region.filter == "FR1") {
286 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)])
287 } else if (empty.region.filter == "CDR1") {
288 transitionTable[NT1,NT2] = sum(tmp[,c(FR2, CDR2, FR3)])
289 } else if (empty.region.filter == "FR2") {
290 transitionTable[NT1,NT2] = sum(tmp[,c(CDR2, FR3)])
291 }
292 }
293 }
294 transition = transitionTable
295 transition$id = names(transition)
296
297 transition2 = melt(transition, id.vars="id")
298
299 transition2 = merge(transition2, base.order.x, by.x="id", by.y="base")
300
301 transition2 = merge(transition2, base.order.y, by.x="variable", by.y="base")
302
303 transition2[is.na(transition2$value),]$value = 0
304
305 if(any(transition2$value != 0)){ #having a transition table filled with 0 is bad
306 print("Plotting heatmap and transition")
307 png(filename=paste("transitions_stacked_", name, ".png", sep=""))
308 p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity", colour="black") #stacked bar
309 p = p + xlab("From base") + ylab("") + ggtitle("Bargraph transition information") + guides(fill=guide_legend(title=NULL))
310 p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black")) + scale_fill_manual(values=c("A" = "blue4", "G" = "lightblue1", "C" = "olivedrab3", "T" = "olivedrab4"))
311 #p = p + scale_colour_manual(values=c("A" = "black", "G" = "black", "C" = "black", "T" = "black"))
312 print(p)
313 dev.off()
314
315 pdfplots[[paste("transitions_stacked_", name, ".pdf", sep="")]] <<- p
316
317 png(filename=paste("transitions_heatmap_", name, ".png", sep=""))
318 p = ggplot(transition2, aes(factor(reorder(variable, -order.y)), factor(reorder(id, -order.x)))) + geom_tile(aes(fill = value)) + scale_fill_gradient(low="white", high="steelblue") #heatmap
319 p = p + xlab("To base") + ylab("From Base") + ggtitle("Heatmap transition information") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))
320 print(p)
321 dev.off()
322
323 pdfplots[[paste("transitions_heatmap_", name, ".pdf", sep="")]] <<- p
324 } else {
325 #print("No data to plot")
326 }
327 }
328
329 #print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep=""))
330 write.table(x=transitionTable, file=paste("transitions_", name ,"_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
331 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", name , "_", fname, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
332 cat(matrx[1,x], file=paste(name, "_", fname, "_value.txt" ,sep=""))
333 cat(nrow(tmp), file=paste(name, "_", fname, "_n.txt" ,sep=""))
334 #print(paste(fname, name, nrow(tmp)))
335 matrx
336 }
337 nts = c("a", "c", "g", "t")
338 zeros=rep(0, 4)
339 funcs = c(median, sum, mean)
340 fnames = c("median", "sum", "mean")
341
342 print("Creating result tables")
343
344 for(i in 1:length(funcs)){
345 func = funcs[[i]]
346 fname = fnames[[i]]
347
348 print(paste("Creating table for", fname))
349
350 rows = 9
351 if(fname == "sum"){
352 rows = 11
353 }
354 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows)
355 for(i in 1:length(genes)){
356 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i])
357 }
358 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all")
359
360 result = data.frame(matrx)
361 if(fname == "sum"){
362 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR")
363 } else {
364 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
365 }
366 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F)
367 }
368
369 print("Adding median number of mutations to sum table")
370 sum.table = read.table("mutations_sum.txt", sep=",", header=F)
371 median.table = read.table("mutations_median.txt", sep=",", header=F)
372
373 new.table = sum.table[1,]
374 new.table[2,] = median.table[1,]
375 new.table[3:12,] = sum.table[2:11,]
376 new.table[,1] = as.character(new.table[,1])
377 new.table[2,1] = "Median of Number of Mutations (%)"
378
379 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),]
380
381 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F)
382
383 print("Plotting IGA piechart")
384
385 dat = dat[!grepl("^unmatched", dat$best_match),]
386
387 #blegh
388
389 genesForPlot = dat[grepl("IGA", dat$best_match),]$best_match
390
391 if(length(genesForPlot) > 0){
392 genesForPlot = data.frame(table(genesForPlot))
393 colnames(genesForPlot) = c("Gene","Freq")
394 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
395
396 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene))
397 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGA1" = "lightblue1", "IGA2" = "blue4"))
398 pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)
399 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"), axis.title=element_blank(), axis.text=element_blank(), axis.ticks=element_blank())
400 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGA subclass distribution", "( n =", sum(genesForPlot$Freq), ")"))
401 write.table(genesForPlot, "IGA_pie.txt", sep="\t",quote=F,row.names=F,col.names=T)
402
403 png(filename="IGA.png")
404 print(pc)
405 dev.off()
406
407 pdfplots[["IGA.pdf"]] <- pc
408 }
409
410 print("Plotting IGG piechart")
411
412 genesForPlot = dat[grepl("IGG", dat$best_match),]$best_match
413
414 if(length(genesForPlot) > 0){
415 genesForPlot = data.frame(table(genesForPlot))
416 colnames(genesForPlot) = c("Gene","Freq")
417 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
418
419 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene))
420 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred"))
421 pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)
422 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"), axis.title=element_blank(), axis.text=element_blank(), axis.ticks=element_blank())
423 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclass distribution", "( n =", sum(genesForPlot$Freq), ")"))
424 write.table(genesForPlot, "IGG_pie.txt", sep="\t",quote=F,row.names=F,col.names=T)
425
426 png(filename="IGG.png")
427 print(pc)
428 dev.off()
429
430 pdfplots[["IGG.pdf"]] <- pc
431 }
432
433 print("Plotting scatterplot")
434
435 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
436 dat.clss = dat
437
438 dat.clss$best_match = substr(dat.clss$best_match, 0, 3)
439
440 dat.clss = rbind(dat, dat.clss)
441
442 p = ggplot(dat.clss, aes(best_match, percentage_mutations))
443 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)
444 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))
445 p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))
446 p = p + scale_colour_manual(guide = guide_legend(title = "Subclass"), values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))
447
448 png(filename="scatter.png")
449 print(p)
450 dev.off()
451
452 pdfplots[["scatter.pdf"]] <- p
453
454 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
455
456 print("Plotting frequency ranges plot")
457
458 dat$best_match_class = substr(dat$best_match, 0, 3)
459 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")
460 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)
461
462 frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match_class")])
463
464 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])
465
466 frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")
467
468 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)
469
470 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))
471 p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))
472 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(guide = guide_legend(title = "Class"), values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))
473
474 png(filename="frequency_ranges.png")
475 print(p)
476 dev.off()
477
478 pdfplots[["frequency_ranges.pdf"]] <- p
479
480 save(pdfplots, file="pdfplots.RData")
481
482 frequency_bins_data_by_class = frequency_bins_data
483
484 frequency_bins_data_by_class = frequency_bins_data_by_class[order(frequency_bins_data_by_class$best_match_class, frequency_bins_data_by_class$frequency_bins),]
485
486 frequency_bins_data_by_class$frequency_bins = gsub("-", " to ", frequency_bins_data_by_class$frequency_bins)
487 frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "20", c("frequency_bins")] = "20 or higher"
488 frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "0", c("frequency_bins")] = "0 or lower"
489
490 write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\t",quote=F,row.names=F,col.names=T)
491
492 frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "best_match_class", "frequency_bins")])
493
494 frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match")])
495
496 frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match")
497
498 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)
499
500 frequency_bins_data = frequency_bins_data[order(frequency_bins_data$best_match, frequency_bins_data$frequency_bins),]
501 frequency_bins_data$frequency_bins = gsub("-", " to ", frequency_bins_data$frequency_bins)
502 frequency_bins_data[frequency_bins_data$frequency_bins == "20", c("frequency_bins")] = "20 or higher"
503 frequency_bins_data[frequency_bins_data$frequency_bins == "0", c("frequency_bins")] = "0 or lower"
504
505 write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T)
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561