comparison report_clonality/RScript.r @ 8:8cbc1a8d27ae draft

Uploaded
author davidvanzessen
date Mon, 19 Dec 2016 08:50:31 -0500
parents d001d0c05dbe
children efa1f5a17b6e
comparison
equal deleted inserted replaced
7:54f6756bacb1 8:8cbc1a8d27ae
133 if(any(is.na(PRODF$freq))){ #if there was an "_" in the ID, but not the frequency, go back to frequency of 1 for every sequence 133 if(any(is.na(PRODF$freq))){ #if there was an "_" in the ID, but not the frequency, go back to frequency of 1 for every sequence
134 PRODF$freq = 1 134 PRODF$freq = 1
135 } 135 }
136 } 136 }
137 137
138 #make a names list with sample -> color
139 naive.colors = c('blue4', 'darkred', 'olivedrab3', 'red', 'gray74', 'darkviolet', 'lightblue1', 'gold', 'chartreuse2', 'pink', 'Paleturquoise3', 'Chocolate1', 'Yellow', 'Deeppink3', 'Mediumorchid1', 'Darkgreen', 'Blue', 'Gray36', 'Hotpink', 'Yellow4')
140 unique.samples = unique(PRODF$Sample)
141
142 if(length(unique.samples) <= length(naive.colors)){
143 sample.colors = naive.colors[1:length(unique.samples)]
144 } else {
145 sample.colors = rainbow(length(unique.samples))
146 }
147
148 names(sample.colors) = unique.samples
149
150 print("Sample.colors")
151 print(sample.colors)
138 152
139 153
140 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive 154 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive
141 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) 155 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T)
142 write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) 156 write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T)
273 287
274 print("Report Clonality - V, D and J frequency plots") 288 print("Report Clonality - V, D and J frequency plots")
275 289
276 pV = ggplot(PRODFV) 290 pV = ggplot(PRODFV)
277 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 291 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
278 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") 292 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") + scale_fill_manual(values=sample.colors)
293 pV = pV + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
279 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 294 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
280 295
281 png("VPlot.png",width = 1280, height = 720) 296 png("VPlot.png",width = 1280, height = 720)
282 pV 297 pV
283 dev.off(); 298 dev.off();
284 299
285 if(useD){ 300 if(useD){
286 pD = ggplot(PRODFD) 301 pD = ggplot(PRODFD)
287 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 302 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
288 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") 303 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") + scale_fill_manual(values=sample.colors)
304 pD = pD + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
289 write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 305 write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
290 306
291 png("DPlot.png",width = 800, height = 600) 307 png("DPlot.png",width = 800, height = 600)
292 print(pD) 308 print(pD)
293 dev.off(); 309 dev.off();
294 } 310 }
295 311
296 pJ = ggplot(PRODFJ) 312 pJ = ggplot(PRODFJ)
297 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 313 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
298 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") 314 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") + scale_fill_manual(values=sample.colors)
299 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 315 pJ = pJ + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
300
301 png("JPlot.png",width = 800, height = 600)
302 pJ
303 dev.off();
304
305 pJ = ggplot(PRODFJ)
306 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
307 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage")
308 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 316 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
309 317
310 png("JPlot.png",width = 800, height = 600) 318 png("JPlot.png",width = 800, height = 600)
311 pJ 319 pJ
312 dev.off(); 320 dev.off();
322 VGenes = merge(VGenes, TotalPerSample, by="Sample") 330 VGenes = merge(VGenes, TotalPerSample, by="Sample")
323 VGenes$Frequency = VGenes$Count * 100 / VGenes$total 331 VGenes$Frequency = VGenes$Count * 100 / VGenes$total
324 VPlot = ggplot(VGenes) 332 VPlot = ggplot(VGenes)
325 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 333 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
326 ggtitle("Distribution of V gene families") + 334 ggtitle("Distribution of V gene families") +
327 ylab("Percentage of sequences") 335 ylab("Percentage of sequences") +
336 scale_fill_manual(values=sample.colors) +
337 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
328 png("VFPlot.png") 338 png("VFPlot.png")
329 VPlot 339 VPlot
330 dev.off(); 340 dev.off();
331 write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 341 write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
332 342
338 DGenes = merge(DGenes, TotalPerSample, by="Sample") 348 DGenes = merge(DGenes, TotalPerSample, by="Sample")
339 DGenes$Frequency = DGenes$Count * 100 / DGenes$total 349 DGenes$Frequency = DGenes$Count * 100 / DGenes$total
340 DPlot = ggplot(DGenes) 350 DPlot = ggplot(DGenes)
341 DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 351 DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
342 ggtitle("Distribution of D gene families") + 352 ggtitle("Distribution of D gene families") +
343 ylab("Percentage of sequences") 353 ylab("Percentage of sequences") +
354 scale_fill_manual(values=sample.colors) +
355 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
344 png("DFPlot.png") 356 png("DFPlot.png")
345 print(DPlot) 357 print(DPlot)
346 dev.off(); 358 dev.off();
347 write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) 359 write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
348 } 360 }
357 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total 369 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total
358 CDR3LengthPlot = ggplot(CDR3Length) 370 CDR3LengthPlot = ggplot(CDR3Length)
359 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 371 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
360 ggtitle("Length distribution of CDR3") + 372 ggtitle("Length distribution of CDR3") +
361 xlab("CDR3 Length") + 373 xlab("CDR3 Length") +
362 ylab("Percentage of sequences") 374 ylab("Percentage of sequences") +
375 scale_fill_manual(values=sample.colors) +
376 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
363 png("CDR3LengthPlot.png",width = 1280, height = 720) 377 png("CDR3LengthPlot.png",width = 1280, height = 720)
364 CDR3LengthPlot 378 CDR3LengthPlot
365 dev.off() 379 dev.off()
366 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) 380 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T)
367 381
784 PRODF = bak 798 PRODF = bak
785 799
786 800
787 # ---------------------- D reading frame ---------------------- 801 # ---------------------- D reading frame ----------------------
788 802
789 D.REGION.reading.frame = PRODF$D.REGION.reading.frame 803 D.REGION.reading.frame = PRODF[,c("Sample", "D.REGION.reading.frame")]
790 804
791 D.REGION.reading.frame[is.na(D.REGION.reading.frame)] = "No D" 805 chck = is.na(D.REGION.reading.frame$D.REGION.reading.frame)
792 806 if(any(chck)){
793 D.REGION.reading.frame = data.frame(table(D.REGION.reading.frame)) 807 D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D"
808 }
809
810 D.REGION.reading.frame = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")])
794 811
795 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T) 812 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T)
796 813
797 D.REGION.reading.frame = ggplot(D.REGION.reading.frame) 814 D.REGION.reading.frame = ggplot(D.REGION.reading.frame)
798 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") 815 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame")
816 D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors)
817 D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
799 818
800 png("DReadingFrame.png") 819 png("DReadingFrame.png")
801 D.REGION.reading.frame 820 D.REGION.reading.frame
802 dev.off() 821 dev.off()
803 822
832 AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat='identity', position='dodge' ) 851 AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat='identity', position='dodge' )
833 AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) 852 AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2)
834 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) 853 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2)
835 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) 854 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2)
836 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) 855 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2)
837 AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") 856 AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") + scale_fill_manual(values=sample.colors)
857 AAfreqplot = AAfreqplot + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
838 858
839 png("AAComposition.png",width = 1280, height = 720) 859 png("AAComposition.png",width = 1280, height = 720)
840 AAfreqplot 860 AAfreqplot
841 dev.off() 861 dev.off()
842 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T) 862 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T)
843 863
844 864 # ---------------------- AA median CDR3 length ----------------------
865
866 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length.DNA))), by=c("Sample")])
867 write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
868