Mercurial > repos > davidvanzessen > argalaxy_tools
comparison report_clonality/RScript.r @ 8:8cbc1a8d27ae draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 19 Dec 2016 08:50:31 -0500 |
parents | d001d0c05dbe |
children | efa1f5a17b6e |
comparison
equal
deleted
inserted
replaced
7:54f6756bacb1 | 8:8cbc1a8d27ae |
---|---|
133 if(any(is.na(PRODF$freq))){ #if there was an "_" in the ID, but not the frequency, go back to frequency of 1 for every sequence | 133 if(any(is.na(PRODF$freq))){ #if there was an "_" in the ID, but not the frequency, go back to frequency of 1 for every sequence |
134 PRODF$freq = 1 | 134 PRODF$freq = 1 |
135 } | 135 } |
136 } | 136 } |
137 | 137 |
138 #make a names list with sample -> color | |
139 naive.colors = c('blue4', 'darkred', 'olivedrab3', 'red', 'gray74', 'darkviolet', 'lightblue1', 'gold', 'chartreuse2', 'pink', 'Paleturquoise3', 'Chocolate1', 'Yellow', 'Deeppink3', 'Mediumorchid1', 'Darkgreen', 'Blue', 'Gray36', 'Hotpink', 'Yellow4') | |
140 unique.samples = unique(PRODF$Sample) | |
141 | |
142 if(length(unique.samples) <= length(naive.colors)){ | |
143 sample.colors = naive.colors[1:length(unique.samples)] | |
144 } else { | |
145 sample.colors = rainbow(length(unique.samples)) | |
146 } | |
147 | |
148 names(sample.colors) = unique.samples | |
149 | |
150 print("Sample.colors") | |
151 print(sample.colors) | |
138 | 152 |
139 | 153 |
140 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive | 154 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive |
141 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) | 155 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) |
142 write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) | 156 write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) |
273 | 287 |
274 print("Report Clonality - V, D and J frequency plots") | 288 print("Report Clonality - V, D and J frequency plots") |
275 | 289 |
276 pV = ggplot(PRODFV) | 290 pV = ggplot(PRODFV) |
277 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 291 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) |
278 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") | 292 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") + scale_fill_manual(values=sample.colors) |
293 pV = pV + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
279 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 294 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
280 | 295 |
281 png("VPlot.png",width = 1280, height = 720) | 296 png("VPlot.png",width = 1280, height = 720) |
282 pV | 297 pV |
283 dev.off(); | 298 dev.off(); |
284 | 299 |
285 if(useD){ | 300 if(useD){ |
286 pD = ggplot(PRODFD) | 301 pD = ggplot(PRODFD) |
287 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 302 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) |
288 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") | 303 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") + scale_fill_manual(values=sample.colors) |
304 pD = pD + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
289 write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 305 write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
290 | 306 |
291 png("DPlot.png",width = 800, height = 600) | 307 png("DPlot.png",width = 800, height = 600) |
292 print(pD) | 308 print(pD) |
293 dev.off(); | 309 dev.off(); |
294 } | 310 } |
295 | 311 |
296 pJ = ggplot(PRODFJ) | 312 pJ = ggplot(PRODFJ) |
297 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 313 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) |
298 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") | 314 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") + scale_fill_manual(values=sample.colors) |
299 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 315 pJ = pJ + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) |
300 | |
301 png("JPlot.png",width = 800, height = 600) | |
302 pJ | |
303 dev.off(); | |
304 | |
305 pJ = ggplot(PRODFJ) | |
306 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | |
307 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") | |
308 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 316 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
309 | 317 |
310 png("JPlot.png",width = 800, height = 600) | 318 png("JPlot.png",width = 800, height = 600) |
311 pJ | 319 pJ |
312 dev.off(); | 320 dev.off(); |
322 VGenes = merge(VGenes, TotalPerSample, by="Sample") | 330 VGenes = merge(VGenes, TotalPerSample, by="Sample") |
323 VGenes$Frequency = VGenes$Count * 100 / VGenes$total | 331 VGenes$Frequency = VGenes$Count * 100 / VGenes$total |
324 VPlot = ggplot(VGenes) | 332 VPlot = ggplot(VGenes) |
325 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 333 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
326 ggtitle("Distribution of V gene families") + | 334 ggtitle("Distribution of V gene families") + |
327 ylab("Percentage of sequences") | 335 ylab("Percentage of sequences") + |
336 scale_fill_manual(values=sample.colors) + | |
337 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
328 png("VFPlot.png") | 338 png("VFPlot.png") |
329 VPlot | 339 VPlot |
330 dev.off(); | 340 dev.off(); |
331 write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 341 write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
332 | 342 |
338 DGenes = merge(DGenes, TotalPerSample, by="Sample") | 348 DGenes = merge(DGenes, TotalPerSample, by="Sample") |
339 DGenes$Frequency = DGenes$Count * 100 / DGenes$total | 349 DGenes$Frequency = DGenes$Count * 100 / DGenes$total |
340 DPlot = ggplot(DGenes) | 350 DPlot = ggplot(DGenes) |
341 DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 351 DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
342 ggtitle("Distribution of D gene families") + | 352 ggtitle("Distribution of D gene families") + |
343 ylab("Percentage of sequences") | 353 ylab("Percentage of sequences") + |
354 scale_fill_manual(values=sample.colors) + | |
355 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
344 png("DFPlot.png") | 356 png("DFPlot.png") |
345 print(DPlot) | 357 print(DPlot) |
346 dev.off(); | 358 dev.off(); |
347 write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 359 write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
348 } | 360 } |
357 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total | 369 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total |
358 CDR3LengthPlot = ggplot(CDR3Length) | 370 CDR3LengthPlot = ggplot(CDR3Length) |
359 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 371 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
360 ggtitle("Length distribution of CDR3") + | 372 ggtitle("Length distribution of CDR3") + |
361 xlab("CDR3 Length") + | 373 xlab("CDR3 Length") + |
362 ylab("Percentage of sequences") | 374 ylab("Percentage of sequences") + |
375 scale_fill_manual(values=sample.colors) + | |
376 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
363 png("CDR3LengthPlot.png",width = 1280, height = 720) | 377 png("CDR3LengthPlot.png",width = 1280, height = 720) |
364 CDR3LengthPlot | 378 CDR3LengthPlot |
365 dev.off() | 379 dev.off() |
366 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) | 380 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) |
367 | 381 |
784 PRODF = bak | 798 PRODF = bak |
785 | 799 |
786 | 800 |
787 # ---------------------- D reading frame ---------------------- | 801 # ---------------------- D reading frame ---------------------- |
788 | 802 |
789 D.REGION.reading.frame = PRODF$D.REGION.reading.frame | 803 D.REGION.reading.frame = PRODF[,c("Sample", "D.REGION.reading.frame")] |
790 | 804 |
791 D.REGION.reading.frame[is.na(D.REGION.reading.frame)] = "No D" | 805 chck = is.na(D.REGION.reading.frame$D.REGION.reading.frame) |
792 | 806 if(any(chck)){ |
793 D.REGION.reading.frame = data.frame(table(D.REGION.reading.frame)) | 807 D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D" |
808 } | |
809 | |
810 D.REGION.reading.frame = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")]) | |
794 | 811 |
795 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T) | 812 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T) |
796 | 813 |
797 D.REGION.reading.frame = ggplot(D.REGION.reading.frame) | 814 D.REGION.reading.frame = ggplot(D.REGION.reading.frame) |
798 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") | 815 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") |
816 D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors) | |
817 D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
799 | 818 |
800 png("DReadingFrame.png") | 819 png("DReadingFrame.png") |
801 D.REGION.reading.frame | 820 D.REGION.reading.frame |
802 dev.off() | 821 dev.off() |
803 | 822 |
832 AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat='identity', position='dodge' ) | 851 AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat='identity', position='dodge' ) |
833 AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) | 852 AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) |
834 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) | 853 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) |
835 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) | 854 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) |
836 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) | 855 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) |
837 AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") | 856 AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") + scale_fill_manual(values=sample.colors) |
857 AAfreqplot = AAfreqplot + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
838 | 858 |
839 png("AAComposition.png",width = 1280, height = 720) | 859 png("AAComposition.png",width = 1280, height = 720) |
840 AAfreqplot | 860 AAfreqplot |
841 dev.off() | 861 dev.off() |
842 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T) | 862 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T) |
843 | 863 |
844 | 864 # ---------------------- AA median CDR3 length ---------------------- |
865 | |
866 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length.DNA))), by=c("Sample")]) | |
867 write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) | |
868 |