comparison report_clonality/RScript.r @ 9:efa1f5a17b6e draft

Uploaded
author davidvanzessen
date Mon, 19 Dec 2016 09:23:01 -0500
parents 8cbc1a8d27ae
children d3ebaa2d2fe0
comparison
equal deleted inserted replaced
8:8cbc1a8d27ae 9:efa1f5a17b6e
361 361
362 # ---------------------- Plotting the cdr3 length ---------------------- 362 # ---------------------- Plotting the cdr3 length ----------------------
363 363
364 print("Report Clonality - CDR3 length plot") 364 print("Report Clonality - CDR3 length plot")
365 365
366 CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length.DNA")]) 366 CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length")])
367 TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample]) 367 TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample])
368 CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample") 368 CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample")
369 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total 369 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total
370 CDR3LengthPlot = ggplot(CDR3Length) 370 CDR3LengthPlot = ggplot(CDR3Length)
371 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 371 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
372 ggtitle("Length distribution of CDR3") + 372 ggtitle("Length distribution of CDR3") +
373 xlab("CDR3 Length") + 373 xlab("CDR3 Length") +
374 ylab("Percentage of sequences") + 374 ylab("Percentage of sequences") +
375 scale_fill_manual(values=sample.colors) + 375 scale_fill_manual(values=sample.colors) +
376 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) 376 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
398 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + 398 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
399 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 399 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
400 scale_fill_gradient(low="gold", high="blue", na.value="white") + 400 scale_fill_gradient(low="gold", high="blue", na.value="white") +
401 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + 401 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
402 xlab("D genes") + 402 xlab("D genes") +
403 ylab("V Genes") 403 ylab("V Genes") +
404 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major = element_line(colour = "gainsboro"))
404 405
405 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) 406 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name)))
406 print(img) 407 print(img)
407 dev.off() 408 dev.off()
408 write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 409 write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
447 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + 448 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
448 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 449 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
449 scale_fill_gradient(low="gold", high="blue", na.value="white") + 450 scale_fill_gradient(low="gold", high="blue", na.value="white") +
450 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + 451 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
451 xlab("J genes") + 452 xlab("J genes") +
452 ylab("V Genes") 453 ylab("V Genes") +
454 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major = element_line(colour = "gainsboro"))
453 455
454 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) 456 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name)))
455 print(img) 457 print(img)
456 dev.off() 458 dev.off()
457 write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 459 write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
495 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) + 497 geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) +
496 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 498 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
497 scale_fill_gradient(low="gold", high="blue", na.value="white") + 499 scale_fill_gradient(low="gold", high="blue", na.value="white") +
498 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + 500 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
499 xlab("J genes") + 501 xlab("J genes") +
500 ylab("D Genes") 502 ylab("D Genes") +
503 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major = element_line(colour = "gainsboro"))
501 504
502 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) 505 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name)))
503 print(img) 506 print(img)
504 dev.off() 507 dev.off()
505 write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 508 write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
729 P4=mean(.SD$P5J.nt.nb, na.rm=T), 732 P4=mean(.SD$P5J.nt.nb, na.rm=T),
730 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 733 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
731 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), 734 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
732 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 735 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
733 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 736 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
734 Median.CDR3.l=median(.SD$CDR3.Length.DNA)), 737 Median.CDR3.l=median(.SD$CDR3.Length)),
735 by=c("Sample")]) 738 by=c("Sample")])
736 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 739 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
737 write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 740 write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
738 741
739 newData = data.frame(data.table(PRODF)[,list(unique=.N, 742 newData = data.frame(data.table(PRODF)[,list(unique=.N,
748 P4=num_median(.SD$P5J.nt.nb, na.rm=T), 751 P4=num_median(.SD$P5J.nt.nb, na.rm=T),
749 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 752 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
750 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), 753 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
751 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 754 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
752 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 755 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
753 Median.CDR3.l=median(.SD$CDR3.Length.DNA)), 756 Median.CDR3.l=median(.SD$CDR3.Length)),
754 by=c("Sample")]) 757 by=c("Sample")])
755 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 758 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
756 write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 759 write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
757 760
758 newData = data.frame(data.table(UNPROD)[,list(unique=.N, 761 newData = data.frame(data.table(UNPROD)[,list(unique=.N,
767 P4=mean(.SD$P5J.nt.nb, na.rm=T), 770 P4=mean(.SD$P5J.nt.nb, na.rm=T),
768 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 771 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
769 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), 772 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
770 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 773 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
771 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 774 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
772 Median.CDR3.l=median(.SD$CDR3.Length.DNA)), 775 Median.CDR3.l=median(.SD$CDR3.Length)),
773 by=c("Sample")]) 776 by=c("Sample")])
774 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 777 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
775 write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 778 write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
776 779
777 newData = data.frame(data.table(UNPROD)[,list(unique=.N, 780 newData = data.frame(data.table(UNPROD)[,list(unique=.N,
786 P4=num_median(.SD$P5J.nt.nb, na.rm=T), 789 P4=num_median(.SD$P5J.nt.nb, na.rm=T),
787 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 790 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
788 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), 791 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
789 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 792 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
790 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 793 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
791 Median.CDR3.l=median(.SD$CDR3.Length.DNA)), 794 Median.CDR3.l=median(.SD$CDR3.Length)),
792 by=c("Sample")]) 795 by=c("Sample")])
793 796
794 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 797 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
795 write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 798 write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
796 } 799 }
861 dev.off() 864 dev.off()
862 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T) 865 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T)
863 866
864 # ---------------------- AA median CDR3 length ---------------------- 867 # ---------------------- AA median CDR3 length ----------------------
865 868
866 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length.DNA))), by=c("Sample")]) 869 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length))), by=c("Sample")])
867 write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 870 write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
868 871