comparison report_clonality/RScript.r @ 24:d5d203d38c8a draft

Uploaded
author davidvanzessen
date Wed, 01 Feb 2017 09:48:38 -0500
parents 9185c3dfc679
children 94765af0db1f
comparison
equal deleted inserted replaced
23:e2fbdfacec1d 24:d5d203d38c8a
47 47
48 print("Report Clonality - Data preperation") 48 print("Report Clonality - Data preperation")
49 49
50 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="", stringsAsFactors=F) 50 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="", stringsAsFactors=F)
51 51
52
52 print(paste("nrows: ", nrow(inputdata))) 53 print(paste("nrows: ", nrow(inputdata)))
53 54
54 setwd(outdir) 55 setwd(outdir)
55 56
56 # remove weird rows 57 # remove weird rows
119 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) 120 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":"))
120 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":")) 121 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":"))
121 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] 122 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ]
122 } 123 }
123 124
124 print("SAMPLE TABLE:")
125 print(table(PRODF$Sample))
126
127 prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")]) 125 prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")])
128 prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")]) 126 prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")])
129 127
130 unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")]) 128 unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")])
131 unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")]) 129 unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")])
159 157
160 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive 158 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive
161 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) 159 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T)
162 #write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) 160 #write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T)
163 write.table(UNPROD, "allUnproductive.csv", sep=",",quote=F,row.names=F,col.names=T) 161 write.table(UNPROD, "allUnproductive.csv", sep=",",quote=F,row.names=F,col.names=T)
162
163 print("SAMPLE TABLE:")
164 print(table(PRODF$Sample))
164 165
165 #write the samples to a file 166 #write the samples to a file
166 sampleFile <- file("samples.txt") 167 sampleFile <- file("samples.txt")
167 un = unique(inputdata$Sample) 168 un = unique(inputdata$Sample)
168 un = paste(un, sep="\n") 169 un = paste(un, sep="\n")
381 scale_fill_manual(values=sample.colors) + 382 scale_fill_manual(values=sample.colors) +
382 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) 383 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
383 png("CDR3LengthPlot.png",width = 1280, height = 720) 384 png("CDR3LengthPlot.png",width = 1280, height = 720)
384 CDR3LengthPlot 385 CDR3LengthPlot
385 dev.off() 386 dev.off()
386 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) 387 write.table(x=CDR3Length, file="CDR3LengthPlot.txt", sep="\t",quote=F,row.names=F,col.names=T)
387 388
388 # ---------------------- Plot the heatmaps ---------------------- 389 # ---------------------- Plot the heatmaps ----------------------
389 390
390 #get the reverse order for the V and D genes 391 #get the reverse order for the V and D genes
391 revVchain = Vchain 392 revVchain = Vchain
410 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro")) 411 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro"))
411 412
412 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) 413 png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name)))
413 print(img) 414 print(img)
414 dev.off() 415 dev.off()
415 write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 416 write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA)
416 } 417 }
417 418
418 VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) 419 VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")])
419 420
420 VandDCount$l = log(VandDCount$Length) 421 VandDCount$l = log(VandDCount$Length)
460 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro")) 461 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro"))
461 462
462 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) 463 png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name)))
463 print(img) 464 print(img)
464 dev.off() 465 dev.off()
465 write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 466 write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA)
466 } 467 }
467 468
468 VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) 469 VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")])
469 470
470 VandJCount$l = log(VandJCount$Length) 471 VandJCount$l = log(VandJCount$Length)
509 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro")) 510 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro"))
510 511
511 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) 512 png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name)))
512 print(img) 513 print(img)
513 dev.off() 514 dev.off()
514 write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 515 write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA)
515 } 516 }
516 517
517 518
518 DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")]) 519 DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")])
519 520
699 700
700 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") 701 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb")
701 if(all(imgtcolumns %in% colnames(inputdata))) 702 if(all(imgtcolumns %in% colnames(inputdata)))
702 { 703 {
703 print("found IMGT columns, running junction analysis") 704 print("found IMGT columns, running junction analysis")
704 705
705 if(locus %in% c("IGK","IGL", "TRA", "TRG")){
706 print("VJ recombination, no filtering on absent D")
707 } else {
708 print("VDJ recombination, using N column for junction analysis")
709 fltr = nchar(PRODF$Top.D.Gene) < 4
710 print(paste("Removing", sum(fltr), "sequences without a identified D"))
711 PRODF = PRODF[!fltr,]
712 }
713
714
715 #ensure certain columns are in the data (files generated with older versions of IMGT Loader) 706 #ensure certain columns are in the data (files generated with older versions of IMGT Loader)
716 col.checks = c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb") 707 col.checks = c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb")
717 for(col.check in col.checks){ 708 for(col.check in col.checks){
718 if(!(col.check %in% names(PRODF))){ 709 if(!(col.check %in% names(PRODF))){
719 print(paste(col.check, "not found adding new column")) 710 print(paste(col.check, "not found adding new column"))
728 UNPROD = cbind(UNPROD, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0))) 719 UNPROD = cbind(UNPROD, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0)))
729 } 720 }
730 } 721 }
731 } 722 }
732 723
724 PRODF.with.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) > 2,]
725 PRODF.no.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) < 4,]
726
733 num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) } 727 num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) }
734 728
735 newData = data.frame(data.table(PRODF)[,list(unique=.N, 729 newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N,
736 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 730 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
737 P1=mean(.SD$P3V.nt.nb, na.rm=T), 731 P1=mean(.SD$P3V.nt.nb, na.rm=T),
738 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), 732 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
739 P2=mean(.SD$P5D.nt.nb, na.rm=T), 733 P2=mean(.SD$P5D.nt.nb, na.rm=T),
740 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 734 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
747 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 741 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
748 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 742 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
749 Median.CDR3.l=as.double(median(.SD$CDR3.Length))), 743 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
750 by=c("Sample")]) 744 by=c("Sample")])
751 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 745 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
752 write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 746 write.table(newData, "junctionAnalysisProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
753 747
754 newData = data.frame(data.table(PRODF)[,list(unique=.N, 748 newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N,
755 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 749 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
756 P1=num_median(.SD$P3V.nt.nb, na.rm=T), 750 P1=num_median(.SD$P3V.nt.nb, na.rm=T),
757 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), 751 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
758 P2=num_median(.SD$P5D.nt.nb, na.rm=T), 752 P2=num_median(.SD$P5D.nt.nb, na.rm=T),
759 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 753 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
766 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 760 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
767 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 761 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
768 Median.CDR3.l=as.double(median(.SD$CDR3.Length))), 762 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
769 by=c("Sample")]) 763 by=c("Sample")])
770 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 764 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
771 write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 765 write.table(newData, "junctionAnalysisProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
772 766
773 newData = data.frame(data.table(UNPROD)[,list(unique=.N, 767 newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N,
774 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 768 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
775 P1=mean(.SD$P3V.nt.nb, na.rm=T), 769 P1=mean(.SD$P3V.nt.nb, na.rm=T),
776 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), 770 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
777 P2=mean(.SD$P5D.nt.nb, na.rm=T), 771 P2=mean(.SD$P5D.nt.nb, na.rm=T),
778 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 772 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
785 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 779 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
786 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 780 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
787 Median.CDR3.l=as.double(median(.SD$CDR3.Length))), 781 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
788 by=c("Sample")]) 782 by=c("Sample")])
789 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 783 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
790 write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 784 write.table(newData, "junctionAnalysisUnProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
791 785
792 newData = data.frame(data.table(UNPROD)[,list(unique=.N, 786 newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N,
793 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 787 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
794 P1=num_median(.SD$P3V.nt.nb, na.rm=T), 788 P1=num_median(.SD$P3V.nt.nb, na.rm=T),
795 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), 789 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
796 P2=num_median(.SD$P5D.nt.nb, na.rm=T), 790 P2=num_median(.SD$P5D.nt.nb, na.rm=T),
797 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 791 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
803 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), 797 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
804 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), 798 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
805 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), 799 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
806 Median.CDR3.l=as.double(median(.SD$CDR3.Length))), 800 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
807 by=c("Sample")]) 801 by=c("Sample")])
808
809 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 802 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
810 write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 803 write.table(newData, "junctionAnalysisUnProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
804
805 #---------------- again for no-D
806
807 newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N,
808 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
809 P1=mean(.SD$P3V.nt.nb, na.rm=T),
810 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
811 P2=mean(.SD$P5J.nt.nb, na.rm=T),
812 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
813 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
814 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
815 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
816 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
817 by=c("Sample")])
818 print(newData)
819 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
820 write.table(newData, "junctionAnalysisProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
821
822 newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N,
823 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
824 P1=num_median(.SD$P3V.nt.nb, na.rm=T),
825 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
826 P2=num_median(.SD$P5J.nt.nb, na.rm=T),
827 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
828 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
829 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
830 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
831 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
832 by=c("Sample")])
833 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
834 write.table(newData, "junctionAnalysisProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
835
836 newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N,
837 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
838 P1=mean(.SD$P3V.nt.nb, na.rm=T),
839 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
840 P2=mean(.SD$P5J.nt.nb, na.rm=T),
841 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
842 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
843 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
844 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
845 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
846 by=c("Sample")])
847 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
848 write.table(newData, "junctionAnalysisUnProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
849
850 newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N,
851 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
852 P1=num_median(.SD$P3V.nt.nb, na.rm=T),
853 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
854 P2=num_median(.SD$P5J.nt.nb, na.rm=T),
855 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
856 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
857 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
858 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
859 Median.CDR3.l=as.double(median(.SD$CDR3.Length))),
860 by=c("Sample")])
861 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
862 write.table(newData, "junctionAnalysisUnProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
811 } 863 }
812 864
813 PRODF = bak 865 PRODF = bak
814 866
815 867
820 chck = is.na(D.REGION.reading.frame$D.REGION.reading.frame) 872 chck = is.na(D.REGION.reading.frame$D.REGION.reading.frame)
821 if(any(chck)){ 873 if(any(chck)){
822 D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D" 874 D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D"
823 } 875 }
824 876
825 D.REGION.reading.frame = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")]) 877 D.REGION.reading.frame.1 = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")])
826 878
827 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T) 879 D.REGION.reading.frame.2 = data.frame(data.table(D.REGION.reading.frame)[, list(sample.sum=sum(as.numeric(.SD$D.REGION.reading.frame), na.rm=T)), by=c("Sample")])
880
881 D.REGION.reading.frame = merge(D.REGION.reading.frame.1, D.REGION.reading.frame.2, by="Sample")
882
883 D.REGION.reading.frame$percentage = round(D.REGION.reading.frame$Freq / D.REGION.reading.frame$sample.sum * 100, 1)
884
885 write.table(D.REGION.reading.frame, "DReadingFrame.txt" , sep="\t",quote=F,row.names=F,col.names=T)
828 886
829 D.REGION.reading.frame = ggplot(D.REGION.reading.frame) 887 D.REGION.reading.frame = ggplot(D.REGION.reading.frame)
830 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") 888 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = percentage, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame")
831 D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors) 889 D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors)
832 D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) 890 D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
833 891
834 png("DReadingFrame.png") 892 png("DReadingFrame.png")
835 D.REGION.reading.frame 893 D.REGION.reading.frame
876 dev.off() 934 dev.off()
877 write.table(AAfreq, "AAComposition.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) 935 write.table(AAfreq, "AAComposition.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T)
878 936
879 # ---------------------- AA median CDR3 length ---------------------- 937 # ---------------------- AA median CDR3 length ----------------------
880 938
881 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length))), by=c("Sample")]) 939 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(as.numeric(.SD$CDR3.Length, na.rm=T), na.rm=T))), by=c("Sample")])
882 write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 940 write.table(median.aa.l, "AAMedianBySample.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
883 941