diff report_clonality/RScript.r @ 8:8cbc1a8d27ae draft

Uploaded
author davidvanzessen
date Mon, 19 Dec 2016 08:50:31 -0500
parents d001d0c05dbe
children efa1f5a17b6e
line wrap: on
line diff
--- a/report_clonality/RScript.r	Fri Dec 16 09:01:40 2016 -0500
+++ b/report_clonality/RScript.r	Mon Dec 19 08:50:31 2016 -0500
@@ -135,6 +135,20 @@
   }
 }
 
+#make a names list with sample -> color
+naive.colors = c('blue4', 'darkred', 'olivedrab3', 'red', 'gray74', 'darkviolet', 'lightblue1', 'gold', 'chartreuse2', 'pink', 'Paleturquoise3', 'Chocolate1', 'Yellow', 'Deeppink3', 'Mediumorchid1', 'Darkgreen', 'Blue', 'Gray36', 'Hotpink', 'Yellow4')
+unique.samples = unique(PRODF$Sample)
+
+if(length(unique.samples) <= length(naive.colors)){
+	sample.colors = naive.colors[1:length(unique.samples)]
+} else {
+	sample.colors = rainbow(length(unique.samples))
+}
+
+names(sample.colors) = unique.samples
+
+print("Sample.colors")
+print(sample.colors)
 
 
 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive
@@ -275,7 +289,8 @@
 
 pV = ggplot(PRODFV)
 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage")
+pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") + scale_fill_manual(values=sample.colors)
+pV = pV + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
 
 png("VPlot.png",width = 1280, height = 720)
@@ -285,7 +300,8 @@
 if(useD){
   pD = ggplot(PRODFD)
   pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-  pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage")
+  pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") + scale_fill_manual(values=sample.colors)
+  pD = pD + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
   write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
   
   png("DPlot.png",width = 800, height = 600)
@@ -295,16 +311,8 @@
 
 pJ = ggplot(PRODFJ)
 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage")
-write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
-
-png("JPlot.png",width = 800, height = 600)
-pJ
-dev.off();
-
-pJ = ggplot(PRODFJ)
-pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage")
+pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") + scale_fill_manual(values=sample.colors)
+pJ = pJ + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T)
 
 png("JPlot.png",width = 800, height = 600)
@@ -324,7 +332,9 @@
 VPlot = ggplot(VGenes)
 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
   ggtitle("Distribution of V gene families") + 
-  ylab("Percentage of sequences")
+  ylab("Percentage of sequences") +
+  scale_fill_manual(values=sample.colors) +
+  theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
 png("VFPlot.png")
 VPlot
 dev.off();
@@ -340,7 +350,9 @@
   DPlot = ggplot(DGenes)
   DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
     ggtitle("Distribution of D gene families") + 
-    ylab("Percentage of sequences")
+    ylab("Percentage of sequences") + 
+    scale_fill_manual(values=sample.colors) +
+    theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
   png("DFPlot.png")
   print(DPlot)
   dev.off();
@@ -359,7 +371,9 @@
 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
   ggtitle("Length distribution of CDR3") + 
   xlab("CDR3 Length") + 
-  ylab("Percentage of sequences")
+  ylab("Percentage of sequences") +
+  scale_fill_manual(values=sample.colors) +
+  theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
 png("CDR3LengthPlot.png",width = 1280, height = 720)
 CDR3LengthPlot
 dev.off()
@@ -786,16 +800,21 @@
 
 # ---------------------- D reading frame ----------------------
 
-D.REGION.reading.frame = PRODF$D.REGION.reading.frame
+D.REGION.reading.frame = PRODF[,c("Sample", "D.REGION.reading.frame")]
 
-D.REGION.reading.frame[is.na(D.REGION.reading.frame)] = "No D"
+chck = is.na(D.REGION.reading.frame$D.REGION.reading.frame)
+if(any(chck)){
+	D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D"
+}
 
-D.REGION.reading.frame = data.frame(table(D.REGION.reading.frame))
+D.REGION.reading.frame = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")])
 
 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T)
 
 D.REGION.reading.frame = ggplot(D.REGION.reading.frame)
-D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame")
+D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame")
+D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors)
+D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
 
 png("DReadingFrame.png")
 D.REGION.reading.frame
@@ -834,11 +853,16 @@
 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2)
 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2)
 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2)
-AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage")
+AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") + scale_fill_manual(values=sample.colors)
+AAfreqplot = AAfreqplot + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
 
 png("AAComposition.png",width = 1280, height = 720)
 AAfreqplot
 dev.off()
 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T)
 
+# ---------------------- AA median CDR3 length ----------------------
 
+median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length.DNA))), by=c("Sample")])
+write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
+