comparison test-data/output_countsummary.Rnw @ 2:81bbbddcf285 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mageck commit 49e456dda49db1f52fc876f406a10273a408b1a2
author iuc
date Wed, 04 Apr 2018 11:03:59 -0400
parents
children
comparison
equal deleted inserted replaced
1:5e2a28bee02d 2:81bbbddcf285
1 % This is a template file for Sweave used in MAGeCK
2 % Author: Wei Li, Shirley Liu lab
3 % Do not modify lines beginning with "#__".
4 \documentclass{article}
5
6 \usepackage{amsmath}
7 \usepackage{amscd}
8 \usepackage[tableposition=top]{caption}
9 \usepackage{ifthen}
10 \usepackage{fullpage}
11 \usepackage[utf8]{inputenc}
12 % \usepackage{longtable}
13
14 \begin{document}
15 \setkeys{Gin}{width=0.9\textwidth}
16
17 \title{MAGeCK Count Report}
18 \author{Wei Li}
19
20 \maketitle
21
22
23 \tableofcontents
24
25 \section{Summary}
26
27 %Function definition
28 <<label=funcdef,include=FALSE,echo=FALSE>>=
29 genreporttable<-function(filelist,labellist,reads,mappedreads){
30 xtb=data.frame(Label=labellist,Reads=reads,MappedReads=mappedreads,MappedPercentage=mappedreads/reads);
31 colnames(xtb)=c("Label","Reads","Mapped","Percentage");
32 return (xtb);
33 }
34 genreporttable2<-function(filelist,labellist,sgrnas,zerocounts,gini){
35 xtb=data.frame(Label=labellist,TotalsgRNAs=sgrnas,ZeroCounts=zerocounts,GiniIndex=gini);
36 colnames(xtb)=c("Label","TotalsgRNA","ZeroCounts","GiniIndex");
37 return (xtb);
38 }
39 genreporttable3<-function(filelist,labellist){
40 xtb=data.frame(File=filelist,Label=labellist);
41 colnames(xtb)=c("File","Label");
42 return (xtb);
43 }
44
45
46 colors=c( "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628", "#F781BF",
47 "#999999", "#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#E5C494", "#B3B3B3",
48 "#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5",
49 "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F");
50
51
52
53 genboxplot<-function(filename,...){
54 #slmed=read.table(filename,header=T)
55 slmed=read.table(filename,header=T)
56 slmat=as.matrix(slmed[,c(-1,-2)])
57 slmat_log=log2(slmat+1)
58
59 boxplot(slmat_log,pch='.',las=2,ylab='log2(read counts)',cex.axis=0.8,...)
60 }
61
62
63 genhistplot<-function(filename,isfile=T,...){
64 if(isfile){
65 slmed=read.table(filename,header=T)
66 }else{
67 slmed=filename;
68 }
69 tabsmat=as.matrix(log2(slmed[,c(-1,-2)]+1))
70 colnames(tabsmat)=colnames(slmed)[c(-1,-2)]
71 samplecol=colors[((1:ncol(tabsmat)) %% length(colors)) ]
72 if(ncol(tabsmat)>=1){
73 histlist=lapply(1:ncol(tabsmat),function(X){ return (hist(tabsmat[,X],plot=F,breaks=40)) })
74 xrange=range(unlist(lapply(histlist,function(X){X$mids})))
75 yrange=range(unlist(lapply(histlist,function(X){X$counts})))
76 hst1=histlist[[1]]
77 plot(hst1$mids,hst1$counts,type='b',pch=20,xlim=c(0,xrange[2]*1.2),ylim=c(0,yrange[2]*1.2),xlab='log2(counts)',ylab='Frequency',main='Distribution of read counts',col = samplecol[1], ... )
78 }
79 if(ncol(tabsmat)>=2){
80 for(i in 2:ncol(tabsmat)){
81 hstn=histlist[[i]]
82 lines(hstn$mids,hstn$counts,type='b',pch=20,col=samplecol[i])
83 }
84 }
85 legend('topright',colnames(tabsmat),pch=20,lwd=1,col=samplecol)
86 }
87
88
89
90 genclustering<-function(filename,...){
91 #slmed=read.table(filename,header=T)
92 slmed=read.table(filename,header=T)
93 slmat=as.matrix(slmed[,c(-1,-2)])
94 slmat_log=log2(slmat+1)
95
96 result=tryCatch({
97 library(gplots);
98 heatmap.2(cor(slmat_log),trace = 'none',density.info = 'none',cexRow = 0.8,cexCol = 0.8,offsetRow = -0.2,offsetCol = -0.2)
99 }, error=function(e){
100 heatmap(cor(slmat_log),scale='none',cexRow = 0.8,cexCol = 0.8,cex.axis=0.8,...)
101 });
102 }
103
104 ctfit_tx=0;
105
106
107 panel.plot<-function(x,y,textnames=names(x),...){
108 par(new=TRUE)
109 m<-cbind(x,y)
110 plot(m,pch=20,xlim = range(x)*1.1,ylim=range(y)*1.1,...)
111 text(x,y,textnames,...)
112 }
113
114
115 genpcaplot<-function(filename,...){
116 #slmed=read.table(filename,header=T)
117 slmed=read.table(filename,header=T)
118 slmat=as.matrix(slmed[,c(-1,-2)])
119 slmat_log=log2(slmat+1)
120 ctfit_tx<<-prcomp(t(slmat_log),center=TRUE)
121
122 # par(mfrow=c(2,1));
123 samplecol=colors[((1:ncol(slmat)) %% length(colors)) ]
124 # first 2 PCA
125 #plot(ctfit_tx$x[,1],ctfit_tx$x[,2],xlab='PC1',ylab='PC2',main='First 2 PCs',col=samplecol,xlim=1.1*range(ctfit_tx$x[,1]),ylim=1.1*range(ctfit_tx$x[,2]));
126 #text(ctfit_tx$x[,1],ctfit_tx$x[,2],rownames(ctfit_tx$x),col=samplecol);
127 # par(mfrow=c(1,1));
128 if(length(samplecol)>2){
129 pairs(ctfit_tx$x[,1:3],panel=panel.plot,textnames=rownames(ctfit_tx$x),main='First 3 principle components',col=samplecol)
130 }else{
131 if(length(samplecol)>1){
132 pairs(ctfit_tx$x[,1:2],panel=panel.plot,textnames=rownames(ctfit_tx$x),main='First 2 principle components',col=samplecol)
133 }
134 }
135
136
137 }
138
139 genpcavar<-function(){
140 # % variance
141 varpca=ctfit_tx$sdev^2
142 varpca=varpca/sum(varpca)*100;
143 if(length(varpca)>10){
144 varpca=varpca[1:10];
145 }
146 plot(varpca,type='b',lwd=2,pch=20,xlab='PCs',ylab='% Variance explained');
147 }
148
149 @
150
151 %__FILE_SUMMARY__
152
153 The statistics of comparisons are listed in Table 1 and Table 2.
154 The corresponding fastq files in each row are listed in Table 3.
155
156 <<label=tab1,echo=FALSE,results=tex>>=
157 library(xtable)
158 filelist=c("input_0.gz");
159 labellist=c("test1_fastq_gz");
160 reads=c(2500);
161 mappedreads=c(1453);
162 totalsgrnas=c(2550);
163 zerocounts=c(1276);
164 giniindex=c(0.5266899931488773);
165
166 cptable=genreporttable(filelist,labellist,reads,mappedreads);
167 print(xtable(cptable, caption = "Summary of comparisons", label = "tab:one",
168 digits = c(0, 0, 0, 0,2),
169 align=c('c', 'c','c', 'c', 'c'),
170 table.placement = "tbp",
171 caption.placement = "top"))
172 @
173
174 <<label=tab2,echo=FALSE,results=tex>>=
175 library(xtable)
176 cptable=genreporttable2(filelist,labellist,totalsgrnas,zerocounts,giniindex);
177 print(xtable(cptable, caption = "Summary of comparisons", label = "tab:two",
178 digits = c(0, 0,0, 0,2),
179 align=c('c', 'c','c', 'c', 'c'),
180 table.placement = "tbp",
181 caption.placement = "top"))
182 @
183
184
185
186
187
188 <<label=tab3,echo=FALSE,results=tex>>=
189 library(xtable)
190 cptable=genreporttable3(filelist,labellist);
191 print(xtable(cptable, caption = "Summary of samples", label = "tab:three",
192 digits = c(0,0, 0),
193 align=c('c', 'p{9cm}', 'c'),
194 table.placement = "tbp",
195 caption.placement = "top"))
196 @
197
198
199
200
201 The meanings of the columns are as follows.
202
203 \begin{itemize}
204 \item \textbf{Row}: The row number in the table;
205 \item \textbf{File}: The filename of fastq file;
206 \item \textbf{Label}: Assigned label;
207 \item \textbf{Reads}: The total read count in the fastq file;
208 \item \textbf{Mapped}: Reads that can be mapped to gRNA library;
209 \item \textbf{Percentage}: The percentage of mapped reads;
210 \item \textbf{TotalsgRNAs}: The number of sgRNAs in the library;
211 \item \textbf{ZeroCounts}: The number of sgRNA with 0 read counts;
212 \item \textbf{GiniIndex}: The Gini Index of the read count distribution. Gini index can be used to measure the evenness of the read counts, and a smaller value means a more even distribution of the read counts.
213 \end{itemize}
214
215
216
217 \newpage\section{Normalized read count distribution of all samples}
218 The following figure shows the distribution of median-normalized read counts in all samples.
219
220
221 <<fig=TRUE,echo=FALSE,width=4.5,height=4.5>>=
222 genboxplot("output.count_normalized.txt");
223 @
224
225 The following figure shows the histogram of median-normalized read counts in all samples.
226
227
228 <<fig=TRUE,echo=FALSE,width=4.5,height=4.5>>=
229 genhistplot("output.count_normalized.txt");
230 @
231
232 %__INDIVIDUAL_PAGE__
233
234
235
236 \end{document}
237