view test-data/output_countsummary.Rnw @ 2:81bbbddcf285 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mageck commit 49e456dda49db1f52fc876f406a10273a408b1a2
author iuc
date Wed, 04 Apr 2018 11:03:59 -0400
parents
children
line wrap: on
line source

% This is a template file for Sweave used in MAGeCK
% Author: Wei Li, Shirley Liu lab
% Do not modify lines beginning with "#__".
\documentclass{article}

\usepackage{amsmath}
\usepackage{amscd}
\usepackage[tableposition=top]{caption}
\usepackage{ifthen}
\usepackage{fullpage}
\usepackage[utf8]{inputenc}
% \usepackage{longtable}

\begin{document}
\setkeys{Gin}{width=0.9\textwidth}

\title{MAGeCK Count Report}
\author{Wei Li}

\maketitle


\tableofcontents

\section{Summary}

%Function definition
<<label=funcdef,include=FALSE,echo=FALSE>>=
genreporttable<-function(filelist,labellist,reads,mappedreads){
  xtb=data.frame(Label=labellist,Reads=reads,MappedReads=mappedreads,MappedPercentage=mappedreads/reads);
  colnames(xtb)=c("Label","Reads","Mapped","Percentage");
  return (xtb);
}
genreporttable2<-function(filelist,labellist,sgrnas,zerocounts,gini){
  xtb=data.frame(Label=labellist,TotalsgRNAs=sgrnas,ZeroCounts=zerocounts,GiniIndex=gini);
  colnames(xtb)=c("Label","TotalsgRNA","ZeroCounts","GiniIndex");
  return (xtb);
}
genreporttable3<-function(filelist,labellist){
  xtb=data.frame(File=filelist,Label=labellist);
  colnames(xtb)=c("File","Label");
  return (xtb);
}


colors=c( "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00",  "#A65628", "#F781BF",
          "#999999", "#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#E5C494", "#B3B3B3", 
          "#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5",
          "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F");



genboxplot<-function(filename,...){
  #slmed=read.table(filename,header=T)
  slmed=read.table(filename,header=T)
  slmat=as.matrix(slmed[,c(-1,-2)])
  slmat_log=log2(slmat+1)

  boxplot(slmat_log,pch='.',las=2,ylab='log2(read counts)',cex.axis=0.8,...)
}


genhistplot<-function(filename,isfile=T,...){
  if(isfile){
    slmed=read.table(filename,header=T)
  }else{
    slmed=filename;
  }
  tabsmat=as.matrix(log2(slmed[,c(-1,-2)]+1))
  colnames(tabsmat)=colnames(slmed)[c(-1,-2)]
  samplecol=colors[((1:ncol(tabsmat)) %% length(colors)) ]
  if(ncol(tabsmat)>=1){
    histlist=lapply(1:ncol(tabsmat),function(X){ return (hist(tabsmat[,X],plot=F,breaks=40)) })
    xrange=range(unlist(lapply(histlist,function(X){X$mids})))
    yrange=range(unlist(lapply(histlist,function(X){X$counts})))
    hst1=histlist[[1]]
    plot(hst1$mids,hst1$counts,type='b',pch=20,xlim=c(0,xrange[2]*1.2),ylim=c(0,yrange[2]*1.2),xlab='log2(counts)',ylab='Frequency',main='Distribution of read counts',col = samplecol[1], ... )
  }
  if(ncol(tabsmat)>=2){ 
    for(i in 2:ncol(tabsmat)){
      hstn=histlist[[i]]
      lines(hstn$mids,hstn$counts,type='b',pch=20,col=samplecol[i])
    }
  }
  legend('topright',colnames(tabsmat),pch=20,lwd=1,col=samplecol)
}



genclustering<-function(filename,...){
  #slmed=read.table(filename,header=T)
  slmed=read.table(filename,header=T)
  slmat=as.matrix(slmed[,c(-1,-2)])
  slmat_log=log2(slmat+1)

  result=tryCatch({
    library(gplots);
    heatmap.2(cor(slmat_log),trace = 'none',density.info = 'none',cexRow = 0.8,cexCol = 0.8,offsetRow = -0.2,offsetCol = -0.2)
  }, error=function(e){
    heatmap(cor(slmat_log),scale='none',cexRow = 0.8,cexCol = 0.8,cex.axis=0.8,...)
  });
}

ctfit_tx=0;


panel.plot<-function(x,y,textnames=names(x),...){
  par(new=TRUE)
  m<-cbind(x,y)
  plot(m,pch=20,xlim = range(x)*1.1,ylim=range(y)*1.1,...)
  text(x,y,textnames,...)
}


genpcaplot<-function(filename,...){
  #slmed=read.table(filename,header=T)
  slmed=read.table(filename,header=T)
  slmat=as.matrix(slmed[,c(-1,-2)])
  slmat_log=log2(slmat+1)
  ctfit_tx<<-prcomp(t(slmat_log),center=TRUE)
  
  # par(mfrow=c(2,1));
  samplecol=colors[((1:ncol(slmat)) %% length(colors)) ]
  # first 2 PCA
  #plot(ctfit_tx$x[,1],ctfit_tx$x[,2],xlab='PC1',ylab='PC2',main='First 2 PCs',col=samplecol,xlim=1.1*range(ctfit_tx$x[,1]),ylim=1.1*range(ctfit_tx$x[,2]));
  #text(ctfit_tx$x[,1],ctfit_tx$x[,2],rownames(ctfit_tx$x),col=samplecol);
  # par(mfrow=c(1,1));
  if(length(samplecol)>2){
    pairs(ctfit_tx$x[,1:3],panel=panel.plot,textnames=rownames(ctfit_tx$x),main='First 3 principle components',col=samplecol)
  }else{
    if(length(samplecol)>1){
      pairs(ctfit_tx$x[,1:2],panel=panel.plot,textnames=rownames(ctfit_tx$x),main='First 2 principle components',col=samplecol)
   }
  }


}

genpcavar<-function(){
  # % variance 
  varpca=ctfit_tx$sdev^2
  varpca=varpca/sum(varpca)*100;
  if(length(varpca)>10){
    varpca=varpca[1:10];
  }
  plot(varpca,type='b',lwd=2,pch=20,xlab='PCs',ylab='% Variance explained');
}

@

%__FILE_SUMMARY__

The statistics of comparisons are listed in Table 1 and Table 2.
The corresponding fastq files in each row are listed in Table 3.

<<label=tab1,echo=FALSE,results=tex>>=
library(xtable)
filelist=c("input_0.gz");
labellist=c("test1_fastq_gz");
reads=c(2500);
mappedreads=c(1453);
totalsgrnas=c(2550);
zerocounts=c(1276);
giniindex=c(0.5266899931488773);

cptable=genreporttable(filelist,labellist,reads,mappedreads);
print(xtable(cptable, caption = "Summary of comparisons", label = "tab:one",
    digits = c(0, 0, 0, 0,2),
    align=c('c',  'c','c',  'c', 'c'),
    table.placement = "tbp",
    caption.placement = "top"))
@

<<label=tab2,echo=FALSE,results=tex>>=
library(xtable)
cptable=genreporttable2(filelist,labellist,totalsgrnas,zerocounts,giniindex);
print(xtable(cptable, caption = "Summary of comparisons", label = "tab:two",
    digits = c(0, 0,0, 0,2),
    align=c('c',  'c','c',  'c', 'c'),
    table.placement = "tbp",
    caption.placement = "top"))
@





<<label=tab3,echo=FALSE,results=tex>>=
library(xtable)
cptable=genreporttable3(filelist,labellist);
print(xtable(cptable, caption = "Summary of samples", label = "tab:three",
    digits = c(0,0, 0),
    align=c('c', 'p{9cm}', 'c'),
    table.placement = "tbp",
    caption.placement = "top"))
@




The meanings of the columns are as follows.

\begin{itemize}
\item \textbf{Row}: The row number in the table;
\item \textbf{File}: The filename of fastq file;
\item \textbf{Label}: Assigned label;
\item \textbf{Reads}: The total read count in the fastq file;
\item \textbf{Mapped}: Reads that can be mapped to gRNA library;
\item \textbf{Percentage}: The percentage of mapped reads;
\item \textbf{TotalsgRNAs}: The number of sgRNAs in the library; 
\item \textbf{ZeroCounts}: The number of sgRNA with 0 read counts;
\item \textbf{GiniIndex}: The Gini Index of the read count distribution. Gini index can be used to measure the evenness of the read counts, and a smaller value means a more even distribution of the read counts.
\end{itemize}



\newpage\section{Normalized read count distribution of all samples}
The following figure shows the distribution of median-normalized read counts in all samples.


<<fig=TRUE,echo=FALSE,width=4.5,height=4.5>>=
genboxplot("output.count_normalized.txt");
@

The following figure shows the histogram of median-normalized read counts in all samples.


<<fig=TRUE,echo=FALSE,width=4.5,height=4.5>>=
genhistplot("output.count_normalized.txt");
@

%__INDIVIDUAL_PAGE__



\end{document}