diff deseq/differential_expression_analysis_pipeline_for_rnaseq_data-a03838a6eb54/DiffExpAnal/DESeqTools/anadiffGenes2conds.R @ 10:6e573fd3c41b draft

Uploaded
author yufei-luo
date Mon, 13 May 2013 10:06:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deseq/differential_expression_analysis_pipeline_for_rnaseq_data-a03838a6eb54/DiffExpAnal/DESeqTools/anadiffGenes2conds.R	Mon May 13 10:06:30 2013 -0400
@@ -0,0 +1,196 @@
+# Marie-Anges Dillies, Yufei Luo
+# Analyse differentielle de donnees d expression par gene
+# avec DESeq
+# 2 conditions
+
+args <- commandArgs()
+#print(args[1])
+#print(args[2])
+#print(args[3])
+#print(args[4])
+#print(args[5])
+#print(args[6])
+#output file names
+#print(args[7]) # HTML file name
+#print(args[8]) # HTML file all images directory 
+#print(args[9]) # complete xls file name
+#print(args[10])# UP xls file name
+#print(args[11]) #Down xls file name
+#print(args[12]) #the executable scipt (for getting the path)
+
+library(R2HTML)
+library(R.utils)
+
+#run example: 
+projectName <- "DESeqAnalysis"
+analysisVersion <- "V1"    # fitType=local, sharingMode=fit-only, method=blind 
+rawDir <- "raw"
+targetFile <- args[4]
+header <- as.integer(args[5]) #si on a header ou pas, si on a, header=1, sinon header=0
+withOutReplicates <- as.integer(args[6])
+
+#get the directory to write the results
+tab <- splitByPattern(args[7], pattern="/")
+res_dir <- ""
+for (e in tab[1:length(tab)-1]) { res_dir <- paste(res_dir, e, sep="")}
+#get the html output file name
+OUT_HTMLname <- args[7]
+#get the images directory to write to
+OUT_imgDir <- args[8]
+#if the directory dosen't existe, we should create it first
+
+alpha <- 0.05
+adjMethod <- "BH"
+outfile <- T
+runningScriptTab <-  splitByPattern(args[12], pattern="/")
+RfuncDir <- ""
+for (r in runningScriptTab[1:length(runningScriptTab)-1]) { RfuncDir <- paste(RfuncDir, r, sep="")} #find the path of executable script  
+RfuncDir <- paste(RfuncDir, "DESeqTools/", sep="") #define the function files path
+# Dossier contenant les fonctions
+print(RfuncDir)
+source( paste(RfuncDir, "RNAseqFunctions.R", sep="/") )
+
+# Chargement des packages et des fonctions
+library(DESeq)
+RNAseqFunctions(RfuncDir)
+# Chargement du target file
+target <- loadTargetFile( targetFile, header )
+# Chargement des donnees, construction d'une table de comptages par gene
+#have changed
+rawCounts <- loadCountData( target, header )
+conds <- unique(target$group)
+cond1 <- as.character(conds[1])
+cond2 <- as.character(conds[!conds == conds[1]])
+rawCounts <- HTseqClean( rawCounts )
+
+# Transformation en matrice de comptages
+counts <- raw2counts( rawCounts )[[1]]
+
+# Nombre de reads par echantillon
+OUT_barplotTCName <- paste(OUT_imgDir, "barplotTC.png", sep="/")
+barplotTC( counts, target$group, OUT_barplotTCName, out=outfile )
+
+# Proportion comptages nuls
+OUT_barplotNulName <- paste(OUT_imgDir, "barplotNul.png", sep="/")
+barplotNul( counts, target$group, OUT_barplotNulName, out=outfile )
+
+# Suppression comptages nuls
+counts <- removeNul( counts )[[1]]
+
+# Density plot
+OUT_densityPlotName <- paste(OUT_imgDir, "densityPlot.png", sep="/")
+densityPlot( counts, target$group, OUT_densityPlotName, out=outfile )
+
+# Boxplot
+OUT_boxplotCountsName <- paste(OUT_imgDir, "boxplotCounts.png", sep="/")
+boxplotCounts( counts, target$group, type = c("raw", "norm"), OUT_boxplotCountsName, out=outfile )
+# Sequence majoritaire
+OUT_majSequenceName <- paste(OUT_imgDir, "majSequence.png", sep="/")
+majSequence( counts, target$group, OUT_majSequenceName, out=outfile )
+
+# ScatterPlot between two samples
+OUT_scatterPlot <- paste(OUT_imgDir, "scatterPlot.png", sep="/")
+pairwiseScatterPlots(counts, target, OUT_scatterPlot, out=outfile, pdffile=FALSE)
+
+# SERE coefficient calculation (Poisson hypothesis for replicates techiques), to know if the variability between the réplicates or the conditons is hight or not. 
+coef <- pairwiseSERE(counts)
+print(coef)
+coef
+# Creation structure de donnees cds, !! we use newCountDataset because that we have first column not numeric, and DESeq dosen't take non numeric values.
+cds <- newCountDataSet( counts, target$group )
+
+# Diagnostic for clustering of non-normalized samples
+OUT_clusterPlot_before <- paste(OUT_imgDir, "clusteringOfSamplesBefore.png", sep="/")
+clusterPlot(cds, OUT_clusterPlot_before, out=outfile)
+
+
+# Normalisation (calcul des lib size factors )
+cds <- estimateSizeFactors( cds )
+
+# Estimation de la dispersion
+# parametres: 
+	# method: how samples are pooled to estimate dispersion (we use "pooled"
+        # as default parameter. If no replicates, we use "blind".
+	# sharingMode: how variance estimate is computed with respect to the fitted line. 
+	#"Maximum" is the most conservative (max between fit andestimation),
+        #"fit-only" keeps the estimated value, we use "Maximum" as default parameter.
+	# fitType: refers to the model. "Local" is the published model, 
+        #"parametric" is glm-based (may not converge), now we use "parametric" as default value.
+
+if(withOutReplicates!=0){
+	cds <- estimateDispersions( cds, method="blind")#in this case, without replicates
+
+} else if(withOutReplicates==0){
+	cds <- estimateDispersions( cds, method="pooled", sharingMode="maximum", fitType="parametric") }
+
+# Analyse differentielle, ajustement BH par defaut
+res <- nbinomTest( cds, cond1, cond2)  
+
+# Diagnostic for clustering of normalized samples
+OUT_clusterPlot <- paste(OUT_imgDir, "clusteringOfSamples.png", sep="/")
+clusterPlot(cds, OUT_clusterPlot, out=outfile)
+
+# Control plot of dispersion estimates
+OUT_plotDispEstimatesName <- paste(OUT_imgDir, "disperssionEstimates.png", sep="/")
+plotDispEstimates( cds, OUT_plotDispEstimatesName, out=outfile )
+
+# Distribution of raw p-values
+OUT_histoRawpName <- paste(OUT_imgDir, "histoRawPvalue.png", sep="/")
+histoRawp( res, OUT_histoRawpName, out=outfile )
+
+# MAplot showing DE genes
+OUT_MAplotDEName <- paste(OUT_imgDir, "MAplotDE.png", sep="/")
+MAplotDE( res, alpha, OUT_MAplotDEName, out=outfile )
+
+# export complete data
+OUT_completeName <- args[9]
+complete <- exportComplete( counts, res, target, adjMethod, cond1, cond2, OUT_completeName, out=outfile )
+
+# export significant genes
+OUT_upName <- args[10]
+OUT_downName <- args[11]
+diff <- exportDiff( complete, alpha, adjMethod, OUT_upName, OUT_downName, out=outfile )
+
+# write all images results into an HTML file
+prefixHTMLname <- tab[length(tab)]
+#HTMLCSS(file.path(res_dir), filename=prefixHTMLname, CSSfile="R2HTML")
+HTMLInitFile(file.path(res_dir), filename=prefixHTMLname, BackGroundColor="white")
+HTML.title("<center>Differential Expression DESeq analysis.", HR=1) 
+HTML.title("<center>BarplotTC: number of RNA-seq reads per sample.", HR=2)
+    	HTMLInsertGraph("barplotTC.png")
+
+HTML.title("<center>BarplotNul: number of RNA-seq reads that the count is 0 (nul).", HR=2)
+	HTMLInsertGraph("barplotNul.png")
+
+HTML.title("<center>DensityPlot: density of each sample.", HR=2)
+	HTMLInsertGraph("densityPlot.png")
+
+HTML.title("<center>Boxplot: number of RNA-seq reads distribution per sample.", HR=2)
+	HTMLInsertGraph("boxplotCounts.png")
+
+HTML.title("<center>MajorSequence: the proportion of reads associated with the most expressed sequence.", HR=2)
+	HTMLInsertGraph("majSequence.png")
+
+HTML.title("<center>ScatterPlot: Scatter plot of samples.", HR=2)
+	HTMLInsertGraph("scatterPlot.png")
+
+HTML.title("<center>Clustering Of No-Normalized Samples: Representing the no-normalized samples in Diagnostic.", HR=2)	
+	HTMLInsertGraph("clusteringOfSamplesBefore.png")
+
+HTML.title("<center>Clustering Of Normalized Samples: Representing the normalized samples in Diagnostic.", HR=2)	
+	HTMLInsertGraph("clusteringOfSamples.png")
+
+HTML.title("<center>DispersionEstimates: representing dispersion estimates vs mean expression.", HR=2)
+	HTMLInsertGraph("disperssionEstimates.png")
+
+HTML.title("<center>HistoRawPValue: histogram of raw p-value.", HR=2)
+	HTMLInsertGraph("histoRawPvalue.png")
+
+HTML.title("<center>MAplotDE: the differentially expressed genes (red point).", HR=2)
+	HTMLInsertGraph("MAplotDE.png")
+HTMLEndFile()
+absoluPrefixHTMLname <- paste(res_dir, prefixHTMLname, sep="")
+outName <- paste(absoluPrefixHTMLname, ".html", sep="")
+# change name is to be adapted into Galaxy
+file.rename(outName, OUT_HTMLname)
+