Mercurial > repos > yufei-luo > s_mart

# Analyse differentielle de donnees d expression par gene
# avec DESeq
# 2 conditions

args <- commandArgs()
#print(args[1])
#print(args[2])
#print(args[3])
#print(args[4])
#print(args[5])
#print(args[6])
#output file names
#print(args[7]) # HTML file name
#print(args[8]) # HTML file all images directory
#print(args[9]) # complete xls file name
#print(args[10])# UP xls file name
#print(args[11]) #Down xls file name
#print(args[12]) #the executable scipt (for getting the path)

library(R2HTML)
library(R.utils)

#run example:
projectName <- "DESeqAnalysis"
analysisVersion <- "V1"    # fitType=local, sharingMode=fit-only, method=blind
rawDir <- "raw"
targetFile <- args[4]
header <- as.integer(args[5]) #si on a header ou pas, si on a, header=1, sinon header=0
withOutReplicates <- as.integer(args[6])

#get the directory to write the results
tab <- splitByPattern(args[7], pattern="/")
res_dir <- ""
for (e in tab[1:length(tab)-1]) { res_dir <- paste(res_dir, e, sep="")}
#get the html output file name
OUT_HTMLname <- args[7]
#get the images directory to write to
OUT_imgDir <- args[8]
#if the directory dosen't existe, we should create it first

alpha <- 0.05
adjMethod <- "BH"
outfile <- T
runningScriptTab <-  splitByPattern(args[12], pattern="/")
RfuncDir <- ""
for (r in runningScriptTab[1:length(runningScriptTab)-1]) { RfuncDir <- paste(RfuncDir, r, sep="")} #find the path of executable script
RfuncDir <- paste(RfuncDir, "DESeqTools/", sep="") #define the function files path
# Dossier contenant les fonctions
print(RfuncDir)
source( paste(RfuncDir, "RNAseqFunctions.R", sep="/") )

# Chargement des packages et des fonctions
library(DESeq)
RNAseqFunctions(RfuncDir)
# Chargement du target file
target <- loadTargetFile( targetFile, header )
# Chargement des donnees, construction d'une table de comptages par gene
#have changed
rawCounts <- loadCountData( target, header )
conds <- unique(target$group)
cond1 <- as.character(conds[1])
cond2 <- as.character(conds[!conds == conds[1]])
rawCounts <- HTseqClean( rawCounts )

# Transformation en matrice de comptages
counts <- raw2counts( rawCounts )[[1]]

# Nombre de reads par echantillon
OUT_barplotTCName <- paste(OUT_imgDir, "barplotTC.png", sep="/")
barplotTC( counts, target$group, OUT_barplotTCName, out=outfile )

# Proportion comptages nuls
OUT_barplotNulName <- paste(OUT_imgDir, "barplotNul.png", sep="/")
barplotNul( counts, target$group, OUT_barplotNulName, out=outfile )

# Suppression comptages nuls
counts <- removeNul( counts )[[1]]

# Density plot
OUT_densityPlotName <- paste(OUT_imgDir, "densityPlot.png", sep="/")
densityPlot( counts, target$group, OUT_densityPlotName, out=outfile )

# Boxplot
OUT_boxplotCountsName <- paste(OUT_imgDir, "boxplotCounts.png", sep="/")
boxplotCounts( counts, target$group, type = c("raw", "norm"), OUT_boxplotCountsName, out=outfile )
# Sequence majoritaire
OUT_majSequenceName <- paste(OUT_imgDir, "majSequence.png", sep="/")
majSequence( counts, target$group, OUT_majSequenceName, out=outfile )

# ScatterPlot between two samples
OUT_scatterPlot <- paste(OUT_imgDir, "scatterPlot.png", sep="/")
pairwiseScatterPlots(counts, target, OUT_scatterPlot, out=outfile, pdffile=FALSE)

# SERE coefficient calculation (Poisson hypothesis for replicates techiques), to know if the variability between the réplicates or the conditons is hight or not.
coef <- pairwiseSERE(counts)
print(coef)
coef
# Creation structure de donnees cds, !! we use newCountDataset because that we have first column not numeric, and DESeq dosen't take non numeric values.
cds <- newCountDataSet( counts, target$group )

# Diagnostic for clustering of non-normalized samples
OUT_clusterPlot_before <- paste(OUT_imgDir, "clusteringOfSamplesBefore.png", sep="/")
clusterPlot(cds, OUT_clusterPlot_before, out=outfile)


# Normalisation (calcul des lib size factors )
cds <- estimateSizeFactors( cds )

# Estimation de la dispersion
# parametres:
	# method: how samples are pooled to estimate dispersion. If no replicates use "blind"
	# sharingMode: how variance estimate is computed with respect to the fitted line.
	# 				"Maximum" is the most conservative (max between fit and estimation), "fit-only" keeps the estimated value
	# fitType: refers to the model. "Local" is the published model, "parametric" is glm-based (may not converge), now we use "parametric" as default value.
#in this case, without replicates
if(withOutReplicates!=0){
	cds <- estimateDispersions( cds, sharingMode="fit-only", method="blind")
} else if(withOutReplicates==0){
	#cds <- estimateDispersions( cds, sharingMode="fit-only", fitType="local")}
	cds <- estimateDispersions( cds)}
# Analyse differentielle, ajustement BH par defaut
res <- nbinomTest( cds, cond1, cond2)

# Diagnostic for clustering of normalized samples
OUT_clusterPlot <- paste(OUT_imgDir, "clusteringOfSamples.png", sep="/")
clusterPlot(cds, OUT_clusterPlot, out=outfile)

# Control plot of dispersion estimates
OUT_plotDispEstimatesName <- paste(OUT_imgDir, "disperssionEstimates.png", sep="/")
plotDispEstimates( cds, OUT_plotDispEstimatesName, out=outfile )

# Distribution of raw p-values
OUT_histoRawpName <- paste(OUT_imgDir, "histoRawPvalue.png", sep="/")
histoRawp( res, OUT_histoRawpName, out=outfile )

# MAplot showing DE genes
OUT_MAplotDEName <- paste(OUT_imgDir, "MAplotDE.png", sep="/")
MAplotDE( res, alpha, OUT_MAplotDEName, out=outfile )

# export complete data
OUT_completeName <- args[9]
complete <- exportComplete( counts, res, target, adjMethod, cond1, cond2, OUT_completeName, out=outfile )

# export significant genes
OUT_upName <- args[10]
OUT_downName <- args[11]
diff <- exportDiff( complete, alpha, adjMethod, OUT_upName, OUT_downName, out=outfile )

# write all images results into an HTML file
prefixHTMLname <- tab[length(tab)]
#HTMLCSS(file.path(res_dir), filename=prefixHTMLname, CSSfile="R2HTML")
HTMLInitFile(file.path(res_dir), filename=prefixHTMLname, BackGroundColor="white")
HTML.title("<center>Differential Expression DESeq analysis.", HR=1)
HTML.title("<center>BarplotTC: number of RNA-seq reads per sample.", HR=2)
    	HTMLInsertGraph("barplotTC.png")

HTML.title("<center>BarplotNul: number of RNA-seq reads that the count is 0 (nul).", HR=2)
	HTMLInsertGraph("barplotNul.png")

HTML.title("<center>DensityPlot: density of each sample.", HR=2)
	HTMLInsertGraph("densityPlot.png")

HTML.title("<center>Boxplot: number of RNA-seq reads distribution per sample.", HR=2)
	HTMLInsertGraph("boxplotCounts.png")

HTML.title("<center>MajorSequence: the proportion of reads associated with the most expressed sequence.", HR=2)
	HTMLInsertGraph("majSequence.png")

HTML.title("<center>ScatterPlot: Scatter plot of samples.", HR=2)
	HTMLInsertGraph("scatterPlot.png")

HTML.title("<center>Clustering Of No-Normalized Samples: Representing the no-normalized samples in Diagnostic.", HR=2)
	HTMLInsertGraph("clusteringOfSamplesBefore.png")

HTML.title("<center>Clustering Of Normalized Samples: Representing the normalized samples in Diagnostic.", HR=2)
	HTMLInsertGraph("clusteringOfSamples.png")

HTML.title("<center>DispersionEstimates: representing dispersion estimates vs mean expression.", HR=2)
	HTMLInsertGraph("disperssionEstimates.png")

HTML.title("<center>HistoRawPValue: histogram of raw p-value.", HR=2)
	HTMLInsertGraph("histoRawPvalue.png")

HTML.title("<center>MAplotDE: the differentially expressed genes (red point).", HR=2)
	HTMLInsertGraph("MAplotDE.png")
HTMLEndFile()
absoluPrefixHTMLname <- paste(res_dir, prefixHTMLname, sep="")
outName <- paste(absoluPrefixHTMLname, ".html", sep="")
# change name is to be adapted into Galaxy
file.rename(outName, OUT_HTMLname)
author	m-zytnicki
date	Tue, 30 Apr 2013 14:33:21 -0400
parents	94ab73e8a190
children