annotate region_motif_compare.r @ 0:5c044273554d draft

initial commit
author jeremyjliu
date Tue, 05 Aug 2014 13:56:22 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
1 # Name: region_motif_compare.r
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
2 # Description: Reads in two count files and determines enriched and depleted
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
3 # motifs (or any location based feature) based on poisson tests and gc
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
4 # corrections. All enrichment ratios relative to overall count / gc ratios.
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
5 # Author: Jeremy liu
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
6 # Email: jeremy.liu@yale.edu
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
7 # Date: 14/07/03
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
8 # Note: This script is meant to be invoked with the following command
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
9 # R --slave --vanilla -f ./region_motif_compare.r --args <workingdir> <db> <intab1> <intab2>
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
10 # <enriched_tab> <depleted_tab> <plots_png>
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
11 # <workingdir> is working directory of galaxy installation
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
12 # <db> types: "t" test, "p" pouya, "j" jaspar jolma, "m" mouse, "c" combined
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
13 # Dependencies: plotting.r
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
14
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
15 # Auxiliary function to concatenate multiple strings
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
16 concat <- function(...) {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
17 input_list <- list(...)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
18 return(paste(input_list, sep="", collapse=""))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
19 }
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
20
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
21 # Supress all warning messages to prevent Galaxy treating warnings as errors
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
22 options(warn=-1)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
23
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
24 # Set common and data directories
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
25 args <- commandArgs()
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
26 workingDir = args[7]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
27 commonDir = concat(workingDir, "/tools/my_tools")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
28 dbCode = args[8]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
29 # dbCode "c" implemented when pwmFile is loaded
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
30 if (dbCode == "t" | dbCode == "p") {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
31 pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
32 } else if (dbCode == "j") {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
33 pwmFile = concat(commonDir, "/region_motif_db/pwms/jaspar.jolma.pwms.from.seq.RData")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
34 } else if (dbCode == "m") {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
35 pwmFile = concat(commonDir, "/region_motif_db/pwms/mm9.pwms.from.seq.RData")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
36 } else if (dbCode == "c") { # rest of dbCode "c" implemeted when pwmFile loaded
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
37 pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
38 pwmFile2 = concat(commonDir, "/region_motif_db/pwms/jaspar.jolma.pwms.from.seq.RData")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
39 } else {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
40 pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
41 }
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
42
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
43 # Set input and reference files
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
44 inTab1 = args[9]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
45 inTab2 = args[10]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
46 enrichTab = args[11]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
47 depleteTab = args[12]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
48 plotsPng = args[13]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
49
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
50 # Load dependencies
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
51 source(concat(commonDir, "/region_motif_lib/plotting.r"))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
52
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
53 # Auxiliary function to read in tab file and prepare the data
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
54 read_tsv <- function(file) {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
55 data = read.table(file, sep="\t", stringsAsFactors=FALSE)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
56 names(data)[names(data) == "V1"] = "motif"
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
57 names(data)[names(data) == "V2"] = "counts"
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
58 return(data)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
59 }
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
60
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
61 startTime = Sys.time()
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
62 cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
63
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
64 # Loading motif position weight matrix (pwm) file and input tab file
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
65 #cat("Loading and reading input region motif count files...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
66 load(pwmFile) # pwms data structure
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
67 if (dbCode == "c") { # Remaining implementation of dbCode "c" combined
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
68 temp = pwms
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
69 load(pwmFile2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
70 pwms = append(temp, pwms)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
71 }
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
72 region1DF = read_tsv(inTab1)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
73 region2DF = read_tsv(inTab2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
74 region1Counts = region1DF$counts
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
75 region2Counts = region2DF$counts
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
76 names(region1Counts) = region1DF$motif
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
77 names(region2Counts) = region2DF$motif
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
78
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
79 # Processing count vectors to account for missing 0 count motifs, then sorting
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
80 #cat("Performing 0 count correction and sorting...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
81 allNames = union(names(region1Counts), names(region2Counts))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
82 region1Diff = setdiff(allNames, names(region1Counts))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
83 region2Diff = setdiff(allNames, names(region2Counts))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
84 addCounts1 = rep(0, length(region1Diff))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
85 addCounts2 = rep(0, length(region2Diff))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
86 names(addCounts1) = region1Diff
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
87 names(addCounts2) = region2Diff
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
88 newCounts1 = append(region1Counts, addCounts1)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
89 newCounts2 = append(region2Counts, addCounts2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
90 region1Counts = newCounts1[sort.int(names(newCounts1), index.return=TRUE)$ix]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
91 region2Counts = newCounts2[sort.int(names(newCounts2), index.return=TRUE)$ix]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
92
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
93 # Generate gc content matrix
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
94 gc = sapply(pwms, function(i) mean(i[2:3,3:18]))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
95
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
96 # Apply poisson test, calculate p and q values, and filter significant results
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
97 #cat("Applying poisson test...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
98 rValue = sum(region2Counts) / sum(region1Counts)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
99 pValue = sapply(seq(along=region1Counts), function(i) {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
100 poisson.test(c(region1Counts[i], region2Counts[i]), r=1/rValue)$p.value
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
101 })
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
102 qValue = p.adjust(pValue, "fdr")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
103 indices = which(qValue<0.1 & abs(log2(region1Counts/region2Counts/rValue))>log2(1.5))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
104
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
105 # Setting up output diagnostic plots, 4 in 1 png image
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
106 png(plotsPng, width=800, height=800)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
107 xlab = "region1_count"
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
108 ylab = "region2_count"
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
109 lim = c(0.5, 5000)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
110 layout(matrix(1:4, ncol=2))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
111 par(mar=c(5, 5, 5, 1))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
112
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
113 # Plot all motif counts along the linear correlation coefficient
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
114 plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
115 cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
116 abline(0, rValue, untf=T)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
117 abline(0, rValue*2, untf=T, lty=2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
118 abline(0, rValue/2, untf=T, lty=2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
119
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
120 # Plot enriched and depleted motifs in red, housed in second plot
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
121 plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
122 cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
123 points(region1Counts[indices]+0.5, region2Counts[indices]+0.5, col="red")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
124 abline(0, rValue, untf=T)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
125 abline(0, rValue*2, untf=T, lty=2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
126 abline(0, rValue/2, untf=T, lty=2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
127
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
128 # Apply and plot gc correction and loess curve
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
129 #cat("Applying gc correction, rerunning poisson test...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
130 ind = which(region1Counts>5)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
131 gc = gc[names(region2Counts)] # Reorder the indices of pwms to match input data
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
132 lo = plot.scatter(gc,log2(region2Counts/region1Counts),draw.loess=T,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
133 xlab="gc content of motif",ylab=paste("log2(",ylab,"/",xlab,")"),
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
134 cex.lab=2.2,cex.axis=1.8,ind=ind) # This function is in plotting.r
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
135 gcCorrection = 2^approx(lo$loess,xout=gc,rule=2)$y
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
136 save(gc, file="gc.RData")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
137
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
138 # Recalculate p and q values, and filter for significant entries
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
139 pValueGC = sapply(seq(along=region1Counts),function(i) {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
140 poisson.test(c(region1Counts[i],region2Counts[i]),r=1/gcCorrection[i])$p.value
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
141 })
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
142 qValueGC=p.adjust(pValueGC,"fdr")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
143 indicesGC = which(qValueGC<0.1 & abs(log2(region1Counts/region2Counts*gcCorrection))>log2(1.5))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
144
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
145 # Plot gc corrected motif counts
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
146 plot.scatter(region1Counts+0.5, (region2Counts+0.5)/gcCorrection, log="xy",
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
147 xlab=xlab, ylab=paste(ylab,"(normalized)"), cex.lab=2.2, cex.axis=1.8,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
148 xlim=lim, ylim=lim)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
149 points(region1Counts[indicesGC]+0.5,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
150 (region2Counts[indicesGC]+0.5)/gcCorrection[indicesGC], col="red")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
151 abline(0,1)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
152 abline(0,1*2,untf=T,lty=2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
153 abline(0,1/2,untf=T,lty=2)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
154
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
155 # Trim results, compile statistics and output to file
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
156 # Only does so if significant results are computed
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
157 if(length(indicesGC) > 0) {
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
158 # Calculate expected counts and enrichment ratios
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
159 #cat("Calculating statistics...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
160 nullExpect = region1Counts * gcCorrection
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
161 enrichment = region2Counts / nullExpect
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
162
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
163 # Reorder selected indices in ascending pvalue
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
164 #cat("Reordering by ascending pvalue...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
165 indicesReorder = indicesGC[order(pValueGC[indicesGC])]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
166
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
167 # Combine data into one data frame and output to two files
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
168 #cat("Splitting and outputting data...\n")
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
169 outDF = data.frame(motif=names(pValueGC), p=as.numeric(pValueGC), q=qValueGC,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
170 stringsAsFactors=F, region_1_count=region1Counts,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
171 null_expectation=round(nullExpect,2), region_2_count=region2Counts,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
172 enrichment=enrichment)[indicesReorder,]
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
173 names(outDF)[which(names(outDF)=="region_1_count")]=xlab
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
174 names(outDF)[which(names(outDF)=="region_2_count")]=ylab
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
175 indicesEnrich = which(outDF$enrichment>1)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
176 indicesDeplete = which(outDF$enrichment<1)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
177 outDF$enrichment = ifelse(outDF$enrichment>1,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
178 round(outDF$enrichment,3),
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
179 paste("1/",round(1/outDF$enrichment,3)))
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
180 write.table(outDF[indicesEnrich,], file=enrichTab, quote=FALSE,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
181 sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
182 write.table(outDF[indicesDeplete,], file=depleteTab, quote=FALSE,
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
183 sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE)
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
184 }
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
185
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
186 # Catch display messages and output timing information
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
187 catchMessage = dev.off()
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
188 cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."),
5c044273554d initial commit
jeremyjliu
parents:
diff changeset
189 "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n")