0
|
1 # ----------------------------------------------------------------------#
|
|
2 # Copyright (c) 2011, Richard Lupat & Jason Li.
|
|
3 #
|
|
4 # > Source License <
|
|
5 # This file is part of CONTRA.
|
|
6 #
|
|
7 # CONTRA is free software: you can redistribute it and/or modify
|
|
8 # it under the terms of the GNU General Public License as published by
|
|
9 # the Free Software Foundation, either version 3 of the License, or
|
|
10 # (at your option) any later version.
|
|
11 #
|
|
12 # CONTRA is distributed in the hope that it will be useful,
|
|
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 # GNU General Public License for more details.
|
|
16 #
|
|
17 # You should have received a copy of the GNU General Public License
|
|
18 # along with CONTRA. If not, see <http://www.gnu.org/licenses/>.
|
|
19 #
|
|
20 #
|
|
21 #-----------------------------------------------------------------------#
|
|
22 # Last Updated : 30 Sept 2011 17:00PM
|
|
23
|
|
24
|
|
25 # Parameters Parsing (from Command Line)
|
|
26 options <- commandArgs(trailingOnly = T)
|
|
27 bins = as.numeric(options[1])
|
|
28 rd.cutoff = as.numeric(options[2])
|
|
29 min.bases = as.numeric(options[3])
|
|
30 outf = options[4]
|
|
31 sample.name = options[5]
|
|
32 plotoption = options[6]
|
|
33 actual.bin = as.numeric(options[7])
|
|
34 min_normal_rd_for_call = as.numeric(options[8])
|
|
35 min_tumour_rd_for_call = as.numeric(options[9])
|
|
36 min_avg_cov_for_call = as.numeric(options[10])
|
|
37
|
|
38 if (sample.name == "No-SampleName")
|
|
39 sample.name = ""
|
|
40
|
|
41 if (sample.name != "")
|
|
42 sample.name = paste(sample.name, ".", sep="")
|
|
43
|
|
44 # Setup output name
|
|
45 out.f = paste(outf, "/table/", sample.name, "CNATable.", rd.cutoff,"rd.", min.bases,"bases.", bins,"bins.txt", sep="")
|
|
46 pdf.out.f = paste(outf, "/plot/", sample.name, "densityplot.", bins, "bins.pdf", sep="")
|
|
47
|
|
48 # Open and read input files
|
|
49 # cnAverageFile = paste("bin", bins, ".txt", sep="")
|
|
50 cnAverageFile = paste(outf,"/buf/bin",bins,".txt",sep="")
|
|
51 boundariesFile = paste(outf,"/buf/bin",bins,".boundaries.txt",sep="")
|
|
52 print (cnAverageFile)
|
|
53 cn.average = read.delim(cnAverageFile, as.is=F, header=F)
|
|
54 cn.boundary= read.delim(boundariesFile,as.is=F, header=F)
|
|
55
|
|
56 # Apply thresholds and data grouping
|
|
57 cn.average.aboveTs = cn.average[cn.average$V3>min.bases,]
|
|
58 cn.average.list = as.matrix(cn.average.aboveTs$V4)
|
|
59
|
|
60 # Get the mean and sd for each bins
|
|
61 cn.average.mean = c()
|
|
62 cn.average.sd = c()
|
|
63 cn.average.log= c()
|
|
64
|
|
65 # Density Plots for each bins
|
|
66 if (plotoption == "True"){
|
|
67 pdf(pdf.out.f)
|
|
68 }
|
|
69 for (j in 1:actual.bin){
|
|
70 cn.average.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V4)
|
|
71 cn.coverage.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V11)
|
|
72 boundary.end = cn.boundary[cn.boundary$V1==j,]$V2
|
|
73 boundary.start = cn.boundary[cn.boundary$V1==(j-1),]$V2
|
|
74 boundary.mid = (boundary.end+boundary.start)/2
|
|
75 if (plotoption == "True") {
|
|
76 plot_title = paste("density: bin", bins, sep="")
|
|
77 #plot(density(cn.average.nth),xlim=c(-5,5), title=plot_title)
|
|
78 plot(density(cn.average.nth),xlim=c(-5,5))
|
|
79 }
|
|
80 cn.average.mean = c(cn.average.mean, mean(cn.average.nth))
|
|
81 # cn.average.sd = c(cn.average.sd, sd(cn.average.nth))
|
|
82 cn.average.sd = c(cn.average.sd, apply(cn.average.nth,2,sd))
|
|
83 #cn.average.log = c(cn.average.log, boundary.mid)
|
|
84 cn.average.log = c(cn.average.log, log(mean(cn.coverage.nth),2))
|
|
85 }
|
|
86 if (plotoption == "True"){
|
|
87 dev.off()
|
|
88 }
|
|
89
|
|
90 # Put the data's details into matrices
|
|
91 ids = as.matrix(cn.average.aboveTs$V1)
|
|
92 exons = as.matrix(cn.average.aboveTs$V6)
|
|
93 exons.pos = as.matrix(cn.average.aboveTs$V5)
|
|
94 gs = as.matrix(cn.average.aboveTs$V2)
|
|
95 number.bases = as.matrix(cn.average.aboveTs$V3)
|
|
96 mean = as.matrix(cn.average.aboveTs$V4)
|
|
97 sd = as.matrix(cn.average.aboveTs$V7)
|
|
98 tumour.rd = as.matrix(cn.average.aboveTs$V8)
|
|
99 tumour.rd.ori = as.matrix(cn.average.aboveTs$V10)
|
|
100 normal.rd = as.matrix(cn.average.aboveTs$V9)
|
|
101 normal.rd.ori = as.matrix(cn.average.aboveTs$V11)
|
|
102 median = as.matrix(cn.average.aboveTs$V12)
|
|
103 MinLogRatio = as.matrix(cn.average.aboveTs$V13)
|
|
104 MaxLogRatio = as.matrix(cn.average.aboveTs$V14)
|
|
105 Bin = as.matrix(cn.average.aboveTs$V15)
|
|
106 Chr = as.matrix(cn.average.aboveTs$V16)
|
|
107 OriStCoordinate = as.matrix(cn.average.aboveTs$V17)
|
|
108 OriEndCoordinate= as.matrix(cn.average.aboveTs$V18)
|
|
109
|
|
110 # Linear Fit
|
|
111 logratios.mean = mean
|
|
112 logcov.mean = log2((normal.rd + tumour.rd)/2)
|
|
113 fit.mean = lm(logratios.mean ~ logcov.mean)
|
|
114 fit.x = fit.mean$coefficient[1]
|
|
115 fit.y = fit.mean$coefficient[2]
|
|
116
|
|
117 adjusted.lr = rep(NA, length(logratios.mean))
|
|
118 for (j in 1:length(logratios.mean)){
|
|
119 fitted.mean = fit.x + fit.y * logcov.mean[j]
|
|
120 adjusted.lr[j] = logratios.mean[j] - fitted.mean
|
|
121 }
|
|
122
|
|
123 fit.mean2 = lm(adjusted.lr ~ logcov.mean)
|
|
124 fit.mean.a = fit.mean2$coefficient[1]
|
|
125 fit.mean.b = fit.mean2$coefficient[2]
|
|
126
|
|
127 fit.mean.fn <- function(x, fit.a, fit.b){
|
|
128 result = fit.a + fit.b * x
|
|
129 return (result)
|
|
130 }
|
|
131
|
|
132 # Adjust SD based on the new adjusted log ratios
|
|
133 logratios.sd = c()
|
|
134 logcov.bins.mean= c()
|
|
135 for (j in 1:actual.bin){
|
|
136 lr.bins.mean = as.matrix(adjusted.lr[cn.average.aboveTs$V15==j])
|
|
137 # logratios.sd = c(logratios.sd, sd(lr.bins.mean))
|
|
138 logratios.sd = c(logratios.sd, apply(lr.bins.mean,2,sd))
|
|
139
|
|
140 cn.coverage.tumour.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V8)
|
|
141 cn.coverage.normal.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V9)
|
|
142 cn.coverage.nth = (cn.coverage.tumour.nth + cn.coverage.normal.nth) /2
|
|
143 logcov.bins.mean= c(logcov.bins.mean, log2(mean(cn.coverage.nth)))
|
|
144
|
|
145 }
|
|
146
|
|
147 logratios.sd.ori = logratios.sd
|
|
148 if (length(logratios.sd) > 2) {
|
|
149 logratios.sd = logratios.sd[-length(logratios.sd)]
|
|
150 }
|
|
151
|
|
152 logcov.bins.mean.ori = logcov.bins.mean
|
|
153 if (length(logcov.bins.mean) > 2){
|
|
154 logcov.bins.mean= logcov.bins.mean[-length(logcov.bins.mean)]
|
|
155 }
|
|
156
|
|
157 fit.sd = lm(log2(logratios.sd) ~ logcov.bins.mean)
|
|
158 fit.sd.a = fit.sd$coefficient[1]
|
|
159 fit.sd.b = fit.sd$coefficient[2]
|
|
160
|
|
161 fit.sd.fn <- function(x, fit.a, fit.b){
|
|
162 result = 2 ^ (fit.mean.fn(x, fit.a, fit.b))
|
|
163 return (result)
|
|
164 }
|
|
165
|
|
166 # Get the P Values, called the gain/loss
|
|
167 # with average and sd from each bins
|
|
168 pVal.list = c()
|
|
169 gain.loss = c()
|
|
170
|
|
171 for (i in 1:nrow(cn.average.list)){
|
|
172 #print (i)
|
|
173 #logratio = cn.average.list[i]
|
|
174 #logcov = log(normal.rd.ori[i],2)
|
|
175 logratio = adjusted.lr[i]
|
|
176 logcov = logcov.mean[i]
|
|
177 exon.bin = Bin[i]
|
|
178
|
|
179 if (length(logratios.sd) > 1){
|
|
180 pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b))
|
|
181 } else {
|
|
182 pVal <- pnorm(logratio, 0, logratios.sd[exon.bin])
|
|
183 }
|
|
184
|
|
185 if (pVal > 0.5){
|
|
186 pVal = 1-pVal
|
|
187 gain.loss = c(gain.loss, "gain")
|
|
188 } else {
|
|
189 gain.loss = c(gain.loss, "loss")
|
|
190 }
|
|
191 pVal.list = c(pVal.list, pVal*2)
|
|
192 }
|
|
193
|
|
194 # Get the adjusted P Values
|
|
195 adjusted.pVal.list = p.adjust(pVal.list, method="BH")
|
|
196
|
|
197 # Write the output into a tab-delimited text files
|
|
198 outdf=data.frame(Targeted.Region.ID=ids,Exon.Number=exons,Gene.Sym=gs,Chr, OriStCoordinate, OriEndCoordinate, Mean.of.LogRatio=cn.average.list, Adjusted.Mean.of.LogRatio=adjusted.lr, SD.of.LogRatio=sd, Median.of.LogRatio=median, number.bases, P.Value=pVal.list ,Adjusted.P.Value=adjusted.pVal.list , gain.loss, tumour.rd, normal.rd, tumour.rd.ori, normal.rd.ori, MinLogRatio, MaxLogRatio, BinNumber = Bin)
|
|
199
|
|
200 #min_normal_rd_for_call=5
|
|
201 #min_tumour_rd_for_call=0
|
|
202 #min_avg_cov_for_call=20
|
|
203 outdf$tumour.rd.ori = outdf$tumour.rd.ori-0.5
|
|
204 outdf$normal.rd.ori = outdf$normal.rd.ori-0.5
|
|
205 wh.to.excl = outdf$normal.rd.ori < min_normal_rd_for_call
|
|
206 wh.to.excl = wh.to.excl | outdf$tumour.rd.ori < min_tumour_rd_for_call
|
|
207 wh.to.excl = wh.to.excl | (outdf$tumour.rd.ori+outdf$normal.rd.ori)/2 < min_avg_cov_for_call
|
|
208 outdf$P.Value[wh.to.excl]=NA
|
|
209 outdf$Adjusted.P.Value[wh.to.excl]=NA
|
|
210
|
|
211
|
|
212 write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T)
|
|
213
|
|
214 #Plotting SD
|
|
215 #a.sd.fn = rep(fit.sd.a, length(logratios.sd.ori))
|
|
216 #b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori))
|
|
217 #sd.after.fit = fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b)
|
|
218 #sd.out.f = paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="")
|
|
219 #sd.outdf = data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn)
|
|
220 #write.table(sd.outdf, sd.out.f,sep="\t", quote=F, row.names=F, col.names=T)
|
|
221
|
|
222
|
|
223 #End of the script
|
|
224 print ("End of cn_analysis.R")
|
|
225 print (i)
|
|
226
|
|
227
|
|
228
|