annotate methylation_analysis_bismark/methylation_analysis/differential_methylation.R @ 9:5b208d4d89e5 draft

Uploaded
author fcaramia
date Tue, 04 Dec 2012 20:15:26 -0500
parents d15b4a2e3bdc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
1 #!/usr/bin/Rscript --vanilla
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
2 library(getopt)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
3 spec <- matrix(c("help", "h", 0, "logical", "view this help", "segfile1", "s", 1, "character", "seg file 1", "segfile2", "t", 1, "character", "seg file 2", "output", "o", 1, "character", "output file", "fdr", "f", 2, "character", paste("fdr method [", paste(p.adjust.methods, collapse="|"), "]", sep=""),
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
4 "reference", "r", 2, "character", "reference to use [b37|hg19|GRCh37|mm9|NCBIM37|mm10|GRCm38|dm3|BDGP5]", "annot", "a", 2, "character", "annotation to add [both|gene|cpg]", "processes", "p", 2, "integer", "number of cluster instances to open [1]"), ncol=5, byrow=T)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
5 opt <- getopt(spec)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
6
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
7 # set default options
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
8 if(is.null(opt$fdr)) opt$fdr <- "BH"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
9 if(is.null(opt$reference)) opt$reference <- "hg19"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
10 if(is.null(opt$annot)) opt$annot <- "both"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
11 if(is.null(opt$processes)) opt$processes <- 1
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
12
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
13 # check if any invalid options
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
14 if(! opt$annot %in% c("both", "gene", "cpg") || !opt$fdr %in% p.adjust.methods || !opt$reference %in% c("b37", "hg19", "GRCh37", "mm9", "NCBIM37", "mm10", "GRCm38", "dm3", "BDGP5")) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
15 opt$help <- 1
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
16 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
17
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
18 # print help file if any incorrect
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
19 if(!is.null(opt$help) || is.null(opt$segfile1) || is.null(opt$output) || is.null(opt$output) || is.na(opt$processes)) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
20 self = commandArgs()[1];
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
21 cat(getopt(spec, usage=T, command="differential_methylation.R"))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
22 q(status=1)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
23 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
24
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
25 library(snow)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
26 cl <- makeCluster(opt$p, type = "MPI")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
27
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
28 segfile1 <- read.table(opt$segfile1, sep="\t", as.is=T, head=T, quote = "")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
29 segfile2 <- read.table(opt$segfile2, sep="\t", as.is=T, head=T, quote = "")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
30
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
31 rownames(segfile1) <- paste(segfile1[,2], ":", segfile1[,3], "-", segfile1[,4], sep="")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
32 rownames(segfile2) <- paste(segfile2[,2], ":", segfile2[,3], "-", segfile2[,4], sep="")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
33
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
34 common_reg <- intersect(rownames(segfile1), rownames(segfile2))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
35 prop_test <- function(x, n, fdr) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
36 library(abind)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
37 ESTIMATE <- x/n
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
38 DELTA <- ESTIMATE[,1L] - ESTIMATE[,2L]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
39 YATES <- pmin(abs(DELTA)/rowSums(1/n), 0.5)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
40 p <- rowSums(x)/rowSums(n)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
41 df <- 1
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
42 x <- abind(x, n - x, along=3)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
43 E <- abind(n*p, n*(1-p), along=3)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
44 STATISTIC <- rowSums((abs(x - E) - YATES)^2/E)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
45 STATISTIC[is.na(STATISTIC)] <- 0
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
46 PVAL <- pchisq(STATISTIC, df, lower.tail = FALSE)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
47 return(data.frame(X_Squared = round(STATISTIC, 3), P.value= round(PVAL, 6), P.adjusted = round(p.adjust(PVAL, method=fdr), 6)))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
48 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
49 results <- prop_test(cbind(segfile1[common_reg, "Methylated"], segfile2[common_reg, "Methylated"]), cbind(segfile1[common_reg, "Total"], segfile2[common_reg, "Total"]), opt$fdr)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
50
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
51 sample1 <- segfile1[1, 1]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
52 sample2 <- segfile2[1, 1]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
53 tab_out <- data.frame(ID = paste(sample1, "vs", sample2, sep="."), segfile1[common_reg, c(2:4)], segfile1[common_reg, c("Methylated", "Total", "FractionMethylated")],
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
54 segfile2[common_reg, c("Methylated", "Total", "FractionMethylated")], results, DiffProp = round(segfile1[common_reg, "FractionMethylated"] - segfile2[common_reg, "FractionMethylated"], 6), stringsAsFactors=F)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
55 colnames(tab_out)[c(5:10)] <- c(paste(sample1, "Methylated", sep="."), paste(sample1, "Total", sep="."), paste(sample1, "Proportion", sep="."), paste(sample2, "Methylated", sep="."),
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
56 paste(sample2, "Total", sep="."), paste(sample2, "Proportion", sep="."))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
57
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
58 rm(segfile1)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
59 rm(segfile2)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
60 gc()
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
61
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
62 add_annot <- function(tab, annotation, genome) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
63 # find closest feature
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
64 if(genome == "hg19" || genome == "GRCh37" || genome == "b37") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
65 genome <- "hg19"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
66 dataset <- "hsapiens_gene_ensembl"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
67 biomart <- "ensembl"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
68 host <- "www.biomart.org"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
69 attributes <- c("ensembl_gene_id", "hgnc_symbol", "refseq_mrna", "start_position", "end_position")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
70 } else if(genome == "mm9" || genome == "NCBIM37") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
71 genome <- "mm9"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
72 dataset <- "mmusculus_gene_ensembl"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
73 biomart <- "ENSEMBL_MART_ENSEMBL"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
74 host <- "may2012.archive.ensembl.org"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
75 attributes <- c("ensembl_gene_id", "mgi_symbol", "refseq_mrna", "start_position", "end_position")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
76 } else if(genome == "mm10" || genome == "GRCm38") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
77 genome <- "mm10"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
78 dataset <- "mmusculus_gene_ensembl"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
79 biomart <- "ensembl"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
80 host <- "www.biomart.org"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
81 attributes <- c("ensembl_gene_id", "mgi_symbol", "refseq_mrna", "start_position", "end_position")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
82 } else if(genome == "dm3" || genome == "BDGP5") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
83 genome <- "dm3"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
84 dataset <- "dmelanogaster_gene_ensembl"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
85 biomart <- "ensembl"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
86 host <- "www.biomart.org"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
87 attributes <- c("ensembl_gene_id", "flybasecgid_gene", "refseq_mrna", "start_position", "end_position")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
88 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
89
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
90 e_to_U = c("GL000191.1" = "1_gl000191_random", "GL000192.1" = "1_gl000192_random", "GL000193.1" = "4_gl000193_random", "GL000194.1" = "4_gl000194_random",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
91 "GL000195.1" = "7_gl000195_random", "GL000196.1" = "8_gl000196_random", "GL000197.1" = "8_gl000197_random", "GL000198.1" = "9_gl000198_random",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
92 "GL000199.1" = "9_gl000199_random", "GL000200.1" = "9_gl000200_random", "GL000201.1" = "9_gl000201_random", "GL000202.1" = "11_gl000202_random",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
93 "GL000203.1" = "17_gl000203_random", "GL000204.1" = "17_gl000204_random", "GL000205.1" = "17_gl000205_random", "GL000206.1" = "17_gl000206_random",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
94 "GL000207.1" = "18_gl000207_random", "GL000208.1" = "19_gl000208_random", "GL000209.1" = "19_gl000209_random", "GL000210.1" = "21_gl000210_random",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
95 "GL000211.1" = "Un_gl000211", "GL000212.1" = "Un_gl000212", "GL000213.1" = "Un_gl000213", "GL000214.1" = "Un_gl000214",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
96 "GL000215.1" = "Un_gl000215", "GL000216.1" = "Un_gl000216", "GL000217.1" = "Un_gl000217", "GL000218.1" = "Un_gl000218",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
97 "GL000219.1" = "Un_gl000219", "GL000220.1" = "Un_gl000220", "GL000221.1" = "Un_gl000221", "GL000222.1" = "Un_gl000222",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
98 "GL000223.1" = "Un_gl000223", "GL000224.1" = "Un_gl000224", "GL000225.1" = "Un_gl000225", "GL000226.1" = "Un_gl000226",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
99 "GL000227.1" = "Un_gl000227", "GL000228.1" = "Un_gl000228", "GL000229.1" = "Un_gl000229", "GL000230.1" = "Un_gl000230",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
100 "GL000231.1" = "Un_gl000231", "GL000232.1" = "Un_gl000232", "GL000233.1" = "Un_gl000233", "GL000234.1" = "Un_gl000234",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
101 "GL000235.1" = "Un_gl000235", "GL000236.1" = "Un_gl000236", "GL000237.1" = "Un_gl000237", "GL000238.1" = "Un_gl000238",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
102 "GL000239.1" = "Un_gl000239", "GL000240.1" = "Un_gl000240", "GL000241.1" = "Un_gl000241", "GL000242.1" = "Un_gl000242",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
103 "GL000243.1" = "Un_gl000243", "GL000244.1" = "Un_gl000244", "GL000245.1" = "Un_gl000245", "GL000246.1" = "Un_gl000246",
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
104 "GL000247.1" = "Un_gl000247", "GL000248.1" = "Un_gl000248", "GL000249.1" = "Un_gl000249")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
105
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
106 # function to conver ucsc chroms to ensembl
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
107 ensembl <- function(chr, genome) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
108 # ensembl does not us chr and M is MT
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
109 chr <- sub("^chr", "", chr)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
110 if(genome == "dm3" || genome == "BDGP5") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
111 chr <- sub("^M$", "dmel_mitochondrion_genome", chr)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
112 } else {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
113 chr <- sub("^M$", "MT", chr)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
114 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
115 chr[which(chr %in% e_to_U)] <- names(e_to_U)[match(chr[which(chr %in% e_to_U)], e_to_U)]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
116 return(chr)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
117 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
118
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
119 # function to conver ensembl chroms to ucsc
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
120 ucsc <- function(chr) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
121 chr <- sub("^chr", "", chr)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
122 tmp <- which(chr %in% names(c(e_to_U, dmel_mitochondrion_genome = "M", MT = "M")))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
123 chr[tmp] <- c(e_to_U, dmel_mitochondrion_genome = "M", MT = "M")[chr[tmp]]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
124 paste("chr", chr, sep="")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
125 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
126
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
127 # function to get the gene annotation by chromosome
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
128 get_genes <- function(chr) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
129 library(biomaRt)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
130 mart <- useMart(biomart=biomart, dataset=dataset, host=host)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
131 tab <- getBM(attributes = attributes, filters = "chromosome_name", values = chr, mart = mart)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
132 if(any(is.na(tab))) for(i in 1:ncol(tab)) tab[is.na(tab[,i]), i] <- ""
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
133 mult_ids <- names(which(table(tab$ensembl_gene_id) > 1))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
134 rem <- NULL
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
135 for(i in mult_ids) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
136 index <- which(tab$ensembl_gene_id == i)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
137 refseq <- which(tab$ensembl_gene_id == i & tab$refseq_mrna != "")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
138 if(length(refseq) > 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
139 rem <- c(rem, setdiff(index, refseq[1]))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
140 } else {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
141 rem <- c(rem, index[-1])
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
142 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
143 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
144 if(length(rem) > 0) tab <- tab[-rem,]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
145 colnames(tab)[4:5] <- c("feature_start", "feature_end")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
146 return(tab)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
147 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
148
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
149 get_cpg <- function(chr) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
150 library(rtracklayer)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
151 options(stringsAsFactors=F)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
152 session <- browserSession()
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
153 genome(session) <- genome
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
154 query <- ucscTableQuery(session, "CpG Islands", GRangesForUCSCGenome(genome, chr))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
155 cpg <- getTable(query)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
156 cpg <- cpg[, c("name", "perCpg", "perGc", "chromStart", "chromEnd")]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
157 colnames(cpg) <- c("cpg", "cpg_perCpg", "cpg_perGc", "cpg_start", "cpg_end")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
158 return(cpg)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
159 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
160 chr <- tab[1, 2]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
161
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
162 # get the gene info for this chrom
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
163 if(annotation == "gene") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
164 tab <- cbind(tab, ensembl_gene_id = "", id = "", refseq_mrna = "", feature_start = "", feature_end = "", Distance_To_Feature = "", stringsAsFactors=F)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
165 annotData <- get_genes(ensembl(chr, genome))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
166 colnames(tab)[16] <- attributes[2]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
167 } else {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
168 tab <- cbind(tab, cpg = "", cpg_perCpg = "", cpg_perGc = "", cpg_start = "", cpg_end = "", Distance_To_cpg = "", stringsAsFactors=F)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
169 annotData <- get_cpg(ucsc(chr))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
170 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
171 if(nrow(annotData) == 0) return(tab)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
172
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
173 starts <- t(matrix(cbind(-1, as.numeric(annotData[, 4])), ncol=2, byrow=F))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
174 ends <- t(matrix(cbind(-1, as.numeric(annotData[, 5])), ncol=2, byrow=F))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
175
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
176 # calculate the distances from the features to the regions
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
177 dist_start_start <- matrix(cbind(tab$loc.start, 1), ncol=2, byrow=F) %*% starts
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
178 dist_start_end <- matrix(cbind(tab$loc.start, 1), ncol=2, byrow=F) %*% ends
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
179 dist_end_start <- matrix(cbind(tab$loc.end, 1), ncol=2, byrow=F) %*% starts
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
180 dist_end_end <- matrix(cbind(tab$loc.end, 1), ncol=2, byrow=F) %*% ends
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
181
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
182 # determine which regions overlap at least 1 feature
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
183 sum_signs <- abs(sign(dist_start_start) + sign(dist_start_end) + sign(dist_end_start) + sign(dist_end_end))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
184 regions <- which(sum_signs != 4, arr.ind=TRUE)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
185 if(length(regions) > 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
186 overlap <- sort(unique(regions[,1]))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
187 non_overlap <- c(1:nrow(tab))[-overlap]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
188 } else {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
189 overlap <- NULL
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
190 non_overlap <- c(1:nrow(tab))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
191 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
192
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
193 # reduce to regions with no overlaping feqature
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
194 if(length(overlap) > 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
195 dist_start_start <- matrix(dist_start_start[non_overlap,], ncol=ncol(dist_start_start))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
196 dist_start_end <- matrix(dist_start_end[non_overlap,], ncol=ncol(dist_start_end))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
197 dist_end_start <- matrix(dist_end_start[non_overlap,], ncol=ncol(dist_end_start))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
198 dist_end_end <- matrix(dist_end_end[non_overlap,], ncol=ncol(dist_end_end))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
199 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
200
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
201 rm(sum_signs)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
202 gc()
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
203
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
204 # extract the annot for the regions with overlaping features
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
205 if(length(overlap) > 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
206 annot <- sapply(overlap, function(x, y) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
207 x <- regions[which(regions[,1] == x), 2]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
208 sub("^//$", "", c(paste(annotData[x,1], collapse="//"), paste(annotData[x,2], collapse="//"), paste(annotData[x,3], collapse="//"), paste(annotData[x,4], collapse="//"), paste(annotData[x,5], collapse="//")))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
209 }, regions)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
210 annot <- as.data.frame(t(annot), stringsAsFactors=F)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
211 annot <- cbind(annot, "Overlap", stringsAsFactors=F)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
212 colnames(annot) <- c(colnames(annotData), tail(colnames(tab),1))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
213 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
214 rm(regions)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
215 gc()
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
216
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
217 # for non the non-overlaps the distance of the closest features to each region
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
218 if(length(non_overlap) > 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
219 clst_pts <- matrix(0, ncol=4, nrow=length(non_overlap))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
220 clst_pts[,1] <- max.col(-abs(dist_start_start), "last")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
221 clst_pts[,2] <- max.col(-abs(dist_start_end), "last")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
222 clst_pts[,3] <- max.col(-abs(dist_end_start), "last")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
223 clst_pts[,4] <- max.col(-abs(dist_end_end), "last")
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
224 dist <- matrix(0, ncol=4, nrow=length(non_overlap))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
225 if(length(clst_pts[,1]) == 1) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
226 dist[,1] <- dist_start_start[1, clst_pts[,1]]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
227 dist[,2] <- dist_start_end[1, clst_pts[,2]]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
228 dist[,3] <- dist_end_start[1, clst_pts[,3]]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
229 dist[,4] <- dist_end_end[1, clst_pts[,4]]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
230
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
231 # extract the annot for the regions with non-overlaping features
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
232 clst_all <- max.col(-abs(dist))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
233 dist_to_feat <- dist[1, clst_all]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
234 clst <- clst_pts[1, clst_all]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
235 } else {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
236 dist[,1] <- dist_start_start[cbind(seq(clst_pts[,1]), clst_pts[,1])]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
237 dist[,2] <- dist_start_end[cbind(seq(clst_pts[,2]), clst_pts[,2])]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
238 dist[,3] <- dist_end_start[cbind(seq(clst_pts[,3]), clst_pts[,3])]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
239 dist[,4] <- dist_end_end[cbind(seq(clst_pts[,4]), clst_pts[,4])]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
240
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
241 # extract the annot for the regions with non-overlaping features
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
242 clst_all <- max.col(-abs(dist))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
243 dist_to_feat <- dist[cbind(seq(clst_all), clst_all)]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
244 clst <- clst_pts[cbind(seq(clst_all), clst_all)]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
245 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
246 annot_non_overlap <- cbind(annotData[clst, ], dist_to_feat)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
247 colnames(annot_non_overlap) <- c(colnames(annotData), tail(colnames(tab),1))
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
248 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
249
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
250 rm(dist_start_start)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
251 rm(dist_start_end)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
252 rm(dist_end_start)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
253 rm(dist_end_end)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
254 gc()
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
255
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
256 if(length(overlap) > 0) tab[overlap, colnames(annot)] <- annot
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
257 if(length(non_overlap) > 0) tab[non_overlap, colnames(annot_non_overlap)] <- annot_non_overlap
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
258 return(tab)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
259 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
260
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
261 tab_list <- list()
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
262 for(i in unique(tab_out[,2])) tab_list[[i]] <- tab_out[which(tab_out$chrom == i),]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
263 rm(tab_out)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
264 gc()
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
265 if(opt$annot == "both" || opt$annot == "gene") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
266 annotation <- "gene"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
267 tab_list <- clusterApplyLB(cl, tab_list, add_annot, annotation, opt$reference)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
268 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
269
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
270 if(opt$annot == "both" || opt$annot == "cpg") {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
271 annotation <- "cpg"
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
272 tab_list <- clusterApplyLB(cl, tab_list, add_annot, annotation, opt$reference)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
273 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
274
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
275 tab_out <- NULL
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
276 for(i in 1:length(tab_list)) tab_out <- rbind(tab_out, tab_list[[i]])
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
277
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
278 cat("'", file=opt$output)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
279 write.table(tab_out[,c(1:13, 15:26, 14)], opt$output, row.names=F, col.names=T, quote=F, sep="\t", append=T)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
280 stopCluster(cl)
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
281 q(status=0)