annotate region_motif_intersect.r @ 4:53e45130e6f9 draft

Uploaded
author jeremyjliu
date Sat, 16 May 2015 22:38:25 -0400
parents cab2db9d058b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
1 # Name: region_motif_intersect.r
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
2 # Description: Takes a bed file of target regions and counts intersections
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
3 # of each motif (in separately installed tabix database) and target regions.
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
4 # Author: Jeremy Liu
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
5 # Email: jeremy.liu@yale.edu
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
6 # Date: 15/02/11
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
7 # Note: This script can be invoked with the following command
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
8 # R --slave --vanilla -f ./region_motif_intersect.r --args <db_bgz> <db_tbi> <inbed> <outtab>
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
9 # Dependencies: region_motif_data_manager, Rsamtools
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
10
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
11 # Auxiliary function to concatenate multiple strings
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
12 concat <- function(...) {
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
13 input_list <- list(...)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
14 return(paste(input_list, sep="", collapse=""))
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
15 }
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
16
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
17 # Retrive motif database path
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
18 args <- commandArgs()
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
19 motifDB_bgz = unlist(strsplit(args[7], ','))[1] # Handles duplicate entries in data table
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
20 motifDB_tbi = unlist(strsplit(args[8], ','))[1] # Just takes the first one
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
21
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
22 # Set input and reference files, comment to toggle commmand line arguments
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
23 inBed = args[9]
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
24 outTab = args[10]
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
25
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
26 # Auxiliary function to read in BED file
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
27 read_bed <- function(file) {
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
28 return(read.table(file, sep="\t", stringsAsFactors=FALSE))
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
29 }
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
30
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
31 startTime = Sys.time()
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
32 cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n")
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
33
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
34 # Load dependencies
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
35 cat("Loading dependencies...\n")
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
36 suppressPackageStartupMessages(library(Rsamtools, quietly=TRUE)) # NEED TO HANDLE INSTALLATION
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
37
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
38 # Initializing hash table (as env) with motif names and loading tabix file
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
39 cat("Loading motif database and initializing hash table...\n")
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
40 motifTable = new.env()
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
41 motifTbx <- TabixFile(motifDB_bgz)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
42
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
43 # Loading input bed file, convert integer columns to numeric, name columns
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
44 cat("Loading region file...\n")
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
45 regionsDF = read_bed(inBed)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
46 dfTemp = sapply(regionsDF, is.integer)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
47 regionsDF[dfTemp] = lapply(regionsDF[dfTemp], as.numeric)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
48 names(regionsDF)[names(regionsDF) == "V1"] = "chr"
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
49 names(regionsDF)[names(regionsDF) == "V2"] = "start"
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
50 names(regionsDF)[names(regionsDF) == "V3"] = "end"
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
51
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
52 # Filtering regions to exclude chromosomes not in motif database
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
53 cat("Determining intersection counts...\n")
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
54 motifTbxChrs = seqnamesTabix(motifTbx)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
55 regionsDFFilter = subset(regionsDF, chr %in% motifTbxChrs)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
56
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
57 # Loading regions into GRanges object and scanning motif tabix database
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
58 # Region end is incremented by 1 since scanTabix querying is inclusive for
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
59 # position start but exclusive for position end.
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
60 param = GRanges(regionsDFFilter$chr, IRanges(regionsDFFilter$start,
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
61 end=regionsDFFilter$end + 1))
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
62 regionsIntersects = scanTabix(motifTbx, param=param)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
63
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
64 # Parsing result list and updating motif count hash table
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
65 cat("Parsing result list...\n")
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
66 for(regionIntersects in regionsIntersects) {
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
67 for(regionIntersect in strsplit(regionIntersects, " ")) {
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
68 intersectMotif = strsplit(regionIntersect, "\t")[[1]][4]
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
69 if(is.null(motifTable[[intersectMotif]])) {
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
70 motifTable[[intersectMotif]] = 1
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
71 } else {
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
72 motifTable[[intersectMotif]] = motifTable[[intersectMotif]] + 1
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
73 }
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
74 }
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
75 }
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
76
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
77 # Converting motif count hash table to an integer vector for output
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
78 counts = integer(length = length(ls(motifTable)))
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
79 names(counts) = ls(motifTable)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
80 for(motifName in ls(motifTable)) {
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
81 counts[motifName] = as.integer(motifTable[[motifName]])
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
82 }
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
83
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
84 # Outputting intersection counts to tab delineated file
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
85 cat("Outputting to file...\n")
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
86 write.table(counts, outTab, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
87 cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."),
cab2db9d058b Uploaded
jeremyjliu
parents:
diff changeset
88 "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n")