Mercurial > repos > jeremyjliu > region_motif_enrichment
changeset 1:1de6f7f3d513 draft
Deleted selected files
author | jeremyjliu |
---|---|
date | Tue, 05 Aug 2014 13:57:32 -0400 |
parents | 5c044273554d |
children | 2538677b6004 |
files | region_motif_lib/regions.cpp region_motif_lib/regions.o region_motif_lib/regions.r region_motif_lib/regions.so |
diffstat | 4 files changed, 0 insertions(+), 436 deletions(-) [+] |
line wrap: on
line diff
--- a/region_motif_lib/regions.cpp Tue Aug 05 13:56:22 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ -#include <iostream> -#include <vector> -#include <algorithm> -using namespace std; - -extern "C" { - typedef pair <int,int> Se_t; - bool se_lt (const Se_t &l,const Se_t &r) { return l.first < r.first; } - - void merge_regions(int *regions, int*nregionsR,int *merge_sepR) { - int nregs=nregionsR[0]; - if(nregs==0) return; - int sep=merge_sepR[0]; - if(sep<1) sep=1; - vector<Se_t> reg(nregs); - for(int ireg=0;ireg<nregs;ireg++) { - reg[ireg]=make_pair(regions[ireg*2],regions[ireg*2+1]); - } - sort(reg.begin(),reg.end(),se_lt); - int *reg_index = new int[nregs]; - for(int ireg=1;ireg<nregs;ireg++) reg_index[ireg]=-1; - reg_index[0]=0; - int last_ireg=0; - int counter=1; - for(int ireg=1;ireg<nregs;ireg++) { - if(reg[ireg].first<=reg[last_ireg].second+sep) { - if(reg[ireg].second>reg[last_ireg].second) reg[last_ireg].second=reg[ireg].second; - } else { - last_ireg=ireg; - reg_index[counter]=ireg; - counter++; - } - } - for(int ireg=0;ireg<counter;ireg++) { - regions[ireg*2] = reg[reg_index[ireg]].first; - regions[ireg*2+1] = reg[reg_index[ireg]].second; - } - nregionsR[0]=counter; - delete [] reg_index; - } - void region_minus_region(int *regions, int*nregionsR,int *region2s, int*nregion2sR,int *updatedregions) { - int sep=1; - merge_regions(regions,nregionsR,&sep); - merge_regions(region2s,nregion2sR,&sep); - int nregs=nregionsR[0]; - int nreg2s=nregion2sR[0]; - for(int i=0;i<2*(nregs+nreg2s);i++) updatedregions[i]=-1; - if(nregs==0) return; - int ireg = 0; - int iregout = 0; - for(int ireg2=0; ireg2<nreg2s;ireg2++) { - if(ireg==nregs) break; - if(region2s[ireg2*2+1] < regions[2*ireg]) continue; - if(region2s[ireg2*2] > regions[2*ireg+1]) { - updatedregions[2*iregout] = regions[2*ireg]; - updatedregions[2*iregout+1] = regions[2*ireg+1]; - ireg++; - ireg2--; - iregout++; - continue; - } - int s = regions[ireg*2]; - int e = regions[ireg*2+1]; - int s2 = region2s[ireg2*2]; - int e2 = region2s[ireg2*2+1]; - if(s2<=s && e2>=e) { - ireg++; - ireg2--; - } - else if(s2<=s) { - regions[ireg*2] = e2+1; - continue; - } else if(e2>=e) { - updatedregions[2*iregout] = s; - updatedregions[2*iregout+1] = s2-1; - ireg2--; - iregout++; - ireg++; - } else { - updatedregions[2*iregout] = s; - updatedregions[2*iregout+1] = s2-1; - regions[ireg*2] = e2+1; - iregout++; - ireg2--; - } - } - while(ireg<nregs) { - updatedregions[2*iregout] = regions[2*ireg]; - updatedregions[2*iregout+1] = regions[2*ireg+1]; - ireg++; - iregout++; - } - } - void intersection_of_regions(int *regions, int*nregionsR,int *region2s, int*nregion2sR,int *updatedregions) { - int sep=1; - merge_regions(regions,nregionsR,&sep); - merge_regions(region2s,nregion2sR,&sep); - int nregs=nregionsR[0]; - int nreg2s=nregion2sR[0]; - for(int i=0;i<2*(nregs+nreg2s);i++) updatedregions[i]=-1; - if(nregs==0) return; - if(nreg2s==0) return; - int ireg2 = 0; - int iregout = 0; - for(int ireg=0; ireg<nregs;ireg++) { - if(ireg2==nreg2s) return; - if(regions[ireg*2+1] < region2s[2*ireg2]) continue; - if(regions[ireg*2] > region2s[2*ireg2+1]) {ireg2++; ireg--; continue;} - - int s = regions[ireg*2]; - if(s<region2s[ireg2*2]) s = region2s[ireg2*2]; - int e = regions[ireg*2+1]; - if(e>region2s[ireg2*2+1]) { - e = region2s[ireg2*2+1]; - ireg2++; - ireg--; - } - updatedregions[2*iregout] = s; - updatedregions[2*iregout+1] = e; - iregout++; - } - } -}
--- a/region_motif_lib/regions.r Tue Aug 05 13:56:22 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,313 +0,0 @@ - -# SHOULD ONLY OCCUR IN ONE FILE -#common.dir = "/Users/jeremyliu1/galaxy-dist/tools/my_tools" - -# commonDir from region_motif_intersect.r sourcing file -dyn.load(paste(commonDir, "/region_motif_lib/regions.so",sep="")) - -##reg = matrix(cbind(from,to)) from<to -##region[[chr]] = reg -##pos = unique(integer()) -##poslist = list(chr,pos, optional(strand=c(-1,0,+1))) - -# USED -merge.reg <- function(...,sep=1) { - ##This function returns union of regs. - reg = rbind(...) - x=.C("merge_regions",as.integer(t(reg)),as.integer(nrow(reg)),as.integer(sep)) - reg=matrix(x[[1]][1:(x[[2]]*2)],ncol=2,byrow=TRUE) - reg = matrix(reg[which(reg[,2]>reg[,1]),],ncol=2) - reg[which(reg==0)]=1 - return(reg) -} - -merge.regions<-function(...,sep=1) { - ##This function returns union of regions. - regions=list(...) - chrs = unique(unlist(lapply(regions,names),use.names=F)) - region = list() - for(chr in chrs) { - region[[chr]] = do.call("merge.reg",c(lapply(regions,function(i) i[[chr]]),sep=sep)) - } - return(region) -} - -plot.reg<-function(reg,xlim=NULL,y=NULL,vertical=FALSE,...) { - ##This function does not stack if reg is overlapping. - ##new plot is made unless y is specified. - if(nrow(reg)==0) return() - if(is.null(xlim)) xlim=range(reg) - if(is.null(y)) { - plot(xlim,c(0,1),type="n",axes=FALSE,xlab=" ",ylab=" ") - y=0.5 - } - segments(reg[,1],y,reg[,2],...) - if(vertical) abline(v=reg) -} - -distance.to.closest.reg.of.reg <- function(reg,reg2) { - ##for each element of reg, what is the closest distance to any element of reg2? - reg2 = merge.reg(reg2) - reg2 = c(-Inf,t(reg2),Inf) - s=reg[,1] - e=reg[,2] - sbin = as.integer(cut(s,reg2)) - ebin = as.integer(cut(e,reg2)) - d = pmin(s-reg2[sbin], reg2[sbin+1]-s, e-reg2[ebin], reg2[ebin+1]-e) - d[which(sbin!=ebin | sbin%%2==0)] = 0 - return(d) -} - -# USED -distance.to.closest.reg.of.pos <- function(pos,reg) { - ##for each element of pos, what is the closest distance to any element of reg? - reg = merge.reg(reg) - reg = c(-Inf,t(reg),Inf) - pbin = as.integer(cut(pos,reg)) - d = pmin(pos-reg[pbin], reg[pbin+1]-pos) - d[which(pbin%%2==0)] = 0 - return(d) -} - -distance.to.closest.pos.of.reg <- function(reg,pos,pos.strand=NULL,index.return=FALSE) { - ##for each element of reg, what is the closest distance to any element of pos? - ##if strand is provided, distance is along strand - o = order(pos) - pos = c(-Inf,pos[o],Inf) - o = c(o[1],o,o[length(o)]) - - s=reg[,1] - e=reg[,2] - sbin = as.integer(cut(s,pos)) - ebin = as.integer(cut(e,pos)) - - d=integer(nrow(reg)) - s.is.closer = s-pos[sbin] < pos[sbin+1]-e - if(index.return) { - return(ifelse(s.is.closer,o[sbin],o[sbin+1])) - } - d = ifelse(s.is.closer, s-pos[sbin], e-pos[sbin+1]) - d[which(sbin!=ebin)] = 0 - if(!is.null(pos.strand)) { - reg.strand = ifelse(s.is.closer,pos.strand[o][sbin],pos.strand[o][sbin+1]) - d = d * reg.strand - } - return(d) -} - -if(F) { - pos = sample(seq(0,1000,200)) - pos2 = sample(seq(10,1010,100)) - pos.strand = sample(c(1,-1),6,replace = T) - pos2.strand = sample(c(1,-1),11,replace = T) -} - -distance.to.closest.pos.of.pos <- function(pos,pos2,pos.strand=NULL,pos2.strand=NULL, ignore.pos.strand=TRUE,index.return=FALSE) { - ##for each element of pos, what is the closest distance to any element of pos2? - ##if index.return==TRUE, index of pos2 closest to pos is returned - ##else if strand2 is provided, distance is along strand2 - ##if strand and strand2 are both provided and !ignore.pos.strand - ## then output is a list giving plus.up, plus.down, minus.up, minus.down - ## plus.up: distance to closest upstream on the same same strand etc. etc. - o = order(pos2) - pos2 = c(-Inf,pos2[o],Inf) - if(!is.null(pos2.strand)) pos2.strand = c(-Inf,pos2.strand[o],Inf) - - if(is.null(pos2.strand) | is.null(pos.strand) | ignore.pos.strand) { - pbin = as.integer(cut(pos,pos2)) - - pbin = ifelse(pos-pos2[pbin] < pos2[pbin+1]-pos,pbin,pbin+1) - d = pos-pos2[pbin] - if(!is.null(pos2.strand)) d = d * pos2.strand[pbin] - - if(index.return) return(o[pbin-1]) - return(d) - } - strands = list(plus=1,minus=-1) - relcoords = list(up=0,down=1) - ind = lapply(strands,function(strand) { - ind.p = c(1,which(pos2.strand==strand),length(pos2)) - pbin.p = cut(pos,pos2[ind.p],labels=FALSE) - as.data.frame(lapply(relcoords,function(i) ind.p[pbin.p+i])) - }) - ind.temp = ind - ind.minus = which(pos.strand==-1) - if(length(ind.minus)>0) { - ind[[1]][ind.minus,]=ind.temp[[2]][ind.minus,2:1] - ind[[2]][ind.minus,]=ind.temp[[1]][ind.minus,2:1] - } - ind = unlist(ind,recursive=FALSE) - if(index.return) { - return( lapply(ind,function(i) { - i[which(i==1)]=NA - i[which(i==length(pos2))]=NA - o[i-1] - }) ) - } - return(lapply(ind,function(i) pos.strand*(pos2[i]-pos))) -} - -distance.to.closest.region.of.region <- function(region,region2) { - ##for each element of region[[chr]], what is the closest distance to any element of region2[[chr]]? - ##returns d[[chr]] - chrs = names(region) - d=list() - for(chr in chrs) { - if(is.null(region2[[chr]])) { - d[[chr]] = rep(Inf,nrow(region[[chr]])) - } else { - d[[chr]] = distance.to.closest.reg.of.reg(region[[chr]],region2[[chr]]) - } - } - return(d) -} - -# USED -distance.to.closest.region.of.poslist <- function(poslist,region) { - ##for each element of poslist, what is the closest distance to any element of region? - chrs = names(table(poslist$chr)) - d=integer() - for(chr in chrs) { - ind = which(poslist$chr==chr) - pos=poslist$pos[ind] - if(is.null(region[[chr]])) { - d[ind] = Inf - } else { - d[ind] = distance.to.closest.reg.of.pos(pos,region[[chr]]) - } - } - return(d) -} -distance.to.closest.poslist.of.region <- function(region,poslist,index.return=FALSE) { - ##for each element of region, what is the closest distance to any element of poslist? - chrs = names(region) - d=list() - for(chr in chrs) { - ind = which(poslist$chr==chr) - pos=poslist$pos[ind] - pos.strand=poslist$strand[ind] - d[[chr]] = distance.to.closest.pos.of.reg(region[[chr]],pos,pos.strand,index.return=index.return) - if(index.return) d[[chr]] = ind[d[[chr]]] - } - return(d) -} - -distance.to.closest.poslist.of.poslist <- function(poslist,poslist2,ignore.poslist.strand=TRUE,index.return=FALSE) { - ##for each element of poslist, what is the closest distance to any element of poslist2? - ##if poslist2$strand is provided, distance is along strand2 - ##if strand and strand2 are provided and no ignore.poslist.strand - ## then output is a list giving plus.up, plus.down, minus.up, minus.down - ## plus.up: distance to closest upstream on the same same strand etc. etc. - ##if index.return==TRUE, index of pos2 closest to pos is returned - - chrs = names(table(poslist$chr)) - - d=integer() - stranded = !(is.null(poslist2$strand) | is.null(poslist$strand) | ignore.poslist.strand) - if(stranded) { - brs = c("plus.up","plus.down","minus.up","minus.down") - d=list() - for(br in brs) d[[br]]=integer() - } - - for(chr in chrs) { - ind = which(poslist$chr==chr) - ind2 = which(poslist2$chr==chr) - pos=poslist$pos[ind] - pos2=poslist2$pos[ind2] - pos.strand=poslist$strand[ind] - pos2.strand=poslist2$strand[ind2] - if(!stranded) { - d[ind] = distance.to.closest.pos.of.pos(pos,pos2,pos.strand,pos2.strand,ignore.poslist.strand,index.return=index.return) - if(index.return) d[ind] = ind2[d[ind]] - } else { - x = distance.to.closest.pos.of.pos(pos,pos2,pos.strand,pos2.strand,ignore.poslist.strand) - for(br in brs) { - d[[br]][ind] = x[[br]] - if(index.return) d[[br]][ind] = ind2[d[[br]][ind]] - } - } - } - return(d) -} - - -reg.minus.reg <- function(reg,reg2) { - x = .C("region_minus_region",as.integer(t(reg)),as.integer(nrow(reg)),as.integer(t(reg2)),as.integer(nrow(reg2)),integer((nrow(reg)+nrow(reg2))*2))[[5]] - x=x[which(x>=0)] - return(matrix(x,ncol=2,byrow=TRUE)) -} - -intersection.of.regs <- function(reg,reg2) { - x = .C("intersection_of_regions",as.integer(t(reg)),as.integer(nrow(reg)),as.integer(t(reg2)),as.integer(nrow(reg2)),integer((nrow(reg)+nrow(reg2))*2))[[5]] - x=x[which(x>=0)] - return(matrix(x,ncol=2,byrow=TRUE)) -} - -region.minus.region<-function(region,region2) { - chrs = names(region) - for(chr in chrs) { - if(is.null(region[[chr]])) next - if(!is.null(region2[[chr]])) { - region[[chr]] = reg.minus.reg(region[[chr]],region2[[chr]]) - } - } - return(region) -} - -intersection.of.regions<-function(region,region2) { - chrs = names(region) - for(chr in chrs) { - if(is.null(region2[[chr]])) { - region[[chr]]<-NULL - } else { - region[[chr]] = intersection.of.regs(region[[chr]],region2[[chr]]) - } - } - return(region) -} - -reg.around.pos <-function(pos,range=500,strand=NULL) { - if(length(range)==1) range=c(range,range) - if(is.null(strand)) strand = 1; - reg = cbind(pos-range[1]*strand,pos+range[2]*strand); - ind = which(reg[,2]<reg[,1]) - reg[ind,] = reg[ind,2:1] - ind = which(reg<=0) - reg[ind] = 1 - return(reg) -} - - -region.around.poslist <-function(poslist,range=500) { - chrs = names(table(poslist$chr)) - region=list() - for(chr in chrs) { - ind = which(poslist$chr==chr) - pos=poslist$pos[ind] - strand = 1 - if(!is.null(poslist$strand)) { - strand = poslist$strand[ind] - } - region[[chr]] = reg.around.pos(pos,range,strand) - } - return(region) -} - - -poslist.of.region.centers <-function(region) { - chrs = names(region) - n=sapply(region,nrow) - return(data.frame(chr=rep(chrs,n),pos=unlist(lapply(region,function(chr)(chr[,1]+chr[,2])/2),use.names = FALSE))) -} - -write.gff.region<-function(region,outfname) { - region = lapply(region,function(chr) list(s=chr[,1],e=chr[,2])) - out=unlist.chr(region) - out$chr=rep(names(region),sapply(region,function(i) length(i$s))) - empty=rep(".",length(out$chr)) - write.table(data.frame(out$chr,empty,empty,out$s,out$e,empty,empty,empty,empty),quote=FALSE,sep="\t",file=outfname,col.names=FALSE,row.names=FALSE) -} - -number.of.regions = function(region)sum(sapply(region,nrow)) -size.of.regions = function(region) sum(sapply(merge.regions(region),function(reg) sum(reg[,2]-reg[,1])))