diff polii.gene.call @ 0:1b5bd3b955ed draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_polII_gene_call commit 2e97fdaeafdafe2c63588de17777cd32a798adb1-dirty
author mvdbeek
date Fri, 20 Apr 2018 10:27:20 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/polii.gene.call	Fri Apr 20 10:27:20 2018 -0400
@@ -0,0 +1,372 @@
+#!/usr/bin/env Rscript 
+
+# polii.gene.call.r
+# Copyright © 2014-15, Owen Marshall
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at
+# your option) any later version. 
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details. 
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 
+# USA
+
+### FDR calcs ###
+# Method based on original perl scripts by Tony Southall (TDS) as published in
+# Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020
+#
+# Significant modifications to the original methodology include:
+# * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data
+# * using a linear regression for the final intercept value rather than using the average intercept value for all conditions
+# -- both of these should increase the accuracy of the final FDR value.
+
+version <- "1.0.2"
+cat(paste("polii.gene.call v",version,"\n", sep=""))
+
+library(tools)
+
+### Read CLI options
+input.args <- commandArgs(trailingOnly = TRUE)
+
+in.files <- vector()
+read.ops <- function (x) {
+  for (op in x) {
+	if (any(grepl("^--",op))) {
+		op <- gsub("^--","",op)
+		y <- unlist(strsplit(op,"="))
+		
+		if (y[1] == "help") {
+		  cat(paste("Usage: Rscript polii.gene.call.r --genes.file=[some_genes_file.gff] [list of .gatc.bedgraph or .gatc.gff ratio files to process]\n\n", sep=""))
+		  
+		  cat("Options:\n")
+		  for (n in names(op.args)) {
+			cat(paste("  ",n,"=",op.args[[n]],"\n",sep=""))
+		  }
+		  cat("\n")
+		  quit("no",1)
+		}
+		
+		if (!is.null(op.args[[ y[1] ]])) {
+		  op.args[[ y[1] ]] <<- y[2]
+		} else {
+		  cat("Error: Option",y[1],"not recognised ...\n")
+		  quit("no",1)
+		}
+	} else {
+		in.files <<- c(in.files,op)
+	}
+  }
+}
+
+write.ops <- function () {
+  out.df <- data.frame()
+  for (n in names(op.args)) {
+	v <<- as.character(op.args[[n]])
+	df.line <- data.frame(
+	  option=n,
+	  value=v
+	)
+	out.df <- rbind(out.df, df.line)
+  }
+  write.table(out.df,"input.args.single.txt",row.names=F)
+}
+
+op.args <- list(
+  "genes.file" = "/mnt/data/Genomes/dmel_release/DmR6/DmR6.genes.gff",
+  "iter" = 50000,
+  "fdr" = 0.01
+)
+
+read.ops(input.args)
+
+if (length(in.files) == 0) {
+	cat("Usage: Rscript polii.gene.call.r [list of .gatc.gff ratio files to process]\n\n")
+	quit("no",1)
+}
+
+write.ops()
+
+### save random seed for future reproducibility
+dump.random <- runif(1)
+my.seed  <- .Random.seed
+write.table(my.seed,".randomseed")
+
+### read genes file
+cat("Reading genes data file ...\n")
+genes.file=op.args[["genes.file"]]
+genes <- read.table(genes.file, comment.char="#", sep="\t", quote="", fill=T)
+names(genes) <-  c('chr','source','type','start','end','score','strand','c','details')
+
+# only subset if there is a type termed "gene"
+if (any(genes$type == 'gene')) {
+  genes <- subset(genes, type=='gene')
+}
+
+genes$name <- sapply(genes$details, FUN = function (x) {regmatches(x,gregexpr("(?<=Name=).*?(?=;)", x, perl=T))} )
+genes <- genes[,c('chr','start','end','strand','name')]
+genes$chr <- gsub("^chr","",genes$chr,perl=T)
+
+if (nrow(genes) == 0) {
+	cat("Error: unable to extract gene information from genes file\n\n")
+	quit("no",1)
+}
+
+### functions
+read.gff <- function (x,name="score") {
+  fn.ext <- file_ext(x)
+  
+  if (grepl("gff",ignore.case=T,fn.ext)) {
+	temp.data <- read.table(x,row.names=NULL)
+	if (ncol(temp.data) > 5) {
+	  # GFF
+	  trim.data <- temp.data[,c(1,4,5,6)]
+	} else {
+		cat("Error: file does not appear to be in GFF format\n\n")
+		quit("no",1)
+	}
+  } else if (grepl("bed",ignore.case=T,fn.ext)) {
+	temp.data <- read.table(x,row.names=NULL,skip=1)
+	if (ncol(temp.data) == 4) {
+		# bedgraph
+		trim.data <- temp.data
+	} else {
+		cat("Error: file does not appear to be in bedGraph format\n\n")
+		quit("no",1)
+	}
+  } else {
+	cat("Error: input file does not appear to be in bedGraph or GFF format ...\n\n")
+	quit("no",1)
+  }
+  
+  names(trim.data) <- c("chr","start","end",name)
+  
+  trim.data$chr <- gsub("^chr","",trim.data$chr,perl=T)
+  
+  return(trim.data)
+}
+
+gene.exp <- function (input.df, buffer=0, iter=50000, debug=F) {
+  
+  avg.exp <- data.frame(input.df[1,c(4:(length(names(input.df))))])
+  avg <- vector(length=(length(names(input.df)) - 4))
+  avg.exp <- avg.exp[0,]
+  
+  ### FDR calcs ###
+  # Method based off perl scripts by Tony Southall (TDS) as published in
+  # Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020
+  #
+  # Significant modifications to the original methodology include:
+  # * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data
+  # * using a linear regression for the final intercept value rather than using the average intercept value for all conditions
+  # -- both of these should increase the accuracy of the final FDR value.
+  
+  input.len <- length(input.df[,1])
+  frag.samp <- c(1,2,3,4,6,8,10,12,15)
+  thres.samp <- c(0.1,0.2,0.3,0.4,0.5,0.65,0.8,1.0,1.5,2.0)
+  
+  rand <- list()
+  
+  for (thres in thres.samp) {
+    
+    cat(paste("  Calculating FDR for threshold",thres,"\n",sep=" "))
+    
+    # init vars
+    for (f in frag.samp) {
+      # List names are, e.g. frag 1, thres 0.2: rand[[thres.1.0.2]]
+      rand[[paste("thres.",f,".",thres,sep="")]] <- 0;
+    }
+    
+    for (i in 1:iter) {
+      if (i %% 200 == 0) {cat(paste("  iter",i,"\r"))}
+      # get random sample for different fragment lengths
+      
+      rand.samp <- list()
+      
+      for (f in frag.samp) {
+        # Using the fourth column as we're only calculating FDR for one sample ...
+        rand.samp[[paste("rand.",f,sep="")]] <- mean(input.df[runif(f,1,input.len),4])
+      }
+      
+      # count number of times exp > thres
+      for (f in frag.samp) {
+        if (rand.samp[[paste("rand.",f,sep="")]] > thres) {rand[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]] + 1}
+      }
+    }
+  }
+  
+  rand.fdr <- list()
+  
+  for (thres in thres.samp) {
+    for (f in frag.samp) {
+      rand.fdr[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]]/iter
+    }
+  }
+  
+  cat("Fitting curves ...\n")
+  
+  # curve fit: fdr vs thresholds
+  var.thres <- list()
+  
+  for (thres in thres.samp) {
+    for (f in frag.samp) {
+      var.thres[[paste("frags.",f,sep="")]] <- append(var.thres[[paste("frags.",f,sep="")]], rand.fdr[[paste("thres.",f,".",thres,sep="")]])
+    }
+  }
+  
+  inf.log.lm <- function (v) {
+    non.inf <- log(v) != -Inf
+    ret <- lm(log(v)[non.inf] ~ thres.samp[non.inf])
+    return(ret)
+  }
+  
+  # The relationship is exponential, so we need log data for a linear regression
+  # (in R, linear regression is: y = lm$coefficients[[2]]x + lm$coefficients[[1]] ... )
+  var.lm <- list()
+  for (f in frag.samp) {
+    var.lm[[paste("frags.",f,sep="")]] <- inf.log.lm(var.thres[[paste("frags.",f,sep="")]])
+  }
+  
+  # ... and now we do a linear regression on the slopes and intercepts of our previous regressions
+  # (This is the clever bit, and it actually seems to work.  The correlation of slope to fragment size is linear ...
+  # By doing this on the slope and intercept, we can now predict the FDR for any number of fragments with any expression.)
+  slope <- vector()
+  for (f in frag.samp) {
+    slope <- append(slope, var.lm[[paste("frags.",f,sep="")]]$coefficients[[2]])
+  }
+  
+  # slope regression predicts the average slope
+  slope.lm <- lm(slope ~ frag.samp)
+  
+  # TDS used an average intercept value for the intercept, however ...
+  inter <- vector()
+  for (f in frag.samp) {
+    inter <- append(inter, var.lm[[paste("frags.",f,sep="")]]$coefficients[[1]])
+  }
+  
+  # ... there's actually quite a bit of variation of the intercept with real data,
+  # so we're going to do a lin regression on the intercept instead.
+  #
+  # (I'm not convinced it's a true linear relationship.  But it's close to linear,
+  # and will certainly perform better than taking the mean intercept ...)
+  #
+  # If you're interested, set the debug flag to TRUE and take a look at the plots generated below ...
+  inter.lm <- lm(inter ~ frag.samp)
+  
+  if (debug == T) {
+    # plots for debugging/checking
+    plot.debug <- function (y,x,l,name="Debug plot") {
+      plot(y ~ x)
+      abline(l)
+      
+      lsum <- summary(l)
+      r2 <- lsum$r.squared
+      
+      legend("topleft",legend=r2,bty='n')
+      title(name)
+      
+      dev.copy(png,paste(name,".png",sep=""));dev.off()
+    }
+    
+    plot.debug(slope,frag.samp,slope.lm,name="correlation of slope")
+    plot.debug(inter,frag.samp,inter.lm,name="correlation of intercepts")
+  }
+  
+  # ok, so putting that all together ...
+  fdr <- function (frags, expr) {
+    inter.test <- inter.lm$coefficients[[2]] * frags + inter.lm$coefficients[[1]]
+    slope.test <- slope.lm$coefficients[[2]] * frags + slope.lm$coefficients[[1]]
+    
+    fdr.out <- exp(slope.test * expr + inter.test)
+    return(fdr.out)
+  }
+  
+  ### Gene expression values ###  
+  cat("Calculating gene values ...\n")
+  
+  count <- 0
+  
+  # unroll chromosomes for speed:
+  for (chromo in unique(genes$chr)) {
+    input.chr <- subset(input.df, chr==chromo)
+    genes.chr <- subset(genes, chr==chromo)
+    for (i in 1:length(genes.chr$name)) {
+      # Roll through each gene
+      
+      # Note: the original script calculated expression values for a table of proteins,
+      # here we just limit ourselves to the FDR for the first column past "chr", "start" and "end"
+      # so if you're thinking of looking at chromatin, for example, put PolII as column 4 in your table!
+      
+      gene.start <- genes.chr[i,"start"] - buffer
+      gene.end <- genes.chr[i,"end"] + buffer
+      
+      gene.start <- ifelse(gene.start < 1, 1, gene.start)
+      
+      # Create data frames for all gatc fragments covering current gene
+      exp <- data.frame(input.chr[ (input.chr$start <= gene.end) 
+                                 & (input.chr$end >= gene.start) 
+                                ,] )
+      
+      gatc.num <- length(exp[,1])
+      
+      # skip if no gatc fragments cover gene :(
+      if (gatc.num == 0) {next}
+      
+      # trim to gene boundaries ...
+      exp$start[1] <- gene.start
+      exp$end[length(exp[,1])] <- gene.end
+      
+      # gene length covered by gatc fragments
+      len <- sum(exp$end-exp$start)
+      
+      # calculate weighted score for each column (representing different proteins)
+      for (j in 4:length(names(input.chr))) {
+        avg[j] <- (sum((exp$end-exp$start)*exp[j]))/len
+      }
+      
+      # make data.frame of averages (to be appended to avg.exp)
+      df <- cbind(avg[1])
+      for (k in 2:length(avg)) {
+        df <- cbind(df,avg[k])
+      }
+      df <- cbind(df,gatc.num)
+      
+      # only fdr for first column for now ...
+      gene.fdr <- fdr(gatc.num,avg[4])
+      df <- cbind(df, gene.fdr)
+      
+      # append current gene to list
+      avg.exp <- rbind(avg.exp,data.frame(name=as.character(genes.chr[i,"name"]), df))
+      count <- count+1
+      if (count %% 50 == 0) {cat(paste(count,"genes averaged ...\r"))}
+    }
+  }
+  
+  avg.exp <- avg.exp[,c(1,5:(length(names(avg.exp))))]
+  names(avg.exp) <- c("name",names(input.df)[c(4:(length(names(input.df))))],"gatc.num","FDR")
+  
+  return(avg.exp)
+}
+
+for (name in in.files) {
+	cat(paste("\nNow working on",name,"...\n"))
+	bname <- basename(name)
+	fname <- sub("\\..*","",bname,perl=T)
+	
+	polii <- read.gff(name,"polii")
+	
+	polii.exp <- gene.exp(polii, iter=as.numeric(op.args[["iter"]]))
+	
+	out <- subset(polii.exp,FDR < op.args[["fdr"]])
+	
+	write.table(polii.exp,paste(fname,"genes.details.csv",sep="."),row.names=F,col.names=T,quote=T,sep="\t")
+	write.table(out$name, paste(fname,"genes",sep="."),row.names=F,col.names=F,quote=F)
+}
+
+cat("\nAll done.\n\n")