changeset 0:1b5bd3b955ed draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_polII_gene_call commit 2e97fdaeafdafe2c63588de17777cd32a798adb1-dirty
author mvdbeek
date Fri, 20 Apr 2018 10:27:20 -0400 (2018-04-20)
parents
children 83ca801f21f9
files damid_polII_gene_call.xml polii.gene.call test-data/details.csv test-data/genes.csv test-data/genes.gff test-data/polII.bed
diffstat 6 files changed, 565 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/damid_polII_gene_call.xml	Fri Apr 20 10:27:20 2018 -0400
@@ -0,0 +1,50 @@
+<tool id="damidseq_polII_gene_call" name="Calculate polII occupancy" version="1.0.2">
+    <description>from DamIDseq data</description>
+    <requirements>
+        <requirement type="package" version="3.4">R</requirement>
+    </requirements>
+    <version_command><![CDATA[Rscript '$__tool_directory__/polii.gene.call' | grep -o "v.*"]]></version_command>
+    <command detect_errors="aggressive"><![CDATA[
+ln -fs '$genes' genes.gff &&
+ln -fs '$input_file' input.$input_file.ext &&
+Rscript '$__tool_directory__/polii.gene.call' --genes.file=genes.gff input.$input_file.ext --iter=$iter --fdr=$fdr && 
+mv *.details.csv details.csv &&
+mv *.genes genes.csv
+    ]]></command>
+    <inputs>
+        <param name="input_file" type="data" format="gff,bed,bedgraph" label="Select dam-fusion/dam ratio files" help="You can use damidseq_core to produce this file."/>
+        <param name="genes" type="data" format="gff" label="Select a GFF file containing genes for which to calculate occupancy"/>
+        <param name="iter" type="integer" min="1" value="50000" label="Number of iterations for calculation of FDR"/>
+        <param name="fdr" type="float" min="0" max="1" value="0.01" label="FDR cutoff for gene list"/>
+    </inputs>
+    <outputs>
+        <data name="details" format="csv" from_work_dir="details.csv" label="${tool.name} gene details on ${on_string}"/>
+        <data name="sig_genes" format="csv" from_work_dir="genes.csv" label="${tool.name} genes on ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="polII.bed" ftype="bedgraph"/>
+            <param name="genes" value="genes.gff" ftype="gff"/>
+            <param name="fdr" value="1"/>
+            <param name="iter" value="500"/>
+            <output name="details" value="details.csv" compare="sim_size"/>
+            <output name="sig_genes" value="genes.csv" compare="sim_size"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+What it does
+------------
+
+Rscript for calculating average PolII occupancy and FDR for RNA Pol II DamID datasets, based on original algorithms developed by Tony Southall.
+The script processes datafiles in gatc.bedgraph or gatc.gff format, such as those generated by damidseq_pipeline.
+
+Differences between RNA pol II DamID and RNAseq
+-----------------------------------------------
+Although both are methods for transcriptional profiling, please be aware that there may be differences between these two methods. In particular, transcript abundancy as assessed through RNAseq will depend on transcript stability, whereas RNA pol II occupancy may provide a better indication of transcription levels.
+
+
+        ]]></help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btv386</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/polii.gene.call	Fri Apr 20 10:27:20 2018 -0400
@@ -0,0 +1,372 @@
+#!/usr/bin/env Rscript 
+
+# polii.gene.call.r
+# Copyright © 2014-15, Owen Marshall
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at
+# your option) any later version. 
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details. 
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 
+# USA
+
+### FDR calcs ###
+# Method based on original perl scripts by Tony Southall (TDS) as published in
+# Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020
+#
+# Significant modifications to the original methodology include:
+# * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data
+# * using a linear regression for the final intercept value rather than using the average intercept value for all conditions
+# -- both of these should increase the accuracy of the final FDR value.
+
+version <- "1.0.2"
+cat(paste("polii.gene.call v",version,"\n", sep=""))
+
+library(tools)
+
+### Read CLI options
+input.args <- commandArgs(trailingOnly = TRUE)
+
+in.files <- vector()
+read.ops <- function (x) {
+  for (op in x) {
+	if (any(grepl("^--",op))) {
+		op <- gsub("^--","",op)
+		y <- unlist(strsplit(op,"="))
+		
+		if (y[1] == "help") {
+		  cat(paste("Usage: Rscript polii.gene.call.r --genes.file=[some_genes_file.gff] [list of .gatc.bedgraph or .gatc.gff ratio files to process]\n\n", sep=""))
+		  
+		  cat("Options:\n")
+		  for (n in names(op.args)) {
+			cat(paste("  ",n,"=",op.args[[n]],"\n",sep=""))
+		  }
+		  cat("\n")
+		  quit("no",1)
+		}
+		
+		if (!is.null(op.args[[ y[1] ]])) {
+		  op.args[[ y[1] ]] <<- y[2]
+		} else {
+		  cat("Error: Option",y[1],"not recognised ...\n")
+		  quit("no",1)
+		}
+	} else {
+		in.files <<- c(in.files,op)
+	}
+  }
+}
+
+write.ops <- function () {
+  out.df <- data.frame()
+  for (n in names(op.args)) {
+	v <<- as.character(op.args[[n]])
+	df.line <- data.frame(
+	  option=n,
+	  value=v
+	)
+	out.df <- rbind(out.df, df.line)
+  }
+  write.table(out.df,"input.args.single.txt",row.names=F)
+}
+
+op.args <- list(
+  "genes.file" = "/mnt/data/Genomes/dmel_release/DmR6/DmR6.genes.gff",
+  "iter" = 50000,
+  "fdr" = 0.01
+)
+
+read.ops(input.args)
+
+if (length(in.files) == 0) {
+	cat("Usage: Rscript polii.gene.call.r [list of .gatc.gff ratio files to process]\n\n")
+	quit("no",1)
+}
+
+write.ops()
+
+### save random seed for future reproducibility
+dump.random <- runif(1)
+my.seed  <- .Random.seed
+write.table(my.seed,".randomseed")
+
+### read genes file
+cat("Reading genes data file ...\n")
+genes.file=op.args[["genes.file"]]
+genes <- read.table(genes.file, comment.char="#", sep="\t", quote="", fill=T)
+names(genes) <-  c('chr','source','type','start','end','score','strand','c','details')
+
+# only subset if there is a type termed "gene"
+if (any(genes$type == 'gene')) {
+  genes <- subset(genes, type=='gene')
+}
+
+genes$name <- sapply(genes$details, FUN = function (x) {regmatches(x,gregexpr("(?<=Name=).*?(?=;)", x, perl=T))} )
+genes <- genes[,c('chr','start','end','strand','name')]
+genes$chr <- gsub("^chr","",genes$chr,perl=T)
+
+if (nrow(genes) == 0) {
+	cat("Error: unable to extract gene information from genes file\n\n")
+	quit("no",1)
+}
+
+### functions
+read.gff <- function (x,name="score") {
+  fn.ext <- file_ext(x)
+  
+  if (grepl("gff",ignore.case=T,fn.ext)) {
+	temp.data <- read.table(x,row.names=NULL)
+	if (ncol(temp.data) > 5) {
+	  # GFF
+	  trim.data <- temp.data[,c(1,4,5,6)]
+	} else {
+		cat("Error: file does not appear to be in GFF format\n\n")
+		quit("no",1)
+	}
+  } else if (grepl("bed",ignore.case=T,fn.ext)) {
+	temp.data <- read.table(x,row.names=NULL,skip=1)
+	if (ncol(temp.data) == 4) {
+		# bedgraph
+		trim.data <- temp.data
+	} else {
+		cat("Error: file does not appear to be in bedGraph format\n\n")
+		quit("no",1)
+	}
+  } else {
+	cat("Error: input file does not appear to be in bedGraph or GFF format ...\n\n")
+	quit("no",1)
+  }
+  
+  names(trim.data) <- c("chr","start","end",name)
+  
+  trim.data$chr <- gsub("^chr","",trim.data$chr,perl=T)
+  
+  return(trim.data)
+}
+
+gene.exp <- function (input.df, buffer=0, iter=50000, debug=F) {
+  
+  avg.exp <- data.frame(input.df[1,c(4:(length(names(input.df))))])
+  avg <- vector(length=(length(names(input.df)) - 4))
+  avg.exp <- avg.exp[0,]
+  
+  ### FDR calcs ###
+  # Method based off perl scripts by Tony Southall (TDS) as published in
+  # Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020
+  #
+  # Significant modifications to the original methodology include:
+  # * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data
+  # * using a linear regression for the final intercept value rather than using the average intercept value for all conditions
+  # -- both of these should increase the accuracy of the final FDR value.
+  
+  input.len <- length(input.df[,1])
+  frag.samp <- c(1,2,3,4,6,8,10,12,15)
+  thres.samp <- c(0.1,0.2,0.3,0.4,0.5,0.65,0.8,1.0,1.5,2.0)
+  
+  rand <- list()
+  
+  for (thres in thres.samp) {
+    
+    cat(paste("  Calculating FDR for threshold",thres,"\n",sep=" "))
+    
+    # init vars
+    for (f in frag.samp) {
+      # List names are, e.g. frag 1, thres 0.2: rand[[thres.1.0.2]]
+      rand[[paste("thres.",f,".",thres,sep="")]] <- 0;
+    }
+    
+    for (i in 1:iter) {
+      if (i %% 200 == 0) {cat(paste("  iter",i,"\r"))}
+      # get random sample for different fragment lengths
+      
+      rand.samp <- list()
+      
+      for (f in frag.samp) {
+        # Using the fourth column as we're only calculating FDR for one sample ...
+        rand.samp[[paste("rand.",f,sep="")]] <- mean(input.df[runif(f,1,input.len),4])
+      }
+      
+      # count number of times exp > thres
+      for (f in frag.samp) {
+        if (rand.samp[[paste("rand.",f,sep="")]] > thres) {rand[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]] + 1}
+      }
+    }
+  }
+  
+  rand.fdr <- list()
+  
+  for (thres in thres.samp) {
+    for (f in frag.samp) {
+      rand.fdr[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]]/iter
+    }
+  }
+  
+  cat("Fitting curves ...\n")
+  
+  # curve fit: fdr vs thresholds
+  var.thres <- list()
+  
+  for (thres in thres.samp) {
+    for (f in frag.samp) {
+      var.thres[[paste("frags.",f,sep="")]] <- append(var.thres[[paste("frags.",f,sep="")]], rand.fdr[[paste("thres.",f,".",thres,sep="")]])
+    }
+  }
+  
+  inf.log.lm <- function (v) {
+    non.inf <- log(v) != -Inf
+    ret <- lm(log(v)[non.inf] ~ thres.samp[non.inf])
+    return(ret)
+  }
+  
+  # The relationship is exponential, so we need log data for a linear regression
+  # (in R, linear regression is: y = lm$coefficients[[2]]x + lm$coefficients[[1]] ... )
+  var.lm <- list()
+  for (f in frag.samp) {
+    var.lm[[paste("frags.",f,sep="")]] <- inf.log.lm(var.thres[[paste("frags.",f,sep="")]])
+  }
+  
+  # ... and now we do a linear regression on the slopes and intercepts of our previous regressions
+  # (This is the clever bit, and it actually seems to work.  The correlation of slope to fragment size is linear ...
+  # By doing this on the slope and intercept, we can now predict the FDR for any number of fragments with any expression.)
+  slope <- vector()
+  for (f in frag.samp) {
+    slope <- append(slope, var.lm[[paste("frags.",f,sep="")]]$coefficients[[2]])
+  }
+  
+  # slope regression predicts the average slope
+  slope.lm <- lm(slope ~ frag.samp)
+  
+  # TDS used an average intercept value for the intercept, however ...
+  inter <- vector()
+  for (f in frag.samp) {
+    inter <- append(inter, var.lm[[paste("frags.",f,sep="")]]$coefficients[[1]])
+  }
+  
+  # ... there's actually quite a bit of variation of the intercept with real data,
+  # so we're going to do a lin regression on the intercept instead.
+  #
+  # (I'm not convinced it's a true linear relationship.  But it's close to linear,
+  # and will certainly perform better than taking the mean intercept ...)
+  #
+  # If you're interested, set the debug flag to TRUE and take a look at the plots generated below ...
+  inter.lm <- lm(inter ~ frag.samp)
+  
+  if (debug == T) {
+    # plots for debugging/checking
+    plot.debug <- function (y,x,l,name="Debug plot") {
+      plot(y ~ x)
+      abline(l)
+      
+      lsum <- summary(l)
+      r2 <- lsum$r.squared
+      
+      legend("topleft",legend=r2,bty='n')
+      title(name)
+      
+      dev.copy(png,paste(name,".png",sep=""));dev.off()
+    }
+    
+    plot.debug(slope,frag.samp,slope.lm,name="correlation of slope")
+    plot.debug(inter,frag.samp,inter.lm,name="correlation of intercepts")
+  }
+  
+  # ok, so putting that all together ...
+  fdr <- function (frags, expr) {
+    inter.test <- inter.lm$coefficients[[2]] * frags + inter.lm$coefficients[[1]]
+    slope.test <- slope.lm$coefficients[[2]] * frags + slope.lm$coefficients[[1]]
+    
+    fdr.out <- exp(slope.test * expr + inter.test)
+    return(fdr.out)
+  }
+  
+  ### Gene expression values ###  
+  cat("Calculating gene values ...\n")
+  
+  count <- 0
+  
+  # unroll chromosomes for speed:
+  for (chromo in unique(genes$chr)) {
+    input.chr <- subset(input.df, chr==chromo)
+    genes.chr <- subset(genes, chr==chromo)
+    for (i in 1:length(genes.chr$name)) {
+      # Roll through each gene
+      
+      # Note: the original script calculated expression values for a table of proteins,
+      # here we just limit ourselves to the FDR for the first column past "chr", "start" and "end"
+      # so if you're thinking of looking at chromatin, for example, put PolII as column 4 in your table!
+      
+      gene.start <- genes.chr[i,"start"] - buffer
+      gene.end <- genes.chr[i,"end"] + buffer
+      
+      gene.start <- ifelse(gene.start < 1, 1, gene.start)
+      
+      # Create data frames for all gatc fragments covering current gene
+      exp <- data.frame(input.chr[ (input.chr$start <= gene.end) 
+                                 & (input.chr$end >= gene.start) 
+                                ,] )
+      
+      gatc.num <- length(exp[,1])
+      
+      # skip if no gatc fragments cover gene :(
+      if (gatc.num == 0) {next}
+      
+      # trim to gene boundaries ...
+      exp$start[1] <- gene.start
+      exp$end[length(exp[,1])] <- gene.end
+      
+      # gene length covered by gatc fragments
+      len <- sum(exp$end-exp$start)
+      
+      # calculate weighted score for each column (representing different proteins)
+      for (j in 4:length(names(input.chr))) {
+        avg[j] <- (sum((exp$end-exp$start)*exp[j]))/len
+      }
+      
+      # make data.frame of averages (to be appended to avg.exp)
+      df <- cbind(avg[1])
+      for (k in 2:length(avg)) {
+        df <- cbind(df,avg[k])
+      }
+      df <- cbind(df,gatc.num)
+      
+      # only fdr for first column for now ...
+      gene.fdr <- fdr(gatc.num,avg[4])
+      df <- cbind(df, gene.fdr)
+      
+      # append current gene to list
+      avg.exp <- rbind(avg.exp,data.frame(name=as.character(genes.chr[i,"name"]), df))
+      count <- count+1
+      if (count %% 50 == 0) {cat(paste(count,"genes averaged ...\r"))}
+    }
+  }
+  
+  avg.exp <- avg.exp[,c(1,5:(length(names(avg.exp))))]
+  names(avg.exp) <- c("name",names(input.df)[c(4:(length(names(input.df))))],"gatc.num","FDR")
+  
+  return(avg.exp)
+}
+
+for (name in in.files) {
+	cat(paste("\nNow working on",name,"...\n"))
+	bname <- basename(name)
+	fname <- sub("\\..*","",bname,perl=T)
+	
+	polii <- read.gff(name,"polii")
+	
+	polii.exp <- gene.exp(polii, iter=as.numeric(op.args[["iter"]]))
+	
+	out <- subset(polii.exp,FDR < op.args[["fdr"]])
+	
+	write.table(polii.exp,paste(fname,"genes.details.csv",sep="."),row.names=F,col.names=T,quote=T,sep="\t")
+	write.table(out$name, paste(fname,"genes",sep="."),row.names=F,col.names=F,quote=F)
+}
+
+cat("\nAll done.\n\n")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/details.csv	Fri Apr 20 10:27:20 2018 -0400
@@ -0,0 +1,11 @@
+"name"	"polii"	"gatc.num"	"FDR"
+"CR46254"	0.356507105200981	5	0.791940813158031
+"CR45339"	1.20628998889217	2	0.260660674330585
+"CR45340"	1.93867996741526	5	0.0511077462101616
+"dbr"	1.2951093442343	22	0.0152079814861417
+"CR44987"	2.49935540607138	24	1.97089127262964e-05
+"galectin"	2.38923790079731	21	9.84816956841954e-05
+"CG11374"	0.00926960982128576	5	1.4451260959191
+"net"	0.663390756192582	14	0.394156836924802
+"Zir"	-0.337727067030228	30	181.086928253756
+"CG11377"	0.263311791595758	9	1.16681003833418
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genes.csv	Fri Apr 20 10:27:20 2018 -0400
@@ -0,0 +1,5 @@
+CR46254
+CR45339
+CR45340
+dbr
+net
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genes.gff	Fri Apr 20 10:27:20 2018 -0400
@@ -0,0 +1,16 @@
+2L	FlyBase	gene	54817	55767	.	+	.	ID=FBgn0267987;Name=CR46254;Ontology_term=SO:0000011,SO:0000087;Dbxref=FlyBase_Annotation_IDs:CR46254;derived_computed_cyto=21B1-21B1
+2L	FlyBase	ncRNA	54817	55767	.	+	.	ID=FBtr0347585;Name=CR46254-RA;Parent=FBgn0267987;Dbxref=FlyBase_Annotation_IDs:CR46254-RA;score_text=Weakly Supported;score=0
+2L	FlyBase	gene	65999	66242	.	+	.	ID=FBgn0266878;Name=CR45339;Ontology_term=SO:0000011,SO:0000087;Dbxref=FlyBase_Annotation_IDs:CR45339,EntrezGene:19834983,GenomeRNAi:19834983;derived_computed_cyto=21B1-21B1
+2L	FlyBase	ncRNA	65999	66242	.	+	.	ID=FBtr0345732;Name=CR45339-RA;Parent=FBgn0266878;Dbxref=FlyBase_Annotation_IDs:CR45339-RA,REFSEQ:NR_123995;score_text=Weakly Supported;score=0
+2L	FlyBase	gene	66318	66524	.	+	.	ID=FBgn0266879;Name=CR45340;Ontology_term=SO:0000011,SO:0000087;Dbxref=FlyBase_Annotation_IDs:CR45340,EntrezGene:19835538,GenomeRNAi:19835538;derived_computed_cyto=21B1-21B1
+2L	FlyBase	ncRNA	66318	66524	.	+	.	ID=FBtr0345733;Name=CR45340-RA;Parent=FBgn0266879;Dbxref=FlyBase_Annotation_IDs:CR45340-RA,REFSEQ:NR_123996;score_text=Weakly Supported;score=0
+2L	FlyBase	gene	66482	71390	.	+	.	ID=FBgn0067779;Name=dbr;fullname=debra;Alias=FBgn0028472,FBgn0043386,CG11371,Dbr,EP456,BcDNA:LD26519,EP0456,Debra;Ontology_term=SO:0000010,SO:0000087,GO:0005771,GO:0007616,GO:0005634,GO:0008270;Dbxref=FlyBase:FBan0011371,FlyBase_Annotation_IDs:CG11371,GB_protein:AAF51565,GB:AA942362,GB:AF184228,GB_protein:AAD55739,GB:AI109558,GB:AI402147,GB:AQ026007,GB:AQ073408,GB:AW942847,GB:AY047576,GB_protein:AAK77308,GB:CL527762,GB:CZ468654,GB:CZ470443,GB:CZ470758,GB:CZ471260,GB:CZ474676,GB:CZ474677,GB:CZ477845,GB:CZ482562,GB:CZ487746,UniProt/TrEMBL:Q9V474,INTERPRO:IPR012934,OrthoDB7_Drosophila:EOG79H5RG,GB_protein:AFH03489,GB_protein:AFH03487,GB_protein:AFH03486,GB_protein:AFH03488,EntrezGene:33161,UniProt/TrEMBL:Q961T5,BIOGRID:59424,FlyAtlas:CG11371-RB,InterologFinder:33161,GenomeRNAi:33161,INTERACTIVEFLY:/genebrief/debra.htm;gbunit=AE014134;derived_computed_cyto=21B1-21B1
+2L	FlyBase	gene	71039	73836	.	-	.	ID=FBgn0266322;Name=CR44987;Ontology_term=SO:0000011,SO:0000087,SO:0000077;Dbxref=EntrezGene:19835103,FlyBase_Annotation_IDs:CR44987,GenomeRNAi:19835103;derived_computed_cyto=21B1-21B1
+2L	FlyBase	ncRNA	71039	73836	.	-	.	ID=FBtr0344052;Name=CR44987-RA;Parent=FBgn0266322;Dbxref=REFSEQ:NR_123998,FlyBase_Annotation_IDs:CR44987-RA;score_text=Weakly Supported;score=0
+2L	FlyBase	ncRNA	71039	73836	.	-	.	ID=FBtr0344053;Name=CR44987-RB;Parent=FBgn0266322;Dbxref=REFSEQ:NR_123997,FlyBase_Annotation_IDs:CR44987-RB;score_text=Weakly Supported;score=0
+2L	FlyBase	gene	71757	76211	.	+	.	ID=FBgn0031213;Name=galectin;fullname=galectin;Alias=CG11372,Dmgal;Ontology_term=SO:0000010,SO:0000087,GO:0016936,GO:0005829,GO:0015143,GO:0008039,GO:0030246;Dbxref=FlyBase:FBan0011372,FlyBase_Annotation_IDs:CG11372,GB_protein:AAF51564,GB_protein:ACZ94130,GB_protein:ACZ94129,GB:AF338142,GB_protein:AAL87743,GB:AY071294,GB_protein:AAL48916,GB:BI236456,GB:CZ467438,GB:CZ467439,GB:CZ468006,GB:CZ481733,GB:CZ481734,GB:CZ485874,GB:CZ485875,UniProt/TrEMBL:Q9VPI6,INTERPRO:IPR001079,INTERPRO:IPR013320,GB_protein:AGB92325,UniProt/TrEMBL:C0PUZ0,UniProt/TrEMBL:M9NCX9,OrthoDB7_Drosophila:EOG7XQ9R2,OrthoDB7_Diptera:EOG76TN7W,GB_protein:AFH03490,EntrezGene:33162,UniProt/TrEMBL:E1JHP9,UniProt/TrEMBL:E1JHQ0,BDGP_clone:RE32514,OrthoDB7_Insecta:EOG76QSHF,OrthoDB7_Arthropoda:EOG7WTGG5,OrthoDB7_Metazoa:EOG7S7SFG,InterologFinder:33162,BIOGRID:59425,FlyAtlas:CG11372-RA,GenomeRNAi:33162;gbunit=AE014134;derived_computed_cyto=21B1-21B1
+2L	FlyBase	gene	76348	77783	.	+	.	ID=FBgn0031214;Name=CG11374;Ontology_term=SO:0000010,SO:0000087,GO:0016936,GO:0015143,GO:0030246;Dbxref=FlyBase:FBan0011374,FlyBase_Annotation_IDs:CG11374,UniProt/TrEMBL:Q9VPI7,INTERPRO:IPR001079,INTERPRO:IPR013320,OrthoDB7_Drosophila:EOG735GGT,GB_protein:AAF51563,EntrezGene:33163,OrthoDB7_Insecta:EOG76QSHF,OrthoDB7_Arthropoda:EOG7WTGG5,OrthoDB7_Metazoa:EOG7S7SFG,InterologFinder:33163,FlyAtlas:CG11374-RA,GenomeRNAi:33163;gbunit=AE014134;derived_computed_cyto=21B1-21B1
+2L	FlyBase	gene	82421	87387	.	-	.	ID=FBgn0002931;Name=net;fullname=net;Alias=FBgn0031215,CG11450,Shout,Group IId,shout,bHLHc42;Ontology_term=SO:0000010,SO:0000087,GO:0005634,GO:0003700,GO:0008586,GO:0006355,GO:0007474,GO:0000122,GO:0046983;Dbxref=FlyBase:FBan0011450,FlyBase_Annotation_IDs:CG11450,GB_protein:AAF51562,GB:AF303350,GB_protein:AAK14073,GB:CZ478919,GB:CZ478940,UniProt/TrEMBL:Q9VPI8,INTERPRO:IPR011598,GB_protein:AGB92326,OrthoDB7_Drosophila:EOG774925,OrthoDB7_Diptera:EOG786TM8,EntrezGene:45339,BDGP_clone:PC00159,UniProt/TrEMBL:Q7KH24,OrthoDB7_Insecta:EOG73VJM7,OrthoDB7_Arthropoda:EOG7SRH34,OrthoDB7_Metazoa:EOG73NG5F,BIOGRID:69604,FlyAtlas:CG11450-RA,InterologFinder:45339,GenomeRNAi:45339,INTERACTIVEFLY:/gene/net1.htm;gbunit=AE014134;derived_computed_cyto=21B1-21B1
+2L	FlyBase	gene	94739	102086	.	+	.	ID=FBgn0031216;Name=Zir;fullname=Zizimin-related;Alias=FBgn0025528,anon-EST:Posey54,Dm zir,CG11376,FBgn 31216;Ontology_term=SO:0000010,SO:0000087,GO:0035010,GO:0006909,GO:0005089,GO:0035023,GO:0035011,GO:0005575;Dbxref=FlyBase:FBan0011376,FlyBase_Annotation_IDs:CG11376,GB_protein:AAF51561,GB:AA391952,GB:AF083301,GB:AW942162,GB:AZ877629,GB:BT015273,GB_protein:AAT94502,GB:CZ468078,GB:CZ468665,GB:CZ469343,GB:CZ470525,GB:CZ470937,GB:CZ480810,GB:CZ480811,GB:CZ483732,UniProt/TrEMBL:Q9VPI9,INTERPRO:IPR027007,INTERPRO:IPR027357,INTERPRO:IPR026791,OrthoDB7_Drosophila:EOG7DP1D5,OrthoDB7_Diptera:EOG78133C,EntrezGene:33165,INTERPRO:IPR021816,INTERPRO:IPR010703,BDGP_clone:LD10883,OrthoDB7_Insecta:EOG71CSK0,OrthoDB7_Arthropoda:EOG735G9K,OrthoDB7_Metazoa:EOG7P8P7G,BIOGRID:59427,InterologFinder:33165,FlyAtlas:CG11376-RA,Fly-FISH:CG11376,GenomeRNAi:33165;gbunit=AE014134;derived_computed_cyto=21B1-21B2
+2L	FlyBase	gene	102380	104142	.	+	.	ID=FBgn0031217;Name=CG11377;Alias=CT31762;Ontology_term=SO:0000010,SO:0000087,GO:0005509,SO:0000548;Dbxref=FlyBase:FBan0011377,FlyBase_Annotation_IDs:CG11377,GB_protein:AAF51560,GB:AY071548,GB_protein:AAL49170,GB:BI371522,UniProt/TrEMBL:Q9VPJ0,INTERPRO:IPR000742,INTERPRO:IPR002049,INTERPRO:IPR006212,INTERPRO:IPR009030,INTERPRO:IPR013032,INTERPRO:IPR018097,INTERPRO:IPR021852,INTERPRO:IPR001881,GB_protein:AGB92327,UniProt/TrEMBL:M9PDP6,OrthoDB7_Drosophila:EOG799F97,OrthoDB7_Diptera:EOG7QK9VM,EntrezGene:33166,OrthoDB7_Insecta:EOG7GR8VQ,OrthoDB7_Arthropoda:EOG7PSCW4,OrthoDB7_Metazoa:EOG7Z3F4B,InterologFinder:33166,BIOGRID:59428,FlyAtlas:CG11377-RA,Fly-FISH:CG11377,GenomeRNAi:33166;gbunit=AE014134;derived_computed_cyto=21B2-21B2
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/polII.bed	Fri Apr 20 10:27:20 2018 -0400
@@ -0,0 +1,111 @@
+2L	54816	55594	0.0552525047833769
+2L	54816	55594	0.0552525047833769
+2L	55594	55635	0.818636112867944
+2L	55594	55635	0.818636112867944
+2L	55635	55767	1.09961409723048
+2L	55635	55767	1.09961409723048
+2L	65998	66242	1.20628998889217
+2L	65998	66242	1.20628998889217
+2L	66317	66422	1.20628998889217
+2L	66317	66422	1.20628998889217
+2L	66422	66524	2.5583945646271
+2L	66422	66524	2.5583945646271
+2L	66481	67274	2.5583945646271
+2L	67274	67324	1.98680486782346
+2L	67324	67817	2.08837880305762
+2L	67817	67905	1.86112283695141
+2L	67905	68330	1.62486655160691
+2L	68330	68393	1.54900967010231
+2L	68393	68886	1.11119688964443
+2L	68886	69103	0.869617644974253
+2L	69103	69241	0.387235732197361
+2L	69241	69301	0.418133867661119
+2L	69301	69792	0.899479539841806
+2L	69792	69796	0.823796983227429
+2L	69796	69820	0.823796983227429
+2L	69820	69878	0.549942264108176
+2L	69878	69959	0.761892304255089
+2L	69959	70111	1.1531457850107
+2L	70111	71390	0.846518854258207
+2L	71038	71492	0.846518854258207
+2L	71038	71492	0.846518854258207
+2L	71038	71492	0.846518854258207
+2L	71492	71990	3.39665608645984
+2L	71492	71990	3.39665608645984
+2L	71492	71990	3.39665608645984
+2L	71756	71990	3.39665608645984
+2L	71990	72652	3.40816353628327
+2L	71990	72652	3.40816353628327
+2L	71990	72652	3.40816353628327
+2L	71990	72652	3.40816353628327
+2L	72652	72787	2.8478823465353
+2L	72652	72787	2.8478823465353
+2L	72652	72787	2.8478823465353
+2L	72652	72787	2.8478823465353
+2L	72787	73420	2.9137462183582
+2L	72787	73420	2.9137462183582
+2L	72787	73420	2.9137462183582
+2L	72787	73420	2.9137462183582
+2L	73420	73836	1.07916756002746
+2L	73420	73836	1.07916756002746
+2L	73420	73836	1.07916756002746
+2L	73420	75584	1.07916756002746
+2L	75584	76211	0.351389501360189
+2L	76347	76557	0.351389501360189
+2L	76557	76602	0.111868083546615
+2L	76602	76694	-0.103143189574513
+2L	76694	77646	-0.0832154089110144
+2L	77646	77783	0.171807761122692
+2L	82420	82686	0.950799825059922
+2L	82686	82987	0.860805501917355
+2L	82987	83358	0.9016755036993
+2L	83358	83622	0.4912523962063
+2L	83622	83828	0.574814522065537
+2L	83828	84509	0.950829410021384
+2L	84509	84651	1.13276692007025
+2L	84651	84867	1.40640395753805
+2L	84867	85682	0.810578385233812
+2L	85682	85687	0.675864965432377
+2L	85687	86075	0.661825886242909
+2L	86075	86215	0.349192348167705
+2L	86215	86316	0.0920730976226941
+2L	86316	87387	0.102318953834602
+2L	94738	95132	-0.106500647321927
+2L	95132	95199	-0.0638569168395719
+2L	95199	95277	0.217427011118734
+2L	95277	95514	0.267705566854591
+2L	95514	95662	0.0612754941064784
+2L	95662	95731	-0.429957911575681
+2L	95731	95761	-0.556019144100863
+2L	95761	96385	-0.725477533205267
+2L	96385	96412	-0.655603418929584
+2L	96412	96491	-0.409536719736943
+2L	96491	97369	-0.226244771136778
+2L	97369	97583	-0.308700411197029
+2L	97583	97642	0.0394679913725759
+2L	97642	97825	0.217457643667941
+2L	97825	98118	0.0793574230565764
+2L	98118	98244	-0.00404328552099076
+2L	98244	98316	-0.285051719568665
+2L	98316	98529	-0.52337489352199
+2L	98529	98813	-0.853641568435678
+2L	98813	99183	-1.20840278715919
+2L	99183	99266	-0.57988161546026
+2L	99266	99666	-0.0718062889470329
+2L	99666	99974	-0.313463689842528
+2L	99974	100074	-0.962054132059592
+2L	100074	100636	-1.19450144164682
+2L	100636	100841	-0.134973412111241
+2L	100841	101514	0.0638744109411819
+2L	101514	101553	-0.0533119771867642
+2L	101553	102022	-0.0462385325318383
+2L	102379	102687	-0.0746051708557362
+2L	102022	102086	-0.0746051708557362
+2L	102687	102730	-0.0466007641328778
+2L	102730	102832	0.129772291996865
+2L	102832	103057	0.328567918234615
+2L	103057	103345	0.287560490314009
+2L	103345	103349	0.278800162489859
+2L	103349	103506	-0.00904364838917334
+2L	103506	103683	0.126271941123851
+2L	103683	104142	0.646700520857224