changeset 9:5b208d4d89e5 draft

Uploaded
author fcaramia
date Tue, 04 Dec 2012 20:15:26 -0500
parents 4c5c3994bfcb
children 2432df265dad
files methylation_analysis_bismark/bismark_bed_files.loc.sample methylation_analysis_bismark/bismark_indices.loc.sample methylation_analysis_bismark/methylation_analysis/bismark.xml methylation_analysis_bismark/methylation_analysis/bismark_wrapper.pl methylation_analysis_bismark/methylation_analysis/differential_methylation.R methylation_analysis_bismark/methylation_analysis/differential_methylation.xml methylation_analysis_bismark/methylation_analysis/methylation_by_region_converter.pl methylation_analysis_bismark/methylation_analysis/methylation_by_region_converter.xml methylation_analysis_bismark/methylation_analysis/methylation_extractor.xml methylation_analysis_bismark/methylation_analysis/methylation_extractor_wrapper.pl methylation_analysis_bismark/tool_data_table_conf.xml.sample
diffstat 11 files changed, 1233 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/bismark_bed_files.loc.sample	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,4 @@
+# Bed file made permanent for experimental reasons, user can also input file in galaxy
+
+#bismark.bed.agilent.human	Agilent Human Methyl-Seq	/galaxy/bed_files/Agilent_Human_Methyl-Seq/S03770311_Covered.bed
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/bismark_indices.loc.sample	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,7 @@
+
+# Genomes have been coverted using bismark_genome_preparation script. Index points to the directory containing the genome and the methylated genome folder
+
+#bismark.hg19.GRCh37.62	hg19	hg19 ensembl GRCh37.62	/galaxy/indexes/human/ensembl_GRCh37.62/bowtie2/illumina/
+#bismark.hg19.b37	b37	hg19 1000 genomes b37	/galaxy/indexes/human/g1k_v37/bowtie2/illumina/
+#bismark.mm9.NCBIM37.62	 mm9	mm9 ensembl NCBIM37.62	/galaxy/indexes/mouse/ensembl_NCBIM37.62/bowtie2/illumina/
+#bismark.dm3.BDGP5.25.62	dm3	dm3 ensembl BDGP5.25.62	/galaxy/indexes/fruitfly/ensembl_BDGP5.25.62/bowtie2/illumina/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/bismark.xml	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,182 @@
+<tool id="bismark_tool" name="Bismark" version="0.7.6">
+  <description>: A bisulfite read mapper and methylation caller</description>
+  <requirements>
+    <requirement type="package" version="0.1.16">samtools</requirement>
+    <requirement type="package" version="0.12.7">bowtie2</requirement>
+    <requirement type="package" version="0.7.6">bismark</requirement>
+  </requirements>
+  <command interpreter="perl">
+    
+	bismark_wrapper.pl
+	
+	
+	"GENOME::${genome.fields.path}"       
+	
+	
+	
+	#if str($option_input.input_option) == "mates":
+		"MATES::$option_input.file_mate1::$option_input.file_mate2"
+	#else
+		"SINGLES::$option_input.file_single"
+	#end if
+	
+	#if str($format_option) == "fasta":
+		"FORMAT::--fasta"
+	#else
+		"FORMAT::--fastq"
+	#end if
+	
+	#if str($non_directional) == "ON":
+		"DIRECTIONAL::--non_directional"
+	#end if
+	
+	"OUTPUT::$output"
+	"SUMMARY::$summary"
+	
+  </command>
+	<inputs>
+
+		<param name="genome" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+			<options from_data_table="bismark_indexes">
+				<filter type="sort_by" column="2"/>
+				<validator type="no_options" message="No indexes are available for the selected input dataset"/>
+			</options>
+		</param>
+		
+		<param name="format_option" type="select" label="sample format">
+			<option value="fastq" selected="true">fastq</option>
+			<option value="fasta">fasta</option>
+		</param>
+		
+		
+		<conditional name="option_input">
+			<param name="input_option" type="select" label="Input files">
+				<option value="mates" selected="true">mates</option>
+				<option value="singles">singles</option>
+			</param>
+			<when value="mates">
+				<param format="fasta, fastq" name="file_mate1" type="data" label="Mate 1" help=""/>
+				<param format="fasta, fastq" name="file_mate2" type="data" label="Mate 2" help=""/>
+			</when>
+			<when value="singles">
+				<param format="fasta, fastq" name="file_single" type="data" label="Single" help=""/>
+			</when>
+		</conditional>	
+		
+		<param name="non_directional" type="select" label="non-directional" help="" optional="true">
+			<option value="ON" selected="true">ON</option>
+			<option value="OFF">OFF</option>
+		</param>
+		
+	</inputs>
+	<outputs>
+		<data name="summary" format="txt" label="Bismark Sumary" />
+		<data format="bam" name="output" label="${tool.name} on ${on_string}">
+			<actions>
+				<action type="metadata" name="dbkey">
+					<option type="from_data_table" name="bismark_indexes" column="1" offset="0">
+					<filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+					<filter type="param_value" ref="genome" column="0"/>
+					</option>
+				</action>
+			</actions>
+		</data>
+	</outputs>
+	<help>
+|
+
+
+**Reference**
+	
+  http://www.bioinformatics.babraham.ac.uk/projects/bismark/
+  
+-----
+
+**What it does**
+
+Bismark takes in FastA or FastQ files and aligns the reads to a specified bisulfite genome. 
+Sequence reads are transformed into a bisulfite converted forward strand version (C->T conversion)
+or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
+Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
+(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
+forward strand, by doing this alignments will produce the same positions). These 4 instances of 
+Bowtie (1 or 2) are run in parallel. The sequence file(s) are then read in again sequence by sequence 
+to pull out the original sequence from the genome and determine if there were any protected C's present or not.
+
+As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
+re-enabled by using --non_directional.
+
+The final output of Bismark is in SAM format by default. But for storage restrictions the output is compressed (BAM).
+
+
+-----
+ 
+**Required Parameters**
+
+::
+
+  -q/--fastq             The query input files (specified as mate1,mate2 or singles are FASTQ
+                         files (usually having extension .fg or .fastq). This is the default. See also
+                         --solexa-quals.
+
+  -f/--fasta             The query input files (specified as mate1,mate2 or singles are FASTA
+                         files (usually havin extension .fa, .mfa, .fna or similar). All quality values
+                         are assumed to be 40 on the Phred scale.
+
+  -1 mates1              List of files containing the #1 mates (filename usually includes
+                         "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
+                         correspond file-for-file and read-for-read with those specified in mates2.
+                         Reads may be a mix of different lengths. Bismark will produce one mapping result
+                         and one report file per paired-end input file pair.
+
+  -2 mates2              List of files containing the #2 mates (filename usually includes
+                         "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
+                         correspond file-for-file and read-for-read with those specified in mates1.
+                         Reads may be a mix of different lengths.
+
+   singles               List of files containing the reads to be aligned (e.g.
+                         lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
+                         produce one mapping result and one report file per input file.
+
+  --non_directional      The sequencing library was constructed in a non strand-specific manner, alignments to all four
+                         bisulfite strands will be reported. Default: ON.
+
+                         (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
+                         to the original strands are merely theoretical and should not exist in reality. Specifying directional
+                         alignments (which is the default) will only run 2 alignment threads to the original top (OT)
+                         or bottom (OB) strands in parallel and report these alignments. This is the recommended option
+                         for sprand-specific libraries).
+
+-----
+
+**Default Parameters**
+
+::
+
+  --bowtie2              Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
+                         alignments, i.e. searches for alignments involving all read characters (also called 
+                         untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
+                         and/or quality trimmed where appropriate. Default: on.
+
+
+
+
+  -p NTHREADS            Launch NTHREADS parallel search threads (default: 4). Threads will run on separate processors/cores
+                         and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
+                         parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
+                         E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
+                         by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
+                         library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
+                         automatically use the option '--reorder', which guarantees that output SAM records are printed in
+                         an order corresponding to the order of the reads in the original input file, even when -p is set
+                         greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
+                         setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
+                         if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
+                         correspond to input order in that case.
+
+
+
+	</help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/bismark_wrapper.pl	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,133 @@
+use strict;
+use warnings;
+use File::Basename; 
+use Cwd;
+use File::Path qw(make_path remove_tree);
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+my @bam_list_entries;
+
+my $player_options = "--bowtie2 -p 4 ";
+my $bam_output;
+my $summary;
+my $genome;
+my $singles = "";
+my $mates1 = "";
+my $mates2 = "";
+my $directional="";
+my $format="";
+my $log_file="";
+
+
+foreach my $input (@ARGV) 
+{
+	my @tmp = split "::", $input;
+	if($tmp[0] eq "GENOME") 
+	{
+		$genome=$tmp[1];	
+	} 
+	elsif($tmp[0] eq "MATES") 
+	{
+		$mates1 = $tmp[1];
+		$mates2 = $tmp[2];
+	}
+	elsif($tmp[0] eq "SINGLES") 
+	{
+		$singles = $tmp[1];
+	} 
+	elsif($tmp[0] eq "FORMAT") 
+	{
+		$format = $tmp[1];
+	} 
+	elsif($tmp[0] eq "DIRECTIONAL") 
+	{
+		$directional = $tmp[1];
+	} 
+	elsif($tmp[0] eq "SUMMARY") 
+	{
+		$summary = $tmp[1];
+	} 
+        elsif($tmp[0] eq "OUTPUT") 
+	{
+		$bam_output = $tmp[1];
+	} 
+
+	else 
+	{
+		die("Unknown Input: $input\n");
+	}
+}
+
+
+my $working_dir = cwd()."/BISMARK_OUTPUT";
+make_path($working_dir);
+
+
+#run bismark 
+
+if ($singles eq "")
+{
+	system ("bismark $player_options $directional $format $genome -o $working_dir -1 $mates1 -2 $mates2 > $summary 2>&1 ");
+	
+}
+else
+{
+	system ("bismark $player_options $directional $format $genome -o $working_dir $singles > $summary 2>&1");
+	
+}
+
+
+move_files($working_dir);
+
+
+sub move_files
+{
+	my $name;
+	my $suffix;
+	my $path;
+	my $local_dir = $_[0];
+	opendir(DIR, $local_dir);
+	#print ("Openning: $local_dir\n");
+	my @FILES= readdir(DIR); 
+	closedir(DIR);
+	foreach my $file (@FILES) 
+	{
+		if ($file eq "." || $file eq "..")
+		{
+			#print ("./ or ../ skipped\n");
+		}
+		elsif (-d "$local_dir/$file")
+		{
+			#print ("moving to: $local_dir/$file\n");
+			move_files("$local_dir/$file");
+		}
+		elsif (-f "$local_dir/$file")
+		{
+			($name,$path,$suffix) = fileparse($file,qr"\.[^.]*$");
+			if ($suffix eq ".sam")
+			{
+				#converto to BAM and Delete SAM
+				system ("MergeSamFiles.sh MAX_RECORDS_IN_RAM=2000000 I=$local_dir/$file O=$bam_output VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true > /dev/null 2>&1");
+				system ("rm -rf $local_dir/$file");
+				
+			}
+			elsif($suffix eq ".txt")
+			{
+				system ("mv $local_dir/$file $summary");
+			}
+		}
+		else
+		{
+			die("Unrecognized file generated: $file\n");
+		}
+		
+	}
+}
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/differential_methylation.R	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,281 @@
+#!/usr/bin/Rscript --vanilla
+library(getopt)
+spec <- matrix(c("help", "h", 0, "logical", "view this help", "segfile1", "s", 1, "character", "seg file 1", "segfile2", "t", 1, "character", "seg file 2", "output", "o", 1, "character", "output file", "fdr", "f", 2, "character", paste("fdr method [", paste(p.adjust.methods, collapse="|"), "]", sep=""),
+        "reference", "r", 2, "character", "reference to use [b37|hg19|GRCh37|mm9|NCBIM37|mm10|GRCm38|dm3|BDGP5]", "annot", "a", 2, "character", "annotation to add [both|gene|cpg]", "processes", "p", 2, "integer", "number of cluster instances to open [1]"), ncol=5, byrow=T)
+opt <- getopt(spec)
+
+# set default options
+if(is.null(opt$fdr)) opt$fdr  <- "BH"
+if(is.null(opt$reference)) opt$reference  <- "hg19"
+if(is.null(opt$annot)) opt$annot  <- "both"
+if(is.null(opt$processes)) opt$processes  <- 1
+
+# check if any invalid options
+if(! opt$annot %in% c("both", "gene", "cpg") || !opt$fdr %in% p.adjust.methods || !opt$reference %in% c("b37", "hg19", "GRCh37", "mm9", "NCBIM37", "mm10", "GRCm38", "dm3", "BDGP5")) {
+        opt$help <- 1
+}
+
+# print help file if any incorrect
+if(!is.null(opt$help) || is.null(opt$segfile1) || is.null(opt$output) || is.null(opt$output) || is.na(opt$processes)) {
+        self = commandArgs()[1];
+        cat(getopt(spec, usage=T, command="differential_methylation.R"))
+        q(status=1)
+}
+
+library(snow)
+cl <- makeCluster(opt$p, type = "MPI")
+
+segfile1 <- read.table(opt$segfile1, sep="\t", as.is=T, head=T, quote = "")
+segfile2 <- read.table(opt$segfile2, sep="\t", as.is=T, head=T, quote = "")
+	
+rownames(segfile1) <- paste(segfile1[,2], ":", segfile1[,3], "-", segfile1[,4], sep="")
+rownames(segfile2) <- paste(segfile2[,2], ":", segfile2[,3], "-", segfile2[,4], sep="")
+	
+common_reg <- intersect(rownames(segfile1), rownames(segfile2))	
+prop_test <- function(x, n, fdr) {
+	library(abind)
+	ESTIMATE <- x/n
+	DELTA <- ESTIMATE[,1L] - ESTIMATE[,2L]
+	YATES <- pmin(abs(DELTA)/rowSums(1/n), 0.5)
+	p <- rowSums(x)/rowSums(n)
+	df <- 1
+	x <- abind(x, n - x, along=3)
+	E <- abind(n*p, n*(1-p), along=3)
+	STATISTIC <- rowSums((abs(x - E) - YATES)^2/E)
+	STATISTIC[is.na(STATISTIC)] <- 0
+	PVAL <- pchisq(STATISTIC, df, lower.tail = FALSE)
+	return(data.frame(X_Squared = round(STATISTIC, 3), P.value= round(PVAL, 6), P.adjusted = round(p.adjust(PVAL, method=fdr), 6)))
+}
+results <- prop_test(cbind(segfile1[common_reg, "Methylated"], segfile2[common_reg, "Methylated"]), cbind(segfile1[common_reg, "Total"], segfile2[common_reg, "Total"]), opt$fdr)
+	
+sample1 <- segfile1[1, 1]
+sample2 <- segfile2[1, 1]
+tab_out <- data.frame(ID = paste(sample1, "vs", sample2, sep="."), segfile1[common_reg, c(2:4)], segfile1[common_reg, c("Methylated", "Total", "FractionMethylated")],
+		segfile2[common_reg, c("Methylated", "Total", "FractionMethylated")], results, DiffProp = round(segfile1[common_reg, "FractionMethylated"] - segfile2[common_reg, "FractionMethylated"], 6), stringsAsFactors=F)
+colnames(tab_out)[c(5:10)] <- c(paste(sample1, "Methylated", sep="."), paste(sample1, "Total", sep="."), paste(sample1, "Proportion", sep="."), paste(sample2, "Methylated", sep="."),
+		paste(sample2, "Total", sep="."), paste(sample2, "Proportion", sep="."))
+
+rm(segfile1)
+rm(segfile2)
+gc()
+
+add_annot <- function(tab, annotation, genome) {
+        # find closest feature
+	if(genome == "hg19" || genome == "GRCh37" || genome == "b37") {
+		genome <- "hg19"
+		dataset <- "hsapiens_gene_ensembl"
+		biomart <- "ensembl"
+		host <- "www.biomart.org"
+		attributes <- c("ensembl_gene_id", "hgnc_symbol", "refseq_mrna", "start_position", "end_position")
+	} else if(genome == "mm9" || genome == "NCBIM37") {
+		genome <- "mm9"
+		dataset <- "mmusculus_gene_ensembl"
+		biomart <- "ENSEMBL_MART_ENSEMBL"
+		host <- "may2012.archive.ensembl.org"
+		attributes <- c("ensembl_gene_id", "mgi_symbol", "refseq_mrna", "start_position", "end_position")
+	} else if(genome == "mm10" || genome == "GRCm38") {
+		genome <- "mm10"
+		dataset <- "mmusculus_gene_ensembl"
+		biomart <- "ensembl"
+		host <- "www.biomart.org"
+		attributes <- c("ensembl_gene_id", "mgi_symbol", "refseq_mrna", "start_position", "end_position")
+	} else if(genome == "dm3" || genome == "BDGP5") {
+		genome <- "dm3"
+		dataset <- "dmelanogaster_gene_ensembl"
+		biomart <- "ensembl"
+		host <- "www.biomart.org"
+		attributes <- c("ensembl_gene_id", "flybasecgid_gene", "refseq_mrna", "start_position", "end_position")
+	}
+
+        e_to_U = c("GL000191.1" = "1_gl000191_random", "GL000192.1" = "1_gl000192_random", "GL000193.1" = "4_gl000193_random", "GL000194.1" = "4_gl000194_random",
+		"GL000195.1" = "7_gl000195_random", "GL000196.1" = "8_gl000196_random", "GL000197.1" = "8_gl000197_random", "GL000198.1" = "9_gl000198_random",
+		"GL000199.1" = "9_gl000199_random", "GL000200.1" = "9_gl000200_random", "GL000201.1" = "9_gl000201_random", "GL000202.1" = "11_gl000202_random",
+		"GL000203.1" = "17_gl000203_random", "GL000204.1" = "17_gl000204_random", "GL000205.1" = "17_gl000205_random", "GL000206.1" = "17_gl000206_random",
+		"GL000207.1" = "18_gl000207_random", "GL000208.1" = "19_gl000208_random", "GL000209.1" = "19_gl000209_random", "GL000210.1" = "21_gl000210_random",
+		"GL000211.1" = "Un_gl000211", "GL000212.1" = "Un_gl000212", "GL000213.1" = "Un_gl000213", "GL000214.1" = "Un_gl000214",
+		"GL000215.1" = "Un_gl000215", "GL000216.1" = "Un_gl000216", "GL000217.1" = "Un_gl000217", "GL000218.1" = "Un_gl000218",
+		"GL000219.1" = "Un_gl000219", "GL000220.1" = "Un_gl000220", "GL000221.1" = "Un_gl000221", "GL000222.1" = "Un_gl000222",
+		"GL000223.1" = "Un_gl000223", "GL000224.1" = "Un_gl000224", "GL000225.1" = "Un_gl000225", "GL000226.1" = "Un_gl000226",
+		"GL000227.1" = "Un_gl000227", "GL000228.1" = "Un_gl000228", "GL000229.1" = "Un_gl000229", "GL000230.1" = "Un_gl000230",
+		"GL000231.1" = "Un_gl000231", "GL000232.1" = "Un_gl000232", "GL000233.1" = "Un_gl000233", "GL000234.1" = "Un_gl000234",
+		"GL000235.1" = "Un_gl000235", "GL000236.1" = "Un_gl000236", "GL000237.1" = "Un_gl000237", "GL000238.1" = "Un_gl000238",
+		"GL000239.1" = "Un_gl000239", "GL000240.1" = "Un_gl000240", "GL000241.1" = "Un_gl000241", "GL000242.1" = "Un_gl000242",
+		"GL000243.1" = "Un_gl000243", "GL000244.1" = "Un_gl000244", "GL000245.1" = "Un_gl000245", "GL000246.1" = "Un_gl000246",
+		"GL000247.1" = "Un_gl000247", "GL000248.1" = "Un_gl000248", "GL000249.1" = "Un_gl000249")
+
+	# function to conver ucsc chroms to ensembl
+	ensembl <- function(chr, genome) {
+		# ensembl does not us chr and M is MT
+		chr <- sub("^chr", "", chr)
+		if(genome == "dm3" || genome == "BDGP5") {
+			chr <- sub("^M$", "dmel_mitochondrion_genome", chr)
+		} else {
+			chr <- sub("^M$", "MT", chr)
+		}
+		chr[which(chr %in% e_to_U)] <- names(e_to_U)[match(chr[which(chr %in% e_to_U)], e_to_U)]
+		return(chr)
+	}
+
+	# function to conver ensembl chroms to ucsc
+	ucsc <- function(chr) {
+		chr <- sub("^chr", "", chr)
+		tmp <- which(chr %in% names(c(e_to_U, dmel_mitochondrion_genome = "M", MT = "M")))
+		chr[tmp] <- c(e_to_U, dmel_mitochondrion_genome = "M", MT = "M")[chr[tmp]]
+		paste("chr", chr, sep="")
+	}
+
+	# function to get the gene annotation by chromosome
+        get_genes <- function(chr) {
+		library(biomaRt)
+		mart <- useMart(biomart=biomart, dataset=dataset, host=host)
+		tab <- getBM(attributes = attributes, filters = "chromosome_name", values = chr, mart = mart)
+		if(any(is.na(tab))) for(i in 1:ncol(tab)) tab[is.na(tab[,i]), i] <- ""
+		mult_ids <- names(which(table(tab$ensembl_gene_id) > 1))
+		rem <- NULL
+		for(i in mult_ids) {
+			index <- which(tab$ensembl_gene_id == i)
+			refseq <- which(tab$ensembl_gene_id == i & tab$refseq_mrna != "")
+			if(length(refseq) > 0) {
+				rem <- c(rem, setdiff(index, refseq[1]))
+			} else {
+				rem <- c(rem, index[-1])
+			}
+		}
+		if(length(rem) > 0) tab <- tab[-rem,]
+		colnames(tab)[4:5] <- c("feature_start", "feature_end")
+		return(tab)
+	}
+	
+	get_cpg <- function(chr) {
+		library(rtracklayer)
+		options(stringsAsFactors=F)
+		session <- browserSession()
+		genome(session) <- genome
+		query <- ucscTableQuery(session, "CpG Islands", GRangesForUCSCGenome(genome, chr))
+		cpg <- getTable(query)
+		cpg <- cpg[, c("name", "perCpg", "perGc", "chromStart", "chromEnd")]
+		colnames(cpg) <- c("cpg", "cpg_perCpg", "cpg_perGc", "cpg_start", "cpg_end")
+		return(cpg)
+	}
+	chr <- tab[1, 2]
+
+	# get the gene info for this chrom
+	if(annotation == "gene") {
+		tab <- cbind(tab, ensembl_gene_id = "", id = "", refseq_mrna = "", feature_start = "", feature_end = "", Distance_To_Feature = "", stringsAsFactors=F)
+		annotData <- get_genes(ensembl(chr, genome))
+		colnames(tab)[16] <- attributes[2]
+	} else {
+		tab <- cbind(tab, cpg = "", cpg_perCpg = "", cpg_perGc = "", cpg_start = "", cpg_end = "", Distance_To_cpg = "", stringsAsFactors=F)
+		annotData <- get_cpg(ucsc(chr))
+	}
+	if(nrow(annotData) == 0) return(tab)
+	
+	starts <- t(matrix(cbind(-1, as.numeric(annotData[, 4])), ncol=2, byrow=F))
+	ends <- t(matrix(cbind(-1, as.numeric(annotData[, 5])), ncol=2, byrow=F))
+
+	# calculate the distances from the features to the regions
+	dist_start_start <- matrix(cbind(tab$loc.start, 1), ncol=2, byrow=F) %*% starts
+	dist_start_end <- matrix(cbind(tab$loc.start, 1), ncol=2, byrow=F) %*% ends
+	dist_end_start <- matrix(cbind(tab$loc.end, 1), ncol=2, byrow=F) %*% starts
+	dist_end_end <- matrix(cbind(tab$loc.end, 1), ncol=2, byrow=F) %*% ends
+		
+	# determine which regions overlap at least 1 feature
+	sum_signs <- abs(sign(dist_start_start) + sign(dist_start_end) + sign(dist_end_start) + sign(dist_end_end))
+	regions <- which(sum_signs != 4, arr.ind=TRUE)
+	if(length(regions) > 0) {
+		overlap <- sort(unique(regions[,1]))
+		non_overlap <- c(1:nrow(tab))[-overlap]
+	} else {
+		overlap <- NULL
+		non_overlap <- c(1:nrow(tab))
+	}
+		
+	# reduce to regions with no overlaping feqature
+	if(length(overlap) > 0) {
+		dist_start_start <- matrix(dist_start_start[non_overlap,], ncol=ncol(dist_start_start))
+		dist_start_end <- matrix(dist_start_end[non_overlap,], ncol=ncol(dist_start_end))
+		dist_end_start <- matrix(dist_end_start[non_overlap,], ncol=ncol(dist_end_start))
+		dist_end_end <- matrix(dist_end_end[non_overlap,], ncol=ncol(dist_end_end))
+	}
+		
+	rm(sum_signs)
+	gc()
+		
+	# extract the annot for the regions with overlaping features
+	if(length(overlap) > 0) {
+		annot <- sapply(overlap, function(x, y) {
+					x <- regions[which(regions[,1] == x), 2]
+					sub("^//$", "", c(paste(annotData[x,1], collapse="//"), paste(annotData[x,2], collapse="//"), paste(annotData[x,3], collapse="//"), paste(annotData[x,4], collapse="//"), paste(annotData[x,5], collapse="//")))
+				}, regions)
+		annot <- as.data.frame(t(annot), stringsAsFactors=F)
+		annot <- cbind(annot, "Overlap", stringsAsFactors=F)
+		colnames(annot) <- c(colnames(annotData), tail(colnames(tab),1))
+	}
+	rm(regions)
+	gc()
+		
+	# for non the non-overlaps the distance of the closest features to each region
+	if(length(non_overlap) > 0) {
+		clst_pts <- matrix(0, ncol=4, nrow=length(non_overlap))
+		clst_pts[,1] <- max.col(-abs(dist_start_start), "last")
+		clst_pts[,2] <- max.col(-abs(dist_start_end), "last")
+		clst_pts[,3] <- max.col(-abs(dist_end_start), "last")
+		clst_pts[,4] <- max.col(-abs(dist_end_end), "last")
+		dist <- matrix(0, ncol=4, nrow=length(non_overlap))
+		if(length(clst_pts[,1]) == 1) {
+			dist[,1] <- dist_start_start[1, clst_pts[,1]]
+			dist[,2] <- dist_start_end[1, clst_pts[,2]]
+			dist[,3] <- dist_end_start[1, clst_pts[,3]]
+			dist[,4] <- dist_end_end[1, clst_pts[,4]]
+
+			# extract the annot for the regions with non-overlaping features
+			clst_all <- max.col(-abs(dist))
+			dist_to_feat <- dist[1, clst_all]	
+			clst <- clst_pts[1, clst_all]
+		} else {
+			dist[,1] <- dist_start_start[cbind(seq(clst_pts[,1]), clst_pts[,1])]
+			dist[,2] <- dist_start_end[cbind(seq(clst_pts[,2]), clst_pts[,2])]
+			dist[,3] <- dist_end_start[cbind(seq(clst_pts[,3]), clst_pts[,3])]
+			dist[,4] <- dist_end_end[cbind(seq(clst_pts[,4]), clst_pts[,4])]
+
+			# extract the annot for the regions with non-overlaping features
+			clst_all <- max.col(-abs(dist))
+			dist_to_feat <- dist[cbind(seq(clst_all), clst_all)]	
+			clst <- clst_pts[cbind(seq(clst_all), clst_all)]
+		}
+		annot_non_overlap <- cbind(annotData[clst, ], dist_to_feat)
+		colnames(annot_non_overlap) <- c(colnames(annotData), tail(colnames(tab),1))
+	}
+		
+	rm(dist_start_start)
+	rm(dist_start_end)
+	rm(dist_end_start)
+	rm(dist_end_end)
+	gc()
+		
+	if(length(overlap) > 0) tab[overlap, colnames(annot)] <- annot
+	if(length(non_overlap) > 0) tab[non_overlap, colnames(annot_non_overlap)] <- annot_non_overlap
+	return(tab)
+}
+
+tab_list <- list()
+for(i in unique(tab_out[,2])) tab_list[[i]] <- tab_out[which(tab_out$chrom == i),]
+rm(tab_out)
+gc()
+if(opt$annot == "both" || opt$annot == "gene") {
+	annotation <- "gene"
+	tab_list <- clusterApplyLB(cl, tab_list, add_annot, annotation, opt$reference)
+}
+
+if(opt$annot == "both" || opt$annot == "cpg") {
+	annotation <- "cpg"
+	tab_list <- clusterApplyLB(cl, tab_list, add_annot, annotation, opt$reference)
+}
+
+tab_out <- NULL
+for(i in 1:length(tab_list)) tab_out <- rbind(tab_out, tab_list[[i]])
+
+cat("'", file=opt$output)
+write.table(tab_out[,c(1:13, 15:26, 14)], opt$output, row.names=F, col.names=T, quote=F, sep="\t", append=T)
+stopCluster(cl)
+q(status=0)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/differential_methylation.xml	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,113 @@
+<tool id="differential_methylation_tool" name="Differential Methylation" version="1.0.0">
+  <description>methylation status of two samples by region</description>
+  <requirements>
+    <requirement type="package" version="0.1.16">samtools</requirement>
+    <requirement type="package" version="0.12.7">bowtie2</requirement>
+    <requirement type="package" >getopt</requirement>
+    <requirement type="package" >snow</requirement>
+    <requirement type="package" >abind</requirement>
+    <requirement type="package" >rtracklayer</requirement>
+    <requirement type="package" >biomaRt</requirement>
+
+  </requirements>
+  <command >
+    
+	differential_methylation.R -s $seg1 -t $seg2 -r $genome.fields.dbkey -p 1 -f $fdr -a $annot -o $output 2>1
+
+  </command>
+	<inputs>
+	
+		<param name="seg1" type="data" format="tabular" label="Seg file 1" />
+		<param name="seg2" type="data" format="tabular" label="Seg file 2" />
+		
+		<param name="genome" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+			<options from_data_table="bismark_indexes">
+				<filter type="sort_by" column="2"/>
+				<validator type="no_options" message="No indexes are available for the selected input dataset"/>
+			</options>
+		</param>
+		<param name="annot" type="select" label="Annotation">
+			<option value="both" selected="true">both</option>
+			<option value="gene">gene</option>
+			<option value="cpg">cpg</option>
+		</param>
+		<param name="fdr" type="select" label="fdr method">
+			<option value="none" selected="true">none</option>
+			<option value="holm">holm</option>
+			<option value="hochberg">hochberg</option>
+			<option value="hommel">hommel</option>
+			<option value="bonferroni">bonferroni</option>
+			<option value="BH">BH</option>
+			<option value="BY">BY</option>
+			<option value="fdr">fdr</option>															
+		</param>
+		
+		
+			
+	</inputs>
+	<outputs>
+		<data format="tabular" name="output" label="${tool.name} on ${on_string}">
+			<actions>
+				<action type="metadata" name="dbkey">
+					<option type="from_data_table" name="bismark_indexes" column="1" offset="0">
+					<filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+					<filter type="param_value" ref="genome" column="0"/>
+					</option>
+				</action>
+			</actions>
+		</data>
+		
+	</outputs>
+	<help>
+|
+
+**What it does**
+
+Compares the methylation status of two samples by region. It calculates a p.value for differential methylation using a test of proportions, that is, it tests the null hypothesis that the 2 methylation proportions are equal. The script will also calculate the difference in methylation and annotate the regions with the closest ensembl gene and cpg island. This script takes as input two seg files created by the tool "methylation_by_region_converter" and returns a new seg file containing:
+
+::
+
+  Comparison the name of the comparison
+  Chrom the chromosome of the region
+  loc.start the start of the region
+  loc.end the end of the region
+  Sample1.Methylated the number of methylated reads in the region for sample 1
+  Sample1.Total the total number of reads in the region for sample 1
+  Sample1.Proportion the proportion of methylated reads for sample 1
+  Sample2.Methylated the number of methylated reads in the region for sample 2
+  Sample2.Total the total number of reads in the region for sample 2
+  Sample2.Proportion the proportion of methylated reads for sample 2
+  X_Squared the test statistic for the test of proportions
+  P.value the raw p-value
+  P.adjusted the fdr adjusted p-value
+  ensembl_gene_id the closest or overlapping Ensembl gene id
+  hgnc_symbol the HUGO Gene Nomenclature Committee id
+  refseq_mrna the refseq id
+  feature_start the start of the gene
+  feature_end the end of the gene
+  Distance_To_Feature the distance to the feature
+  Cpg the closest or overlapping CpG island
+  cpg_perCpg the number of CpGs in island
+  cpg_perGc the percentage of island that is CpG
+  cpg_start the start location of the CpG island
+  cpg_end the end location of the CpG island
+  Distance_To_cpg the distance to the CpG island
+  DiffProp the difference in the methylation proportions
+
+  Usage: differential_methylation.R [[option][value]]
+  Options:
+  -h|--help         view this help
+  -s|--segfile1     seg file 1
+  -t|--segfile2     seg file 2
+  -o|--output       output file
+  -f|--fdr          fdr method [holm|hochberg|hommel|bonferroni|BH|BY|fdr|none]
+  -r|--reference    reference to use [hg19|GRCh37|mm9|NCBIM37|mm10|GRCm38|dm3|BDGP5]
+  -a|--annot        annotation to add [both|gene|cpg]
+  -p|--processes    number of cluster instances to open [1]
+
+
+
+	</help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/methylation_by_region_converter.pl	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,161 @@
+#!/usr/bin/perl
+
+# script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region
+
+# Created by Jason Ellul, Oct 2012
+
+
+use strict;
+use warnings;
+use Getopt::Std;
+use File::Basename;
+use Data::Dumper;
+$| = 1;
+
+# Grab and set all options
+my %OPTIONS = (s => "MethylationData");
+getopts('l:L:o:s:', \%OPTIONS);
+
+die qq(
+Usage:   methylation_by_region_converter.pl [OPTIONS] <bed file> <bedGraph 1> [<bedGraph 2> ...]
+
+OPTIONS:		-o	STR	the name of the output file
+			-l	STR	filename of the log file [null]
+			-L	STR	append to an existing log file [null]
+			-s	STR	Sample Name [$OPTIONS{s}]
+) if(@ARGV < 2);
+
+my $version = 0.1;
+my $bed = shift @ARGV;
+my @graphs = @ARGV;
+my $Script = "methylation_by_region_converter.pl";
+my $now;
+
+# if log file specified
+if(defined $OPTIONS{l}) {
+	open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n";
+	close (FH);
+	# Open the log file and redirect output to it
+	open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n";
+	my $now = localtime time;
+	print "Log File Created $now\n";
+} elsif(defined $OPTIONS{L}) {
+	#append to a log file
+	# Open the log file and redirect output to it
+	open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n";
+	my $now = localtime time;
+	print "Appending To Log File $now\n\n";
+}
+
+# print version of this script.
+print STDERR "Using $Script version $version\n\n";
+print STDERR "Using region file: $bed\n";
+print STDERR "And bedgraphs: @graphs\n\n";
+my %Regions;
+my @chr_order;
+
+# read in regions file
+print STDERR "Reading Regions Bed File $bed ... ";
+open(BED, "$bed") || die "$bed: $!";
+while(my $line = <BED>) {
+	chomp $line;
+	my @line_sp = split("\t", $line);
+	$line_sp[2]--;
+	
+	push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]};		
+	push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1];
+	push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2];
+	push @{ $Regions{$line_sp[0]}{Methylated} }, 0;
+	push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0;
+}
+close(BED);
+print STDERR "Done.\n";
+
+# read in bedgraphs
+foreach my $bedGraph (@graphs) {
+	print STDERR "Loading bedGraph File $bedGraph ... ";
+	open(GRAPH, $bedGraph) || die "$bedGraph: $!";
+	my @lines = <GRAPH>;
+	close(GRAPH);
+	print STDERR "Done.\n\n";
+	
+	print STDERR "Processing bedGraph File ... ";
+	foreach my $line (@lines) {
+		chomp $line;
+		my @line_sp = split("\t", $line);
+		$line_sp[1]++;
+		
+		# if the chromosome is in the regions file
+		if(defined $Regions{$line_sp[0]}) {
+			my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} });
+			
+			# if the position is within a region
+			if($pos >= 0) {
+				${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4];
+				${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5];
+			}
+		}
+	}
+	print STDERR "Done.\n\n";
+}
+
+if(defined $OPTIONS{o}) {
+	open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!";
+}
+
+# calculate percent methylated and print
+print STDERR "Printing output ... ";
+print "#type=DNA_METHYLATION\n";
+print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n";
+foreach my $chr (@chr_order) {
+	for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) {
+		my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region];
+		my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total;
+		print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total;
+		print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total;
+	}
+}
+close(STDERR) if defined $OPTIONS{o};
+print STDERR "Done.\n";
+
+sub binsearchregion {
+        my ($target, $cmp, $start, $end) = @_;
+
+        my $posmin = 0;
+        my $posmax = $#{ $start };
+
+        return -1 if &$cmp (0, $start, $target) > 0;
+        return -1 if &$cmp ($#{ $end }, $end, $target) < 0;
+
+        while (1) {
+                my $mid = int (($posmin + $posmax) / 2);
+                my $result = &$cmp ($mid, $start, $target);
+
+                if ($result < 0) {
+                        $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin;
+                        if($mid == $posmin) {
+                                return -1 if &$cmp ($mid, $end, $target) < 0;
+                                return $mid;
+                        }
+                        $posmin = $mid;
+                } elsif ($result > 0) {
+                        $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin;
+                        if($mid == $posmax) {
+                                $mid--;
+                                return -1 if &$cmp ($mid, $end, $target) < 0;
+                                return $mid;
+                        }
+                        $posmax = $mid;
+                } else {
+                        return $mid;
+                }
+        }
+}
+
+sub cmpFunc {
+	my ($index, $arrayRef, $target) = @_;
+	my $item = $$arrayRef[$index];
+
+	return $item <=> $target;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/methylation_by_region_converter.xml	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,69 @@
+<tool id="methyation_by_region_tool" name="Methylation by Region Converter" version="0.1.1">
+  <description>: create a bedgrapgh with methylation percentages aggregated by region</description>
+  <requirements>
+    <requirement type="package" version="0.1.16">samtools</requirement>
+    <requirement type="package" version="0.12.7">bowtie2</requirement>
+  </requirements>
+  <command interpreter="perl">
+    
+	methylation_by_region_converter.pl
+	
+	-o $output
+	-l $summary
+	## Handle reference file.
+        #if $refBedSource.bedSource == "history":
+            $refBedSource.ownFile
+        #else:
+            "${refBedSource.index.fields.path}"
+        #end if
+	$bedgraph
+
+  </command>
+	<inputs>
+		<conditional name="refBedSource">
+		  <param name="bedSource" type="select" label="Will you select a bed file from your history or use a built-in one?" help="Built-ins were created using default options">
+		    <option value="indexed">Use a built-in index</option>
+		    <option value="history">Use one from the history</option>
+		  </param>
+		  <when value="indexed">
+		    <param name="index" type="select" label="Select a bed file" help="If bed file is not listed, contact the Galaxy team">
+		      <options from_data_table="bismark_bed_files">
+		        <filter type="sort_by" column="1"/>
+		        <validator type="no_options" message="No files are available for the selected input dataset"/>
+		      </options>
+		    </param>
+		  </when>
+		  <when value="history">
+		    <param name="ownFile" type="data" format="bed" metadata_name="dbkey" label="Select the bed file" />
+		  </when>  <!-- history -->
+		</conditional>  <!-- refGenomeSource -->
+		<param name="bedgraph" type="data" format="bedgraph" label="BedGraph file" />
+		<param name="sample" type="text" label="Sample Name" />
+	
+	</inputs>
+	<outputs>
+		<data name="output" format="tabular" label="${tool.name} on ${on_string}" />
+		<data name="summary" format="txt" label="${tool.name} summary" />
+	</outputs>
+	<help>
+|
+
+
+**What it does**
+
+
+Script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region
+
+
+-----
+ 
+
+::
+
+  Usage:   methylation_by_region_converter.pl [OPTIONS] bed_file bedGraph
+
+
+	</help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/methylation_extractor.xml	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,165 @@
+<tool id="methyation_extractor_tool" name="Methylation Extractor" version="0.7.6">
+  <description>: extracts the methylation information for individual cytosine</description>
+  <requirements>
+    <requirement type="package" version="0.1.16">samtools</requirement>
+    <requirement type="package" version="0.12.7">bowtie2</requirement>
+    <requirement type="package" version="0.7.6">bismark</requirement>
+  </requirements>
+  <command interpreter="perl">
+    
+	methylation_extractor_wrapper.pl
+	
+	
+	"GENOME::${genome.fields.path}"      
+	
+	
+	#if str($no_overlap) == "ON":
+		"OPTION::--no_overlap"
+	#end if
+	
+	#if str($ending) == "single":
+		"ENDING::-s"
+	#else
+		"ENDING::-p"
+	#end if
+	
+	#if str($report) == "ON":
+		"OPTION::--report"
+	#end if
+	
+	"OPTION::--bedGraph"
+	
+	"OPTION::--counts"
+	
+	
+	"OUTPUT::$output"
+	"SUMMARY::$summary"
+
+	"BAMFILE::$bamfile"
+
+
+  </command>
+	<inputs>
+
+		<param name="genome" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+			<options from_data_table="bismark_indexes">
+				<filter type="sort_by" column="2"/>
+				<validator type="no_options" message="No indexes are available for the selected input dataset"/>
+			</options>
+		</param>
+		
+		<param name="bamfile" type="data" format="bam" label="Bam file: bismark output" />
+		
+		<param name="ending" type="select" label="ending" help="" optional="true">
+			<option value="single" >single-end</option>
+			<option value="paired" selected="true">paired-end</option>
+		</param>
+	
+		<param name="no_overlap" type="select" label="no-overlap" help="" optional="true">
+			<option value="ON" selected="true">ON</option>
+			<option value="OFF">OFF</option>
+		</param>
+	
+		<param name="report" type="select" label="Report" help="" optional="true">
+			<option value="ON" selected="true">ON</option>
+			<option value="OFF">OFF</option>
+		</param>
+	
+	</inputs>
+	<outputs>
+		<data format="bedgraph" name="output" label="${tool.name} on ${on_string}">
+			<actions>
+				<action type="metadata" name="dbkey">
+					<option type="from_data_table" name="bismark_indexes" column="1" offset="0">
+					<filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+					<filter type="param_value" ref="genome" column="0"/>
+					</option>
+				</action>
+			</actions>
+		</data>
+		<data name="summary" format="txt" label="${tool.name} summary" />
+	</outputs>
+	<help>
+|
+
+**Reference**
+	
+  http://www.bioinformatics.babraham.ac.uk/projects/bismark/
+  
+-----
+
+**What it does**
+
+
+
+The script reads in a bisulfite read alignment results file 
+produced by the Bismark bisulfite mapper (BAM file) and extracts the methylation 
+informationfor individual cytosines. This information is found in the methylation 
+call field which can contain the following characters:
+
+::
+       
+ ~~~   X   for methylated C in CHG context (was protected)     ~~~
+ 
+ ~~~   x   for not methylated C CHG (was converted)            ~~~
+ 
+ ~~~   H   for methylated C in CHH context (was protected)     ~~~
+ 
+ ~~~   h   for not methylated C in CHH context (was converted) ~~~
+ 
+ ~~~   Z   for methylated C in CpG context (was protected)     ~~~
+ 
+ ~~~   z   for not methylated C in CpG context (was converted) ~~~
+ 
+ ~~~   .   for any bases not involving cytosines               ~~~
+       
+
+
+-----
+ 
+**Required Parameters**
+
+::
+
+  -s/--single-end        Input file(s) are Bismark result file(s) generated from single-end
+                         read data. Specifying either --single-end or --paired-end is
+                         mandatory.
+
+  -p/--paired-end        Input file(s) are Bismark result file(s) generated from paired-end
+                         read data. Specifying either --paired-end or --single-end is
+                         mandatory.
+
+  --no_overlap           For paired-end reads it is theoretically possible that read_1 and
+                         read_2 overlap. This option avoids scoring overlapping methylation
+                         calls twice (only methylation calls of read 1 are used for in the process
+                         since read 1 has historically higher quality basecalls than read 2).
+                         Whilst this option removes a bias towards more methylation calls
+                         in the center of sequenced fragments it may de facto remove a sizable
+                         proportion of the data. This option is highly recommended for paired-end
+                         data.
+
+  --report               Prints out a short methylation summary as well as the paramaters used to run
+                         this script.
+
+
+-----
+
+**Default Parameters**
+
+::
+
+  --bedGraph             After finishing the methylation extraction, the methylation output is written into a
+                         sorted bedGraph file that reports the position of a given cytosine and its methylation 
+                         state (in %, seem details below). The methylation extractor output is temporarily split up into
+                         temporary files, one per chromosome (written into the current directory or folder
+                         specified with -o/--output); these temp files are then used for sorting and deleted
+                         afterwards. By default, only cytosines in CpG context will be sorted. The option
+                         '--CX_context' may be used to report all cyosines irrespective of sequence context
+                         (this will take MUCH longer!).
+
+
+
+	</help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/methylation_extractor_wrapper.pl	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,106 @@
+use strict;
+use warnings;
+use File::Basename; 
+use Cwd;
+use File::Path qw(make_path remove_tree);
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+my @bam_list_entries;
+
+my $player_options = "";
+my $bam_file;
+my $ending;
+my $genome;
+my $summary ;
+my $output;
+
+foreach my $input (@ARGV) 
+{
+	my @tmp = split "::", $input;
+	if($tmp[0] eq "GENOME") 
+	{
+		$genome=$tmp[1];	
+	} 
+	elsif($tmp[0] eq "OPTION") 
+	{
+		$player_options = "$player_options ${tmp[1]}";
+	} 
+	elsif($tmp[0] eq "ENDING") 
+	{
+		$ending = $tmp[1];
+	} 
+        elsif($tmp[0] eq "BAMFILE") 
+	{
+		$bam_file = $tmp[1];
+	}
+	elsif($tmp[0] eq "SUMMARY") 
+	{
+		$summary = $tmp[1];
+	} 
+	elsif($tmp[0] eq "OUTPUT") 
+	{
+		$output = $tmp[1];
+	}
+        else 
+	{
+		die("Unknown Input: $input\n");
+	}
+}
+
+
+my $working_dir = cwd();
+
+#convert bam to sam
+my $sam_file = "$working_dir/coverted.sam";
+system("samtools view -h $bam_file > $sam_file");
+
+#run bismark 
+
+system ("bismark_methylation_extractor $ending $player_options --genome_folder $genome $sam_file > $summary 2>&1");
+
+move_files($working_dir);
+
+sub move_files
+{
+	my $name;
+	my $suffix;
+	my $path;
+	my $local_dir = $_[0];
+	opendir(DIR, $local_dir);
+	#print ("Openning: $local_dir\n");
+	my @FILES= readdir(DIR); 
+	closedir(DIR);
+	foreach my $file (@FILES) 
+	{
+		if ($file eq "." || $file eq "..")
+		{
+			#print ("./ or ../ skipped\n");
+		}
+		elsif (-d "$local_dir/$file")
+		{
+			#print ("moving to: $local_dir/$file\n");
+			move_files("$local_dir/$file");
+		}
+		elsif (-f "$local_dir/$file")
+		{
+			($name,$path,$suffix) = fileparse($file,qr"\.[^.]*$");
+			if ($suffix eq ".bedGraph")
+			{
+				system ("mv $local_dir/$file $output");
+			}
+		}
+		else
+		{
+			die("Unrecognized file generated: $file\n");
+		}
+		
+	}
+}
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/tool_data_table_conf.xml.sample	Tue Dec 04 20:15:26 2012 -0500
@@ -0,0 +1,12 @@
+<!-- Locations of indexes in the bisulphite conversion -->
+<tables>
+	<table name="bismark_indexes" comment_char="#">
+		<columns>value, dbkey, name, path</columns>
+		<file path="tool-data/bismark_indices.loc" />
+	</table>
+	<!-- Locations of bed files for bisulphite analysis -->
+	<table name="bismark_bed_files" comment_char="#">
+		<columns>value, name, path</columns>
+		<file path="tool-data/bismark_bed_files.loc" />
+	</table>
+</tables>