changeset 0:b03e31cab4a6 draft

Uploaded
author charles-bernard
date Mon, 14 Nov 2016 04:55:30 -0500
parents
children 03eefc41396e
files cytosine_report_to_bedgraph/.shed.yml cytosine_report_to_bedgraph/bismark2bedgraph.awk cytosine_report_to_bedgraph/bismark2bedgraph.sh cytosine_report_to_bedgraph/cytosine_report_to_bedgraph.xml cytosine_report_to_bedgraph/cytosine_report_to_bedgraph_wrapper.py cytosine_report_to_bedgraph/tool-data/igv_genomes_chr_sizes.loc.sample cytosine_report_to_bedgraph/tool_data_table_conf.xml.sample cytosine_report_to_bedgraph/tool_dependencies.xml
diffstat 8 files changed, 352 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/.shed.yml	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,14 @@
+categories:
+- Convert Formats
+description: Converts genome-wide cytosine methylation report to Bedgraphs
+long_description: |
+    This tool takes as input a genome-wide cytosine methylation report (generated by the tool Bismark Meth. Extractor) and converts it into a bedGraph for each cytosine context (CpG, CHG and CHH).
+    These bedGraphs display, for a given context, the ratio of methylation of each covered cytosine in the genome.
+
+    It also produces a bedGraph displaying the coverage count of each cytosine in the genome (non-covered cytosine are ignored).
+
+    The tool outputs offer the possibility to vizualise the methylation signal of covered cytosines thanks to softwares like IGV (Integrative Genomics Viewer).
+    In this respect, the tool can optionally generate a tdf binary file (Tiled Data Format) from each converted bedGraph. Tdf format is indeed better handled by IGV than bedGraph.
+name: bismark
+owner: charles-bernard
+type: unrestricted
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/bismark2bedgraph.awk	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,29 @@
+#!/usr/bin/awk
+
+BEGIN {
+	FS = "\t";
+}
+
+{
+	if ( $6 ~ context && ($4 > 0 || $5 > 0) ) { 
+
+		chr_name = $1;
+		chr_pos = $2;
+		strand = $3;
+		c_meth_count = $4;
+		c_unmeth_count = $5;
+
+		if ( coverage == "true" ) {
+			nb_reads = c_meth_count + c_unmeth_count
+			printf("%s\t%s\t%s\t%s\n", chr_name, chr_pos, chr_pos, nb_reads)
+		} else {
+			if ( strand == "-") { 
+				s = "-"; 
+			} else { 
+				s = "";
+			}
+			meth_ratio = c_meth_count / (c_meth_count + c_unmeth_count); 
+			printf("%s\t%s\t%s\t%s%s\n", chr_name, chr_pos, chr_pos, s, meth_ratio)	
+		}
+	}
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/bismark2bedgraph.sh	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# read the options
+TEMP=`getopt -o e::i:co: --long epi::,infile_cov:,context,output_dir:,tool_dir:,tdf,igv_genome: -n 'report' -- "$@"`
+eval set -- "$TEMP"
+
+# extract options and their arguments into variables.
+while true ; do
+	case "$1" in
+		-e | --epi )
+			case "$2" in
+				"" ) epi="current_job"; shift 2 ;;
+				*) epi=$2; shift 2 ;;
+			esac ;;
+		-i | --infile_cov )
+			case "$2" in
+				*) infile_cov=$2; shift 2 ;;
+			esac ;;
+		-c | --context ) 
+			context=true; shift ;;
+		--tdf )
+			tdf=true; shift ;;
+		--igv_genome )
+			case "$2" in
+				*) igv_genome=$2; shift 2 ;;
+			esac ;;
+		-o | --output_dir )
+			case "$2" in
+				*) output_dir=$2; shift 2 ;;
+			esac ;;
+		--tool_dir )
+			case "$2" in
+				*) tool_dir=$2; shift 2 ;;
+			esac ;;
+		-- ) shift; break ;;
+		* ) echo "Internal error!"; exit 1 ;;
+	esac
+done
+
+# do something with the variables -- in this case the lamest possible one :-)
+echo "epi = $epi"
+echo "infile_cov = $infile_cov"
+echo "output_dir = $output_dir"
+echo "context = $context"
+echo "tool_dir = $tool_dir"
+echo "tdf = $tdf"
+echo "igv_genome = $igv_genome"
+
+#IGV_path
+IGV_path="/users/biocomp/chbernar/galaxy_testing/database/dependencies/igvtools/2.3.32/geert-vandeweyer/package_igvtools_2_3_32/3c087cee3b8f/bin"
+
+# define outputs according to options
+if [[ "$context" = true ]]; then
+	context_list=("CG" "CHG" "CHH")
+	output_types=("CG" "CHG" "CHH" "coverage")
+	bedgraph_list=("$output_dir""/""$epi""_CpG.bedgraph" "$output_dir""/""$epi""_CHG.bedgraph" "$output_dir""/""$epi""_CHH.bedgraph" "$output_dir""/""$epi""_coverage.bedgraph")
+	tdf_list=("$output_dir""/""$epi""_CpG.tdf" "$output_dir""/""$epi""_CHG.tdf" "$output_dir""/""$epi""_CHH.tdf" "$output_dir""/""$epi""_coverage.tdf")
+	n="4"
+else
+	context_list=(".*")
+	output_types=("CXX" "coverage")
+	bedgraph_list=("$output_dir""/""$epi""_CXX.bedgraph" "$output_dir""/""$epi""_coverage.bedgraph")
+	tdf_list=("$output_dir""/""$epi""_CXX.tdf" "$output_dir""/""$epi""_coverage.tdf")
+	n="2"
+fi
+
+# process
+for (( i=0; i<$n; i++)); do
+	printf "________________________________________________________________________\n"
+	printf "Processing %s\n" ${output_types[$i]}
+	printf "... Converting Cytosine Report to Bedgraph\n" 
+	if (( i < n - 1 )); then
+		#if not coverage:
+		#printf "track type=bedGraph name=%s Coverage description=%s Coverage\n" "$epi""_""${context_list[$i]}" "$epi""_""${context_list[$i]}" > "${bedgraph_list[$i]}"
+		printf "#<Chr>\t<Start>\t<End>\t<Strand;Meth_ratio>\n" > "${bedgraph_list[$i]}"
+		awk -v context="${context_list[$i]}" -v coverage="false" -f "$tool_dir"/bismark2bedgraph.awk $infile_cov >> "${bedgraph_list[$i]}"
+	else
+		#printf "track type=bedGraph name=%s Coverage description=%s Coverage\n" "$epi""_""${context_list[$i]}" "$epi""_""${context_list[$i]}" > "${bedgraph_list[$i]}"
+		printf "#<Chr>\t<Start>\t<End>\t<Coverage>\n" > "${bedgraph_list[$i]}"
+		awk -v context="${context_list[$i]}" -v coverage="true" -f "$tool_dir"/bismark2bedgraph.awk $infile_cov >> "${bedgraph_list[$i]}"
+	fi
+	if [[ "$tdf" = true ]]; then
+		printf "... Converting Bedgraph to Tdf\n"
+		"$IGV_path""/"igvtools toTDF "${bedgraph_list[$i]}" "${tdf_list[$i]}" "$igv_genome" > stdout_file  
+	fi
+done
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/cytosine_report_to_bedgraph.xml	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,112 @@
+<tool id="cytosine_report_to_bedgraph" name="Cytosine Report To Bedgraphs" version="0.0.0">
+
+  <description>Converts genome-wide cytosine methylation report to Bedgraphs</description>
+
+  <requirements>
+  	<requirement type="package" version="2.3.32">igvtools</requirement>
+  </requirements>
+
+  <command interpreter="python">
+<![CDATA[
+        cytosine_report_to_bedgraph_wrapper.py
+
+        --tool_dir "$__tool_directory__"
+
+        --cytosine_report "$cytosine_report"
+
+        --coverage_bedgraph "$coverage_bedgraph"
+        #if $tdf['tdf_selector']:
+          --tdf
+          --igv_genome "${tdf.built_in_igv_genomes_chr_sizes.fields.path}"
+          --coverage_tdf "$coverage_tdf"
+        #end if
+
+        #if $context:
+          --context
+          --CpG_bedgraph "$CpG_bedgraph"
+          --CHG_bedgraph "$CHG_bedgraph"
+          --CHH_bedgraph "$CHH_bedgraph"
+          #if $tdf['tdf_selector']:
+            --CpG_tdf "$CpG_tdf"
+            --CHG_tdf "$CHG_tdf"
+            --CHH_tdf "$CHH_tdf"
+          #end if
+        #else
+          --CXX "$CXX_bedgraph"
+          #if $tdf['tdf_selector']:
+            --CXX "$CXX_tdf"
+          #end if
+        #end if  
+
+        --log "$log_report"
+]]>
+  </command>
+  <inputs>
+    <param name="cytosine_report" type="data" format="txt" label="Submit a genome-wide cytosine methylation report" optional="False" />
+    <param name="context" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Create a bedGraph for each cytosine methylation context?" />
+    <conditional name="tdf">
+      <param name="tdf_selector" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a tdf file from each resulting bedGraph?" />
+      <when value="true">
+        <param name="built_in_igv_genomes_chr_sizes" type="select" label="Select Reference Genome (igv genomes chr.sizes)" help="If your genome of interest is not listed, contact your Galaxy admin">
+          <options from_data_table="igv_genomes_chr_sizes">
+            <filter type="sort_by" column="name"/>
+            <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+          </options>
+        </param>
+      </when>
+      <when value="false"></when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="log_report" format="txt" label="${tool.name} on ${on_string}: Log Report" />
+    <data name="CpG_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CpG context">
+      <filter>context is True</filter>
+    </data>
+    <data name="CHG_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CHG context">
+      <filter>context is True</filter>
+    </data>
+    <data name="CHH_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CHH context">
+      <filter>context is True</filter>
+    </data>
+    <data name="CXX_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CXX">
+      <filter>context is False</filter>
+    </data>
+    <data name="coverage_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: Coverage" />
+    <data name="CpG_tdf" format="tdf" label="${tool.name} on ${on_string}: CpG context (tdf)">
+      <filter>tdf['tdf_selector'] is True and context is True</filter>
+    </data>
+    <data name="CHG_tdf" format="tdf" label="${tool.name} on ${on_string}: CHG context (tdf)">
+          <filter>tdf['tdf_selector'] is True and context is True</filter>
+    </data>
+    <data name="CHH_tdf" format="tdf" label="${tool.name} on ${on_string}: CHH context (tdf)">
+          <filter>tdf['tdf_selector'] is True and context is True</filter>
+    </data>
+    <data name="CXX_tdf" format="tdf" label="${tool.name} on ${on_string}: CXX (tdf)">
+          <filter>context is False and tdf['tdf_selector'] is True</filter>
+    </data>
+    <data name="coverage_tdf" format="tdf" label="${tool.name} on ${on_string}: Coverage (tdf)" >
+      <filter>tdf['tdf_selector'] is True</filter>
+    </data>
+  </outputs>
+
+  <tests></tests>
+
+  <help>
+<![CDATA[
+
+**What it does**
+
+  | This tool takes as input a genome-wide cytosine methylation report (generated by the tool *Bismark Meth. Extractor*) and converts it into a bedGraph for each cytosine context (CpG, CHG and CHH). 
+  | These bedGraphs display, for a given context, the ratio of methylation of each covered cytosine in the genome.
+  |
+  | It also produces a bedGraph displaying the coverage count of each cytosine in the genome (non-covered cytosine are ignored).
+  | 
+
+  .. class:: infomark
+
+  | The tool outputs offer the possibility to vizualise the methylation signal of covered cytosines thanks to softwares like IGV (*Integrative Genomics Viewer*).
+  | In this respect, the tool can optionally generate a tdf binary file (*Tiled Data Format*) from each converted bedGraph. Tdf format is indeed better handled by IGV than bedGraph.
+]]>
+  </help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/cytosine_report_to_bedgraph_wrapper.py	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from glob import glob
+from optparse import OptionParser
+
+def __main__():
+    parser = OptionParser()
+    parser.add_option("--tool_dir", dest = 'tool_dir', action = 'store', nargs = 1, metavar = 'tool_dir', type = "str")
+    parser.add_option("--cytosine_report", dest='cytosine_report', action='store', nargs=1, metavar='cytosine_report', type="str")
+    parser.add_option("--context", action="store_true")
+    parser.add_option("--CpG_bedgraph", dest='CpG_bedgraph', action="store", nargs = 1, metavar = 'CpG_bedgraph', type='str')
+    parser.add_option("--CHH_bedgraph", dest='CHH_bedgraph', action="store", nargs=1, metavar='CHH_bedgraph', type='str')
+    parser.add_option("--CHG_bedgraph", dest='CHG_bedgraph', action="store", nargs=1, metavar='CHG_bedgraph', type='str')
+    parser.add_option("--CXX_bedgraph", dest='CXX_bedgraph', action="store", nargs=1, metavar='CXX_bedgraph', type='str')
+    parser.add_option("--coverage_bedgraph", dest='coverage_bedgraph', action="store", nargs=1, metavar='coverage_bedgraph', type='str')
+    parser.add_option("--tdf", action="store_true")
+    parser.add_option("--igv_genome", dest='igv_genome', action='store', nargs=1, metavar='igv_genome', type='str')
+    parser.add_option("--CpG_tdf", dest='CpG_tdf', action="store", nargs=1, metavar='CpG_tdf', type='str')
+    parser.add_option("--CHH_tdf", dest='CHH_tdf', action="store", nargs=1, metavar='CHH_tdf', type='str')
+    parser.add_option("--CHG_tdf", dest='CHG_tdf', action="store", nargs=1, metavar='CHG_tdf', type='str')
+    parser.add_option("--CXX_tdf", dest='CXX_tdf', action="store", nargs=1, metavar='CXX_tdf', type='str')
+    parser.add_option("--coverage_tdf", dest='coverage_tdf', action="store", nargs=1, metavar='coverage_tdf', type='str')
+    parser.add_option("--log", dest='log_report', action="store", nargs=1, metavar='log_report')
+
+    options, args = parser.parse_args()
+
+    if options.log_report:
+    	logging.basicConfig(level=logging.INFO, filename=options.log_report, filemode="a+", format='%(message)s')
+    else:
+    	logging.basicConfig(level=logging.INFO, filename='galaxy_log_report.log', filemode="a+", format='%(message)s')
+    logging.info('_______________________________________________________________\n')
+    logging.info('List of options: %s\n' % options)
+
+    script_path=os.path.join(options.tool_dir, 'bismark2bedgraph.sh')
+    tmp_dir = tempfile.mkdtemp(prefix='tmp')
+
+    if options.context:
+        context_option = '-c'
+    else:
+        context_option = ''
+
+    if options.tdf:
+        tdf_option = '--tdf'
+        genome_option = '--igv_genome'
+        genome = options.igv_genome
+    else:
+        tdf_option = ''
+        genome_option = ''
+        genome = ''
+
+    logging.info('_______________________________________________________________\n')
+    logging.info('COMMAND:\n')
+    logging.info("bash %s -i %s %s %s %s %s -e current_job -o %s\n" % (script_path, options.cytosine_report, context_option, tdf_option, genome_option, genome, tmp_dir))
+
+    proc = subprocess.Popen(['bash', script_path, '--tool_dir', options.tool_dir, '-i', options.cytosine_report,\
+                             context_option, tdf_option, genome_option, genome, '-e', 'current_job', '-o', tmp_dir], \
+                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    proc_out, proc_err = proc.communicate()
+
+    if proc_out:
+    	logging.info('_______________________________________________________________\n')
+    	logging.info('Tool STDOUT:\n')
+    	logging.info(proc_out)
+    if proc_err:
+    	logging.info('_______________________________________________________________\n')
+    	logging.info('ERROR:\n')
+    	msg = 'The tool crashed with the following error:\n%s\n' % proc_err
+    	logging.critical(msg)
+    	sys.exit(msg)
+
+    if options.context:
+        shutil.move( glob(os.path.join(tmp_dir, '*_CpG.bedgraph'))[0], options.CpG_bedgraph )
+        shutil.move( glob(os.path.join(tmp_dir, '*_CHG.bedgraph'))[0], options.CHG_bedgraph )
+        shutil.move( glob(os.path.join(tmp_dir, '*_CHH.bedgraph'))[0], options.CHH_bedgraph )
+        if options.tdf:
+            shutil.move(glob(os.path.join(tmp_dir, '*_CpG.tdf'))[0], options.CpG_tdf)
+            shutil.move(glob(os.path.join(tmp_dir, '*_CHG.tdf'))[0], options.CHG_tdf)
+            shutil.move(glob(os.path.join(tmp_dir, '*_CHH.tdf'))[0], options.CHH_tdf)
+    else:
+        shutil.move(glob(os.path.join(tmp_dir, '*_CXX.bedgraph'))[0], options.CXX_bedgraph)
+        if options.tdf:
+            shutil.move(glob(os.path.join(tmp_dir, '*_CXX.tdf'))[0], options.CXX_tdf)
+    shutil.move(glob(os.path.join(tmp_dir, '*_coverage.bedgraph'))[0], options.coverage_bedgraph)
+    if options.tdf:
+        shutil.move(glob(os.path.join(tmp_dir, '*_coverage.tdf'))[0], options.coverage_tdf)
+
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir)
+
+if __name__=="__main__": __main__()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/tool-data/igv_genomes_chr_sizes.loc.sample	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,2 @@
+#<value>	<dbkey>	<name>	<path>
+Arabidopsis_thaliana_TAIR10	Arabidopsis_thaliana_TAIR10	Arabidopsis thaliana: TAIR10	/export/home1/users/biocomp/chbernar/galaxy_testing/database/dependencies/igvtools/2.3.32/geert-vandeweyer/package_igvtools_2_3_32/3c087cee3b8f/bin/genomes/tair10.chrom.sizes
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/tool_data_table_conf.xml.sample	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Location of all igv genomes chr sizes -->
+     <table name="igv_genomes_chr_sizes" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/igv_genomes_chr_sizes.loc" />
+    </table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cytosine_report_to_bedgraph/tool_dependencies.xml	Mon Nov 14 04:55:30 2016 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="igv_tools" version="2.3.32">
+        <repository changeset_revision="3c087cee3b8f" name="package_igvtools_2_3_32" owner="geert-vandeweyer" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>