Mercurial > repos > charles-bernard > cytosine_report_to_bedgraph
changeset 0:b03e31cab4a6 draft
Uploaded
author | charles-bernard |
---|---|
date | Mon, 14 Nov 2016 04:55:30 -0500 |
parents | |
children | 03eefc41396e |
files | cytosine_report_to_bedgraph/.shed.yml cytosine_report_to_bedgraph/bismark2bedgraph.awk cytosine_report_to_bedgraph/bismark2bedgraph.sh cytosine_report_to_bedgraph/cytosine_report_to_bedgraph.xml cytosine_report_to_bedgraph/cytosine_report_to_bedgraph_wrapper.py cytosine_report_to_bedgraph/tool-data/igv_genomes_chr_sizes.loc.sample cytosine_report_to_bedgraph/tool_data_table_conf.xml.sample cytosine_report_to_bedgraph/tool_dependencies.xml |
diffstat | 8 files changed, 352 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/.shed.yml Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,14 @@ +categories: +- Convert Formats +description: Converts genome-wide cytosine methylation report to Bedgraphs +long_description: | + This tool takes as input a genome-wide cytosine methylation report (generated by the tool Bismark Meth. Extractor) and converts it into a bedGraph for each cytosine context (CpG, CHG and CHH). + These bedGraphs display, for a given context, the ratio of methylation of each covered cytosine in the genome. + + It also produces a bedGraph displaying the coverage count of each cytosine in the genome (non-covered cytosine are ignored). + + The tool outputs offer the possibility to vizualise the methylation signal of covered cytosines thanks to softwares like IGV (Integrative Genomics Viewer). + In this respect, the tool can optionally generate a tdf binary file (Tiled Data Format) from each converted bedGraph. Tdf format is indeed better handled by IGV than bedGraph. +name: bismark +owner: charles-bernard +type: unrestricted \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/bismark2bedgraph.awk Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,29 @@ +#!/usr/bin/awk + +BEGIN { + FS = "\t"; +} + +{ + if ( $6 ~ context && ($4 > 0 || $5 > 0) ) { + + chr_name = $1; + chr_pos = $2; + strand = $3; + c_meth_count = $4; + c_unmeth_count = $5; + + if ( coverage == "true" ) { + nb_reads = c_meth_count + c_unmeth_count + printf("%s\t%s\t%s\t%s\n", chr_name, chr_pos, chr_pos, nb_reads) + } else { + if ( strand == "-") { + s = "-"; + } else { + s = ""; + } + meth_ratio = c_meth_count / (c_meth_count + c_unmeth_count); + printf("%s\t%s\t%s\t%s%s\n", chr_name, chr_pos, chr_pos, s, meth_ratio) + } + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/bismark2bedgraph.sh Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,86 @@ +#!/bin/bash + +# read the options +TEMP=`getopt -o e::i:co: --long epi::,infile_cov:,context,output_dir:,tool_dir:,tdf,igv_genome: -n 'report' -- "$@"` +eval set -- "$TEMP" + +# extract options and their arguments into variables. +while true ; do + case "$1" in + -e | --epi ) + case "$2" in + "" ) epi="current_job"; shift 2 ;; + *) epi=$2; shift 2 ;; + esac ;; + -i | --infile_cov ) + case "$2" in + *) infile_cov=$2; shift 2 ;; + esac ;; + -c | --context ) + context=true; shift ;; + --tdf ) + tdf=true; shift ;; + --igv_genome ) + case "$2" in + *) igv_genome=$2; shift 2 ;; + esac ;; + -o | --output_dir ) + case "$2" in + *) output_dir=$2; shift 2 ;; + esac ;; + --tool_dir ) + case "$2" in + *) tool_dir=$2; shift 2 ;; + esac ;; + -- ) shift; break ;; + * ) echo "Internal error!"; exit 1 ;; + esac +done + +# do something with the variables -- in this case the lamest possible one :-) +echo "epi = $epi" +echo "infile_cov = $infile_cov" +echo "output_dir = $output_dir" +echo "context = $context" +echo "tool_dir = $tool_dir" +echo "tdf = $tdf" +echo "igv_genome = $igv_genome" + +#IGV_path +IGV_path="/users/biocomp/chbernar/galaxy_testing/database/dependencies/igvtools/2.3.32/geert-vandeweyer/package_igvtools_2_3_32/3c087cee3b8f/bin" + +# define outputs according to options +if [[ "$context" = true ]]; then + context_list=("CG" "CHG" "CHH") + output_types=("CG" "CHG" "CHH" "coverage") + bedgraph_list=("$output_dir""/""$epi""_CpG.bedgraph" "$output_dir""/""$epi""_CHG.bedgraph" "$output_dir""/""$epi""_CHH.bedgraph" "$output_dir""/""$epi""_coverage.bedgraph") + tdf_list=("$output_dir""/""$epi""_CpG.tdf" "$output_dir""/""$epi""_CHG.tdf" "$output_dir""/""$epi""_CHH.tdf" "$output_dir""/""$epi""_coverage.tdf") + n="4" +else + context_list=(".*") + output_types=("CXX" "coverage") + bedgraph_list=("$output_dir""/""$epi""_CXX.bedgraph" "$output_dir""/""$epi""_coverage.bedgraph") + tdf_list=("$output_dir""/""$epi""_CXX.tdf" "$output_dir""/""$epi""_coverage.tdf") + n="2" +fi + +# process +for (( i=0; i<$n; i++)); do + printf "________________________________________________________________________\n" + printf "Processing %s\n" ${output_types[$i]} + printf "... Converting Cytosine Report to Bedgraph\n" + if (( i < n - 1 )); then + #if not coverage: + #printf "track type=bedGraph name=%s Coverage description=%s Coverage\n" "$epi""_""${context_list[$i]}" "$epi""_""${context_list[$i]}" > "${bedgraph_list[$i]}" + printf "#<Chr>\t<Start>\t<End>\t<Strand;Meth_ratio>\n" > "${bedgraph_list[$i]}" + awk -v context="${context_list[$i]}" -v coverage="false" -f "$tool_dir"/bismark2bedgraph.awk $infile_cov >> "${bedgraph_list[$i]}" + else + #printf "track type=bedGraph name=%s Coverage description=%s Coverage\n" "$epi""_""${context_list[$i]}" "$epi""_""${context_list[$i]}" > "${bedgraph_list[$i]}" + printf "#<Chr>\t<Start>\t<End>\t<Coverage>\n" > "${bedgraph_list[$i]}" + awk -v context="${context_list[$i]}" -v coverage="true" -f "$tool_dir"/bismark2bedgraph.awk $infile_cov >> "${bedgraph_list[$i]}" + fi + if [[ "$tdf" = true ]]; then + printf "... Converting Bedgraph to Tdf\n" + "$IGV_path""/"igvtools toTDF "${bedgraph_list[$i]}" "${tdf_list[$i]}" "$igv_genome" > stdout_file + fi +done \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/cytosine_report_to_bedgraph.xml Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,112 @@ +<tool id="cytosine_report_to_bedgraph" name="Cytosine Report To Bedgraphs" version="0.0.0"> + + <description>Converts genome-wide cytosine methylation report to Bedgraphs</description> + + <requirements> + <requirement type="package" version="2.3.32">igvtools</requirement> + </requirements> + + <command interpreter="python"> +<![CDATA[ + cytosine_report_to_bedgraph_wrapper.py + + --tool_dir "$__tool_directory__" + + --cytosine_report "$cytosine_report" + + --coverage_bedgraph "$coverage_bedgraph" + #if $tdf['tdf_selector']: + --tdf + --igv_genome "${tdf.built_in_igv_genomes_chr_sizes.fields.path}" + --coverage_tdf "$coverage_tdf" + #end if + + #if $context: + --context + --CpG_bedgraph "$CpG_bedgraph" + --CHG_bedgraph "$CHG_bedgraph" + --CHH_bedgraph "$CHH_bedgraph" + #if $tdf['tdf_selector']: + --CpG_tdf "$CpG_tdf" + --CHG_tdf "$CHG_tdf" + --CHH_tdf "$CHH_tdf" + #end if + #else + --CXX "$CXX_bedgraph" + #if $tdf['tdf_selector']: + --CXX "$CXX_tdf" + #end if + #end if + + --log "$log_report" +]]> + </command> + <inputs> + <param name="cytosine_report" type="data" format="txt" label="Submit a genome-wide cytosine methylation report" optional="False" /> + <param name="context" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Create a bedGraph for each cytosine methylation context?" /> + <conditional name="tdf"> + <param name="tdf_selector" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a tdf file from each resulting bedGraph?" /> + <when value="true"> + <param name="built_in_igv_genomes_chr_sizes" type="select" label="Select Reference Genome (igv genomes chr.sizes)" help="If your genome of interest is not listed, contact your Galaxy admin"> + <options from_data_table="igv_genomes_chr_sizes"> + <filter type="sort_by" column="name"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="false"></when> + </conditional> + </inputs> + + <outputs> + <data name="log_report" format="txt" label="${tool.name} on ${on_string}: Log Report" /> + <data name="CpG_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CpG context"> + <filter>context is True</filter> + </data> + <data name="CHG_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CHG context"> + <filter>context is True</filter> + </data> + <data name="CHH_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CHH context"> + <filter>context is True</filter> + </data> + <data name="CXX_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: CXX"> + <filter>context is False</filter> + </data> + <data name="coverage_bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: Coverage" /> + <data name="CpG_tdf" format="tdf" label="${tool.name} on ${on_string}: CpG context (tdf)"> + <filter>tdf['tdf_selector'] is True and context is True</filter> + </data> + <data name="CHG_tdf" format="tdf" label="${tool.name} on ${on_string}: CHG context (tdf)"> + <filter>tdf['tdf_selector'] is True and context is True</filter> + </data> + <data name="CHH_tdf" format="tdf" label="${tool.name} on ${on_string}: CHH context (tdf)"> + <filter>tdf['tdf_selector'] is True and context is True</filter> + </data> + <data name="CXX_tdf" format="tdf" label="${tool.name} on ${on_string}: CXX (tdf)"> + <filter>context is False and tdf['tdf_selector'] is True</filter> + </data> + <data name="coverage_tdf" format="tdf" label="${tool.name} on ${on_string}: Coverage (tdf)" > + <filter>tdf['tdf_selector'] is True</filter> + </data> + </outputs> + + <tests></tests> + + <help> +<![CDATA[ + +**What it does** + + | This tool takes as input a genome-wide cytosine methylation report (generated by the tool *Bismark Meth. Extractor*) and converts it into a bedGraph for each cytosine context (CpG, CHG and CHH). + | These bedGraphs display, for a given context, the ratio of methylation of each covered cytosine in the genome. + | + | It also produces a bedGraph displaying the coverage count of each cytosine in the genome (non-covered cytosine are ignored). + | + + .. class:: infomark + + | The tool outputs offer the possibility to vizualise the methylation signal of covered cytosines thanks to softwares like IGV (*Integrative Genomics Viewer*). + | In this respect, the tool can optionally generate a tdf binary file (*Tiled Data Format*) from each converted bedGraph. Tdf format is indeed better handled by IGV than bedGraph. +]]> + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/cytosine_report_to_bedgraph_wrapper.py Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import logging +import os +import shutil +import subprocess +import sys +import tempfile +from glob import glob +from optparse import OptionParser + +def __main__(): + parser = OptionParser() + parser.add_option("--tool_dir", dest = 'tool_dir', action = 'store', nargs = 1, metavar = 'tool_dir', type = "str") + parser.add_option("--cytosine_report", dest='cytosine_report', action='store', nargs=1, metavar='cytosine_report', type="str") + parser.add_option("--context", action="store_true") + parser.add_option("--CpG_bedgraph", dest='CpG_bedgraph', action="store", nargs = 1, metavar = 'CpG_bedgraph', type='str') + parser.add_option("--CHH_bedgraph", dest='CHH_bedgraph', action="store", nargs=1, metavar='CHH_bedgraph', type='str') + parser.add_option("--CHG_bedgraph", dest='CHG_bedgraph', action="store", nargs=1, metavar='CHG_bedgraph', type='str') + parser.add_option("--CXX_bedgraph", dest='CXX_bedgraph', action="store", nargs=1, metavar='CXX_bedgraph', type='str') + parser.add_option("--coverage_bedgraph", dest='coverage_bedgraph', action="store", nargs=1, metavar='coverage_bedgraph', type='str') + parser.add_option("--tdf", action="store_true") + parser.add_option("--igv_genome", dest='igv_genome', action='store', nargs=1, metavar='igv_genome', type='str') + parser.add_option("--CpG_tdf", dest='CpG_tdf', action="store", nargs=1, metavar='CpG_tdf', type='str') + parser.add_option("--CHH_tdf", dest='CHH_tdf', action="store", nargs=1, metavar='CHH_tdf', type='str') + parser.add_option("--CHG_tdf", dest='CHG_tdf', action="store", nargs=1, metavar='CHG_tdf', type='str') + parser.add_option("--CXX_tdf", dest='CXX_tdf', action="store", nargs=1, metavar='CXX_tdf', type='str') + parser.add_option("--coverage_tdf", dest='coverage_tdf', action="store", nargs=1, metavar='coverage_tdf', type='str') + parser.add_option("--log", dest='log_report', action="store", nargs=1, metavar='log_report') + + options, args = parser.parse_args() + + if options.log_report: + logging.basicConfig(level=logging.INFO, filename=options.log_report, filemode="a+", format='%(message)s') + else: + logging.basicConfig(level=logging.INFO, filename='galaxy_log_report.log', filemode="a+", format='%(message)s') + logging.info('_______________________________________________________________\n') + logging.info('List of options: %s\n' % options) + + script_path=os.path.join(options.tool_dir, 'bismark2bedgraph.sh') + tmp_dir = tempfile.mkdtemp(prefix='tmp') + + if options.context: + context_option = '-c' + else: + context_option = '' + + if options.tdf: + tdf_option = '--tdf' + genome_option = '--igv_genome' + genome = options.igv_genome + else: + tdf_option = '' + genome_option = '' + genome = '' + + logging.info('_______________________________________________________________\n') + logging.info('COMMAND:\n') + logging.info("bash %s -i %s %s %s %s %s -e current_job -o %s\n" % (script_path, options.cytosine_report, context_option, tdf_option, genome_option, genome, tmp_dir)) + + proc = subprocess.Popen(['bash', script_path, '--tool_dir', options.tool_dir, '-i', options.cytosine_report,\ + context_option, tdf_option, genome_option, genome, '-e', 'current_job', '-o', tmp_dir], \ + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc_out, proc_err = proc.communicate() + + if proc_out: + logging.info('_______________________________________________________________\n') + logging.info('Tool STDOUT:\n') + logging.info(proc_out) + if proc_err: + logging.info('_______________________________________________________________\n') + logging.info('ERROR:\n') + msg = 'The tool crashed with the following error:\n%s\n' % proc_err + logging.critical(msg) + sys.exit(msg) + + if options.context: + shutil.move( glob(os.path.join(tmp_dir, '*_CpG.bedgraph'))[0], options.CpG_bedgraph ) + shutil.move( glob(os.path.join(tmp_dir, '*_CHG.bedgraph'))[0], options.CHG_bedgraph ) + shutil.move( glob(os.path.join(tmp_dir, '*_CHH.bedgraph'))[0], options.CHH_bedgraph ) + if options.tdf: + shutil.move(glob(os.path.join(tmp_dir, '*_CpG.tdf'))[0], options.CpG_tdf) + shutil.move(glob(os.path.join(tmp_dir, '*_CHG.tdf'))[0], options.CHG_tdf) + shutil.move(glob(os.path.join(tmp_dir, '*_CHH.tdf'))[0], options.CHH_tdf) + else: + shutil.move(glob(os.path.join(tmp_dir, '*_CXX.bedgraph'))[0], options.CXX_bedgraph) + if options.tdf: + shutil.move(glob(os.path.join(tmp_dir, '*_CXX.tdf'))[0], options.CXX_tdf) + shutil.move(glob(os.path.join(tmp_dir, '*_coverage.bedgraph'))[0], options.coverage_bedgraph) + if options.tdf: + shutil.move(glob(os.path.join(tmp_dir, '*_coverage.tdf'))[0], options.coverage_tdf) + + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + +if __name__=="__main__": __main__() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/tool-data/igv_genomes_chr_sizes.loc.sample Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,2 @@ +#<value> <dbkey> <name> <path> +Arabidopsis_thaliana_TAIR10 Arabidopsis_thaliana_TAIR10 Arabidopsis thaliana: TAIR10 /export/home1/users/biocomp/chbernar/galaxy_testing/database/dependencies/igvtools/2.3.32/geert-vandeweyer/package_igvtools_2_3_32/3c087cee3b8f/bin/genomes/tair10.chrom.sizes \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/tool_data_table_conf.xml.sample Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,7 @@ +<tables> + <!-- Location of all igv genomes chr sizes --> + <table name="igv_genomes_chr_sizes" comment_char="#" allow_duplicate_entries="False"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/igv_genomes_chr_sizes.loc" /> + </table> +</tables> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cytosine_report_to_bedgraph/tool_dependencies.xml Mon Nov 14 04:55:30 2016 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="igv_tools" version="2.3.32"> + <repository changeset_revision="3c087cee3b8f" name="package_igvtools_2_3_32" owner="geert-vandeweyer" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>