changeset 2:5b99943c4627 draft

Span version https://github.com/JetBrains-Research/galaxy-applications/commit/cbbba255d66a4775cc35caf5cb85665396fdcd2a
author jetbrains
date Sun, 18 Nov 2018 08:20:27 -0500
parents dfb1e66235c5
children 4130e95bd6c8
files README.md span.xml span_wrapper.py
diffstat 3 files changed, 220 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/README.md	Thu Nov 15 11:30:01 2018 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-Release version
-===============
-Release is just a `span` snapshot from [https://github.com/JetBrains-Research/galaxy-applications](https://github.com/JetBrains-Research/galaxy-applications)
\ No newline at end of file
--- a/span.xml	Thu Nov 15 11:30:01 2018 -0500
+++ b/span.xml	Sun Nov 18 08:20:27 2018 -0500
@@ -1,8 +1,7 @@
 <tool id="span" name="SPAN" version="0.7.1.4272">
-    <description>ChIP-Seq analysis</description>
+    <description>Semi-supervised Peak Analyzer for ChIP-Seq data</description>
     <requirements>
         <requirement type="package" version="0.7.1.4272">package_span_jar</requirement>
-        <!--<container type="docker">biolabs/span</container>-->
     </requirements>
     <stdio>
         <!-- Wrapper ensures anything other than zero is an error -->
@@ -10,31 +9,58 @@
         <exit_code range=":-1"/>
     </stdio>
     <command interpreter="python">
+#import re
+#set treatment_identifier = re.sub('[^\w\-\.]', '_', str($treatment_file.element_identifier))
+#set genome_identifier = re.sub('[^\w\-\.]', '_', str($genome_file.element_identifier))
+
+#if $control.control_selector
+    #set control_identifier = re.sub('[^\w\-\.]', '_', str($control.control_file.element_identifier))
+#end if
+
 #if str($action.action_selector) == "model"
     #if $control.control_selector
-        span_wrapper.py model with_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}" "${control.control_file}"
+        span_wrapper.py model with_control
+            "${genome_identifier}" "${genome_file}"
+            "${treatment_identifier}" "${treatment_file}"
+            "${bin}" "${action.model_file}"
+            "${control_identifier}" "${control.control_file}"
     #else
-        span_wrapper.py model without_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}"
+        span_wrapper.py model without_control
+            "${genome_identifier}" "${genome_file}"
+            "${treatment_identifier}" "${treatment_file}"
+            "${bin}" "${action.model_file}"
     #end if
 #else
     #if $control.control_selector
-        span_wrapper.py peaks with_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}" "${control.control_file}" "${fdr}" "${gap}" "${action.peaks_file}"
+        span_wrapper.py peaks with_control
+            "${genome_identifier}" "${genome_file}"
+            "${treatment_identifier}" "${treatment_file}"
+            "${bin}" "${action.model_file}"
+            "${control_identifier}" "${control.control_file}"
+            "${action.fdr}" "${action.gap}" "${action.peaks_file}"
     #else
-        span_wrapper.py peaks without_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}" "${fdr}" "${gap}" "${action.peaks_file}"
+        span_wrapper.py peaks without_control
+            "${genome_identifier}" "${genome_file}"
+            "${treatment_identifier}" "${treatment_file}"
+            "${bin}" "${action.model_file}"
+            "${action.fdr}" "${action.gap}" "${action.peaks_file}"
     #end if
 #end if
      </command>
     <inputs>
         <param name="treatment_file" type="data" format="bam" label="Treatment BAM"
-               description="Treatment BAM reads to process"/>
-        <param name="genome" type="data" format="chrom.sizes" label="Genome chrom.sizes"
-               description="Genome build chrom.sizes file"/>
+               description="Treatment BAM reads to process" argument="--treatment"
+               help="Treatment BAM reads to process"/>
+        <param name="genome_file" type="data" format="chrom.sizes" label="Genome chrom.sizes"
+               description="Genome build chrom.sizes file" argument="--chrom.sizes"
+               help="Genome build chrom.sizes file"/>
 
         <conditional name="control">
             <param name="control_selector" type="boolean" label="Control available" value="false"/>
             <when value="true">
                 <param name="control_file" type="data" format="bam" label="Control BAM"
-                       description="Control BAM reads to process"/>
+                       description="Control BAM reads to process" argument="--control"
+                       help="Control BAM reads to process"/>
             </when>
         </conditional>
 
@@ -44,26 +70,107 @@
                 <option value="peaks">Compute SPAN model and produce peaks file</option>
             </param>
             <when value="model">
-                <param name="model_file" type="text" value="model.span" label="Model name"/>
+                <param name="model_file" type="text" value="model.span" label="Model name"
+                       help="Trained model file in binary format, which can be visualized directly in JBR Genome Browser
+                       and used in integrated peak calling pipeline"/>
             </when>
             <when value="peaks">
-                <param name="model_file" type="text" value="model.span" label="Model file name"/>
-                <param name="fdr" size="5" type="float" value="0.0001" label="FDR"/>
-                <param name="gap" size="5" type="integer" value="5" label="GAP"/>
-                <param name="peaks_file" type="text" value="result.peak" label="Peaks file name"/>
+                <param name="model_file" type="text" value="model.span" label="Model file name"
+                       help="Trained model file in binary format, which can be visualized directly in JBR Genome Browser
+                       and used in integrated peak calling pipeline"/>
+                <param name="fdr" size="5" type="float" value="0.0001" label="FDR" argument="--fdr"
+                       help="Minimum FDR cutoff to call significant regions, default value is 1.0E-6.
+                       SPAN reports p- and q- values for the null hypothesis that a given bin is not enriched with a histone modification.
+                       Peaks are formed from a list of truly (in the FDR sense) enriched bins for the analyzed biological condition by thresholding the
+                       Q-value with a cutoff FDR and merging spatially close peaks using GAP option to broad ones. This is equivalent to controlling FDR.
+                       q-values are are calculated from p-values using Benjamini-Hochberg procedure."/>
+                <param name="gap" size="5" type="integer" value="5" label="GAP" argument="--gap"
+                       help="Gap size to merge spatially close peaks. Useful for wide histone modifications.
+                       Default value is 5, i.e. peaks separated by 5*BIN distance or less are merged."/>
+                <param name="peaks_file" type="text" value="result.peak" label="Peaks file name" argument="--peaks"/>
             </when>
         </conditional>
 
-        <param name="bin" size="5" type="integer" value="200" label="Bin size"/>
+        <param name="bin" size="5" type="integer" value="200" label="Bin size" argument="--bin"
+               help="Peak analysis is performed on read coverage tiled into consequent bins, with size being configurable.
+               Default value is 200bp, approximately the length of one nucleosome."/>
     </inputs>
     <outputs>
-        <data name="${action.model_file}" format="span" label="SPAN model file"/>
-        <data name="${action.peaks_file}" format="bed" label="SPAN peaks file">
+        <data name="SPAN model file" format="span" from_work_dir="*.span" label="SPAN model file ${action.model_file} on ${on_string}"/>
+        <data name="SPAN peaks file" format="bed" from_work_dir="*.peak" label="SPAN peaks file ${action.peaks_file} on ${on_string}">
             <filter>action['action_selector'] == "peaks"</filter>
         </data>
+        <data name="SPAN log file" format="txt" from_work_dir="*.log" label="SPAN log file on ${on_string}"/>
     </outputs>
-    <help>
-        SPAN Semi-supervised Peak Analyzer is a tool for analyzing ChIP-seq data.
-        Details: http://artyomovlab.wustl.edu/aging/span.html
-    </help>
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+SPAN Semi-supervised Peak Analyzer is a tool for analyzing ChIP-seq data.
+
+-----
+
+**Inputs**
+
+*-t, --treatment <Path>*        **Required.** ChIP-seq treatment file. bam, bed or .bed.gz file; If multiple files are given, treated as replicates.
+
+*--chrom.sizes, --cs <Path>*    **Required.** Chromosome sizes path, can be downloaded at http://hgdownload.cse.ucsc.edu/goldenPath/<build>/bigZips/<build>.chrom.sizes
+
+*-c, --control <Path>*          Control file. bam, bed or bed.gz file; Single control file or separate file per each treatment file required.
+
+*--fragment <Integer>*          Fragment size, read length if not given
+
+*-b, --bin <Integer>*           Bin size
+
+*-f, --fdr <Double>*            Fdr value
+
+*-g, --gap <Integer>*           Gap size to merge peaks
+
+*-p, --peaks <Path>*            Path to result peaks file in ENCODE broadPeak (BED 6+3) format
+
+
+-----
+
+**Outputs**
+
+This tool produces a SPAN binary model file and/or peaks in ENCODE broadPeak (BED 6+3) format.
+
+Peak file columns contain the following data:
+
+* **1st**: chromosome name
+* **2nd**: start position of peak
+* **3rd**: end position of peak
+* **4th**: name of peak
+* **5th**: integer score for display in genome browser (e.g. UCSC)
+* **6th**: strand, either "." (=no strand) or "+" or "-"
+* **7th**: fold-change
+* **8th**: -log10pvalue
+* **9th**: -log10qvalue
+
+-----
+
+**SPAN workflow**
+
+* Convert raw reads to tags using *FRAGMENT* parameter.
+* Compute coverage for all genome tiled into bins of *BIN* base pairs.
+* Fit 3-state hidden Markov model that classifies bins as ZERO states with no coverage, LOW states of non-specific binding, and HIGH states of the specific binding.
+* Compute posterior HIGH state probability of each bin.
+* Trained model is saved into *.span* binary format.
+* Peaks are computed using trained model and *FDR* and *GAP* parameters.
+
+------
+
+**Citation**
+
+If you use this tool in Galaxy, please cite XXX, et al. *In preparation.*
+
+-----
+
+**More Information**
+
+* Project home page: https://research.jetbrains.org/groups/biolabs/tools/span-peak-analyzer
+* Study cases: https://artyomovlab.wustl.edu/aging
+
+]]></help>
 </tool>
--- a/span_wrapper.py	Thu Nov 15 11:30:01 2018 -0500
+++ b/span_wrapper.py	Sun Nov 18 08:20:27 2018 -0500
@@ -1,67 +1,135 @@
 #!/usr/bin/env python
 
 import os
+import shutil
+import subprocess
 import sys
-import subprocess
+
 argv = sys.argv[1:]
 print 'Arguments {0}'.format(argv)
 
 SPAN_JAR = os.environ.get("SPAN_JAR")
-# span.jar from Docker container
-# SPAN_JAR = "/root/span.jar"
 print 'Using SPAN Peak Analyzer distributive file {0}'.format(SPAN_JAR)
 
-# #if $action.action_selector
-#     #if str($control.control_selector) == "with_control"
-#         span_wrapper.py model with_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}" "${control.control_file}"
+# #if str($action.action_selector) == "model"
+#     #if $control.control_selector
+#         span_wrapper.py model with_control
+#             "${genome_identifier}" "${genome_file}"
+#             "${treatment_identifier}" "${treatment_file}"
+#             "${bin}" "${action.model_file}"
+#             "${control_identifier}" "${control.control_file}"
 #     #else
-#         span_wrapper.py model without_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}"
+#         span_wrapper.py model without_control
+#             "${genome_identifier}" "${genome_file}"
+#             "${treatment_identifier}" "${treatment_file}"
+#             "${bin}" "${action.model_file}"
 #     #end if
 # #else
 #     #if $control.control_selector
-#         span_wrapper.py peaks with_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}" "${control.control_file}" "${fdr}" "${gap}" "${action.peaks_file}"
+#         span_wrapper.py peaks with_control
+#             "${genome_identifier}" "${genome_file}"
+#             "${treatment_identifier}" "${treatment_file}"
+#             "${bin}" "${action.model_file}"
+#             "${control_identifier}" "${control.control_file}"
+#             "${fdr}" "${gap}" "${action.peaks_file}"
 #     #else
-#         span_wrapper.py peaks without_control "${genome}" "${treatment_file}" "${bin}" "${action.model_file}" "${fdr}" "${gap}" "${action.peaks_file}"
+#         span_wrapper.py peaks with_control
+#             "${genome_identifier}" "${genome_file}"
+#             "${treatment_identifier}" "${treatment_file}"
+#             "${bin}" "${action.model_file}"
+#             "${fdr}" "${gap}" "${action.peaks_file}"
 #     #end if
 # #end if
 
-# See http://artyomovlab.wustl.edu/aging/span.html for command line options
+# See https://research.jetbrains.org/groups/biolabs/tools/span-peak-analyzer for command line options
 action = argv[0]
 control = argv[1]
+
+working_dir = os.path.abspath('.')
+print 'WORKING DIRECTORY: {}'.format(working_dir)
+
+
+def link(name, f):
+    """ SPAN uses file extension to detect input type, so original names are necessary, instead of Galaxy .dat files"""
+    result = os.path.join(working_dir, name)
+    os.symlink(f, result)
+    return result
+
+
 if action == 'model':
     if control == 'with_control':
-        (chrom_sizes, treatment_file, bin, model_file, control_file) = argv[2:]
+        (chrom_sizes, chrom_sizes_file,
+         treatment, treatment_file,
+         bin, model_file,
+         control, control_file) = argv[2:]
         cmd = 'java -jar {} analyze --chrom.sizes {} --treatment {} --control {} --bin {}'.format(
-            SPAN_JAR, chrom_sizes, treatment_file, control_file, bin
+            SPAN_JAR,
+            link(chrom_sizes, chrom_sizes_file),
+            link(treatment, treatment_file),
+            link(control, control_file),
+            bin
         )
-        print "MODEL FILE" + model_file
     elif control == 'without_control':
-        (chrom_sizes, treatment_file, bin, model_file) = argv[2:]
+        (chrom_sizes, chrom_sizes_file,
+         treatment, treatment_file,
+         bin, model_file) = argv[2:]
         cmd = 'java -jar {} analyze --chrom.sizes {} --treatment {} --bin {}'.format(
-            SPAN_JAR, argv[2], argv[3], argv[4]
+            SPAN_JAR,
+            link(chrom_sizes, chrom_sizes_file),
+            link(treatment, treatment_file),
+            bin
         )
-        print "MODEL FILE" + model_file
     else:
         raise Exception("Unknown control option {}".format(control))
 
 elif action == "peaks":
     if control == 'with_control':
-        (chrom_sizes, treatment_file, bin, model_file, control_file, fdr, gap, peaks_file) = argv[2:]
+        (chrom_sizes, chrom_sizes_file,
+         treatment, treatment_file,
+         bin, model_file,
+         control, control_file,
+         fdr, gap, peaks_file) = argv[2:]
         cmd = 'java -jar {} analyze --chrom.sizes {} --treatment {} --control {} --bin {} --fdr {} --gap {} --peaks {}'.format(
-            SPAN_JAR, chrom_sizes, treatment_file, control_file, bin, fdr, gap, peaks_file
+            SPAN_JAR,
+            link(chrom_sizes, chrom_sizes_file),
+            link(treatment, treatment_file),
+            link(control, control_file),
+            bin, fdr, gap,
+            os.path.join(working_dir, peaks_file)
         )
-        print "MODEL FILE" + model_file
     elif control == 'without_control':
-        (chrom_sizes, treatment_file, bin, model_file, fdr, gap, peaks_file) = argv[2:]
+        (chrom_sizes, chrom_sizes_file,
+         treatment, treatment_file,
+         bin, model_file,
+         fdr, gap, peaks_file) = argv[2:]
         cmd = 'java -jar {} analyze --chrom.sizes {} --treatment {} --bin {} --fdr {} --gap {} --peaks {}'.format(
-            SPAN_JAR, chrom_sizes, treatment_file, bin, fdr, gap, peaks_file
+            SPAN_JAR,
+            link(chrom_sizes, chrom_sizes_file),
+            link(treatment, treatment_file),
+            bin, fdr, gap,
+            os.path.join(working_dir, peaks_file)
         )
-        print "MODEL FILE" + model_file
     else:
         raise Exception("Unknown control option {}".format(control))
 else:
     raise Exception("Unknown action command {}".format(action))
 
 
-print 'Launching SPAN: {0}'.format(cmd)
+print 'Launching SPAN: {}'.format(cmd)
+print 'Model file: {}'.format(model_file)
+try:
+    print 'Peaks file: {}'.format(peaks_file)
+except NameError:
+    pass
+
 subprocess.check_call(cmd, cwd=None, shell=True)
+
+# Move model to the the working dir with given name
+fit_dir = os.path.join(working_dir, 'fit')
+model_original = os.path.join(fit_dir, os.listdir(fit_dir)[0])
+shutil.move(model_original, os.path.join(working_dir, model_file))
+
+# Move log file
+logs_dir = os.path.join(working_dir, 'logs')
+log_original = os.path.join(logs_dir, os.listdir(logs_dir)[0])
+shutil.move(log_original, os.path.join(working_dir, "span.log"))