Mercurial > repos > rmarenco > multi_fasta_glimmer_hmm

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_hmm.loc.sample	Thu Aug 18 18:50:00 2016 -0400
@@ -0,0 +1,16 @@
+#This file lists the locations of all the trained_dir files
+#under the "trained_dir" directory (a directory that contains a directory
+#for each organism used by glimmer_hmm).
+#This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_id>	<display_name>	<file_path>
+#
+#glimmer_hmm.loc could look something like this:
+#
+#human	Human	/path/to/trained_dir/human
+#celegans	Celegan	/path/to/trained_dir/Celegans
+#arabidopsis Arabidopsis /path/to/trained_dir/arabidopsis
+#rice Rice /path/to/trained_dir/rice
+#zebrafish Zebrafish /path/to/trained_dir/zebrafish
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerhmm.xml	Thu Aug 18 18:50:00 2016 -0400
@@ -0,0 +1,54 @@
+<tool id="multi_fasta_glimmerhmm" name="Multi Fasta GlimmerHMM" version="4.0">
+    <description>Predict ORFs in eukaryotic genomes for Multi Fasta file</description>
+    <command detect_errors="aggressive"><![CDATA[
+        python $__tool_directory__/multi_glimmer.py
+        --multi_fasta $input
+        --trained_dir $trained_specie.fields.path
+        --output $output
+	]]></command>
+    <inputs>
+        <param name="input" type="data" format="fasta" label="Genome Sequence"/>
+        <param name="trained_specie" type="select" label="Select a specie">
+            <options from_data_table="glimmer_hmm_trained_dir">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available"/>
+            </options>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="gff3" name="output" />
+    </outputs>
+    <help>
+        **What it does**
+
+        GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM).
+        Although the gene finder conforms to the overall mathematical framework of a GHMM,
+        additionally it incorporates splice site models adapted from the GeneSplicer program and a
+        decision tree adapted from GlimmerM. It also utilizes Interpolated Markov Models for the
+        coding and noncoding models . Currently, GlimmerHMM's GHMM structure includes introns of each phase,
+        intergenic regions, and four types of exons (initial, internal, final, and single).
+        A basic user manual can be consulted here.
+
+        **Example**
+
+        Suppose you have the following DNA formatted sequences::
+
+            >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+            cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
+            ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
+            cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
+            cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
+            ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
+
+        Running this tool will produce this::
+
+            ##gff-version 3
+            ##sequence-region ConsensusfromCH236920mapping 1 4148552
+            ConsensusfromCH236920mapping  GlimmerHMM  mRNA  1       122     .   +   .   ID=ConsensusfromCH236920mapping.path1.gene1;Name=ConsensusfromCH236920mapping.path1.gene1
+            ConsensusfromCH236920mapping  GlimmerHMM  CDS   1       122     .   +   0   ID=ConsensusfromCH236920mapping.cds1.1;
+            ConsensusfromCH236920mapping  GlimmerHMM  mRNA  14066   15205   .   -   .   ID=ConsensusfromCH236920mapping.path1.gene2;Name=ConsensusfromCH236920mapping.path1.gene2
+            ConsensusfromCH236920mapping  GlimmerHMM  CDS   14066   15034   .   -   0   ID=ConsensusfromCH236920mapping.cds2.1;
+            ConsensusfromCH236920mapping  GlimmerHMM  CDS   15137   15205   .   -   0   ID=ConsensusfromCH236920mapping.cds2.2;
+            ConsensusfromCH236920mapping  GlimmerHMM  mRNA  19910   24210   .   -   .   ID=ConsensusfromCH236920mapping.path1.gene3;Name=ConsensusfromCH236920mapping.path1.gene3
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multi_glimmer.py	Thu Aug 18 18:50:00 2016 -0400
@@ -0,0 +1,75 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+import argparse
+import os
+import subprocess
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Get a multi-fasta, the trained_dir and the output file as inputs, '
+                                                 'to generate GlimmerHMM gene prediction over all contigs')
+
+    parser.add_argument('--multi_fasta', help='Multi fasta file to run GlimmerHMM on', required=True)
+
+    parser.add_argument('--trained_dir', help='Path to the GlimmerHMM trained_dir', required=True)
+
+    parser.add_argument('--output', help='file to output the result into', required=True)
+
+    args = parser.parse_args()
+
+    multi_fasta = args.multi_fasta
+    trained_dir = args.trained_dir
+    # TODO: Temporary fix for the issue with config.file in human/. Next: GC Content to select the appropriate folder
+    if trained_dir.split('/')[-1] == "human":
+        trained_dir = os.path.join(trained_dir, "Train0-43")
+
+    output_file = args.output
+    temp_contig = "temp_contig"
+
+    def exec_glimmer(contig_file, first_time=False):
+        p = subprocess.Popen(["glimmerhmm", contig_file, trained_dir, "-g"],
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        output, errors = p.communicate()
+
+        p.wait()
+        # Process the error if != "Done"
+        if not errors or (errors.split()[0] != "Done"):
+            raise Exception("Error in glimmer: {0}".format(errors))
+        else:
+            sys.stdout.write(errors)
+        # If  not first time, we need to remove the first comments
+        if not first_time:
+            output = "\n".join(output.split("\n")[1:])
+
+        return output
+
+    with open(output_file, 'w+') as o:
+        with open(multi_fasta, 'r') as mf:
+            is_first_time = True
+            for i, line in enumerate(mf):
+                if line[0] == '>':
+                    # If it is the first time we finish to read a contig, we let glimmer add the full comments
+                    # and write into the output the result
+                    if is_first_time is True and i != 0:
+                        o.write(exec_glimmer(temp_contig, first_time=is_first_time))
+                        is_first_time = False
+                    # Else we call glimmer and say this is not the first time (so remove the first comment)
+                    # and dump into the output file the result
+                    elif i > 0:
+                        o.write(exec_glimmer(temp_contig))
+
+                    # Because we are on an indication of a beginning of a sequence, we need to create an empty file
+                    # to dump the line into
+                    with open(temp_contig, 'w+') as tc:
+                        tc.write(line)
+                else:
+                    # We are in the sequence of a contig, so we append the line in the file
+                    with open(temp_contig, 'a+') as tc:
+                        tc.write(line)
+        # The file is terminate, we did read another contig so we need to save this last one
+        o.write(exec_glimmer(temp_contig, first_time=is_first_time))
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multi_glimmer.sh	Thu Aug 18 18:50:00 2016 -0400
@@ -0,0 +1,50 @@
+#!/bin/sh
+set -e
+
+reference_fasta=$1
+trained_dir=$2
+output=$3
+temp="temp_contig_file"
+
+# Write the glimmerhmm, with the comments
+glimmerHMM_first () {
+    glimmerhmm $1 ${trained_dir} -o ${output} -g
+}
+
+# Write the glimmerhmm output without the comments
+glimmerHMM_without_comments () {
+    glimmerhmm $1 ${trained_dir} -g | tail -n +2 >> ${output}
+}
+
+count=1
+# Loop through the contigs to run glimmer on each
+while read line
+do
+    # Get the content of actual contig
+    #samtools_faidx_show_contig ${reference_fasta} ${contig} > contig_content
+    first_char=$(echo ${line} | cut -c1-1)
+
+    if [ ${first_char} = '>' ]
+    then
+        # If true, it means we have finished reading at least the first contig
+        if [ -f ${temp} ]
+        then
+            if [ ${count} -eq 1 ]
+            then
+                glimmerHMM_first ${temp};
+                count=$((count+1))
+            else
+                glimmerHMM_without_comments ${temp};
+            fi
+        fi
+        echo ${line} > ${temp}
+    else
+        echo ${line} >> ${temp}
+    fi
+done < "${reference_fasta}"
+
+# Still last contig to process
+glimmerHMM_without_comments ${temp};
+
+# Delete the temp_contig_file
+rm ${temp}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.md	Thu Aug 18 18:50:00 2016 -0400
@@ -0,0 +1,83 @@
+Galaxy wrapper for GlimmerHMM
+=====================================
+
+This wrapper has been rewritten by Rémi Marenco in 2016 to fix issues, improve it and add the multi_fasta handling. It has been originally written by Björn Gruening.
+
+This is a wrapper for the command line tool of GlimmerHMM.
+https://ccb.jhu.edu/software/glimmerhmm/
+
+GlimmerHMM is a gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM,
+additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes
+Interpolated Markov Models for the coding and noncoding models.
+Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single).
+
+Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879.
+Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48.
+The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815.
+Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5.
+Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31.
+
+
+Installation
+============
+
+To install Glimmer3, please download GlimmerHMM from
+
+ftp://ccb.jhu.edu/pub/software/glimmerhmm
+
+and follow the installation instructions.
+To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included.
+
+To install the wrapper copy the glimmerHMM folder in the galaxy tools
+folder and modify the $GALAXY_ROOT/config/tool_conf.xml file to make the tool available to Galaxy.
+For example:
+
+```xml
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" />
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" />
+```
+
+You also need to use a trained organism by adding them as reference data in Galaxy:
+
+1. Add the *glimmer_hmm_trained_dir* data table to `tool_data_table_conf.xml` in `$GALAXY_ROOT/config/`:
+
+    ```xml
+    <!-- glimmer_hmm trained_dir -->
+    <table name="glimmer_hmm_trained_dir" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/glimmer_hmm.loc" />
+    </table>
+    ```
+
+2. Add the `glimmer_hmm.loc` file referencing your trained organism, in `tool-data`.
+    You have a sample [`glimmer_hmm.loc.sample`] available in the repository to help you configuring it properly
+3. Add your data in the chosen folder at step 2. You can get them from the GlimmerHMM tar, `$GLIMMERHMM/trained_dir`
+
+History
+=======
+
+- v3.0 - Add the Multi Fasta support
+- v2.0 - Update by Rémi Marenco to make it work without having to modify the wrapper + add ability to select the species
+- v0.1 - Initial public release
+
+
+Wrapper Licence (MIT/BSD style)
+===============================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Aug 18 18:50:00 2016 -0400
@@ -0,0 +1,98 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+    <!-- Locations of indexes in the BFAST mapper format -->
+    <table name="bfast_indexes" comment_char="#">
+        <columns>value, dbkey, formats, name, path</columns>
+        <file path="tool-data/bfast_indexes.loc" />
+    </table>
+    <!-- Locations of protein (mega)blast databases -->
+    <table name="blastdb_p" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb_p.loc" />
+    </table>
+    <!-- Locations of indexes in the BWA mapper format -->
+    <table name="bwa_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/bwa_index.loc" />
+    </table>
+    <!-- Locations of indexes in the BWA color-space mapper format -->
+    <table name="bwa_indexes_color" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/bwa_index_color.loc" />
+    </table>
+    <!-- Locations of MAF files that have been indexed with bx-python -->
+    <table name="indexed_maf_files">
+        <columns>name, value, dbkey, species</columns>
+        <file path="tool-data/maf_index.loc" />
+    </table>
+    <!-- Locations of fasta files appropriate for NGS simulation -->
+    <table name="ngs_sim_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/ngs_sim_fasta.loc" />
+    </table>
+    <!-- Locations of PerM base index files -->
+    <table name="perm_base_indexes" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/perm_base_index.loc" />
+    </table>
+    <!-- Locations of PerM color-space index files -->
+    <table name="perm_color_indexes" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/perm_color_index.loc" />
+    </table>
+    <!-- Location of Picard dict file and other files -->
+    <table name="picard_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/picard_index.loc" />
+    </table>
+    <!-- Location of SRMA dict file and other files -->
+    <table name="srma_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/picard_index.loc" />
+    </table>
+    <!-- Location of Mosaik files -->
+    <table name="mosaik_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/mosaik_index.loc" />
+    </table>
+    <!-- Locations of indexes in the 2bit format -->
+    <table name="twobit" comment_char="#">
+        <columns>value, path</columns>
+        <file path="tool-data/twobit.loc" />
+    </table>
+    <!-- Available IGV builds, loaded from URL -->
+    <table name="igv_broad_genomes" comment_char="#">
+        <columns>name, url, value</columns>
+        <file url="http://igv.broadinstitute.org/genomes/genomes.txt" />
+    </table>
+    <!-- Available liftOver chain file -->
+    <table name="liftOver" comment_char="#">
+        <columns>dbkey, name, value</columns>
+        <file path="tool-data/liftOver.loc" />
+    </table>
+    <!-- iobio bam servers -->
+    <table name="bam_iobio" comment_char="#">
+        <columns>value, name, url</columns>
+        <file path="tool-data/bam_iobio.loc" />
+    </table>
+    <!-- iobio vcf servers -->
+    <table name="vcf_iobio" comment_char="#">
+        <columns>value, name, url</columns>
+        <file path="tool-data/vcf_iobio.loc" />
+    </table>
+    <!-- simple biom servers -->
+    <table name="biom_simple_display" comment_char="#">
+        <columns>value, name, url</columns>
+        <file path="tool-data/biom_simple_display.loc" />
+    </table>
+    <!-- glimmer_hmm trained_dir -->
+    <table name="glimmer_hmm_trained_dir" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/glimmer_hmm.loc" />
+    </table>
+</tables>