Mercurial > repos > rmarenco > multi_fasta_glimmer_hmm
changeset 0:0ddb5ee32ff6 draft default tip
planemo upload for repository https://github.com/remimarenco/multi_fasta_glimmerhmm.git commit 28bd73b26b50165eded1d9ba995979acdf005ad1-dirty
author | rmarenco |
---|---|
date | Thu, 18 Aug 2016 18:50:00 -0400 |
parents | |
children | |
files | glimmer_hmm.loc.sample glimmerhmm.xml multi_glimmer.py multi_glimmer.sh readme.md tool_data_table_conf.xml.sample |
diffstat | 6 files changed, 376 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_hmm.loc.sample Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,16 @@ +#This file lists the locations of all the trained_dir files +#under the "trained_dir" directory (a directory that contains a directory +#for each organism used by glimmer_hmm). +#This file has the format (white space characters are +#TAB characters): +# +#<unique_id> <display_name> <file_path> +# +#glimmer_hmm.loc could look something like this: +# +#human Human /path/to/trained_dir/human +#celegans Celegan /path/to/trained_dir/Celegans +#arabidopsis Arabidopsis /path/to/trained_dir/arabidopsis +#rice Rice /path/to/trained_dir/rice +#zebrafish Zebrafish /path/to/trained_dir/zebrafish +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerhmm.xml Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,54 @@ +<tool id="multi_fasta_glimmerhmm" name="Multi Fasta GlimmerHMM" version="4.0"> + <description>Predict ORFs in eukaryotic genomes for Multi Fasta file</description> + <command detect_errors="aggressive"><![CDATA[ + python $__tool_directory__/multi_glimmer.py + --multi_fasta $input + --trained_dir $trained_specie.fields.path + --output $output + ]]></command> + <inputs> + <param name="input" type="data" format="fasta" label="Genome Sequence"/> + <param name="trained_specie" type="select" label="Select a specie"> + <options from_data_table="glimmer_hmm_trained_dir"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available"/> + </options> + </param> + </inputs> + <outputs> + <data format="gff3" name="output" /> + </outputs> + <help> + **What it does** + + GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM). + Although the gene finder conforms to the overall mathematical framework of a GHMM, + additionally it incorporates splice site models adapted from the GeneSplicer program and a + decision tree adapted from GlimmerM. It also utilizes Interpolated Markov Models for the + coding and noncoding models . Currently, GlimmerHMM's GHMM structure includes introns of each phase, + intergenic regions, and four types of exons (initial, internal, final, and single). + A basic user manual can be consulted here. + + **Example** + + Suppose you have the following DNA formatted sequences:: + + >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; + cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg + ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag + cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc + cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc + ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg + + Running this tool will produce this:: + + ##gff-version 3 + ##sequence-region ConsensusfromCH236920mapping 1 4148552 + ConsensusfromCH236920mapping GlimmerHMM mRNA 1 122 . + . ID=ConsensusfromCH236920mapping.path1.gene1;Name=ConsensusfromCH236920mapping.path1.gene1 + ConsensusfromCH236920mapping GlimmerHMM CDS 1 122 . + 0 ID=ConsensusfromCH236920mapping.cds1.1; + ConsensusfromCH236920mapping GlimmerHMM mRNA 14066 15205 . - . ID=ConsensusfromCH236920mapping.path1.gene2;Name=ConsensusfromCH236920mapping.path1.gene2 + ConsensusfromCH236920mapping GlimmerHMM CDS 14066 15034 . - 0 ID=ConsensusfromCH236920mapping.cds2.1; + ConsensusfromCH236920mapping GlimmerHMM CDS 15137 15205 . - 0 ID=ConsensusfromCH236920mapping.cds2.2; + ConsensusfromCH236920mapping GlimmerHMM mRNA 19910 24210 . - . ID=ConsensusfromCH236920mapping.path1.gene3;Name=ConsensusfromCH236920mapping.path1.gene3 + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_glimmer.py Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,75 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +import argparse +import os +import subprocess +import sys + + +def main(): + parser = argparse.ArgumentParser(description='Get a multi-fasta, the trained_dir and the output file as inputs, ' + 'to generate GlimmerHMM gene prediction over all contigs') + + parser.add_argument('--multi_fasta', help='Multi fasta file to run GlimmerHMM on', required=True) + + parser.add_argument('--trained_dir', help='Path to the GlimmerHMM trained_dir', required=True) + + parser.add_argument('--output', help='file to output the result into', required=True) + + args = parser.parse_args() + + multi_fasta = args.multi_fasta + trained_dir = args.trained_dir + # TODO: Temporary fix for the issue with config.file in human/. Next: GC Content to select the appropriate folder + if trained_dir.split('/')[-1] == "human": + trained_dir = os.path.join(trained_dir, "Train0-43") + + output_file = args.output + temp_contig = "temp_contig" + + def exec_glimmer(contig_file, first_time=False): + p = subprocess.Popen(["glimmerhmm", contig_file, trained_dir, "-g"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, errors = p.communicate() + + p.wait() + # Process the error if != "Done" + if not errors or (errors.split()[0] != "Done"): + raise Exception("Error in glimmer: {0}".format(errors)) + else: + sys.stdout.write(errors) + # If not first time, we need to remove the first comments + if not first_time: + output = "\n".join(output.split("\n")[1:]) + + return output + + with open(output_file, 'w+') as o: + with open(multi_fasta, 'r') as mf: + is_first_time = True + for i, line in enumerate(mf): + if line[0] == '>': + # If it is the first time we finish to read a contig, we let glimmer add the full comments + # and write into the output the result + if is_first_time is True and i != 0: + o.write(exec_glimmer(temp_contig, first_time=is_first_time)) + is_first_time = False + # Else we call glimmer and say this is not the first time (so remove the first comment) + # and dump into the output file the result + elif i > 0: + o.write(exec_glimmer(temp_contig)) + + # Because we are on an indication of a beginning of a sequence, we need to create an empty file + # to dump the line into + with open(temp_contig, 'w+') as tc: + tc.write(line) + else: + # We are in the sequence of a contig, so we append the line in the file + with open(temp_contig, 'a+') as tc: + tc.write(line) + # The file is terminate, we did read another contig so we need to save this last one + o.write(exec_glimmer(temp_contig, first_time=is_first_time)) + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_glimmer.sh Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,50 @@ +#!/bin/sh +set -e + +reference_fasta=$1 +trained_dir=$2 +output=$3 +temp="temp_contig_file" + +# Write the glimmerhmm, with the comments +glimmerHMM_first () { + glimmerhmm $1 ${trained_dir} -o ${output} -g +} + +# Write the glimmerhmm output without the comments +glimmerHMM_without_comments () { + glimmerhmm $1 ${trained_dir} -g | tail -n +2 >> ${output} +} + +count=1 +# Loop through the contigs to run glimmer on each +while read line +do + # Get the content of actual contig + #samtools_faidx_show_contig ${reference_fasta} ${contig} > contig_content + first_char=$(echo ${line} | cut -c1-1) + + if [ ${first_char} = '>' ] + then + # If true, it means we have finished reading at least the first contig + if [ -f ${temp} ] + then + if [ ${count} -eq 1 ] + then + glimmerHMM_first ${temp}; + count=$((count+1)) + else + glimmerHMM_without_comments ${temp}; + fi + fi + echo ${line} > ${temp} + else + echo ${line} >> ${temp} + fi +done < "${reference_fasta}" + +# Still last contig to process +glimmerHMM_without_comments ${temp}; + +# Delete the temp_contig_file +rm ${temp}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.md Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,83 @@ +Galaxy wrapper for GlimmerHMM +===================================== + +This wrapper has been rewritten by Rémi Marenco in 2016 to fix issues, improve it and add the multi_fasta handling. It has been originally written by Björn Gruening. + +This is a wrapper for the command line tool of GlimmerHMM. +https://ccb.jhu.edu/software/glimmerhmm/ + +GlimmerHMM is a gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM, +additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes +Interpolated Markov Models for the coding and noncoding models. +Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single). + +Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879. +Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48. +The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815. +Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5. +Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31. + + +Installation +============ + +To install Glimmer3, please download GlimmerHMM from + +ftp://ccb.jhu.edu/pub/software/glimmerhmm + +and follow the installation instructions. +To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included. + +To install the wrapper copy the glimmerHMM folder in the galaxy tools +folder and modify the $GALAXY_ROOT/config/tool_conf.xml file to make the tool available to Galaxy. +For example: + +```xml +<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" /> +<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" /> +``` + +You also need to use a trained organism by adding them as reference data in Galaxy: + +1. Add the *glimmer_hmm_trained_dir* data table to `tool_data_table_conf.xml` in `$GALAXY_ROOT/config/`: + + ```xml + <!-- glimmer_hmm trained_dir --> + <table name="glimmer_hmm_trained_dir" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/glimmer_hmm.loc" /> + </table> + ``` + +2. Add the `glimmer_hmm.loc` file referencing your trained organism, in `tool-data`. + You have a sample [`glimmer_hmm.loc.sample`] available in the repository to help you configuring it properly +3. Add your data in the chosen folder at step 2. You can get them from the GlimmerHMM tar, `$GLIMMERHMM/trained_dir` + +History +======= + +- v3.0 - Add the Multi Fasta support +- v2.0 - Update by Rémi Marenco to make it work without having to modify the wrapper + add ability to select the species +- v0.1 - Initial public release + + +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,98 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="all_fasta" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/all_fasta.loc" /> + </table> + <!-- Locations of indexes in the BFAST mapper format --> + <table name="bfast_indexes" comment_char="#"> + <columns>value, dbkey, formats, name, path</columns> + <file path="tool-data/bfast_indexes.loc" /> + </table> + <!-- Locations of protein (mega)blast databases --> + <table name="blastdb_p" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/blastdb_p.loc" /> + </table> + <!-- Locations of indexes in the BWA mapper format --> + <table name="bwa_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/bwa_index.loc" /> + </table> + <!-- Locations of indexes in the BWA color-space mapper format --> + <table name="bwa_indexes_color" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/bwa_index_color.loc" /> + </table> + <!-- Locations of MAF files that have been indexed with bx-python --> + <table name="indexed_maf_files"> + <columns>name, value, dbkey, species</columns> + <file path="tool-data/maf_index.loc" /> + </table> + <!-- Locations of fasta files appropriate for NGS simulation --> + <table name="ngs_sim_fasta" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/ngs_sim_fasta.loc" /> + </table> + <!-- Locations of PerM base index files --> + <table name="perm_base_indexes" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/perm_base_index.loc" /> + </table> + <!-- Locations of PerM color-space index files --> + <table name="perm_color_indexes" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/perm_color_index.loc" /> + </table> + <!-- Location of Picard dict file and other files --> + <table name="picard_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/picard_index.loc" /> + </table> + <!-- Location of SRMA dict file and other files --> + <table name="srma_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/picard_index.loc" /> + </table> + <!-- Location of Mosaik files --> + <table name="mosaik_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/mosaik_index.loc" /> + </table> + <!-- Locations of indexes in the 2bit format --> + <table name="twobit" comment_char="#"> + <columns>value, path</columns> + <file path="tool-data/twobit.loc" /> + </table> + <!-- Available IGV builds, loaded from URL --> + <table name="igv_broad_genomes" comment_char="#"> + <columns>name, url, value</columns> + <file url="http://igv.broadinstitute.org/genomes/genomes.txt" /> + </table> + <!-- Available liftOver chain file --> + <table name="liftOver" comment_char="#"> + <columns>dbkey, name, value</columns> + <file path="tool-data/liftOver.loc" /> + </table> + <!-- iobio bam servers --> + <table name="bam_iobio" comment_char="#"> + <columns>value, name, url</columns> + <file path="tool-data/bam_iobio.loc" /> + </table> + <!-- iobio vcf servers --> + <table name="vcf_iobio" comment_char="#"> + <columns>value, name, url</columns> + <file path="tool-data/vcf_iobio.loc" /> + </table> + <!-- simple biom servers --> + <table name="biom_simple_display" comment_char="#"> + <columns>value, name, url</columns> + <file path="tool-data/biom_simple_display.loc" /> + </table> + <!-- glimmer_hmm trained_dir --> + <table name="glimmer_hmm_trained_dir" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/glimmer_hmm.loc" /> + </table> +</tables>