Mercurial > repos > rnateam > segemehl
changeset 0:e97db054a88d draft
Uploaded
author | rnateam |
---|---|
date | Sat, 22 Feb 2014 06:01:16 -0500 |
parents | |
children | df7c7d732d31 |
files | segemehl.xml tool-data/segemehl_indices.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 4 files changed, 203 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segemehl.xml Sat Feb 22 06:01:16 2014 -0500 @@ -0,0 +1,160 @@ +<tool id="segemehl" name="segemehl" version="0.1.6.0"> + <description>based short read aligner</description> + <requirements> + <requirement type="package" version="0.1.6">segemehl</requirement> + </requirements> + <command> + ## prepare segemehl index if no reference genome is supplied + temp_index = `mktemp`; + #if $refGenomeSource.genomeSource == "history": + segemehl.x -x $temp_index -d $refGenomeSource.own_reference_genome; + #else: + $temp_index = ${refGenomeSource.index.fields.index_path} + #end if + + + ## execute segemehl + segemehl.x + + ## number of threads + -t "\${GALAXY_SLOTS:-12}" + + ## db file path + -d ${refGenomeSource.index.fields.db_path} + + -i $temp_index + + ## check for single/pair-end + #if str( $library.type ) == "single": + #set $query_list = list() + ## prepare inputs + #for $fastq in $library.reads: + $query_list.append('%s' %($fastq.input_query)) + #end for + -q "#echo ' '.join( $query_list )#" + #else + ## prepare inputs + + #set $mate1 = list() + #set $mate2 = list() + #for $mate_pair in $library.mate_list: + $mate1.append( str($mate_pair.first_strand_query) ) + $mate2.append( str($mate_pair.second_strand_query) ) + #end for + + -q #echo ','.join($mate1) + -p #echo ','.join($mate2) + + -I $library.maxinsertsize + #end if + -m $minsize + -A $accuracy + -H $hitstrategy + #if str( $prime5 ).strip(): + -P $prime5 + #end if + #if str( $prime3 ).strip(): + -Q $prime3 + #end if + $polyA + $autoclip + $hardclip + $order + -s + -o $segemehl_out + </command> + <stdio> + <regex match="Exit forced" + source="both" + level="fatal" + description="Execution halted." /> + </stdio> + <inputs> + + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact your Galaxy admin"> + <options from_data_table="segemehl_indexes"> + <column name="value" index="0"/> + <column name="dbkey" index="1"/> + <column name="name" index="2"/> + <column name="db_path" index="3"/> + <column name="index_path" index="4"/> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> <!-- build-in --> + <when value="history"> + <param name="own_reference_genome" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> <!-- history --> + </conditional> <!-- refGenomeSource --> + + + <conditional name="library"> + <param name="type" type="select" label="Is this library paired-end?"> + <option value="single">Single-end</option> + <option value="paired">Paired-end</option> + </param> + <when value="single"> + <repeat name="reads" title="FASTQ/FASTA files"> + <param name="input_query" type="data" format="fastqsanger,fastqillumina,fastq,fasta" label="Reads fasta/fastq file" /> + </repeat> + </when> + <when value="paired"> + <repeat name="mate_list" title="Paired End Pairs" min="1"> + <param name="first_strand_query" type="data" format="fastqsanger,fastqillumina,fastq,fasta" label="Reads from first strand" /> + <param name="second_strand_query" type="data" format="fastqsanger,fastqillumina,fastq,fasta" label="Reads from second strand" /> + </repeat> + <param name="maxinsertsize" type="integer" value="5000" label="Maximum size of the inserts (paired end)" help="default: 5000 (-I)" /> + </when> + </conditional> + + + <param name="minsize" type="integer" value="12" size="5" label="Minimum size of queries" help="default: 12 (-m)"> + <validator type="in_range" min="1"/> + </param> + <param name="accuracy" type="integer" value="85" size="5" label="Min percentage of matches per read in semi-global alignment" help="default: 85 (-A)" > + <validator type="in_range" min="1" max="100"/> + </param> + <param name="hitstrategy" type="select" label="Hits to report?" help="(-H)"> + <option value="1">report only best scoring hits</option> + <option value="0">report all scoring hits</option> + </param> + <param name="prime5" type="text" size="80" label="add 5' adapter" help="default: none (-Q)" /> + <param name="prime3" type="text" size="80" label="add 3' adapter" help="default: none (-P)"/> + <param name="polyA" type="boolean" truevalue="--polyA" falsevalue="" checked="false" label="Clip polyA tail" help="(-T)"/> + <param name="autoclip" type="boolean" truevalue="--autoclip" falsevalue="" checked="false" label="Autoclip unknown 3prime adapter" help="(-Y)"/> + <param name="hardclip" type="boolean" truevalue="--hardclip" falsevalue="" checked="false" label="Enable hard clipping" help="-C"/> + <param name="order" type="boolean" truevalue="--order" falsevalue="" checked="false" label="Sorts the output by chromsome and position" help="(-O)"/> + </inputs> + + <outputs> + <data format="sam" name="segemehl_out" label="Read alignments on ${on_string}"/> + </outputs> + <help> + +.. class:: infomark + +**What it does** + +Segemehl_ is a short read mapper with gaps. + +Segemehl_ is a software to map short sequencer reads to reference genomes. +Unlike other methods, segemehl is able to detect not only mismatches but also insertions and deletions. +Furthermore, segemehl is not limited to a specific read length and is able to mapprimer- or polyadenylation contaminated reads correctly. +segemehl implements a matching strategy based on enhanced suffix arrays (ESA). Segemehl_ allows bisulfite sequencing mapping and split read mapping. + +.. _Segemehl: http://www.bioinf.uni-leipzig.de/Software/segemehl/ + +**References** + +Hoffmann S, Otto C, Kurtz S, Sharma CM, Khaitovich P, Vogel J, Stadler PF, Hackermueller J: "Fast mapping of short sequences with mismatches, insertions and deletions using index structures", PLoS Comput Biol (2009) vol. 5 (9) pp. e1000502 +download latest version: 0.1.6 manual: download here new stuff: faster multiple split read mapping bug fixes: bugfixes: increased sensitivity for strand switches changes: - default accuracy now 90% older segemehl indices are still usable. issues: untraceable errors with gcc compiler gcc-4.5. zlib linker problems with some ubuntu versions complaint department: steve bioinf uni leipzig deshapeimage_1_link_0shapeimage_1_link_1 + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/segemehl_indices.loc.sample Sat Feb 22 06:01:16 2014 -0500 @@ -0,0 +1,29 @@ +#This is a sample file that enables the segemehl mapper to find genome indices +#and database files. You will need to create these data files and then create +#a segemehl_indices.loc file similar to this one (store it in this directory) +#that points to the directories in which those files are stored. +#The segemehl_indices.loc file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg18 indexed stored in +#/data/0/galaxy/segemehl/hg18/, +#then the segemehl_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /data/0/galaxy/segemehl/hg18/chromosomes.fa /data/0/galaxy/segemehl/hg18/chromosomes.idx +# +#and your /data/0/galaxy/segemehl/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 bag bag 539833 2013-10-13 10:12 chromosomes.idx +#-rw-r--r-- 1 bag bag 342562 2013-10-13 10:12 chromosomes.fa +#...etc... +# +#Your segemehl_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /data/0/galaxy/segemehl/hg18/chromosomes_can.fa /data/0/galaxy/segemehl/hg18/chromosomes_can.idx +#hg18full hg18 hg18 Full /data/0/galaxy/segemehl/hg18/chromosomes_full.fa /data/0/galaxy/segemehl/hg18/chromosomes_full.idx +#...etc... +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sat Feb 22 06:01:16 2014 -0500 @@ -0,0 +1,8 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of indexes in the Bowtie mapper format --> + <table name="segemehl_indexes" comment_char="#"> + <columns>value, dbkey, name, db_path, index_path</columns> + <file path="tool-data/segemehl_indices.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sat Feb 22 06:01:16 2014 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="segemehl" version="0.1.6"> + <repository changeset_revision="7078b80ffc12" name="package_segemehl_0_1_6" owner="rnateam" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>