Mercurial > repos > iuc > data_manager_hisat2_index_builder

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/hisat2_index_builder.py	Sat Oct 10 14:53:43 2015 -0400
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# Based heavily on the Bowtie 2 data manager wrapper script by Dan Blankenberg
+
+import shlex
+import sys
+import os
+import argparse
+import subprocess
+
+from json import loads, dumps
+
+
+DEFAULT_DATA_TABLE_NAME = "hisat2_indexes"
+
+
+def get_id_name( params, dbkey, fasta_description=None):
+    # TODO: ensure sequence_id is unique and does not already appear in location file
+    sequence_id = params['param_dict']['sequence_id']
+    if not sequence_id:
+        sequence_id = dbkey
+
+    sequence_name = params['param_dict']['sequence_name']
+    if not sequence_name:
+        sequence_name = fasta_description
+        if not sequence_name:
+            sequence_name = dbkey
+    return sequence_id, sequence_name
+
+
+def build_hisat_index( data_manager_dict, options, params, sequence_id, sequence_name ):
+    data_table_name = options.data_table_name or DEFAULT_DATA_TABLE_NAME
+    target_directory = params[ 'output_data' ][0]['extra_files_path']
+    if not os.path.exists( target_directory ):
+        os.mkdir( target_directory )
+    fasta_base_name = os.path.split( options.fasta_filename )[-1]
+    sym_linked_fasta_filename = os.path.join( target_directory, fasta_base_name )
+    os.symlink( options.fasta_filename, sym_linked_fasta_filename )
+    args = [ 'hisat2-build' ]
+    args.extend( shlex.split( options.indexer_options ) )
+    args.extend( [ sym_linked_fasta_filename, sequence_id ] )
+    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
+    return_code = proc.wait()
+    if return_code:
+        print >> sys.stderr, "Error building index."
+        sys.exit( return_code )
+    data_table_entry = dict( value=sequence_id, dbkey=options.fasta_dbkey, name=sequence_name, path=sequence_id )
+    _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
+
+
+def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
+    data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
+    data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
+    data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
+    return data_manager_dict
+
+
+def main():
+    # Parse Command Line
+    parser = argparse.ArgumentParser()
+    parser.add_argument( '--output', dest='output', action='store', type=str, default=None )
+    parser.add_argument( '--fasta_filename', dest='fasta_filename', action='store', type=str, default=None )
+    parser.add_argument( '--fasta_dbkey', dest='fasta_dbkey', action='store', type=str, default=None )
+    parser.add_argument( '--fasta_description', dest='fasta_description', action='store', type=str, default=None )
+    parser.add_argument( '--data_table_name', dest='data_table_name', action='store', type=str, default='hisat2_indexes' )
+    parser.add_argument( '--indexer_options', dest='indexer_options', action='store', type=str, required=True, default=None )
+    options = parser.parse_args()
+
+    filename = options.output
+
+    params = loads( open( filename ).read() )
+    data_manager_dict = {}
+
+    if options.fasta_dbkey in [ None, '', '?' ]:
+        raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( dbkey ) )
+
+    sequence_id, sequence_name = get_id_name( params, dbkey=options.fasta_dbkey, fasta_description=options.fasta_description )
+
+    # build the index
+    build_hisat_index( data_manager_dict, options, params, sequence_id, sequence_name )
+
+    # save info to json file
+    open( filename, 'wb' ).write( dumps( data_manager_dict ) )
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/hisat2_index_builder.xml	Sat Oct 10 14:53:43 2015 -0400
@@ -0,0 +1,105 @@
+<tool id="hisat2_index_builder_data_manager" name="HISAT2 index" tool_type="manage_data" version="1.0.0">
+    <description>builder</description>
+    <requirements>
+        <requirement type="package" version="2.0">hisat</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range=":-1" />
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+        #if $gtf_input:
+            ln -s "${gtf_input}" gtf_file.gtf &&
+            python \$HISAT2_ROOT_DIR/bin/extract_splice_sites.py gtf_file.gtf > splice_sites.txt &&
+            python \$HISAT2_ROOT_DIR/bin/extract_exons.py gtf_file.gtf > exon.txt &&
+            ls -lh &&
+        #end if
+        #if $snps:
+            ln -s "${all_fasta_source.fields.path}" genome.fa &&
+            ln -s "${snps}" snps.tabular &&
+            python \$HISAT2_ROOT_DIR/bin/extract_snps.py --genome_file genome.fa --snp_file snps.tabular > snps.txt &&
+        #end if
+        python $__tool_directory__/hisat2_index_builder.py --output "${out_file}"
+            --fasta_filename "${all_fasta_source.fields.path}"
+            --fasta_dbkey "${all_fasta_source.fields.dbkey}"
+            --fasta_description "${all_fasta_source.fields.name}"
+            --data_table_name "hisat2_indexes"
+            #if $advanced.adv_param_select == 'yes':
+                --indexer_options "
+                --noauto
+                -p \${GALAXY_SLOTS:-1}
+                #if $snps:
+                    --snps `pwd`/snps.txt
+                #end if
+                #if $advanced.gtf_input:
+                    --ss `pwd`/splice_sites.txt
+                    --exon `pwd`/exon.txt
+                #end if
+                --bmax $advanced.bmax
+                --bmaxdivn $advanced.bmaxdivn
+                --dcv $advanced.dcv
+                --offrate $advanced.offrate
+                "
+            #end if
+        ]]>
+    </command>
+    <inputs>
+        <param label="Source FASTA Sequence" name="all_fasta_source" type="select">
+            <options from_data_table="all_fasta" />
+        </param>
+        <conditional name="advanced" label="Advanced parameters">
+            <param name="adv_param_select" type="select" label="Advanced parameters">
+                <option value="no">Use defaults</option>
+                <option value="yes">Fine-tune indexing parameters</option>
+            </param>
+            <when value="yes">
+                <param type="integer" name="bmax" label="Maximum number of suffixes allowed in a block." help="--bmax" value="4" />
+                <param type="integer" name="bmaxdivn" label="Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference." help="--bmaxdivn" value="4" />
+                <param type="integer" name="dcv" label="Period for the difference-cover sample." help="--dcv: A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. " value="1024" min="2" max="4096" />
+                <param type="integer" name="offrate" label="Mark rows in the Burrows-Wheeler transform" help="--offrate: To map alignments back to positions on the reference sequences, it's necessary to annotate (&quot;mark&quot;) some or all of the Burrows-Wheeler rows with their corresponding location on the genome. This parameter governs how many rows get marked: the indexer will mark every 2^&lt;int&gt; rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes)." value="4" />
+                <param type="data" format="tabular" name="snps" label="Provide a list of SNPs in the UCSC dbSNP format" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." />
+                <param type="data" format="gtf" name="gtf_input" label="Provide a GTF file for HISAT2 to extract splice sites from" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." />
+            </when>
+            <when value="no" />
+        </conditional>
+        <param label="Name of sequence" name="sequence_name" type="text" value="" />
+        <param label="ID for sequence" name="sequence_id" type="text" value="" />
+    </inputs>
+    <outputs>
+        <data format="data_manager_json" name="out_file" />
+    </outputs>
+    <help>
+<![CDATA[
+.. class:: infomark
+
+**Notice:** If you leave name, description, or id blank, it will be generated automatically.
+
+What is HISAT2?
+---------------
+
+`HISAT <http://ccb.jhu.edu/software/hisat>`__ is a fast and sensitive alignment
+program for mapping next-generation sequencing reads (both DNA and RNA) against
+the general human population (as well as against a single reference genome).
+Based on an extension of BWT for graphs (`BWT <http://dl.acm.org/citation.cfm?id=2674828>`__)
+we designed and implemented a graph FM index (GFM), an original approach and
+its first implementation to the best of our knowledge. In addition to using one
+global GFM index that represents the general population, HISAT2 uses a large set
+of small GFM indexes that collectively cover the whole genome (each index
+representing a genomic region of 56 Kbp, with 55,000 indexes needed to cover
+the human population). These small indexes (called local indexes), combined
+with several alignment strategies, enable rapid and accurate alignment of
+sequencing reads. This new indexing scheme is called a Hierarchical Graph
+FM index (HGFM).  In addition to spliced alignment, HISAT handles reads
+involving indels and supports a paired-end alignment mode. Multiple processors
+can be used simultaneously to achieve greater alignment speed. HISAT outputs
+alignments in `SAM <http://samtools.sourceforge.net/SAM1.pdf>`__ format, enabling
+interoperation with a large number of other tools (e.g. `SAMtools <http://samtools.sourceforge.net>`__,
+`GATK <http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit>`__)
+that use SAM. HISAT is distributed under the `GPLv3 license <http://www.gnu.org/licenses/gpl-3.0.html>`__,
+and it runs on the command line under Linux, Mac OS X and Windows.
+]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1038/nmeth.3317</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Sat Oct 10 14:53:43 2015 -0400
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/hisat2_index_builder.xml" id="hisat2_index_builder" version="0.0.1">
+        <data_table name="hisat2_indexes">
+            <output>
+                <column name="value" />
+                <column name="dbkey" />
+                <column name="name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="directory" relativize_symlinks="True">
+                        <!-- <source>${path}</source>--> <!-- out_file.extra_files_path is used as base by default --> <!-- if no source, eg for type=directory, then refers to base -->
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">${dbkey}/hisat2_index/${value}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/${dbkey}/hisat2_index/${value}/${path}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample	Sat Oct 10 14:53:43 2015 -0400
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>		<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3		/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19		Human (Homo sapiens): hg19 Canonical		/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19		Human (Homo sapiens): hg19 Full			/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/hisat2_indexes.loc.sample	Sat Oct 10 14:53:43 2015 -0400
@@ -0,0 +1,39 @@
+# hisat2_indexes.loc.sample
+# This is a *.loc.sample file distributed with Galaxy that enables tools
+# to use a directory of indexed data files. This one is for HISAT2.
+# See the wiki: http://wiki.galaxyproject.org/Admin/NGS%20Local%20Setup
+# First create these data files and save them in your own data directory structure.
+# Then, create a hisat2_indexes.loc file to use those indexes with tools.
+# Copy this file, save it with the same name (minus the .sample),
+# follow the format examples, and store the result in this directory.
+# The file should include an one line entry for each index set.
+# The path points to the "basename" for the set, not a specific file.
+# It has four text columns seperated by TABS.
+#
+# <unique_build_id>	<dbkey>	<display_name>	<file_base_path>
+#
+# So, for example, if you had sacCer3 indexes stored in:
+#
+#    /depot/data2/galaxy/sacCer3/hisat2_indexes/
+#
+# containing sacCer3 genome and sacCer3.*.ht2 files, such as:
+#
+#   -rw-rw-r-- 1 dave dave  12M Sep 23 13:57 sacCer3.1.ht2
+#   -rw-rw-r-- 1 dave dave 2.9M Sep 23 13:57 sacCer3.2.ht2
+#   -rw-rw-r-- 1 dave dave  161 Sep 23 13:57 sacCer3.3.ht2
+#   -rw-rw-r-- 1 dave dave 2.9M Sep 23 13:57 sacCer3.4.ht2
+#   -rw-rw-r-- 1 dave dave 7.3M Sep 23 13:57 sacCer3.5.ht2
+#   -rw-rw-r-- 1 dave dave 3.0M Sep 23 13:57 sacCer3.6.ht2
+#   -rw-rw-r-- 1 dave dave 128K Sep 23 13:57 sacCer3.7.ht2
+#   -rw-rw-r-- 1 dave dave  32K Sep 23 13:57 sacCer3.8.ht2
+#
+# then the hisat2_indexes.loc entry could look like this:
+#
+#sacCer3	sacCer3	S. cerevisiae Apr. 2011 (SacCer_Apr2011/sacCer3) (sacCer3)	/depot/data2/galaxy/hisat2_indexes/sacCer3
+#
+#More examples:
+#
+#mm10	mm10	Mouse (mm10)	/depot/data2/galaxy/hisat2_indexes/mm10
+#dm3	dm3		D. melanogaster (dm3)	/depot/data2/galaxy/hisat2_indexes/dm3
+#
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Sat Oct 10 14:53:43 2015 -0400
@@ -0,0 +1,13 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+    <!-- Locations of indexes in the hisat mapper format -->
+    <table name="hisat2_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/hisat2_indexes.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Sat Oct 10 14:53:43 2015 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="hisat" version="2.0">
+        <repository changeset_revision="c65f00072e57" name="package_hisat_2_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>