Mercurial > repos > simonl > agile_wrapper

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Users/oconnorlab/Desktop/agile/agile_wrapper.py	Tue Jun 07 16:22:51 2011 -0400
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR ):
+    nib_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR
+    nib_path = ''
+    nibs = {}
+    for i, line in enumerate( file( nib_file ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( "#" ):
+            fields = line.split( '\t' )
+            if len( fields ) < 3:
+                continue
+            if fields[0] == 'seq':
+                nibs[( fields[1] )] = fields[2]
+    if nibs.has_key( dbkey ):
+        nib_path = nibs[( dbkey )]
+    return nib_path
+
+def check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR ):
+    twobit_file = "%s/twobit.loc" % GALAXY_DATA_INDEX_DIR
+    twobit_path = ''
+    twobits = {}
+    for i, line in enumerate( file( twobit_file ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( "#" ):
+            fields = line.split( '\t' )
+            if len( fields ) < 2:
+                continue
+            twobits[( fields[0] )] = fields[1]
+    if twobits.has_key( dbkey ):
+        twobit_path = twobits[( dbkey )]
+    return twobit_path
+
+def __main__():
+    # I/O
+    source_format = sys.argv[1]        # 0: dbkey; 1: upload file
+    target_file = sys.argv[2]
+    query_file = sys.argv[3]
+    output_file = sys.argv[4]
+    max_sims = sys.argv[5]
+    tile_size = sys.argv[6]
+    max_freq = sys.argv[7]
+    out_type = sys.argv[8]
+    all_match = sys.argv[9]
+
+    GALAXY_DATA_INDEX_DIR = sys.argv[10]
+
+    all_files = []
+    if source_format == '0':
+        # check target genome
+        dbkey = target_file
+        nib_path = check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR )
+        twobit_path = check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR )
+        if not os.path.exists( nib_path ) and not os.path.exists( twobit_path ):
+            stop_err("No sequences are available for %s, request them by reporting this error." % dbkey)
+
+        # check the query file, see whether all of them are legitimate sequence
+        if nib_path and os.path.isdir( nib_path ):
+            compress_files = os.listdir(nib_path)
+            target_path = nib_path
+        elif twobit_path:
+            compress_files = [twobit_path]
+            target_path = ""
+        else:
+            stop_err("Requested genome build has no available sequence.")
+
+        for file in compress_files:
+            file = "%s/%s" % ( target_path, file )
+            file = os.path.normpath(file)
+            all_files.append(file)
+    else:
+        all_files = [target_file]
+
+    for detail_file_path in all_files:
+        output_tempfile = tempfile.NamedTemporaryFile().name
+        if all_match == "true":
+	        command = "agile %s %s -maxSIMs=%s -tileSize=%s -maxFreq=%s -out=%s -all %s 2>&1" % ( detail_file_path, query_file, max_sims, tile_size, max_freq, out_type, output_tempfile )
+        else:
+	        command = "agile %s %s -maxSIMs=%s -tileSize=%s -maxFreq=%s -out=%s %s 2>&1" % ( detail_file_path, query_file, max_sims, tile_size, max_freq, out_type, output_tempfile )
+
+        os.system( command )
+        os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) )
+        os.remove( output_tempfile )
+
+if __name__ == '__main__': __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Users/oconnorlab/Desktop/agile/agile_wrapper.xml	Tue Jun 07 16:22:51 2011 -0400
@@ -0,0 +1,78 @@
+<tool id="agile_wrapper" name="AGILE" version="1.0.0">
+  <description> Quickly match reads to a reference genome or sequence file</description>
+  <command interpreter="python">
+    #if $source.source_select=="database" #agile_wrapper.py 0 $source.dbkey $input_query $output1 $max_sims $tile_size $max_freq $out_type $all
+    #else                                 #agile_wrapper.py 1 $source.input_target $input_query $output1 $max_sims $tile_size $max_freq $out_type $all
+    #end if# ${GALAXY_DATA_INDEX_DIR}
+  </command>
+	<inputs>
+	<conditional name="source">
+		<param name="source_select" type="select" label="Target source">
+				<option value="database">Genome Build</option>
+				<option value="input_ref">Your Upload File</option>
+		</param>
+		<when value="database">
+			<param name="dbkey" type="genomebuild" label="Genome" />
+		</when>
+		<when value="input_ref">
+			<param name="input_target" type="data" label="Reference sequence" />
+ 		</when>
+	</conditional>
+		<param name="input_query" type="data" format="fasta" label="Query sequences file"/>
+		<param name="max_sims" type="integer" size="15" value="7" label="Maximum number of Single Imperfect Matches (SIMs) allowed as percentage of read length (-maxSIMs, default 7)" />
+		<param name="tile_size" type="integer" size="15" value="20" label="Sets the length of tuples for creating hash table (-tileSize, default 20)" help="Usually between 11 and 20."/>
+		<param name="max_freq" type="integer" size="15" value="8" label="Maximum number of occurrences of a pattern (k-tupple) allowed (-maxFreq, default 8"/>
+		<param name="all" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Output all matches? (-all, default true for MHC)"/>
+		<param name="out_type" type="select" label="Output Format (-out default pslx for MHC">
+				<option value="pslx">pslx</option>
+				<option value="psl">psl</option>
+				<option value="axt">axt</option>
+				<option value="maf">maf</option>
+				<option value="sim4">sim4</option>
+				<option value="wublast">wublast</option>
+				<option value="blast">blast</option>
+				<option value="blast8">blast8</option>
+				<option value="blast9">blast9</option>
+		</param>
+	</inputs>
+	<outputs>
+		<data name="output1" format="tabular"/>
+	</outputs>
+	<requirements>
+	    <requirement type="binary">agile</requirement>
+	</requirements>
+	<help>
+
+.. class:: warningmark
+
+The default parameter values can be altered in the agile tool xml file
+
+-----
+
+**What it does**
+
+This tool uses the **AGILE** alignment program, a faster replacement for the **BLAT** algorithm. Your reads file is searched against a genome build or another uploaded file.
+
+-----
+
+**Parameters**
+
+- *Maximum Single Imperfect Matches* (**-maxSIMs**) : The number of allowable mismatches as a percentage of read length.
+
+- *Tuple Length* (**-tileSize**) : The length of tuples for craring a hash table.
+
+- *Maximum Frequency* (**-maxFreq**) : The maximum number of pattern occurrences allowed.
+
+- *All Matches* (**-all**) : Output all matches satisfying the match criteria (true/false).
+
+- *Output Format* (**-out**) : Define the output format for the match file.
+
+-----
+
+**Reference**
+
+ **AGILE**: Sanchit Misra, Ankit Agrawal, Wei-keng Liao, Alok Choudhary. Anatomy of a Hash-based Long Read Sequence Mapping Algorithm for Next Generation DNA Sequencing. Bioinformatics 2010; doi: 10.1093/bioinformatics/btq648.
+
+
+	</help>
+</tool>