changeset 0:cdcc400dcafc draft

Migrated separate tools fastq_dump, sam_dump, and sra_fetch to this repository for further development.
author matt-shirley <mdshw5@gmail.com>
date Tue, 27 Nov 2012 13:31:09 -0500
parents
children 75d914fa5711
files datatypes_conf.xml fastq_dump.xml sam_dump.xml sra.py sra_fetch.py sra_fetch.xml
diffstat 6 files changed, 168 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Tue Nov 27 13:31:09 2012 -0500
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<datatypes>
+  <datatype_files>
+    <datatype_file name="sra.py"/>
+  </datatype_files>
+  <registration>
+    <datatype extension="sra" type="galaxy.datatypes.binary:Sra" display_in_upload="true"/>
+  </registration>
+  <sniffers>
+    <sniffer type="galaxy.datatypes.binary:Sra"/>
+  </sniffers>
+</datatypes>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_dump.xml	Tue Nov 27 13:31:09 2012 -0500
@@ -0,0 +1,31 @@
+<tool id="fastq_dump" name="Extract fastq" version="1.0.0">
+  <description> format reads from NCBI SRA.</description>
+  <command>./fastq-dump --log-level fatal --report never --accession '${input.name}' --stdout $split $aligned '$input' > $output </command>
+  <version_string>fastq-dump --version</version_string>
+  <inputs>
+    <param format="sra" name="input" type="data" label="sra archive"/>
+    <param format="text" name="split" type="select" value="">
+      <label>Split read pairs</label>
+      <option value="">No</option>
+      <option value="--split-spot">Yes</option>
+    </param>
+    <param format="text" name="aligned" type="select" value="">
+      <label>Specify alignment</label>
+      <option value="">All</option>
+      <option value="--aligned">Only aligned</option>
+      <option value="--unaligned">Only unaligned</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fastqsanger" name="output"/>
+  </outputs>
+  <stdio>
+    <exit_code range="127" level="fatal" description="Cannot find fastq-dump binary"/>
+  </stdio>
+  <requirements>
+    <requirement type="binary">fastq-dump</requirement>
+  </requirements>
+  <help>
+    This tool extracts fastqsanger reads from SRA archives using fastq-dump. The fastq-dump program is developed at NCBI, and is available at: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sam_dump.xml	Tue Nov 27 13:31:09 2012 -0500
@@ -0,0 +1,33 @@
+<tool id="sam_dump" name="Extract SAM" version="1.0.0">
+  <description> format reads from NCBI SRA.</description>
+  <command>sam-dump $header $aligned $primary '$input' > $output</command>
+  <version_string>sam-dump --version</version_string>
+  <inputs>
+    <param format="sra" name="input" type="data" label="sra archive"/>
+    <param format="text" name="header" type="select" value="">
+      <label>Output SAM header</label>
+      <option value="--header">Yes</option>
+      <option value="--no-header">No</option>
+    </param>
+    <param format="text" name="aligned" type="select" value="">
+      <label>Output unaligned reads</label>
+      <option value="">No</option>
+      <option value="--unaligned">Yes</option>
+    </param>
+    <param format="text" name="primary" type="select" value="">
+      <label>Output only primary aligments</label>
+      <option value="">No</option>
+      <option value="--primary">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="sam" name="output"/>
+  </outputs>
+  <requirements>
+    <requirement type="binary">sam-dump</requirement>
+  </requirements>
+  <help>
+    This tool extracts SAM format reads from SRA archives using sam-dump. The sam-dump program is developed at NCBI, and is available at: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software.
+Contact Matt Shirley at mdshw5@gmail.com for support and bug reports.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sra.py	Tue Nov 27 13:31:09 2012 -0500
@@ -0,0 +1,46 @@
+"""
+Sra class
+"""
+
+import galaxy.datatypes.binary
+from galaxy.datatypes.binary import Binary
+import data, logging, binascii
+from galaxy.datatypes.metadata import MetadataElement
+from galaxy.datatypes import metadata
+from galaxy.datatypes.sniff import *
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import os, subprocess, tempfile
+import struct
+
+class Sra( Binary ):
+    """ Sequence Read Archive (SRA) """
+    file_ext = "sra"
+
+    def __init__( self, **kwd ):
+        Binary.__init__( self, **kwd )
+    def sniff( self, filename ):
+        # The first 8 bytes of any NCBI sra file is 'NCIB.sra', and the file is binary. EBI and DDBJ files may differ. For details
+        # about the format, see http://www.ncbi.nlm.nih.gov/books/n/helpsra/SRA_Overview_BK/#SRA_Overview_BK.4_SRA_Data_Structure
+        try:
+            header = open( filename ).read(8)
+            if binascii.b2a_hex( header ) == binascii.hexlify( 'NCBI.sra' ):
+                return True
+            return False
+        except:
+            return False
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek  = "Binary sra file" 
+            dataset.blurb = data.nice_size( dataset.get_size() )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def display_peek( self, dataset ):
+        try:
+            return dataset.peek
+        except:
+            return "Binary sra file (%s)" % ( data.nice_size( dataset.get_size() ) )
+
+Binary.register_sniffable_binary_format("sra", "sra", Sra)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sra_fetch.py	Tue Nov 27 13:31:09 2012 -0500
@@ -0,0 +1,30 @@
+from ftplib import FTP
+import sys
+
+# Get accession number from argument
+accession = sys.argv[1]
+outfile = sys.argv[2]
+prefix = accession[0:3]
+middle = accession[3:6]
+suffix = accession[6:9]
+
+# NCBI SRA FTP site
+ftp = FTP('ftp-trace.ncbi.nih.gov')
+
+# Open file and transfer requested SRA as a file
+# Try to change the working directory until it works
+sra = open(outfile, 'wb')
+ftp.login('ftp')
+connected = False
+while not connected:
+    try:
+        ftp.cwd('/sra/sra-instant/reads/ByRun/sra/' + 
+                prefix + '/' +
+                prefix + middle + '/' +
+                prefix + middle + suffix + '/')
+        connected = True
+    except:
+        pass
+        
+ftp.retrbinary('RETR ' + prefix + middle + suffix + '.sra', sra.write)
+ftp.quit()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sra_fetch.xml	Tue Nov 27 13:31:09 2012 -0500
@@ -0,0 +1,16 @@
+<tool id="sra_fetch" name="Fetch SRA" version="1.0.0">
+  <description> by accession from NCBI SRA.</description>
+  <command interpreter="python">sra_fetch.py '$accession' '$output'</command>
+  <inputs>
+    <param name="accession" size="13" type="text" value="SRR000001" label="SRA run accession"/>
+  </inputs>
+  <outputs>
+    <data format="sra" name="output" label="Fetch ${accession.value}"/>
+  </outputs>
+  <requirements>
+    <requirement type="python">sra_fetch.py</requirement>
+  </requirements>
+  <help>
+    This tool fetches SRA archives from NCBI over FTP using the python ftplib.
+  </help>
+</tool>