Mercurial > repos > matt-shirley > sra_tools
changeset 0:cdcc400dcafc draft
Migrated separate tools fastq_dump, sam_dump, and sra_fetch to this repository for further development.
author | matt-shirley <mdshw5@gmail.com> |
---|---|
date | Tue, 27 Nov 2012 13:31:09 -0500 |
parents | |
children | 75d914fa5711 |
files | datatypes_conf.xml fastq_dump.xml sam_dump.xml sra.py sra_fetch.py sra_fetch.xml |
diffstat | 6 files changed, 168 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Tue Nov 27 13:31:09 2012 -0500 @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="sra.py"/> + </datatype_files> + <registration> + <datatype extension="sra" type="galaxy.datatypes.binary:Sra" display_in_upload="true"/> + </registration> + <sniffers> + <sniffer type="galaxy.datatypes.binary:Sra"/> + </sniffers> +</datatypes> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_dump.xml Tue Nov 27 13:31:09 2012 -0500 @@ -0,0 +1,31 @@ +<tool id="fastq_dump" name="Extract fastq" version="1.0.0"> + <description> format reads from NCBI SRA.</description> + <command>./fastq-dump --log-level fatal --report never --accession '${input.name}' --stdout $split $aligned '$input' > $output </command> + <version_string>fastq-dump --version</version_string> + <inputs> + <param format="sra" name="input" type="data" label="sra archive"/> + <param format="text" name="split" type="select" value=""> + <label>Split read pairs</label> + <option value="">No</option> + <option value="--split-spot">Yes</option> + </param> + <param format="text" name="aligned" type="select" value=""> + <label>Specify alignment</label> + <option value="">All</option> + <option value="--aligned">Only aligned</option> + <option value="--unaligned">Only unaligned</option> + </param> + </inputs> + <outputs> + <data format="fastqsanger" name="output"/> + </outputs> + <stdio> + <exit_code range="127" level="fatal" description="Cannot find fastq-dump binary"/> + </stdio> + <requirements> + <requirement type="binary">fastq-dump</requirement> + </requirements> + <help> + This tool extracts fastqsanger reads from SRA archives using fastq-dump. The fastq-dump program is developed at NCBI, and is available at: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sam_dump.xml Tue Nov 27 13:31:09 2012 -0500 @@ -0,0 +1,33 @@ +<tool id="sam_dump" name="Extract SAM" version="1.0.0"> + <description> format reads from NCBI SRA.</description> + <command>sam-dump $header $aligned $primary '$input' > $output</command> + <version_string>sam-dump --version</version_string> + <inputs> + <param format="sra" name="input" type="data" label="sra archive"/> + <param format="text" name="header" type="select" value=""> + <label>Output SAM header</label> + <option value="--header">Yes</option> + <option value="--no-header">No</option> + </param> + <param format="text" name="aligned" type="select" value=""> + <label>Output unaligned reads</label> + <option value="">No</option> + <option value="--unaligned">Yes</option> + </param> + <param format="text" name="primary" type="select" value=""> + <label>Output only primary aligments</label> + <option value="">No</option> + <option value="--primary">Yes</option> + </param> + </inputs> + <outputs> + <data format="sam" name="output"/> + </outputs> + <requirements> + <requirement type="binary">sam-dump</requirement> + </requirements> + <help> + This tool extracts SAM format reads from SRA archives using sam-dump. The sam-dump program is developed at NCBI, and is available at: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software. +Contact Matt Shirley at mdshw5@gmail.com for support and bug reports. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sra.py Tue Nov 27 13:31:09 2012 -0500 @@ -0,0 +1,46 @@ +""" +Sra class +""" + +import galaxy.datatypes.binary +from galaxy.datatypes.binary import Binary +import data, logging, binascii +from galaxy.datatypes.metadata import MetadataElement +from galaxy.datatypes import metadata +from galaxy.datatypes.sniff import * +from galaxy import eggs +import pkg_resources +pkg_resources.require( "bx-python" ) +import os, subprocess, tempfile +import struct + +class Sra( Binary ): + """ Sequence Read Archive (SRA) """ + file_ext = "sra" + + def __init__( self, **kwd ): + Binary.__init__( self, **kwd ) + def sniff( self, filename ): + # The first 8 bytes of any NCBI sra file is 'NCIB.sra', and the file is binary. EBI and DDBJ files may differ. For details + # about the format, see http://www.ncbi.nlm.nih.gov/books/n/helpsra/SRA_Overview_BK/#SRA_Overview_BK.4_SRA_Data_Structure + try: + header = open( filename ).read(8) + if binascii.b2a_hex( header ) == binascii.hexlify( 'NCBI.sra' ): + return True + return False + except: + return False + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = "Binary sra file" + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "Binary sra file (%s)" % ( data.nice_size( dataset.get_size() ) ) + +Binary.register_sniffable_binary_format("sra", "sra", Sra)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sra_fetch.py Tue Nov 27 13:31:09 2012 -0500 @@ -0,0 +1,30 @@ +from ftplib import FTP +import sys + +# Get accession number from argument +accession = sys.argv[1] +outfile = sys.argv[2] +prefix = accession[0:3] +middle = accession[3:6] +suffix = accession[6:9] + +# NCBI SRA FTP site +ftp = FTP('ftp-trace.ncbi.nih.gov') + +# Open file and transfer requested SRA as a file +# Try to change the working directory until it works +sra = open(outfile, 'wb') +ftp.login('ftp') +connected = False +while not connected: + try: + ftp.cwd('/sra/sra-instant/reads/ByRun/sra/' + + prefix + '/' + + prefix + middle + '/' + + prefix + middle + suffix + '/') + connected = True + except: + pass + +ftp.retrbinary('RETR ' + prefix + middle + suffix + '.sra', sra.write) +ftp.quit()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sra_fetch.xml Tue Nov 27 13:31:09 2012 -0500 @@ -0,0 +1,16 @@ +<tool id="sra_fetch" name="Fetch SRA" version="1.0.0"> + <description> by accession from NCBI SRA.</description> + <command interpreter="python">sra_fetch.py '$accession' '$output'</command> + <inputs> + <param name="accession" size="13" type="text" value="SRR000001" label="SRA run accession"/> + </inputs> + <outputs> + <data format="sra" name="output" label="Fetch ${accession.value}"/> + </outputs> + <requirements> + <requirement type="python">sra_fetch.py</requirement> + </requirements> + <help> + This tool fetches SRA archives from NCBI over FTP using the python ftplib. + </help> +</tool>