# HG changeset patch # User edward-kirton # Date 1307482916 14400 # Node ID f9e4e6fe0e73fc05b8653ca9469acc39e205175b Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository diff -r 000000000000 -r f9e4e6fe0e73 phrap/jgi_assembly.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phrap/jgi_assembly.py Tue Jun 07 17:41:56 2011 -0400 @@ -0,0 +1,117 @@ +""" +Assembly classes +""" + +import data +import logging +import re +import string +from cgi import escape +from galaxy.datatypes.metadata import MetadataElement +from galaxy.datatypes import metadata +import galaxy.model +from galaxy import util +from sniff import * + +log = logging.getLogger(__name__) + +class Assembly( data.Text ): + """Class describing an assembly""" + + """Add metadata elements""" + MetadataElement( name="contigs", default=0, desc="Number of contigs", readonly=True, visible=False, optional=True, no_value=0 ) + MetadataElement( name="reads", default=0, desc="Number of reads", readonly=True, visible=False, optional=True, no_value=0 ) + + +class Ace(Assembly): + """Class describing an assembly Ace file""" + + file_ext = "ace" + +# def init_meta( self, dataset, copy_from=None ): +# Assembly.init_meta( self, dataset, copy_from=copy_from ) + + def set_meta( self, dataset, overwrite=True, **kwd ): + """ + Set the number of assembled contigs and read sequences and the number of data lines in dataset. + """ + contigs = 0 + reads = 0 + for line in file( dataset.file_name ): + line = line.strip() + if line and line.startswith( '#' ): + # Don't count comment lines + continue + if line and line.startswith( 'CO' ): + contigs += 1 + if line and line.startswith( 'RD' ): + reads += 1 + dataset.metadata.contigs = contigs + dataset.metadata.reads = reads + + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + if dataset.metadata.contigs: + dataset.blurb = "%s contigs" % util.commaify( str( dataset.metadata.contigs ) ) + else: + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff( self, filename ): + """ + Determines whether the file is in ace format + + An ace file contains these sections + AS \d+ \d+ + + CO \S+ \d+ \d+ \d+ \w + [atcgATCGN\*]+ + + BQ + [\d\s]+ + + AF \S+ [CU] \-?\d+ + + BS \d+ \d+ \S+ + + RD \S+ \d+ \d+ \d+ + [ATCGN\*]+ + + QA \d+ \d+ \d+ \d+ + DS .* + + Currently we only check if file begins with AS + + >>> fname = get_test_fname( 'genome.ace' ) + >>> Ace().sniff( fname ) + True + >>> fname = get_test_fname( 'genome.fasta' ) + >>> Ace().sniff( fname ) + False + """ + + try: + fh = open( filename ) + line = fh.readline() + line = line.strip() + if line: + if line.startswith( 'AS ' ): + fh.close() + return True + fh.close() + return False + except: + pass + return False + +class Velveth(Assembly): + composite_type='basic' + file_ext = "txt" + + def __init__(self,**kwd): + Assembly.__init__(self,**kwd) + self.add_composite_file('Roadmap') + self.add_composite_file('Sequences') diff -r 000000000000 -r f9e4e6fe0e73 phrap/phrap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phrap/phrap.xml Tue Jun 07 17:41:56 2011 -0400 @@ -0,0 +1,41 @@ + +Assemble long reads/short contigs +phrap_wrapper.sh $label $fasta_infile $qual_infile $fasta_outfile $qual_outfile $singlets_outfile $fasta_outfile.extra_files_path + + + + + + + + + + + + + + +**What it does** + +phrap ("phragment assembly program", or "phil's revised assembly +program"; a homonym of "frappe" = French for "swat") -- a +program for assembling shotgun DNA sequence data. Key features: +allows use of entire read (not just trimmed high quality part); uses a +combination of user-supplied and internally computed data quality +information to improve accuracy of assembly in the presence of +repeats; constructs contig sequence as a mosaic of the highest quality +parts of reads (rather than a consensus); provides extensive information +about assembly (including quality values for contig sequence) to +assist trouble-shooting; able to handle very large datasets. + +**Notes** + +phrap is great for assembling Sanger shotgun reads but should not be used for next-generation data (e.g. Illumina, Solid, 454, etc.). +However phrap is useful for combining short-read assemblies from Velvet or Abyss (i.e. results from using multiple k-mer parameters). +When used for this purpose, you should filter out short sequences first (e.g. less than 200-1000bp, depending on number of contigs). + +**Reference** + +http://www.phrap.org/phredphrap/phrap.html + + diff -r 000000000000 -r f9e4e6fe0e73 phrap/phrap_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phrap/phrap_wrapper.sh Tue Jun 07 17:41:56 2011 -0400 @@ -0,0 +1,56 @@ +#!/bin/bash + +if [ $# -ne 7 ] +then + echo "ERROR: Expected exactly 7 arguments; got: $*" 1>&2 + exit 1 +fi + +# GET ARGS +LABEL=$1; shift +FASTA_INFILE=$1; shift +QUAL_INFILE=$1; shift +FASTA_OUTFILE=$1; shift +QUAL_OUTFILE=$1; shift +SINGLETS_OUTFILE=$1; shift +DIR=$1; shift + +if [ ! -e $FASTA_INFILE ] +then + echo "Fasta infile not found: $FASTA_INFILE" 1>&2 + exit 1 +fi + +# ALL FILES GO IN THIS extra_files_path +mkdir $DIR +cd $DIR +ln -s $FASTA_INFILE ./$LABEL +if [ $QUAL_INFILE != 'None' ] +then + ln -s $QUAL_INFILE ./$LABEL.qual +fi + +# RUN COMMAND +phrap $LABEL 1> phrap.stdout 2> phrap.stderr +if [ $? -ne 0 ] +then + echo "COMMAND FAILURE; LOG:" 1>&2 + cat phrap.stderr 1>&2 + exit $? +fi + +# SYMLINK OUTFILES +link () { # args: src, dest + # if dest defined and src exist, then create symlink + if [[ $2 != 'None' && -e "$1" ]] + then + if [ -e $2 ] + then + rm $2 + fi + ln -s $1 $2 + fi +} +link $DIR/$LABEL.contigs $FASTA_OUTFILE +link $DIR/$LABEL.contigs.qual $QUAL_OUTFILE +link $DIR/$LABEL.singlets $SINGLETS_OUTFILE