# HG changeset patch # User lparsons # Date 1353969446 18000 # Node ID 2d6671b10919c8b35f0897b55bb5ba8e5b91b641 # Parent 1dda185ea2d072baa0366ae3a923fed68d63f1ba Updated to support cutadapt version 1.1 (also include automatic dependency installation) diff -r 1dda185ea2d0 -r 2d6671b10919 .hgtags --- a/.hgtags Thu Dec 22 11:46:33 2011 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -8b064ea1672296c6c224cb4af5e36cb685d88993 0.1 -1dada50cca8ae5ee163254f3981b0c1ec4becb64 v0.9.5.a diff -r 1dda185ea2d0 -r 2d6671b10919 README --- a/README Thu Dec 22 11:46:33 2011 -0500 +++ b/README Mon Nov 26 17:37:26 2012 -0500 @@ -1,7 +1,18 @@ Galaxy tool definition for cutadapt (http://code.google.com/p/cutadapt/) -Installation ------------- + +Installation - Tool Shed +--------------------------- + +The recommended way to install cutadapt as a tool in Galaxy is to the use the +Galaxy Tool Shed (http://wiki.galaxyproject.org/Tool%20Shed). + +This will allow cutadapt to be installed automatically and keep track of older +versions of cutadapt and the tool wrapper. + + +Installation - Manual +--------------------- 1 - Install the cutadapt package and make sure it is in path for Galaxy 2 - Copy cutadapt.xml to $GALAXY_HOME/tools/cutadapt @@ -16,9 +27,16 @@ See the Galaxy Wiki for more information: http://wiki.g2.bx.psu.edu/ +Configuration of Adapters +------------------------- + +A list of predefined adapters may be specified in the fastx_clipper_sequences.txt +file which resides in the tool-data directory underneath the Galaxy root. A sample +file is provided. + + Limitations ----------- Colorspace data support is not implemented -Prefix and Suffix to read names not implemented -Length-tag addition to read name not implemented +Name adapters support is not implemented diff -r 1dda185ea2d0 -r 2d6671b10919 cutadapt.xml --- a/cutadapt.xml Thu Dec 22 11:46:33 2011 -0500 +++ b/cutadapt.xml Mon Nov 26 17:37:26 2012 -0500 @@ -1,12 +1,19 @@ - + Remove adapter sequences from Fastq/Fasta - cutadapt + cutadapt + cutadapt --version cutadapt #if $input.extension.startswith( "fastq"): - --format=fastq + --format=fastq + #if $input.extension == "fastqillumina": + --quality-base=64 + #end if + #if $input.extension == "fastqsolexa": + --quality-base=64 + #end if #else --format=$input.extension #end if @@ -16,38 +23,64 @@ #for $aa in $anywhere_adapters --anywhere='${aa.anywhere_adapter_source.anywhere_adapter}' #end for + #for $fa in $front_adapters + --front='${fa.front_adapter_source.front_adapter}' + #end for --error-rate=$error_rate --times=$count --overlap=$overlap - #if str($min) != '0': - --minimum-length=$min - #end if - #if str($max) != '0': - --maximum-length=$max - #end if - #if str($quality_cutoff) != '0': - --quality-cutoff=$quality_cutoff - #end if - $discard + $match_read_wildcards + $no_match_adapters_wildcards + + #if str( $output_filtering_options.output_filtering) == "filter": + $output_filtering_options.discard + #if str($output_filtering_options.min) != '0': + --minimum-length=$output_filtering_options.min + #end if + #if str($output_filtering_options.max) != '0': + --maximum-length=$output_filtering_options.max + #end if + #end if + --output='$output' #if str( $output_params.output_type ) == "additional": #if $output_params.rest_file: - --rest-file=$rest_output + --rest-file=$rest_output + #end if + #if $output_params.wildcard_file: + --wildcard-file=$wild_output #end if #if $output_params.too_short_file: - --too-short-output=$too_short_output + --too-short-output=$too_short_output #end if #if $output_params.untrimmed_file: - --untrimmed-output=$untrimmed_output + --untrimmed-output=$untrimmed_output #end if #end if + + #if str( $read_modification_params.read_modification) == "modify": + #if str($read_modification_params.quality_cutoff) != '0': + --quality-cutoff=$read_modification_params.quality_cutoff + #end if + #if $read_modification_params.prefix != '': + --prefix="$read_modification_params.prefix" + #end if + #if $read_modification_params.suffix != '': + --suffix="$read_modification_params.suffix" + #end if + #if $read_modification_params.length_tag != '': + --length-tag="$read_modification_params.length_tag" + #end if + $read_modification_params.zero_cap + #end if + '$input' > $report - + - + @@ -55,11 +88,11 @@ - + - + @@ -77,10 +110,31 @@ - + - + + + + + + + + + + + + + + + + + + + + + + @@ -92,12 +146,25 @@ - - - - - - + + + + + + + + + + + + + + + + + + + @@ -105,11 +172,28 @@ + + + + + + + + + + + + + + + + + @@ -117,6 +201,10 @@ (output_params['output_type'] == "additional") (output_params['rest_file'] is True) + + (output_params['output_type'] == "additional") + (output_params['wild_file'] is True) + (output_params['output_type'] == "additional") (output_params['too_short_file'] is True) @@ -134,16 +222,25 @@ + + + + + +Summary +------- This tool removes adapter sequences from DNA high-throughput sequencing data. This is usually necessary when the read length of the machine is longer than the molecule that is sequenced, such as in @@ -170,60 +274,90 @@ ----- -**Algorithm** +Algorithm +--------- cutadapt uses a simple semi-global alignment algorithm, without any special optimizations. -For speed, the algorithm is implemented as a Python extension module in calignmodule.c. +For speed, the algorithm is implemented as a Python extension module in ``calignmodule.c``. -**Partial adapter matches** +Partial adapter matches +----------------------- Cutadapt correctly deals with partial adapter matches. As an example, suppose -your adapter sequence is "ADAPTER" (specified via 3' Adapters parameter). -If you have these input sequences: - -:: +your adapter sequence is ``ADAPTER`` (specified via 3' Adapters parameter). +If you have these input sequences:: MYSEQUENCEADAPTER MYSEQUENCEADAP MYSEQUENCEADAPTERSOMETHINGELSE -All of them will be trimmed to "MYSEQUENCE". If the sequence starts with an -adapter, like this: - -:: +All of them will be trimmed to ``MYSEQUENCE``. If the sequence starts with an +adapter, like this:: ADAPTERSOMETHING It will be empty after trimming. When the allowed error rate is sufficiently high, errors in -the adapter sequence are allowed. For example, ADABTER (1 mismatch), ADAPTR (1 deletion), -and ADAPPTER (1 insertion) will all be recognized if the error rate is set to 0.15. +the adapter sequence are allowed. For example, ``ADABTER`` (1 mismatch), ``ADAPTR`` (1 deletion), +and ``ADAPPTER`` (1 insertion) will all be recognized if the error rate is set to 0.15. -**Allowing adapters anywhere** +Anchoring 5' adapters +--------------------- + +If you specify a 5' (Front) adapter, the adapter may overlap the beginning of the read or +occur anywhere whithin it. If it appears withing the read, the sequence that precedes it +will also be trimmed in addition to the adapter. For example when the adapter sequence is +``ADAPTER``:: + + HELLOADAPTERTHERE + APTERTHERE -Cutadapt assumes that any adapter specified via the *3` Adapters* parameter -was ligated to the 3' end of the sequence. This is the correct assumption for +will both be trimmed to ``THERE``. To avoid this, you can prefix the adapter with the character +``^``. This will restrict the search, forcing the adapter to be a prefix of the read. With +the adapter sequence set to ``^ADAPTER``, only reads like this will be trimmed:: + + ADAPTERHELLO + + +Allowing adapters anywhere +-------------------------- + +Cutadapt assumes that any adapter specified via the 3' Adapter parameter +was ligated to the 3\' end of the sequence. This is the correct assumption for at least the SOLiD and Illumina small RNA protocols and probably others. +The assumption is enforced by the alignment algorithm, which only finds the adapter +when its starting position is within the read. In other words, the 5' base of +the adapter must appear within the read. The adapter and all bases following +it are remved. If, on the other hand, your adapter can also be ligated to the 5' end (on -purpose or by accident), you should tell cutadapt so by using the *5' or 3' (Anywhere) -Adapters* parameter. It will then use a different alignment algorithm and -correctly trim adapters that appear in the beginning of a read. An adapter -specified this way will also be found if it appears only partially in the -beginning of a read. For example, these sequences +purpose or by accident), you should tell cutadapt so by using the Anywhere Adapter +parameter. It will then use a slightly different alignment algorithm +(so-called semiglobal alignment), which allows any type of overlap between the +adapter and the sequence. In particular, the adapter may appear only partially +in the beginning of the read, like this:: -:: + PTERMYSEQUENCE - ADAPTERMYSEQUENCE - PTERMYSEQUENCE +The decision which part of the read to remove is made as follows: If there is at +least one base before the found adapter, then the adapter is considered to be +a 3' adapter and the adapter itself and everything following it is removed. +Otherwise, the adapter is considered to be a 5' adapter and it is removed from +the read. -will be trimmed to "MYSEQUENCE". Note that the regular algorithm would trim -the first read to an empty sequence. +Here are some examples, which may make this clearer (left: read, right: trimmed +read):: -This parameter currently does not work with color space data. + MYSEQUENCEADAPTER -> MYSEQUENCE (3' adapter) + MADAPTER -> M (3' adapter) + ADAPTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + PTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + +The regular algorithm (3' Adapter) would trim the first two examples in the same way, +but trim the third to an empty sequence and trim the fourth not at all. .. _cutadapt: http://code.google.com/p/cutadapt/ diff -r 1dda185ea2d0 -r 2d6671b10919 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Nov 26 17:37:26 2012 -0500 @@ -0,0 +1,17 @@ + + + + + + http://pypi.python.org/packages/source/c/cutadapt/cutadapt-1.1.tar.gz + python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin + + $INSTALL_DIR/lib/python + $INSTALL_DIR/bin + + + + + + +