annotate tools/make_nr/make_nr.xml @ 0:c84f12187af9 draft

v0.0.1
author peterjc
date Fri, 09 Nov 2018 11:00:03 -0500
parents
children 84e483325b04
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
peterjc
parents:
diff changeset
1 <tool id="make_nr" name="Make FASTA non-redundant" version="0.0.1">
peterjc
parents:
diff changeset
2 <description>by combining duplicated sequences</description>
peterjc
parents:
diff changeset
3 <requirements>
peterjc
parents:
diff changeset
4 <requirement type="package" version="1.67">biopython</requirement>
peterjc
parents:
diff changeset
5 </requirements>
peterjc
parents:
diff changeset
6 <version_command>
peterjc
parents:
diff changeset
7 python $__tool_directory__/make_nr.py --version
peterjc
parents:
diff changeset
8 </version_command>
peterjc
parents:
diff changeset
9 <command detect_errors="aggressive">
peterjc
parents:
diff changeset
10 python $__tool_directory__/make_nr.py $alphasort -s '$separator' -o '$output'
peterjc
parents:
diff changeset
11 #for $f in $input
peterjc
parents:
diff changeset
12 '$f'
peterjc
parents:
diff changeset
13 #end for
peterjc
parents:
diff changeset
14 </command>
peterjc
parents:
diff changeset
15 <inputs>
peterjc
parents:
diff changeset
16 <param name="input" type="data" format="fasta,fasta.gz" multiple="True"
peterjc
parents:
diff changeset
17 label="Input FASTA sequence file(s)"/>
peterjc
parents:
diff changeset
18 <param argument="separator" type="text" size="10" area="False" value=";"
peterjc
parents:
diff changeset
19 label="Separator string to use when combining the identifiers of duplicate sequences"
peterjc
parents:
diff changeset
20 help="A single character is recommended, e.g. the semi-colon, or comma">
peterjc
parents:
diff changeset
21 <sanitizer>
peterjc
parents:
diff changeset
22 <valid initial="default">
peterjc
parents:
diff changeset
23 <add value=";"/>
peterjc
parents:
diff changeset
24 <add value="|"/>
peterjc
parents:
diff changeset
25 </valid>
peterjc
parents:
diff changeset
26 </sanitizer>
peterjc
parents:
diff changeset
27 </param>
peterjc
parents:
diff changeset
28 <param argument="alphasort" type="select" label="Treatment of identifiers when combining duplicates with the separator">
peterjc
parents:
diff changeset
29 <option value="">Use the order they appear in the input file(s)</option>
peterjc
parents:
diff changeset
30 <option value="-a">Sort alphabetically before combining them</option>
peterjc
parents:
diff changeset
31 </param>
peterjc
parents:
diff changeset
32 </inputs>
peterjc
parents:
diff changeset
33 <outputs>
peterjc
parents:
diff changeset
34 <data name="output" format="fasta" label="$on_string (NR)" />
peterjc
parents:
diff changeset
35 </outputs>
peterjc
parents:
diff changeset
36 <tests>
peterjc
parents:
diff changeset
37 <test>
peterjc
parents:
diff changeset
38 <param name="input" value="duplicates.fasta" ftype="fasta"/>
peterjc
parents:
diff changeset
39 <output name="output" file="duplicates.nr.fasta" ftype="fasta"/>
peterjc
parents:
diff changeset
40 </test>
peterjc
parents:
diff changeset
41 <test>
peterjc
parents:
diff changeset
42 <param name="input" value="duplicates.fasta.gz" ftype="fasta.gz"/>
peterjc
parents:
diff changeset
43 <output name="output" file="duplicates.nr.fasta" ftype="fasta"/>
peterjc
parents:
diff changeset
44 </test>
peterjc
parents:
diff changeset
45 <test>
peterjc
parents:
diff changeset
46 <param name="input" value="more_duplicates.fasta,duplicates.fasta" ftype="fasta"/>
peterjc
parents:
diff changeset
47 <output name="output" file="deduplicate.nosortids.fasta" ftype="fasta"/>
peterjc
parents:
diff changeset
48 </test>
peterjc
parents:
diff changeset
49 <test>
peterjc
parents:
diff changeset
50 <param name="input" value="more_duplicates.fasta,duplicates.fasta" ftype="fasta"/>
peterjc
parents:
diff changeset
51 <param name="alphasort" value="-a"/>
peterjc
parents:
diff changeset
52 <output name="output" file="deduplicate.sortids.fasta" ftype="fasta"/>
peterjc
parents:
diff changeset
53 </test>
peterjc
parents:
diff changeset
54 </tests>
peterjc
parents:
diff changeset
55 <help>
peterjc
parents:
diff changeset
56 **What it does**
peterjc
parents:
diff changeset
57
peterjc
parents:
diff changeset
58 Takes one or more input FASTA files, checks them to find any duplicate sequences
peterjc
parents:
diff changeset
59 (ignoring the case), and writes an output FASTA file where any duplicates appear
peterjc
parents:
diff changeset
60 once with combined identifier.
peterjc
parents:
diff changeset
61
peterjc
parents:
diff changeset
62 For example, using the default separator of a semi-colon::
peterjc
parents:
diff changeset
63
peterjc
parents:
diff changeset
64 >1 first entry
peterjc
parents:
diff changeset
65 act
peterjc
parents:
diff changeset
66 >2 The A-Team
peterjc
parents:
diff changeset
67 AAaa
peterjc
parents:
diff changeset
68 >3 not unique...
peterjc
parents:
diff changeset
69 ACgt
peterjc
parents:
diff changeset
70 >4
peterjc
parents:
diff changeset
71 CCCC
peterjc
parents:
diff changeset
72 >5 a duplicate
peterjc
parents:
diff changeset
73 acgt
peterjc
parents:
diff changeset
74 >6 last!
peterjc
parents:
diff changeset
75 GGGG
peterjc
parents:
diff changeset
76
peterjc
parents:
diff changeset
77 In this simple example ``ACGT`` appears twice (ignoring case) as entries ``3``
peterjc
parents:
diff changeset
78 and ``6``. Entry ``3`` is renamed as ``3;6`` and entry ``4`` is omitted::
peterjc
parents:
diff changeset
79
peterjc
parents:
diff changeset
80 >1 first entry
peterjc
parents:
diff changeset
81 act
peterjc
parents:
diff changeset
82 >2 The A-Team
peterjc
parents:
diff changeset
83 AAaa
peterjc
parents:
diff changeset
84 >3;6 representing 2 records
peterjc
parents:
diff changeset
85 ACgt
peterjc
parents:
diff changeset
86 >4
peterjc
parents:
diff changeset
87 CCCC
peterjc
parents:
diff changeset
88 >6 last!
peterjc
parents:
diff changeset
89 GGGG
peterjc
parents:
diff changeset
90
peterjc
parents:
diff changeset
91 This means that the representative records take the position and sequence case
peterjc
parents:
diff changeset
92 from the first entry with that sequence.
peterjc
parents:
diff changeset
93
peterjc
parents:
diff changeset
94 In this case the combined entry is labelled as ``3;6``, so the sort option
peterjc
parents:
diff changeset
95 has no effect. However, if the records appears in the file with ``6`` before
peterjc
parents:
diff changeset
96 ``3`` you can choose to get ``6;3`` (order from file, default) or ``3;6``
peterjc
parents:
diff changeset
97 (ordered alphabetically).
peterjc
parents:
diff changeset
98
peterjc
parents:
diff changeset
99 Notice the unique sequences are preserved as they were with any description
peterjc
parents:
diff changeset
100 or mixed case.
peterjc
parents:
diff changeset
101
peterjc
parents:
diff changeset
102
peterjc
parents:
diff changeset
103 **References**
peterjc
parents:
diff changeset
104
peterjc
parents:
diff changeset
105 If you cannot cite this tool directly via the GitHub URL
peterjc
parents:
diff changeset
106 https://github.com/peterjc/galaxy_blast/tree/master/tools/make_nr
peterjc
parents:
diff changeset
107 and need a traditional paper, then please cite:
peterjc
parents:
diff changeset
108
peterjc
parents:
diff changeset
109 P.J.A. Cock, J.M. Chilton, B. Gruening, J.E. Johnson, N. Soranzo (2015).
peterjc
parents:
diff changeset
110 NCBI BLAST+ integrated into Galaxy.
peterjc
parents:
diff changeset
111 *GigaScience* 4:39
peterjc
parents:
diff changeset
112 https://doi.org/10.1186/s13742-015-0080-7
peterjc
parents:
diff changeset
113
peterjc
parents:
diff changeset
114 This wrapper is available to install into other Galaxy Instances via the Galaxy
peterjc
parents:
diff changeset
115 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/make_nr
peterjc
parents:
diff changeset
116 </help>
peterjc
parents:
diff changeset
117 <citations>
peterjc
parents:
diff changeset
118 <citation type="doi">10.1186/1471-2105-10-421</citation>
peterjc
parents:
diff changeset
119 </citations>
peterjc
parents:
diff changeset
120 </tool>