annotate cd_hit_protein.xml @ 3:43724ea1c85f

Add cd-hit for protein fastas
author Jim Johnson <jj@umn.edu>
date Thu, 27 Jun 2013 21:37:08 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
1 <tool id="cd_hit_protein" name="CD-HIT PROTEIN" version="1.2">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
2 <description>Cluster a protein dataset into representative sequences</description>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
3 <requirements>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
4 <requirement type="package" version="4.6.1">cd-hit</requirement>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
5 </requirements>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
6 <macros>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
7 <import>cdhit_macros.xml</import>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
8 </macros>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
9 <command>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
10 cd-hit -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
11 #include source=$common_cdhit_options#
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
12 #include source=$runtime_tuning#
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
13 </command>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
14 <inputs>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
15 <param name="fasta_in" type="data" format="fasta" label="Protein Sequences to cluster"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
16 <param name="similarity" type="float" value="0.9" label="similarity threshold: .4 - 1.0 (default .9)">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
17 <validator type="in_range" message="sequence similarity threshold should be .4 - 1.0" min=".4" max="1.0"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
18 </param>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
19 <param name="wordsize" type="integer" value="5" label="word size (default 5)">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
20 <help> Suggested word size:
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
21 5 for thresholds 0.7 ~ 1.0;
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
22 4 for thresholds 0.6 ~ 0.7;
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
23 3 for thresholds 0.5 ~ 0.6;
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
24 2 for thresholds 0.4 ~ 0.5;
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
25 </help>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
26 <validator type="in_range" message="word size should be between 2 and 5" min="2" max="5"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
27 </param>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
28 <expand macro="common_cdhit_options" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
29 <expand macro="runtime_tuning" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
30 </inputs>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
31 <outputs>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
32 <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
33 <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
34 </outputs>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
35 <tests>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
36 <test>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
37 <param name="fasta_in" value="cd_hit_protein_in.fasta" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
38 <param name="similarity" value="0.9"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
39 <param name="wordsize" value="5"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
40 <!-- conditionals in macros -->
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
41 <param name="settings" value="no"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
42 <param name="tuning" value="default"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
43 <output name="clusters_out">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
44 <assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
45 <has_text text="Cluster 0" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
46 <!--
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
47 <has_text_matching expression=">sp.P00338-2.LDHA_HU" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
48 -->
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
49 </assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
50 </output>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
51 <output name="fasta_out">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
52 <assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
53 <has_text_matching expression=">sp.P19858.LDHA_BOVIN" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
54 </assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
55 </output>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
56 </test>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
57 <test>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
58 <param name="fasta_in" value="cd_hit_protein_in.fasta" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
59 <param name="similarity" value="0.8" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
60 <param name="wordsize" value="5" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
61 <!-- conditionals in macros -->
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
62 <param name="settings" value="no"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
63 <param name="tuning" value="default"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
64 <output name="clusters_out">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
65 <assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
66 <has_text text="Cluster 0" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
67 <not_has_text text="Cluster 4" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
68 </assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
69 </output>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
70 <output name="fasta_out">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
71 <assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
72 <has_text_matching expression=">sp.P00340.LDHA_CHICK" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
73 </assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
74 </output>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
75 </test>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
76 </tests>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
77
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
78 <help>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
79 **CD-HIT**
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
80
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
81 CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
82
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
83 .. _CD-HIT: http://www.bioinformatics.org/cd-hit/
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
84
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
85 ------
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
86
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
87 **Inputs**
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
88
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
89 cd-hit requires a protein fasta dataset as input.
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
90
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
91 ------
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
92
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
93 **Outputs**
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
94
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
95 A fasta datasets containing representative sequences.
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
96
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
97 A text file listing the mapping of sequences to the representative sequences::
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
98
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
99 >Cluster 0
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
100 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
101 >Cluster 1
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
102 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
103 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
104 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
105 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
106 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
107 >Cluster 2
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
108 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
109 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
110 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
111 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
112
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
113
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
114 </help>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
115 </tool>