2
|
1 <tool id="microsatellite" name="STR detection" version="1.0.0">
|
0
|
2 <description>for short read, reference, and mapped data</description>
|
|
3 <command interpreter="python2.7"> microsatellite.py
|
|
4 "${filePath}"
|
|
5 #if $inputFileSource.inputFileType == "fasta"
|
|
6 --fasta
|
|
7 #elif $inputFileSource.inputFileType == "fastq"
|
|
8 --fastq
|
|
9 #elif $inputFileSource.inputFileType == "fastq_noquals"
|
|
10 --fastq:noquals
|
|
11 #elif $inputFileSource.inputFileType == "sam"
|
|
12 --sam
|
|
13 #end if
|
|
14
|
|
15 #if $inputFileSource.inputFileType == "sam"
|
|
16 #if $inputFileSource.referenceFileSource.requireReference
|
|
17 --r --ref="${inputFileSource.referenceFileSource.referencePath}"
|
|
18 #end if
|
|
19 #end if
|
|
20
|
|
21 --period="${period}"
|
|
22
|
|
23 #if $partialmotifs == "true"
|
|
24 --partialmotifs
|
|
25 #end if
|
|
26
|
|
27 --minlength="${minlength}"
|
|
28
|
|
29
|
|
30 --prefix="${prefix}"
|
|
31 --suffix="${surfix}"
|
|
32
|
|
33 --hamming="${hammingThreshold}"
|
|
34
|
|
35 #if $multipleruns
|
|
36 --multipleruns
|
|
37 #end if
|
|
38
|
|
39 #if $flankSetting.noflankdisplay
|
|
40 --noflankdisplay
|
|
41 #else
|
|
42 --flankdisplay=${flankSetting.flankdisplay}
|
|
43 #end if
|
|
44 > $stdout
|
|
45 </command>
|
|
46
|
|
47 <inputs>
|
|
48 <param name="filePath" label="Select input file" type="data"/>
|
|
49 <conditional name="inputFileSource">
|
|
50 <param name="inputFileType" type="select" label="Select input file type">
|
|
51 <option value="fasta">Fasta File</option>
|
|
52 <option value="fastq">Fastq File</option>
|
|
53 <option value="fastq_noquals">Fastq File without Quality Information</option>
|
|
54 <option value="sam">SAM File</option>
|
|
55 </param>
|
|
56 <when value="sam">
|
|
57 <conditional name="referenceFileSource">
|
|
58 <param name="requireReference" label="Do you want to extract correspond microsatellites in reference for comparison?" type="boolean">
|
|
59 </param>
|
|
60 <when value="true">
|
|
61 <param name="referencePath" label="Select reference file" type="data"/>
|
|
62 </when>
|
|
63 </conditional>
|
|
64 </when>
|
|
65 </conditional>
|
|
66
|
|
67 <param name="period" label="Motif size of microsatellites of interest (e.g. Mononucleotide microsatellite =1) (must be less than 10)" type="integer" size="2" value="1"/>
|
|
68 <param name="partialmotifs" label="Consider microsatellites with a partial motif?" type="boolean" checked="True"/>
|
|
69 <param name="minlength" label="Minimal length (bp) of microsatellite sequence reported" type="integer" size="2" value="5"/>
|
|
70
|
|
71
|
|
72 <param name="prefix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
|
|
73 <param name="surfix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
|
|
74
|
|
75
|
|
76 <param name="hammingThreshold" label="Hamming threshold of microsatellite, If greater than 0, interrupted microsatellites will also be reported" type="integer" size="2" value="0"/>
|
|
77 <param name="multipleruns" label="Consider all candidate intervals in a sequence. If not check, only the longest one will be considered" type="boolean" checked="True"> </param>
|
|
78 <conditional name="flankSetting">
|
|
79 <param name="noflankdisplay" label="Show the entire flanking regions" type="boolean" checked="True"/>
|
|
80 <when value="false">
|
|
81 <param name="flankdisplay" label="Limit length (bp) of flanking regions shown" type="integer" size="4" value="5"/>
|
|
82 </when>
|
|
83 </conditional>
|
|
84
|
|
85 </inputs>
|
|
86 <outputs>
|
|
87 <data name="stdout" format="tabular"/>
|
|
88 </outputs>
|
|
89 <tests>
|
|
90 <!-- Test data with valid values -->
|
|
91 <test>
|
|
92 <param name="filePath" value="C_sample_fastq"/>
|
|
93 <param name="period" value="1"/>
|
2
|
94 <param name="inputFileType" value="fastq"/>
|
0
|
95 <param name="partialmotifs" value="true" />
|
|
96 <param name="minlength" value="3" />
|
|
97 <param name="prefix" value="5"/>
|
|
98 <param name="surfix" value="5"/>
|
|
99 <param name="hammingThreshold" value="0"/>
|
|
100 <param name="multipleruns" value="true"> </param>
|
|
101 <output name="microsatellite" file="C_sample_snoope"/>
|
|
102 </test>
|
|
103
|
|
104 </tests>
|
|
105 <help>
|
|
106
|
|
107
|
|
108 .. class:: infomark
|
|
109
|
|
110 **What it does**
|
|
111
|
2
|
112 This tool identifies simple as well interrupted STRs. Choosing a hamming distance of zero will return simple STRs.
|
|
113 Choosing a hamming distance of greater than zero will return both simple and interrupted STRs.
|
|
114 The algorithms used to identify simple and interrupted STRs are described oin the manuscript cited below (see TABLE XXXX).
|
0
|
115
|
|
116 **Citation**
|
|
117
|
|
118 When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research**
|
|
119 This tool is developed by Chen Sun (cxs1031@cse.psu.edu) and Bob Harris (rsharris@bx.psu.edu)
|
|
120
|
|
121 **Input**
|
|
122
|
|
123 - The input files can be fastq, fasta, fastq without quality score, and SAM format.
|
|
124
|
|
125 **Output**
|
|
126
|
|
127 For fastq, the output will contain the following columns:
|
|
128
|
2
|
129 - Column 1 = length of STR (bp)
|
|
130 - Column 2 = length of left flanking region (bp)
|
|
131 - Column 3 = length of right flanking region (bp)
|
0
|
132 - Column 4 = repeat motif (bp)
|
|
133 - Column 5 = hamming distance
|
|
134 - Column 6 = read name
|
2
|
135 - Column 7 = read sequence with soft masking of STR
|
0
|
136 - Column 8 = read quality (the same Phred score scale as input)
|
|
137
|
|
138 For fasta, fastq without quality score and sam format, column 8 will be replaced with dot(.).
|
|
139
|
2
|
140 If the users have mapped file (SAM) and would like to profile STRs from premapped data instead of using flank-based mapping approach, they can select SAM format input and specify that they want correspond STRs in reference for comparison. The output will be as follow:
|
0
|
141
|
2
|
142 - Column 1 = length of STR (bp)
|
|
143 - Column 2 = length of left flanking region (bp)
|
|
144 - Column 3 = length of right flanking region (bp)
|
0
|
145 - Column 4 = repeat motif (bp)
|
|
146 - Column 5 = hamming distance
|
|
147 - Column 6 = read name
|
2
|
148 - Column 7 = read sequence with soft masking of STR
|
0
|
149 - Column 8 = read quality (the same Phred score scale as input)
|
|
150 - Column 9 = read name (The same as column 6)
|
|
151 - Column 10 = chromosome
|
|
152 - Column 11 = left flanking region start
|
|
153 - Column 12 = left flanking region stop
|
2
|
154 - Column 13 = STR start as infer from pair-end
|
|
155 - Column 14 = STR stop as infer from pair-end
|
0
|
156 - Column 15 = right flanking region start
|
|
157 - Column 16 = right flanking region stop
|
2
|
158 - Column 17 = STR length in reference
|
|
159 - Column 18 = STR sequence in reference
|
0
|
160
|
|
161 </help>
|
|
162 </tool>
|