annotate microsatellite.xml @ 0:07588b899c13 draft

Uploaded
author arkarachai-fungtammasan
date Wed, 01 Apr 2015 17:05:51 -0400
parents
children d5ed5c2e25c3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
1 <tool id="microsatellite" name="Microsatellite detection" version="1.0.0">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
2 <description>for short read, reference, and mapped data</description>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
3 <command interpreter="python2.7"> microsatellite.py
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
4 "${filePath}"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
5 #if $inputFileSource.inputFileType == "fasta"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
6 --fasta
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
7 #elif $inputFileSource.inputFileType == "fastq"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
8 --fastq
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
9 #elif $inputFileSource.inputFileType == "fastq_noquals"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
10 --fastq:noquals
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
11 #elif $inputFileSource.inputFileType == "sam"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
12 --sam
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
13 #end if
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
14
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
15 #if $inputFileSource.inputFileType == "sam"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
16 #if $inputFileSource.referenceFileSource.requireReference
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
17 --r --ref="${inputFileSource.referenceFileSource.referencePath}"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
18 #end if
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
19 #end if
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
20
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
21 --period="${period}"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
22
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
23 #if $partialmotifs == "true"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
24 --partialmotifs
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
25 #end if
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
26
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
27 --minlength="${minlength}"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
28
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
29
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
30 --prefix="${prefix}"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
31 --suffix="${surfix}"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
32
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
33 --hamming="${hammingThreshold}"
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
34
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
35 #if $multipleruns
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
36 --multipleruns
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
37 #end if
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
38
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
39 #if $flankSetting.noflankdisplay
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
40 --noflankdisplay
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
41 #else
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
42 --flankdisplay=${flankSetting.flankdisplay}
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
43 #end if
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
44 &gt; $stdout
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
45 </command>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
46
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
47 <inputs>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
48 <param name="filePath" label="Select input file" type="data"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
49 <conditional name="inputFileSource">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
50 <param name="inputFileType" type="select" label="Select input file type">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
51 <option value="fasta">Fasta File</option>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
52 <option value="fastq">Fastq File</option>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
53 <option value="fastq_noquals">Fastq File without Quality Information</option>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
54 <option value="sam">SAM File</option>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
55 </param>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
56 <when value="sam">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
57 <conditional name="referenceFileSource">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
58 <param name="requireReference" label="Do you want to extract correspond microsatellites in reference for comparison?" type="boolean">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
59 </param>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
60 <when value="true">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
61 <param name="referencePath" label="Select reference file" type="data"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
62 </when>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
63 </conditional>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
64 </when>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
65 </conditional>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
66
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
67 <param name="period" label="Motif size of microsatellites of interest (e.g. Mononucleotide microsatellite =1) (must be less than 10)" type="integer" size="2" value="1"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
68 <param name="partialmotifs" label="Consider microsatellites with a partial motif?" type="boolean" checked="True"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
69 <param name="minlength" label="Minimal length (bp) of microsatellite sequence reported" type="integer" size="2" value="5"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
70
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
71
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
72 <param name="prefix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
73 <param name="surfix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
74
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
75
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
76 <param name="hammingThreshold" label="Hamming threshold of microsatellite, If greater than 0, interrupted microsatellites will also be reported" type="integer" size="2" value="0"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
77 <param name="multipleruns" label="Consider all candidate intervals in a sequence. If not check, only the longest one will be considered" type="boolean" checked="True"> </param>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
78 <conditional name="flankSetting">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
79 <param name="noflankdisplay" label="Show the entire flanking regions" type="boolean" checked="True"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
80 <when value="false">
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
81 <param name="flankdisplay" label="Limit length (bp) of flanking regions shown" type="integer" size="4" value="5"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
82 </when>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
83 </conditional>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
84
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
85 </inputs>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
86 <outputs>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
87 <data name="stdout" format="tabular"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
88 </outputs>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
89 <tests>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
90 <!-- Test data with valid values -->
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
91 <test>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
92 <param name="filePath" value="C_sample_fastq"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
93 <param name="period" value="1"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
94 <param name="partialmotifs" value="true" />
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
95 <param name="minlength" value="3" />
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
96 <param name="prefix" value="5"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
97 <param name="surfix" value="5"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
98 <param name="hammingThreshold" value="0"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
99 <param name="multipleruns" value="true"> </param>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
100 <output name="microsatellite" file="C_sample_snoope"/>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
101 </test>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
102
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
103 </tests>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
104 <help>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
105
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
106
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
107 .. class:: infomark
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
108
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
109 **What it does**
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
110
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
111 We use different algorithms to detect microsatellites depend on hamming distance parameter.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
112 If hamming distance is set to zero, the program will only concern about uninterrupted microsatellites. The process works as follows.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
113
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
114 1) Scanning reads using sliding windows. For a given repeat period ‘k’ (e.g. k=2 for dinucleotide TRs), we compared consecutive k-mer window size sequences, with a step size of k. If a base at a given position matches one k positions earlier it was marked with a plus, if corresponding sites had different bases it was marked with a minus. The first k position is blank.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
115
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
116 2) Since we do not allow mutations in reported TR, consecutive “+” signal sequence means that a k-mer TR is present in this sample.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
117
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
118 3) Report k-mer TRs if the length is larger than a threshold provided by the user.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
119
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
120 If hamming distance is set to integer more than zero, the program will concern both uninterrupted and interrupted microsatellites. The process works as follows:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
121
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
122 (1) Identify intervals that are highly correlated with the interval shifted by ‘k’ (the repeat period). These intervals are called "runs" or "candidates". The allowed level of correlation is 6/7. Depending on whether we want to look for more than one microsat, we either find the longest such run (simple algorithm) or many runs (more complicated algorithm). The following steps are then performed on each run.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
123
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
124 (2) Find the most likely repeat motif in the run. This is done by counting all kmers (of length P) and choosing the most frequent. If that kmer is itself covered by a sub-repeat we discard this run. The idea is that we can ignore a 6-mer like ACGACG because we will find it when we are looking for 3-mers.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
125
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
126 (3) Once we identify the most likely repeat motif, we then modify the interval, adjusting start and end to find the interval that has the fewest mismatches vs. a sequence of the motif repeated (hamming distance).
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
127
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
128 (4) At this point we have a valid microsat interval (in the eyes of the program). It is subjected to some filtering stages (hamming distance or too close to an end), and if it satisfies those conditions, it's reported to the user
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
129
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
130 For more option, the script to run this program can be downloaded and run with python independently from Galaxy. There are more option for the script mode. Help page is build-in inside the script.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
131
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
132 **Citation**
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
133
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
134 When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research**
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
135 This tool is developed by Chen Sun (cxs1031@cse.psu.edu) and Bob Harris (rsharris@bx.psu.edu)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
136
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
137 **Input**
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
138
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
139 - The input files can be fastq, fasta, fastq without quality score, and SAM format.
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
140
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
141 **Output**
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
142
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
143 For fastq, the output will contain the following columns:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
144
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
145 - Column 1 = length of microsatellites (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
146 - Column 2 = length of left flanking regions (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
147 - Column 3 = length of right flanking regions (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
148 - Column 4 = repeat motif (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
149 - Column 5 = hamming distance
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
150 - Column 6 = read name
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
151 - Column 7 = read sequence with soft masking of microsatellites
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
152 - Column 8 = read quality (the same Phred score scale as input)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
153
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
154 For fasta, fastq without quality score and sam format, column 8 will be replaced with dot(.).
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
155
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
156 If the users have mapped file (SAM) and would like to profile microsatellites from premapped data instead of using flank-based mapping approach, they can select SAM format input and specify that they want correspond microsatellites in reference for comparison. The output will be as follow:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
157
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
158 - Column 1 = length of microsatellites (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
159 - Column 2 = length of left flanking regions (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
160 - Column 3 = length of right flanking regions (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
161 - Column 4 = repeat motif (bp)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
162 - Column 5 = hamming distance
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
163 - Column 6 = read name
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
164 - Column 7 = read sequence with soft masking of microsatellites
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
165 - Column 8 = read quality (the same Phred score scale as input)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
166 - Column 9 = read name (The same as column 6)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
167 - Column 10 = chromosome
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
168 - Column 11 = left flanking region start
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
169 - Column 12 = left flanking region stop
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
170 - Column 13 = microsatellite start as infer from pair-end
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
171 - Column 14 = microsatellite stop as infer from pair-end
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
172 - Column 15 = right flanking region start
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
173 - Column 16 = right flanking region stop
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
174 - Column 17 = microsatellite length in reference
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
175 - Column 18 = microsatellite sequence in reference
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
176
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
177 </help>
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
178 </tool>