0
|
1 <?xml version="1.0" encoding="utf-8"?>
|
|
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
3 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
4 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
|
|
5 <head>
|
|
6 <!-- 2016-10-21 Pá 11:06 -->
|
|
7 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
|
8 <meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
9 <title>TAREAN output description</title>
|
|
10 <meta name="generator" content="Org-mode" />
|
|
11 <meta name="author" content="petr" />
|
|
12 <style type="text/css">
|
|
13 <!--/*--><![CDATA[/*><!--*/
|
|
14 .title { text-align: center;
|
|
15 margin-bottom: .2em; }
|
|
16 .subtitle { text-align: center;
|
|
17 font-size: medium;
|
|
18 font-weight: bold;
|
|
19 margin-top:0; }
|
|
20 .todo { font-family: monospace; color: red; }
|
|
21 .done { font-family: monospace; color: green; }
|
|
22 .priority { font-family: monospace; color: orange; }
|
|
23 .tag { background-color: #eee; font-family: monospace;
|
|
24 padding: 2px; font-size: 80%; font-weight: normal; }
|
|
25 .timestamp { color: #bebebe; }
|
|
26 .timestamp-kwd { color: #5f9ea0; }
|
|
27 .org-right { margin-left: auto; margin-right: 0px; text-align: right; }
|
|
28 .org-left { margin-left: 0px; margin-right: auto; text-align: left; }
|
|
29 .org-center { margin-left: auto; margin-right: auto; text-align: center; }
|
|
30 .underline { text-decoration: underline; }
|
|
31 #postamble p, #preamble p { font-size: 90%; margin: .2em; }
|
|
32 p.verse { margin-left: 3%; }
|
|
33 pre {
|
|
34 border: 1px solid #ccc;
|
|
35 box-shadow: 3px 3px 3px #eee;
|
|
36 padding: 8pt;
|
|
37 font-family: monospace;
|
|
38 overflow: auto;
|
|
39 margin: 1.2em;
|
|
40 }
|
|
41 pre.src {
|
|
42 position: relative;
|
|
43 overflow: visible;
|
|
44 padding-top: 1.2em;
|
|
45 }
|
|
46 pre.src:before {
|
|
47 display: none;
|
|
48 position: absolute;
|
|
49 background-color: white;
|
|
50 top: -10px;
|
|
51 right: 10px;
|
|
52 padding: 3px;
|
|
53 border: 1px solid black;
|
|
54 }
|
|
55 pre.src:hover:before { display: inline;}
|
|
56 pre.src-sh:before { content: 'sh'; }
|
|
57 pre.src-bash:before { content: 'sh'; }
|
|
58 pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
|
|
59 pre.src-R:before { content: 'R'; }
|
|
60 pre.src-perl:before { content: 'Perl'; }
|
|
61 pre.src-java:before { content: 'Java'; }
|
|
62 pre.src-sql:before { content: 'SQL'; }
|
|
63
|
|
64 table { border-collapse:collapse; }
|
|
65 caption.t-above { caption-side: top; }
|
|
66 caption.t-bottom { caption-side: bottom; }
|
|
67 td, th { vertical-align:top; }
|
|
68 th.org-right { text-align: center; }
|
|
69 th.org-left { text-align: center; }
|
|
70 th.org-center { text-align: center; }
|
|
71 td.org-right { text-align: right; }
|
|
72 td.org-left { text-align: left; }
|
|
73 td.org-center { text-align: center; }
|
|
74 dt { font-weight: bold; }
|
|
75 .footpara { display: inline; }
|
|
76 .footdef { margin-bottom: 1em; }
|
|
77 .figure { padding: 1em; }
|
|
78 .figure p { text-align: center; }
|
|
79 .inlinetask {
|
|
80 padding: 10px;
|
|
81 border: 2px solid gray;
|
|
82 margin: 10px;
|
|
83 background: #ffffcc;
|
|
84 }
|
|
85 #org-div-home-and-up
|
|
86 { text-align: right; font-size: 70%; white-space: nowrap; }
|
|
87 textarea { overflow-x: auto; }
|
|
88 .linenr { font-size: smaller }
|
|
89 .code-highlighted { background-color: #ffff00; }
|
|
90 .org-info-js_info-navigation { border-style: none; }
|
|
91 #org-info-js_console-label
|
|
92 { font-size: 10px; font-weight: bold; white-space: nowrap; }
|
|
93 .org-info-js_search-highlight
|
|
94 { background-color: #ffff00; color: #000000; font-weight: bold; }
|
|
95 /*]]>*/-->
|
|
96 </style>
|
|
97 <link rel="stylesheet" type="text/css" href="style1.css" />
|
|
98 <script type="text/javascript">
|
|
99 /*
|
|
100 @licstart The following is the entire license notice for the
|
|
101 JavaScript code in this tag.
|
|
102
|
|
103 Copyright (C) 2012-2013 Free Software Foundation, Inc.
|
|
104
|
|
105 The JavaScript code in this tag is free software: you can
|
|
106 redistribute it and/or modify it under the terms of the GNU
|
|
107 General Public License (GNU GPL) as published by the Free Software
|
|
108 Foundation, either version 3 of the License, or (at your option)
|
|
109 any later version. The code is distributed WITHOUT ANY WARRANTY;
|
|
110 without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
111 FOR A PARTICULAR PURPOSE. See the GNU GPL for more details.
|
|
112
|
|
113 As additional permission under GNU GPL version 3 section 7, you
|
|
114 may distribute non-source (e.g., minimized or compacted) forms of
|
|
115 that code without the copy of the GNU GPL normally required by
|
|
116 section 4, provided you include this license notice and a URL
|
|
117 through which recipients can access the Corresponding Source.
|
|
118
|
|
119
|
|
120 @licend The above is the entire license notice
|
|
121 for the JavaScript code in this tag.
|
|
122 */
|
|
123 <!--/*--><![CDATA[/*><!--*/
|
|
124 function CodeHighlightOn(elem, id)
|
|
125 {
|
|
126 var target = document.getElementById(id);
|
|
127 if(null != target) {
|
|
128 elem.cacheClassElem = elem.className;
|
|
129 elem.cacheClassTarget = target.className;
|
|
130 target.className = "code-highlighted";
|
|
131 elem.className = "code-highlighted";
|
|
132 }
|
|
133 }
|
|
134 function CodeHighlightOff(elem, id)
|
|
135 {
|
|
136 var target = document.getElementById(id);
|
|
137 if(elem.cacheClassElem)
|
|
138 elem.className = elem.cacheClassElem;
|
|
139 if(elem.cacheClassTarget)
|
|
140 target.className = elem.cacheClassTarget;
|
|
141 }
|
|
142 /*]]>*///-->
|
|
143 </script>
|
|
144 </head>
|
|
145 <body>
|
|
146 <div id="content">
|
|
147 <h1 class="title">TAREAN output description</h1>
|
|
148 <div id="table-of-contents">
|
|
149 <h2>Table of Contents</h2>
|
|
150 <div id="text-table-of-contents">
|
|
151 <ul>
|
|
152 <li><a href="#orgheadline1">1. Introduction</a></li>
|
|
153 <li><a href="#orgheadline3">2. Main HTML report</a>
|
|
154 <ul>
|
|
155 <li><a href="#orgheadline2">2.1. Table legend</a></li>
|
|
156 </ul>
|
|
157 </li>
|
|
158 <li><a href="#orgheadline5">3. Detailed cluster report</a>
|
|
159 <ul>
|
|
160 <li><a href="#orgheadline4">3.1. Table legend</a></li>
|
|
161 </ul>
|
|
162 </li>
|
|
163 <li><a href="#orgheadline7">4. Structure of the output archive</a>
|
|
164 <ul>
|
|
165 <li><a href="#orgheadline6">4.1. structure of cluster directories</a></li>
|
|
166 </ul>
|
|
167 </li>
|
|
168 </ul>
|
|
169 </div>
|
|
170 </div>
|
|
171
|
|
172 <div id="outline-container-orgheadline1" class="outline-2">
|
|
173 <h2 id="orgheadline1"><span class="section-number-2">1</span> Introduction</h2>
|
|
174 <div class="outline-text-2" id="text-1">
|
|
175 <p>
|
|
176 TAREAN output includes <b>HTML report</b> with list of all analyzed clusters; the clusters are classified into five categories:
|
|
177 </p>
|
|
178 <ul class="org-ul">
|
|
179 <li>high confidence satellites</li>
|
|
180 <li>low confidence satellites</li>
|
|
181 <li>potential LTR elements</li>
|
|
182 <li>rDNA</li>
|
|
183 <li>other clusters</li>
|
|
184 </ul>
|
|
185 <p>
|
|
186 Each cluster for which consensus sequences was reconstructed has also its own detailed report, linked to the main report.
|
|
187 </p>
|
|
188 </div>
|
|
189 </div>
|
|
190
|
|
191 <div id="outline-container-orgheadline3" class="outline-2">
|
|
192 <h2 id="orgheadline3"><span class="section-number-2">2</span> Main HTML report</h2>
|
|
193 <div class="outline-text-2" id="text-2">
|
|
194 <p>
|
|
195 This report contains basic information about all clusters larger than specified threshold (default value is 0.01% of analyzed reads)
|
|
196 </p>
|
|
197 </div>
|
|
198 <div id="outline-container-orgheadline2" class="outline-3">
|
|
199 <h3 id="orgheadline2"><span class="section-number-3">2.1</span> Table legend</h3>
|
|
200 <div class="outline-text-3" id="text-2-1">
|
|
201 <dl class="org-dl">
|
|
202 <dt>Cluster</dt><dd>Cluster identifier</dd>
|
|
203 <dt>Genome Proportion<code>[%]</code></dt><dd><i>(Number of sequences in cluster/Number of sequences in clustering) x 100%</i></dd>
|
|
204 <dt>Size</dt><dd>Number of reads in the cluster</dd>
|
|
205 <dt>Satellite probability</dt><dd>Empirical probability estimate that cluster sequences
|
|
206 are derived from satellite repeat. This estimate is based on analysis of more
|
|
207 than xxx clusters including yyy manually anotated and zzz experimentaly
|
|
208 validated satellite repeats</dd>
|
|
209 <dt>Consensus</dt><dd>Consensus sequence is outcome of kmer-based
|
|
210 analysis and represents the most probable satellite monomer
|
|
211 sequence</dd>
|
|
212 <dt>Kmer analysis</dt><dd>link to analysis report for individual clusters</dd>
|
|
213 <dt>Graph layout</dt><dd>Graph-based visualization of similarities among sequence
|
|
214 reads</dd>
|
|
215 <dt>Connected component index</dt><dd>Proportion of nodes of the graph which are part
|
|
216 of the the largest strongly connected component</dd>
|
|
217 <dt>Pair completeness index</dt><dd>Proportion of reads with available
|
|
218 mate-pair within the same cluster</dd>
|
|
219 <dt>Kmer coverage</dt><dd>Sum of relative frequencies of all kmers used for consensus
|
|
220 sequence reconstruction</dd>
|
|
221 <dt>|V|</dt><dd>Number of vertices of the graph</dd>
|
|
222 <dt>|E|</dt><dd>Number of edges of the graph</dd>
|
|
223 <dt>PBS score</dt><dd>Primer binding site detection score</dd>
|
|
224 <dt>The longest ORF length</dt><dd>Length of the longest open reading frame found in
|
|
225 any of the possible six reading frames. Search was done on dimer of
|
|
226 consensus so ORFs can be longer than 'monomer' length</dd>
|
|
227 <dt>Similarity-based annotation</dt><dd>Annotation based on
|
|
228 similarity search using blastn/blastx against database of known
|
|
229 repeats.</dd>
|
|
230 </dl>
|
|
231 </div>
|
|
232 </div>
|
|
233 </div>
|
|
234 <div id="outline-container-orgheadline5" class="outline-2">
|
|
235 <h2 id="orgheadline5"><span class="section-number-2">3</span> Detailed cluster report</h2>
|
|
236 <div class="outline-text-2" id="text-3">
|
|
237 <p>
|
|
238 Cluster report includes a list of major monomer sequence varinats reconstructed from the most frequent k-mers. The reconstructed consensus sequences are sorted based on their significance (that is, what proportion of k-mer they represent).
|
|
239 </p>
|
|
240 </div>
|
|
241 <div id="outline-container-orgheadline4" class="outline-3">
|
|
242 <h3 id="orgheadline4"><span class="section-number-3">3.1</span> Table legend</h3>
|
|
243 <div class="outline-text-3" id="text-3-1">
|
|
244 <dl class="org-dl">
|
|
245 <dt>kmer</dt><dd>length of kmer used for consensus reconstruction.</dd>
|
|
246 <dt>variant</dt><dd>identifier of consensus variant.</dd>
|
|
247 <dt>total score</dt><dd>measure of significance of consensus variant. Score is calculated as a sum of weights of all k-mers used for consensus reconstruction.</dd>
|
|
248 <dt>monomer length</dt><dd>length of the consensus</dd>
|
|
249 <dt>consensus</dt><dd>consensus sequence without ambiguous bases.</dd>
|
|
250 <dt>graph image</dt><dd>part of de-Bruijn graph based on the abundant k-mers. Size of
|
|
251 vertices corresponds to k-mer frequencies, Paths in the graph which was used
|
|
252 for reconstruction of consensus sequences is gray colored.</dd>
|
|
253 <dt>logo image</dt><dd>consensus sequences shown as DNA logo. Height of letters corresponds to kmer frequencies. Logo images are linked to corresponding position probability matrices.</dd>
|
|
254 </dl>
|
|
255 </div>
|
|
256 </div>
|
|
257 </div>
|
|
258
|
|
259 <div id="outline-container-orgheadline7" class="outline-2">
|
|
260 <h2 id="orgheadline7"><span class="section-number-2">4</span> Structure of the output archive</h2>
|
|
261 <div class="outline-text-2" id="text-4">
|
|
262 <p>
|
|
263 Complete results from TAREAN analysis can by downloaded as zip archive which contains the following
|
|
264 files and directories:
|
|
265 </p>
|
|
266
|
|
267 <div class="org-src-container">
|
|
268
|
|
269 <pre class="src src-files">.
|
|
270 .
|
|
271 ├── clusters_info.csv <------------ list of clusters in tab delimited format
|
|
272 ├── index.html <------------ main html report
|
|
273 ├── seqclust
|
|
274 │ ├── assembly # not implemented yet
|
|
275 │ ├── blastn <------------ results of read comparison with DNA database
|
|
276 │ ├── blastx <------------ results of read comparison with protein database
|
|
277 │ ├── clustering
|
|
278 │ │ ├── clusters
|
|
279 │ │ │ ├── dir_CL0001 <----┐- detailed information about clusters
|
|
280 │ │ │ ├── dir_CL0002 <----│
|
|
281 │ │ │ ├── dir_CL0003 <----│
|
|
282 │ │ │ .... <----┘
|
|
283 │ │ │
|
|
284 │ │ └── hitsort.cls <--------- list of reads in individual clusters
|
|
285 │ ├── mgblast
|
|
286 │ ├── prerun
|
|
287 │ └── sequences <--------- input reads
|
|
288 ├── summary # not implemented yet
|
|
289 ├── TR_consensus_rank_1_.fasta <-- reconstructed monomer sequences for HIGH confidence satellites
|
|
290 ├── TR_consensus_rank_2_.fasta <-- reconstructed monomer sequences for LOW confidence satellites
|
|
291 ├── TR_consensus_rank_3_.fasta <-- reconstructed sequences of potential LTR elements
|
|
292 └── TR_consensus_rank_4_.fasta <-- reconstructed consensus for rDNA
|
|
293 </pre>
|
|
294 </div>
|
|
295
|
|
296 <p>
|
|
297 List of all clusters which is available in HTML file <code>index.html</code> is also
|
|
298 available in tab delimited format in the file <code>clusters_info.csv</code> which can be
|
|
299 easily viewed and edited in spreadsheet editing programs. List of all clusters
|
|
300 and the corresponding reads is in the file <code>hitsort.cls</code> which has the following
|
|
301 format:
|
|
302 </p>
|
|
303
|
|
304 <pre class="example">
|
|
305 >CL1 11
|
|
306 134234r 55494f 85525f 136746r 96742f 91926f 239729r 105445f 222518r 136402r 9013
|
|
307 >CL2 10
|
|
308 76205r 120735r 69527r 12235r 176778f 189307f 131952f 163507f 100038r 178475r
|
|
309 >CL3 6
|
|
310 99835r 222598f 29715r 102023f 99524r 30116f
|
|
311 >CL4 6
|
|
312 51723r 69073r 218774r 146425f 136314r 41744f
|
|
313 >CL5 5
|
|
314 70686f 65565f 234078r 50430r 68247r
|
|
315 </pre>
|
|
316
|
|
317 <p>
|
|
318 where <code>CL1 11</code> is the cluster ID followed by number of reads in the cluster;
|
|
319 next line contains list of all read names belonging to the cluster.
|
|
320 </p>
|
|
321 </div>
|
|
322 <div id="outline-container-orgheadline6" class="outline-3">
|
|
323 <h3 id="orgheadline6"><span class="section-number-3">4.1</span> structure of cluster directories</h3>
|
|
324 <div class="outline-text-3" id="text-4-1">
|
|
325 <p>
|
|
326 Detailed information for each cluster is stored is subdirectories:
|
|
327 </p>
|
|
328
|
|
329 <div class="org-src-container">
|
|
330
|
|
331 <pre class="src src-folder">dir_CL0011
|
|
332 ├── blast.csv <------------tab delimited file, all-to-all comparison od reads within cluster
|
|
333 ├── CL11_directed_graph.RData <----directed graph representation of cluster saved as R igraph object
|
|
334 ├── CL11.GL <-----------------undirected graph representation of cluster saved as R igraph object
|
|
335 ├── CL11.png <-----------┐- images with graph visualization
|
|
336 ├── CL11_tmb.png <-----------┘
|
|
337 ├── dna_database_annotation.csv <-- annotation of cluster reads based on the DNA database of repeats
|
|
338 ├── reads_all.fas <---------------- all reads included in the cluster in fasta format
|
|
339 ├── reads.fas <---------------- subset of reads used for monomer reconstruction
|
|
340 ├── reads_oriented.fas <------------ subset of reads all in the same orientation
|
|
341 └── tarean
|
|
342 ├── consensus.fasta <----------- fasta file with tandem repeat consensus variants
|
|
343 ├── ggmin.RData
|
|
344 ├── img
|
|
345 │ ├── graph_11mer_1.png <-----┐
|
|
346 │ ├── graph_11mer_2.png <-----│
|
|
347 │ ├── graph_15mer_2.png <-----│
|
|
348 │ ├── graph_15mer_3.png <-----│
|
|
349 │ ├── graph_15mer_4.png <-----│ images of kmer-based graphs used for reconstruction of
|
|
350 │ ├── graph_19mer_2.png <-----│ monomer variants
|
|
351 │ ├── graph_19mer_4.png <-----│
|
|
352 │ ├── graph_19mer_5.png <-----│
|
|
353 │ ├── graph_23mer_2.png <-----│
|
|
354 │ ├── graph_27mer_3.png <-----┘
|
|
355 │ │
|
|
356 │ ├── logo_11mer_1.png <-----┐
|
|
357 │ ├── logo_11mer_2.png <-----│
|
|
358 │ ├── logo_15mer_2.png <-----│
|
|
359 │ ├── logo_15mer_3.png <-----│
|
|
360 │ ├── logo_15mer_4.png <-----│ images with DNA logos representing consensus sequences
|
|
361 │ ├── logo_19mer_2.png <-----│ of monomer variants
|
|
362 │ ├── logo_19mer_4.png <-----│
|
|
363 │ ├── logo_19mer_5.png <-----│
|
|
364 │ ├── logo_23mer_2.png <-----│
|
|
365 │ └── logo_27mer_3.png <-----┘
|
|
366 │
|
|
367 ├── ppm_11mer_1.csv <-----┐
|
|
368 ├── ppm_11mer_2.csv <-----│
|
|
369 ├── ppm_15mer_2.csv <-----│
|
|
370 ├── ppm_15mer_3.csv <-----│
|
|
371 ├── ppm_15mer_4.csv <-----│ position probability matrices for individual monomer
|
|
372 ├── ppm_19mer_2.csv <-----│ variants derived from k-mer frequencies
|
|
373 ├── ppm_19mer_4.csv <-----│
|
|
374 ├── ppm_19mer_5.csv <-----│
|
|
375 ├── ppm_23mer_2.csv <-----│
|
|
376 ├── ppm_27mer_3.csv <-----┘
|
|
377 │
|
|
378 ├── reads_oriented.fas_11.kmers <-----┐
|
|
379 ├── reads_oriented.fas_15.kmers <-----│
|
|
380 ├── reads_oriented.fas_19.kmers <-----│ k-mer frequencies calculated on oriented reads
|
|
381 ├── reads_oriented.fas_23.kmers <-----│ for k-mer lengths 11 - 27
|
|
382 ├── reads_oriented.fas_27.kmers <-----┘
|
|
383 ├── reads_oriented.fasblast_out.cvs <---------┐results of blastn search against database of tRNA
|
|
384 ├── reads_oriented.fasblast_out.cvs_L.csv <----│for purposes of LTR detection
|
|
385 ├── reads_oriented.fasblast_out.cvs_R.csv <----┘
|
|
386 └── report.html <--- cluster analysisHTML summary
|
|
387 </pre>
|
|
388 </div>
|
|
389 </div>
|
|
390 </div>
|
|
391 </div>
|
|
392 </div>
|
|
393 <div id="postamble" class="status">
|
|
394 <p class="author">Author: petr</p>
|
|
395 <p class="date">Created: 2016-10-21 Pá 11:06</p>
|
|
396 <p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
|
|
397 </div>
|
|
398 </body>
|
|
399 </html>
|