comparison map_damage.xml @ 0:3db999d4e0c1 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/map_damage commit 3b5fcf457cea50698cc333239669bf25f6031ab5
author iuc
date Tue, 15 Jul 2025 19:00:59 +0000
parents
children 761b6fdcac6e
comparison
equal deleted inserted replaced
-1:000000000000 0:3db999d4e0c1
1 <tool id="map_damage" name="mapDamage" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.2">
2
3 <description>Tracks and quantifies damage patterns in ancient DNA sequences</description>
4
5 <macros>
6 <token name="@TOOL_VERSION@">2.2.2</token>
7 <token name="@VERSION_SUFFIX@">0</token>
8 </macros>
9
10 <xrefs>
11 <xref type="bio.tools">mapdamage</xref>
12 </xrefs>
13
14 <requirements>
15 <requirement type="package" version="@TOOL_VERSION@">mapdamage2</requirement>
16 </requirements>
17
18 <version_command><![CDATA[mapDamage --version]]></version_command>
19
20 <command detect_errors="aggressive"><![CDATA[
21
22 ln -s '$sbam_file' alignment.$sbam_file.ext &&
23
24 #if $reference.ref_source == 'history':
25 ln -s '$reference.history_reference' reference.fa &&
26 #end if
27
28 ## MAIN COMMAND LINE
29
30 mapDamage
31
32 ## INPUT / OUTPUT PARAMETERS
33
34 --input alignment.$sbam_file.ext
35
36 #if $reference.ref_source == 'builtin':
37 --reference '$reference.builtin_reference.fields.path'
38 #elif $reference.ref_source == 'history':
39 --reference reference.fa
40 #end if
41
42 --folder 'mapDamage_results'
43
44 ## GENERAL OPTIONS
45
46 $merge_reference_sequences
47
48 #if $downsampling.downsampling.downsampling_type != 'no_downsampling':
49 --downsample $downsampling.downsampling.downsample
50 #if $downsampling.downsampling.downsample_seed:
51 --downsample-seed $downsampling.downsampling.downsample_seed
52 #end if
53 #end if
54
55 --length $window_size.length
56 --around $window_size.around
57 --min-basequal $min_basequal
58 $fasta
59
60 ## GRAPHICS OPTIONS
61
62 --ymax $graphics.ymax
63 --readplot $graphics.readplot
64 --refplot $graphics.refplot
65
66 #if str($graphics.title):
67 --title '$graphics.title'
68 #end if
69
70 $graphics.theme_bw
71
72 ## STATISTICS OPTIONS
73
74 #if $statistics.stats.no_stats:
75 $statistics.stats.no_stats
76 #else:
77 --rand $statistics.stats.rand
78 --burn $statistics.stats.burn
79 --adjust $statistics.stats.adjust
80 --iter $statistics.stats.iter
81 $statistics.stats.ends.ends_to_process
82
83 #if not $statistics.stats.ends.ends_to_process:
84 $statistics.stats.ends.diff_hangs
85 #end if
86
87 --seq-length $statistics.stats.seq_length
88 $statistics.stats.var_disp
89 $statistics.stats.mutation_model
90 $statistics.stats.nick_vector
91 #end if
92
93 ## RESCALING OPTIONS
94
95 #if $rescale.rescaling.rescale:
96 --rescale
97 #if $rescale.rescaling.rescale_length_5p:
98 --rescale-length-5p $rescale.rescaling.rescale_length_5p
99 #end if
100 #if $rescale.rescaling.rescale_length_3p:
101 --rescale-length-3p $rescale.rescaling.rescale_length_3p
102 #end if
103 #end if
104
105 ]]></command>
106
107 <inputs>
108
109 <!--INPUT FILES-->
110
111 <param name="sbam_file" argument="--input" type="data" format="sam,bam" label="SAM / BAM input file of aligned reads to be analyzed." help="Must contain a valid header."/>
112
113 <conditional name="reference">
114 <param name="ref_source" type="select" label="Use a built-in FASTA reference or retrieve one from your history?">
115 <option value="builtin" selected="true">Built-in reference FASTA file</option>
116 <option value="history">From your history</option>
117 </param>
118 <when value="builtin">
119 <param name="builtin_reference" type="select" label="FASTA genome to use as a reference in order to assess DNA damage" help="Contact your Galaxy team in order to add a reference">
120 <options from_data_table="all_fasta"></options>
121 <validator type="no_options" message="No FASTA references are available, please contact your Galaxy administrators."/>
122 </param>
123 </when>
124 <when value="history">
125 <param name="history_reference" type="data" format="fasta" label="FASTA genome to use as a reference in order to use DNA damage"/>
126 </when>
127 </conditional>
128
129 <!--GENERAL OPTIONS-->
130
131 <param argument="--merge-reference-sequences" type="boolean" truevalue="--merge-reference-sequences" falsevalue="" label="Merge reference sequences in result files?" help="Useful for memory usage."/>
132
133 <param argument="--min-basequal" type="integer" min="0" max="93" value="0" label="Minimum PHRED score for base to be considered" help="(assumes PHRED+33 score)"/>
134
135 <param argument="--fasta" type="boolean" checked="false" truevalue="--fasta" falsevalue="" label="Output alignments in FASTA format?"/>
136
137 <section name="downsampling" title="Downsampling">
138 <conditional name="downsampling">
139 <param name="downsampling_type" type="select" label="Type of downsampling" help="Downsample using a percentage or a number of reads?">
140 <option value="no_downsampling" selected="true">No downsampling</option>
141 <option value="p">Percentage of reads</option>
142 <option value="n">Number of reads</option>
143 </param>
144 <when value="no_downsampling"/>
145 <when value="p">
146 <param argument="--downsample" type="float" min="0" max="1" label="Percentage of reads to sample" help="Must be between 0 and 1 (0 and 1 not included)">
147 <validator type="in_range" min="0" exclude_min="true" max="1" exclude_max="true" message="Percentage of reads to downsample must be > 0 and &lt; 1"/>
148 </param>
149 <param argument="--downsample-seed" type="integer" optional="true" label="Downsampling seed" help="Seed (integer) used to randomly select reads for downsampling. Useful for reproducibility."/>
150 </when>
151 <when value="n">
152 <param argument="--downsample" type="integer" min="1" label="Number of reads to sample" help="Must be superior or equal to 1"/>
153 <param argument="--downsample-seed" type="integer" optional="true" label="Downsampling seed" help="Seed (integer) used to randomly select reads for downsampling. Useful for reproducibility."/>
154 </when>
155 </conditional>
156
157 </section>
158
159 <section name="window_size" title="Analysis Window Size">
160 <param argument="--length" type="integer" min="1" value="70" label="Number of nucleotides to process, starting from 5p and 3p end of read" help="(Bases which are located further than this from one of the two read ends will not be analyzed)"/>
161 <param argument="--around" type="integer" min="0" value="10" label="Number of nucleotides to retrieve before and after read" help="(This is used in order to look at purine enrichment before strand breaks)"/>
162 </section>
163
164 <!--GRAPHICS OPTIONS-->
165
166 <section name="graphics" title="Graphics Options">
167 <param argument="--ymax" type="float" min="0" max="1" value="0.3" label="Graphical y-axis limit for nucleotide misincorporation frequency" help="(Bottom plot in the Fragmisincorporation_plot.pdf file)">
168 <validator type="in_range" min="0" exclude_min="true" max="1" exclude_max="false" message="--ymax must be > 0 and &lt;= 1"/>
169 </param>
170 <param argument="--readplot" type="integer" min="0" value="25" label="Number of bases to plot from 5p and 3p ends (x-axis)" help="(Bottom plot in the Fragmisincorporation_plot.pdf file) // Must be inferior or equal to --length !">
171 <validator type="in_range" min="0" exclude_min="true" message="--readplot must be > 0"/>
172 </param>
173 <param argument="--refplot" type="integer" min="0" value="10" label="Number of upstream and downstream nucleotides to plot for the composition plots" help="(Top four plots in the Fragmisincorporation_plot.pdf file) // Must be inferior or equal to --around !"/>
174 <param argument="--title" type="text" optional="true" label="Plot title" help="Title to give to the plots in the pdf files"/>
175 <param argument="--theme-bw" type="select" label="Graphical theme to use for the posterior prediction plot">
176 <option value="" selected="true">gray (default)</option>
177 <option value="--theme-bw">bw (--theme-bw)</option>
178 </param>
179 </section>
180
181 <!--STATISTICS OPTIONS-->
182
183 <section name="statistics" title="Statistical Estimation Options">
184 <conditional name="stats">
185 <param argument="--no-stats" type="select" label="Enable statistical estimations?">
186 <option value="" selected="true">Yes</option>
187 <option value="--no-stats">No</option>
188 </param>
189 <when value="">
190 <param argument="--rand" type="integer" min="0" value="30" label="Number of random starting points for the likelihood optimization"/>
191 <param argument="--burn" type="integer" min="0" value="10000" label="Number of burn-in iterations"/>
192 <param argument="--adjust" type="integer" min="0" value="10" label="Number of adjust proposal variance parameter iterations"/>
193 <param argument="--iter" type="integer" min="0" value="50000" label="Number of final MCMC iterations"/>
194 <conditional name="ends">
195 <param name="ends_to_process" type="select" label="Read ends to analyze for statistical estimation" help="(relates to --forward and --reverse options, default is to use both ends)">
196 <option value="" selected="true">5p and 3p ends</option>
197 <option value="--forward">5p ends only</option>
198 <option value="--reverse">3p ends only</option>
199 </param>
200 <when value="">
201 <param argument="--diff-hangs" type="boolean" checked="false" truevalue="--diff-hangs" falsevalue="" label="Overhangs are different in 5p and 3p?"/>
202 </when>
203 <when value="--forward"/>
204 <when value="--reverse"/>
205 </conditional>
206 <param argument="--seq-length" type="integer" min="1" value="12" label="Number of nucleotides (on both ends if both are used) to use for statistical estimations." help="Must be inferior or equal to the --length option !"/>
207 <param argument="--var-disp" type="boolean" checked="false" truevalue="--var-disp" falsevalue="" label="Variable dispersion in the overhangs?"/>
208 <param name="mutation_model" type="select" label="Substitution model to use in statistical estimation">
209 <option value="" selected="true">HKY85 model (Hasegawa, Kishino and Yano, default option)</option>
210 <option value="--jukes-cantor">Jukes-Cantor model (--jukes-cantor)</option>
211 </param>
212 <param name="nick_vector" type="select" label="Calculation of the nick vector" help="Default option is to estimate nick frequencies along the sequence using a GAM (Generalized Additive Model), followed by smoothing. GAM estimation will only be done if there are enough substitutions.">
213 <option value="" selected="true">Estimate using a GAM + smoothing</option>
214 <option value="--use-raw-nick-freq">Estimate using a GAM, no smoothing (--use-raw-nick-freq)</option>
215 <option value="--fix-nicks">Make it constant (--fix-nicks)</option>
216 <option value="--single-stranded">Single-stranded library protocol (makes the vector constant by filling with ones) (--single-stranded)</option>
217 </param>
218 </when>
219 <when value="--no-stats"/>
220 </conditional>
221 </section>
222
223 <!--RESCALING OPTIONS-->
224
225 <section name="rescale" title="Rescaling options">
226 <conditional name="rescaling">
227 <param argument="--rescale" type="select" label="Perform rescaling of base quality according to DNA damage level?" help="This is helpful for downstream analyses, as damaged bases create less noise.">
228 <option value="" selected="true">No</option>
229 <option value="--rescale">Yes</option>
230 </param>
231 <when value="--rescale">
232 <param argument="--rescale-length-5p" type="integer" min="0" optional="true" label="Number of bases to rescale at the 5p end" help="defaults to --seq-length if left empty. // Must be inferior or equal to --seq-length !"/>
233 <param argument="--rescale-length-3p" type="integer" min="0" optional="true" label="Number of bases to rescale at the 3p end" help="defaults to --seq-length if left empty. // Must be inferior or equal to --seq-length !"/>
234 </when>
235 <when value=""/>
236 </conditional>
237 </section>
238
239 </inputs>
240
241 <outputs>
242
243 <!--RUNTIME LOG FILE-->
244
245 <data name="runtime_log" format="txt" from_work_dir="mapDamage_results/Runtime_log.txt" label="${tool.name} on ${on_string}: Runtime_log.txt"/>
246
247 <!--RESULT FILES THAT DESCRIBE DNA DAMAGE OBSERVED ON READS-->
248
249 <collection name="damage_visualisation" type="list" label="${tool.name} on ${on_string}: Data description files and plots">
250 <data name="dnacomp" format="txt" from_work_dir="mapDamage_results/dnacomp.txt" label="${tool.name} on ${on_string}: dnacomp.txt"/>
251 <data name="misincorporation" format="txt" from_work_dir="mapDamage_results/misincorporation.txt" label="${tool.name} on ${on_string}: misincorporation.txt"/>
252 <data name="5pCtoT_freq" format="txt" from_work_dir="mapDamage_results/5pCtoT_freq.txt" label="${tool.name} on ${on_string}: 5pCtoT_freq.txt"/>
253 <data name="3pGtoA_freq" format="txt" from_work_dir="mapDamage_results/3pGtoA_freq.txt" label="${tool.name} on ${on_string}: 3pGtoA_freq.txt"/>
254 <data name="Fragmisincorporation_plot" format="pdf" from_work_dir="mapDamage_results/Fragmisincorporation_plot.pdf" label="${tool.name} on ${on_string}: Fragmisincorporation_plot.pdf"/>
255 <data name="lgdistribution" format="txt" from_work_dir="mapDamage_results/lgdistribution.txt" label="${tool.name} on ${on_string}: lgdistribution.txt"/>
256 <data name="Length_plot" format="pdf" from_work_dir="mapDamage_results/Length_plot.pdf" label="${tool.name} on ${on_string}: Length_plot.pdf"/>
257 </collection>
258
259 <!--RESULT FILES THAT DESCRIBE STATISTICAL ESTIMATIONS OF DAMAGE PARAMETERS-->
260
261 <collection name="statistical_estimation" type="list" label="${tool.name} on ${on_string}: Statistical estimation files and plots">
262 <data name="dnacomp_genome" format="csv" from_work_dir="mapDamage_results/dnacomp_genome.csv" label="${tool.name} on ${on_string}: dnacomp_genome.csv"/>
263 <data name="Stats_out_MCMC_iter_summ_stat" format="csv" from_work_dir="mapDamage_results/Stats_out_MCMC_iter_summ_stat.csv" label="${tool.name} on ${on_string}: Stats_out_MCMC_iter_summ_stat.csv"/>
264 <data name="Stats_out_MCMC_hist" format="pdf" from_work_dir="mapDamage_results/Stats_out_MCMC_hist.pdf" label="${tool.name} on ${on_string}: Stats_out_MCMC_hist.pdf"/>
265 <data name="Stats_out_MCMC_iter" format="csv" from_work_dir="mapDamage_results/Stats_out_MCMC_iter.csv" label="${tool.name} on ${on_string}: Stats_out_MCMC_iter.csv"/>
266 <data name="Stats_out_MCMC_trace" format="pdf" from_work_dir="mapDamage_results/Stats_out_MCMC_trace.pdf" label="${tool.name} on ${on_string}: Stats_out_MCMC_trace.pdf"/>
267 <data name="Stats_out_MCMC_correct_prob" format="csv" from_work_dir="mapDamage_results/Stats_out_MCMC_correct_prob.csv" label="${tool.name} on ${on_string}: Stats_out_MCMC_correct_prob.csv"/>
268 <data name="Stats_out_MCMC_post_pred" format="pdf" from_work_dir="mapDamage_results/Stats_out_MCMC_post_pred.pdf" label="${tool.name} on ${on_string}: Stats_out_MCMC_post_pred.pdf"/>
269 <filter>statistics['stats']['no_stats'] == ''</filter>
270 </collection>
271
272 <!--OPTIONAL ALIGNMENT OUTPUT FILES-->
273
274 <data name="rescaled_bam" format="bam" from_work_dir="mapDamage_results/alignment.rescaled.bam" label="${tool.name} on ${on_string}: Rescaled .bam file">
275 <filter>rescale['rescaling']['rescale'] == '--rescale'</filter>
276 </data>
277
278 <data name="fasta_alignment" format="fasta" from_work_dir="mapDamage_results/alignment.fasta" label="${tool.name} on ${on_string}: FASTA file of alignments">
279 <filter>fasta</filter>
280 </data>
281
282 </outputs>
283
284 <tests>
285
286 <!--GENERAL TESTS-->
287 <!--NB: only data_visualisation files are tested, as the statistical estimation process is stochastic and files will always be different-->
288
289 <!--SAM input-->
290 <test expect_num_outputs="17" expect_exit_code="0">
291
292 <param name="sbam_file" value="test_align.sam" ftype="sam"/>
293 <conditional name="reference">
294 <param name="ref_source" value="history"/>
295 <param name="history_reference" value="ref.fa" ftype="fasta"/>
296 </conditional>
297
298 <output_collection name="damage_visualisation" type="list">
299 <element name="misincorporation" file="reference_misincorporation.txt" compare="diff" lines_diff="2"/>
300 <element name="lgdistribution" file="reference_lgdistribution.txt" compare="diff" lines_diff="2"/>
301 </output_collection>
302
303 </test>
304
305 <!--BAM input-->
306 <test expect_num_outputs="17" expect_exit_code="0">
307
308 <param name="sbam_file" value="test_align.bam" ftype="bam"/>
309 <conditional name="reference">
310 <param name="ref_source" value="history"/>
311 <param name="history_reference" value="ref.fa" ftype="fasta"/>
312 </conditional>
313
314 <output_collection name="damage_visualisation" type="list">
315 <element name="misincorporation" file="reference_misincorporation.txt" compare="diff" lines_diff="2"/>
316 <element name="lgdistribution" file="reference_lgdistribution.txt" compare="diff" lines_diff="2"/>
317 </output_collection>
318
319
320 </test>
321
322 <!--TEST TO VERIFY BUILT-IN REFERENCE GENOMES WORK AS INTENDED-->
323 <test expect_num_outputs="9" expect_exit_code="0">
324
325 <param name="sbam_file" value="test_align.bam" ftype="bam"/>
326 <conditional name="reference">
327 <param name="ref_source" value="builtin"/>
328 <param name="builtin_reference" value="test"/>
329 </conditional>
330 <section name="statistics">
331 <conditional name="stats">
332 <param name="no_stats" value="--no-stats"/>
333 </conditional>
334 </section>
335
336 <output_collection name="damage_visualisation" type="list">
337 <element name="misincorporation" file="reference_misincorporation.txt" compare="diff" lines_diff="2"/>
338 <element name="lgdistribution" file="reference_lgdistribution.txt" compare="diff" lines_diff="2"/>
339 </output_collection>
340
341 </test>
342
343 <!--TEST TO VERIFY no_stats OPTION WORKS AS INTENDED-->
344 <test expect_num_outputs="9" expect_exit_code="0">
345
346 <param name="sbam_file" value="test_align.sam" ftype="sam"/>
347 <conditional name="reference">
348 <param name="ref_source" value="history"/>
349 <param name="history_reference" value="ref.fa" ftype="fasta"/>
350 </conditional>
351 <section name="statistics">
352 <conditional name="stats">
353 <param name="no_stats" value="--no-stats"/>
354 </conditional>
355 </section>
356
357 <output_collection name="damage_visualisation" type="list">
358 <element name="misincorporation" file="reference_misincorporation.txt" compare="diff" lines_diff="2"/>
359 <element name="lgdistribution" file="reference_lgdistribution.txt" compare="diff" lines_diff="2"/>
360 </output_collection>
361
362 </test>
363
364 <!--TEST TO VERIFY fasta OPTION WORKS AS INTENDED-->
365
366 <test expect_num_outputs="10" expect_exit_code="0">
367
368 <param name="sbam_file" value="test_align.sam" ftype="sam"/>
369 <conditional name="reference">
370 <param name="ref_source" value="history"/>
371 <param name="history_reference" value="ref.fa" ftype="fasta"/>
372 </conditional>
373 <section name="statistics">
374 <conditional name="stats">
375 <param name="no_stats" value="--no-stats"/>
376 </conditional>
377 </section>
378 <param name="fasta" value="true"/>
379
380 <output_collection name="damage_visualisation" type="list">
381 <element name="misincorporation" file="reference_misincorporation.txt" compare="diff" lines_diff="2"/>
382 <element name="lgdistribution" file="reference_lgdistribution.txt" compare="diff" lines_diff="2"/>
383 </output_collection>
384
385 <output name="fasta_alignment" file="alignment.fa" compare="diff" count="1"/>
386
387 </test>
388
389 <!--TEST TO VERIFY rescale OPTION WORKS AS INTENDED-->
390 <!--BAM file is not compared, as it depends on the stochastic statistical estimations, and may therefore differ (though perhaps not with this dataset)-->
391 <test expect_num_outputs="18" expect_exit_code="0">
392
393 <param name="sbam_file" value="test_align.sam" ftype="sam"/>
394 <conditional name="reference">
395 <param name="ref_source" value="history"/>
396 <param name="history_reference" value="ref.fa" ftype="fasta"/>
397 </conditional>
398 <section name="rescale">
399 <conditional name="rescaling">
400 <param name="rescale" value="--rescale"/>
401 </conditional>
402 </section>
403
404 <output_collection name="damage_visualisation" type="list">
405 <element name="misincorporation" file="reference_misincorporation.txt" compare="diff" lines_diff="2"/>
406 <element name="lgdistribution" file="reference_lgdistribution.txt" compare="diff" lines_diff="2"/>
407 </output_collection>
408
409 </test>
410
411 </tests>
412
413 <help><![CDATA[
414 **Overview**:
415
416 mapDamage is a computational framework which allows for authentication of ancient DNA NGS reads by tracking and quantifying damage patterns which are specific to ancient DNA molecules.
417
418 The following damage patterns can be studied thanks to mapDamage's plots and Bayesian statistical estimations:
419
420 * Fragment length
421 * Purine enrichment before strand breaks
422 * Cytosine deaminations and resulting misincorporations (C → T and G → A) (in single-stranded and double-stranded context)
423 * Nicking
424 * Overhang lengths
425
426 This help is divided into subsections: Inputs, Outputs, Parameters, and General Usage. For a quick read, jump down to the 'General Usage' section!
427
428 -----
429
430 **Inputs**:
431
432 mapDamage only needs two input files:
433
434 * An alignment map file, either in BAM or SAM format.
435
436 * A reference sequence, in FASTA format. Can either be uploaded to Galaxy or chosen from a list of built-in references (no uploading required).
437
438 -----
439
440 **Outputs**:
441
442 The most commonly used outputs are listed in the 'General Usage' section below.
443
444 mapDamage has plenty of output files, which have been grouped into Data Collections in order to improve clarity.
445
446 * **Runtime_log.txt**: Contains general information about the command line which was executed by Galaxy, as well as the time taken by each task.
447
448 * Outputs in the **'Damage Visualisation' collection**: These outputs (text files and associated PDF plots) describe the damage patterns observed in your input BAM / SAM. No statistical estimations were involved in their generation.
449
450 - **dnacomp.txt**: Contains the counts of each base (ATGC) across all reads, for the first X bases from the end in question (X is given by --length option) as well as the Y bases surrounding the end in question (Y is given by the --around option). This is broken down by end (3p/5p) and by strand (+/-), and by chromosome/contig.
451
452 - **misincorporation.txt**: Contains the counts of each type of mutation (substitutions, deletions, insertions, soft-clipping) across all reads, for the first X bases from the end in question (X is given by the --length option). This is broken down by end (3p/5p), by strand (+/-), and by chromosome/contig.
453
454 - **3pGtoA_freq.txt**: Contains the frequencies of G → A substitutions across all reads, for the first N bases from the 3p end (N is given by --readplot).
455
456 - **5pCtoT_freq.txt**: Contains the frequencies of C → T substitutions across all reads, for the first N bases from the 5p end (N is given by --readplot).
457
458 - **Fragmisincorporation_plot.pdf**: Contains plots of the above data: One frequency plot per nucleotide, as well as a plot describing the distribution of C → T and G → A substitutions
459
460 - **lgdistribution.txt**: Contains the distribution of reads according to strand on which they map (+/-) and length.
461
462 - **Length_plot.pdf**: Contains plots of the above data.
463
464 * Outputs in the **'Statistical Estimations' collection**: These outputs describe the Bayesian statistical estimation process as well as its results.
465
466 - **dnacomp_genome.csv**: Contains whole-genome base frequencies, as calculated by seqtk.
467
468 - **Stats_out_MCMC_iter_summ_stat.csv**: Contains mean value, standard deviation, acceptance ratio and posterior distribution for the following parameters:
469
470 * θ : difference rate between reference and sample that is not due to DNA damage.
471 * ρ : transversion / transition bias.
472 * δd : rate of Cytosine deamination in a double-stranded context.
473 * δs : rate of Cytosine deamination in a single-stranded context.
474 * λ : probability of terminating an overhang.
475 * The model's log-likelihood.
476
477 - **Stats_out_MCMC_hist.pdf**: Contains plots of the posterior distribution of the 6 parameters described above.
478
479 - **Stats_out_MCMC_iter.csv**: Contains the mean values for each of the 6 parameters described above at every MCMC iteration.
480
481 - **Stats_out_MCMC_trace.pdf**: Contains trace plots derived from the previous file.
482
483 - **Stats_out_MCMC_correct_prob.csv**: Contains posterior probabilities that C → T and G → A substitutions are due to DNA damage, for the first N bases from both 3p and 5p ends (N is given by --seq-length)
484
485 - **Stats_out_MCMC_post_pred.pdf**: Contains plots derived from the previous file, as well as predictive intervals for these probabilities.
486
487
488
489 * **alignment.fasta** (if --fasta option was set): FASTA file containing alignments as specified by the BAM / SAM file. FASTA file also contains the nucleotides surrounding the reads as defined by the --around parameter.
490
491 * **alignment.rescaled.bam** (if --rescale option was set): BAM file containing a rescaled version of the alignment: the --rescale parameter leads to a re-evaluation of base quality based on misincorporation probability, which can decrease the noise created by these misincorporations in downstream analyses.
492
493 -----
494
495 **Options**:
496
497 mapDamage has many parameters, which have been grouped into sections for convenience.
498
499 Here, the objective is to provide a guide on the most important tool features, and only a few key parameters are therefore detailed.
500
501 More details concerning each option can be found:
502
503 - on the mapDamage GitHub.
504 - in the descriptions given below the options.
505
506 Some options may lack a detailed explanation (statistics related options in particular). In this case, please refer to the citations below and / or to the source code, available on the mapDamage GitHub.
507
508 * **--no-stats**: Do not perform statistical estimation of damage parameters. This can be very useful if you only plan on looking at the *Fragmisincorporation.pdf* plot in order to see if the damage patterns are present, as it tends to vastly improve the run-time.
509 * **--rescale**: Produces a new BAM alignment file where base qualities have been re-evaluated according to damage probability. Please note that this can only be done if statistical estimations are performed. Useful for reducing noise in downstream analyses: aDNA related misincorporations (C → T, G → A) are no longer falsely flagged as SNPs.
510 * **--merge-reference-sequence**: Merges results for the different sequences (chromosomes or contigs) in your reference FASTA file. Instead of one table per reference sequence in the dnacomp.txt and misincorporation.txt files, there is one table describing every sequence. This greatly reduces the size of these files (size is more or less divided by the number of reference sequences)
511
512 * Options regarding **specific library preparation protocols**: By default, mapDamage assumes that you are working with merged paired-end reads, from a double-stranded library protocol. If that is not the case, the tool authors have made recommendations, which you can check out in the supplementary data of Jónsson et al. (2013).
513
514 - For **single-stranded libraries**:
515
516 * Calculation of the nick vector: single-stranded library protocol (**--single-stranded**). The authors recommend this option for treating single-stranded library data, and remark that the user should see elevated C → T substitution frequencies at both ends in the Stats_out_MCMC_post_pred.pdf plot.
517
518 - For **single-end libraries**:
519
520 * Read ends to analyze for statistical estimation: 5p ends only (**--forward**). This forces mapDamage to only analyze the 5p ends of reads.
521 * Overhangs are different in 5p and 3p?: Yes (**--diff-hangs**). This option allows different mean lengths for the two overhangs, which can be another way to account for single-end data.
522
523 -----
524
525 **General usage**:
526
527 (This description is also based on the supplementary data of Jónsson et al. (2013))
528
529 **Launching mapDamage**
530
531 For a simple usage of mapDamage, the user only needs to provide the two input files (alignment file and reference file), and press the 'Run tool' button.
532
533 If downstream SNP analyses are in order, then one may want to use the rescaling feature, which will reduce the noise induced by cytosine deamination in these analyses.
534
535 Statistical estimation may be disabled to improve run-time, if the user doesn't intend to make use of it.
536
537 **Analyzing the outputs**
538
539 The first outputs one should have a look at are the *Fragmisincorporation_plot.pdf* and the *Length_plot.pdf* files. One can check if the expected damage is present, and if there are any problems with the dataset.
540
541 If statistical estimations were run, the user should check if the proposed model is a good fit: do the empirical substitution frequencies (*Fragmisincorporation.pdf* plot) fall within the prediction intervals (*Stats_out_MCMC_post_pred.pdf* plot) ?
542
543 The fit can also be assessed by looking at the *Stats_out_MCMC_trace.pdf* plot, to check if an equilibrium was reached over the course of the iterations. Finally, one can have a look at the acceptance ratio values in the *Stats_out_MCMC_iter_summ_stat.csv* file (3rd row), which should be between 0.1 and 0.3.
544
545 If the fit is not correct, it may be improved by increasing the values of the --rand, --burn, --adjust and --iter options.
546
547 If the fit looks correct, the rescaled BAM file can be confidently used, and the values of the damage parameters described in the *Stats_out_MCMC_iter_summ_stat.csv* may be used for comparison across samples.
548 ]]></help>
549
550 <citations>
551 <citation type="doi">10.1093/bioinformatics/btt193</citation>
552 <citation type="doi">10.1093/bioinformatics/btr347</citation>
553 </citations>
554
555 </tool>