comparison rgPicardMarkDups.xml @ 4:f4d018471628 draft default tip

Uploaded
author jpruab
date Tue, 13 Aug 2013 12:09:14 -0400
parents
children
comparison
equal deleted inserted replaced
3:08b477977410 4:f4d018471628
1 <tool name="Mark Duplicate reads" id="rgPicardMarkDups" version="1.56.0">
2 <command interpreter="python">
3 picard_wrapper.py -i "${input_file}" -n "${out_prefix}" --tmpdir "${__new_file_path__}" -o "${out_file}"
4 --remdups "${remDups}" --assumesorted "${assumeSorted}" --readregex "${readRegex}" --optdupdist "${optDupeDist}"
5 -j "\$JAVA_JAR_PATH/MarkDuplicates.jar" -d "${html_file.files_path}" -t "${html_file}" -e "${input_file.ext}"
6 </command>
7 <requirements><requirement type="package" version="1.56.0">picard</requirement></requirements>
8 <inputs>
9 <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset to mark duplicates in"
10 help="If empty, upload or import a SAM/BAM dataset."/>
11 <param name="out_prefix" value="Dupes Marked" type="text"
12 label="Title for the output file" help="Use this remind you what the job was for" size="80" />
13 <param name="remDups" value="false" type="boolean" label="Remove duplicates from output file"
14 truevalue="true" falsevalue="false" checked="yes"
15 help="If true do not write duplicates to the output file instead of writing them with appropriate flags set." />
16 <param name="assumeSorted" value="true" type="boolean" label="Assume reads are already ordered"
17 truevalue="true" falsevalue="false" checked="yes"
18 help="If true assume input data are already sorted (most Galaxy SAM/BAM should be)." />
19 <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" type="text" size="80"
20 label="Regular expression that can be used to parse read names in the incoming SAM file"
21 help="Names are parsed to extract: tile/region, x coordinate and y coordinate, to estimate optical duplication rate" >
22 <sanitizer>
23 <valid initial="string.printable">
24 <remove value="&apos;"/>
25 </valid>
26 <mapping initial="none">
27 <add source="&apos;" target="__sq__"/>
28 </mapping>
29 </sanitizer>
30 </param>
31 <param name="optDupeDist" value="100" type="integer"
32 label="The maximum offset between two duplicate clusters in order to consider them optical duplicates." size="5"
33 help="e.g. 5-10 pixels. Later Illumina software versions multiply pixel values by 10, in which case 50-100." >
34 <validator type="in_range" message="Minimum optical dupe distance must be positive" min="0" />
35 </param>
36
37 </inputs>
38 <outputs>
39 <data format="bam" name="out_file" label="MarkDups_${out_prefix}.bam"/>
40 <data format="html" name="html_file" label="MarkDups_${out_prefix}.html"/>
41 </outputs>
42 <tests>
43 <test>
44 <param name="input_file" value="picard_input_tiny_coord.bam" ftype="bam" />
45 <param name="out_prefix" value="Dupes Marked" />
46 <param name="remDups" value="false" />
47 <param name="assumeSorted" value="true" />
48 <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
49 <param name="optDupeDist" value="100" />
50 <output name="out_file" file="picard_output_markdups_sortedpairsam.bam" ftype="bam" compare="diff" />
51 <output name="html_file" file="picard_output_markdups_sortedpairsam.html" ftype="html" lines_diff="75" />
52 </test>
53 <test>
54 <param name="input_file" value="picard_input_tiny_coord.sam" ftype="sam" />
55 <param name="out_prefix" value="Dupes Marked" />
56 <param name="remDups" value="true" />
57 <param name="assumeSorted" value="true" />
58 <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
59 <param name="optDupeDist" value="100" />
60 <output name="out_file" file="picard_output_markdups_remdupes.bam" ftype="bam" compare="diff" />
61 <output name="html_file" file="picard_output_markdups_sortedpairsam.html" ftype="html" lines_diff="75" />
62 </test>
63 </tests>
64
65 <help>
66
67 .. class:: infomark
68
69 **Purpose**
70
71 Marks all duplicate reads in a provided SAM or BAM file and either removes them or flags them.
72
73 **Picard documentation**
74
75 This is a Galaxy wrapper for MarkDuplicates, a part of the external package Picard-tools_.
76
77 .. _Picard-tools: http://www.google.com/search?q=picard+samtools
78
79 -----
80
81 .. class:: infomark
82
83 **Inputs, outputs, and parameters**
84
85 Picard documentation says (reformatted for Galaxy):
86
87 .. csv-table:: Mark Duplicates docs
88 :header-rows: 1
89
90 Option,Description
91 "INPUT=File","The input SAM or BAM file to analyze. Must be coordinate sorted. Required."
92 "OUTPUT=File","The output file to right marked records to Required."
93 "METRICS_FILE=File","File to write duplication metrics to Required."
94 "REMOVE_DUPLICATES=Boolean","If true do not write duplicates to the output file instead of writing them with appropriate flags set. Default value: false."
95 "ASSUME_SORTED=Boolean","If true, assume that the input file is coordinate sorted, even if the header says otherwise. Default value: false."
96 "MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=Integer","This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000."
97 "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=Integer","Maximum number of file handles to keep open when spilling read ends to disk."
98 "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. "
99 "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100"
100
101 .. class:: warningmark
102
103 **Warning on SAM/BAM quality**
104
105 Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
106 flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
107 to be the only way to deal with SAM/BAM that cannot be parsed.
108 .. class:: infomark
109
110 **Note on the Regular Expression**
111
112 (from the Picard docs)
113 This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).
114
115 Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. All records are then written to the output file with the duplicate records flagged unless the remove duplicates option is selected. In some cases you may want to do this, but please only do this if you really understand what you are doing.
116
117 </help>
118 </tool>
119
120
121
122
123
124
125
126
127
128
129
130