Mercurial > repos > iuc > nugen_nudup

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nugen_nudup.xml	Fri Dec 02 18:03:47 2016 -0500
@@ -0,0 +1,115 @@
+<tool id="nugen_nudup" name="NuDUP" version="2.2_post2016104">
+    <description>mark/remove PCR duplicates based on molecular tags</description>
+    <requirements>
+        <requirement type="package" version="2.2_post2016104">nudup</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <version_command>nudup.py --version</version_command>
+    <command><![CDATA[
+         ln -f -s '$input' 'input.bam' &&
+         ln -f -s '$input.metadata.bam_index' 'input.bai' &&
+        nudup.py $paired_end
+        -f '$umi_fastq'
+        --start $start
+        --length $length
+        'input.bam'
+        ]]>
+    </command>
+    <inputs>
+        <param type="data" name="input" label="Input SAM/BAM file"
+            format="sam,bam" help="Input SAM/BAM containing only unique
+            alignments" />
+        <param type="data" name="umi_fastq" label="Fastq file containing
+            molecular tag sequence" format="fastq,fastqsanger" help="FASTQ
+            file containing the molecular tag sequence for each read name in
+            the corresponding SAM/BAM file" />
+        <param type="boolean" argument="--paired-end"
+            label="Paired-end deduping" name="paired_end"
+            truevalue="--paired-end" falsevalue=""
+            checked="false"
+            help="use paired end deduping with template. SAM/BAM alignment
+            must contain paired end reads. Degenerate read pairs
+            (alignments for one read of pair) will be discarded." />
+        <param type="integer" argument="--start" label="Tag sequence start
+            position from 3' end" value="6" help="position in index read where
+            molecular tag sequence begins. This should be a 1-based value that
+            counts in from the 3' END of the read." />
+        <param type="integer" argument="--length" label="Tag sequence length"
+            value="6" help="length of molecular tag sequence" />
+    </inputs>
+    <outputs>
+        <data format="bam" name="markdup" from_work_dir="prefix.sorted.markdup.bam" />
+        <data format="bam" name="dedup" from_work_dir="prefix.sorted.dedup.bam" />
+        <data format="txt" name="log" from_work_dir="prefix_dup_log.txt" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="nudup_test_1.bam" ftype="bam" />
+            <param name="umi_fastq" value="nudup_umis.fastq" ftype="fastqsanger" />
+            <param name="start" value="8" />
+            <param name="length" value="8" />
+            <output name="markdup" file="nudup_markdup_1.bam" ftype="bam" />
+            <output name="dedup" file="nudup_dedup_1.bam" ftype="bam" />
+            <output name="log" file="nudup_log_1.txt" ftype="txt" />
+        </test>
+    </tests>
+    <help><![CDATA[
+Marks/removes PCR introduced duplicate molecules based on the molecular tagging
+technology used in NuGEN products.
+
+For SINGLE END reads, duplicates are marked if they fulfill the following
+criteria: a) start at the same genomic coordinate b) have the same strand
+orientation c) have the same molecular tag sequence. The read with the
+highest mapping quality is kept as the non-duplicate read.
+
+For PAIRED END reads, duplicates are marked if they fulfill the following
+criteria: a) start at the same genomic coordinate b) have the same template
+length c) have the same molecular tag sequence. The read pair with the highest
+mapping quality is kept as the non-duplicate read.
+
+Author: Anand Patel
+
+Contact: NuGEN Technologies Inc., techserv@nugen.com
+
+::
+
+    Input:
+      IN.sam|IN.bam         input sorted/unsorted SAM/BAM containing only unique
+                            alignments (sorted required for case 2 detailed above)
+
+    Options:
+      -2, --paired-end      use paired end deduping with template. SAM/BAM
+                            alignment must contain paired end reads. Degenerate
+                            read pairs (alignments for one read of pair) will be
+                            discarded.
+      -f INDEX.fq|READ.fq   FASTQ file containing the molecular tag sequence for
+                            each read name in the corresponding SAM/BAM file
+                            (required only for CASE 1 detailed above)
+      -o OUT_PREFIX, --out OUT_PREFIX
+                            prefix of output file paths for sorted BAMs (default
+                            will create prefix.sorted.markdup.bam,
+                            prefix.sorted.dedup.bam, prefix_dup_log.txt)
+      -s START, --start START
+                            position in index read where molecular tag sequence
+                            begins. This should be a 1-based value that counts in
+                            from the 3' END of the read. (default = 6)
+      -l LENGTH, --length LENGTH
+                            length of molecular tag sequence (default = 6)
+      -v, --version         show program's version number and exit
+      -h, --help            show this help message and exit
+        ]]></help>
+    <citations>
+        <citation type="bibtex">@misc{Patel2016,
+  author = {Patel, Anand},
+  title = {NuDUP},
+  year = {2016},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/nugentechnologies/nudup}},
+  commit = {740d9fe439dd8917605a56483a8796b377eb24c6}
+}
+        </citation>
+    </citations>
+</tool>
Binary file test-data/nudup_dedup_1.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nudup_log_1.txt	Fri Dec 02 18:03:47 2016 -0500
@@ -0,0 +1,2 @@
+aligned_count	unaligned_count	position_dup_count	frac_position_dup	moltag_dup_count	frac_moltag_dup
+4	0	3	0.7500	1	0.2500
Binary file test-data/nudup_markdup_1.bam has changed
Binary file test-data/nudup_test_1.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nudup_umis.fastq	Fri Dec 02 18:03:47 2016 -0500
@@ -0,0 +1,16 @@
+@HWI-M04292:9:000000000-G129D:1:1102:2975:16166 3:N:0:
+CTAATACT
++
+BFFFGGFE
+@HWI-M04292:9:000000000-G129D:1:1103:25506:5857 3:N:0:
+GCCATCGC
++
+FFCFGGGG
+@HWI-M04292:9:000000000-G129D:1:1104:4648:15369 3:N:0:
+GACCTGGT
++
+FFFFGGGG
+@HWI-M04292:9:000000000-G129D:1:2101:15393:25938 3:N:0:
+CTAATACT
++
+FFFFGGGG