# HG changeset patch
# User iuc
# Date 1631544656 0
# Node ID cf25b50eff0a34ff450b75f941d26e7a1d22a173
# Parent bc082a79d655786bba18ae6b165a0edff00f693b
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
diff -r bc082a79d655 -r cf25b50eff0a macros.xml
--- a/macros.xml Wed Feb 10 19:31:44 2021 +0000
+++ b/macros.xml Mon Sep 13 14:50:56 2021 +0000
@@ -1,5 +1,43 @@
+
+
+
+ 1.1.2
+ 0
+ 21.01
+
+
+ umi_tools
+
+
+
+
+
+ 10.1101/gr.209601.116
+
+ @misc{githubUMI-tools,
+ title = {UMI-tools},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ url = {https://github.com/CGATOxford/UMI-tools},
+ }
+
+
+
+
+
+
+
+
+
+
+
+
@@ -23,90 +61,510 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz,fastqsolexa,fastqsolexa.gz
+
+
+ umi-tools
+
+
-
-
+
+
-
+
+
-
-
-
+
+
+
+
+
-
-
+
+
+
+
-
-
- 10.1101/gr.209601.116
-
- @misc{githubUMI-tools,
- title = {UMI-tools},
- publisher = {GitHub},
- journal = {GitHub repository},
- url = {https://github.com/CGATOxford/UMI-tools},
- }
-
-
-
-
-
- umi_tools
-
-
-
- 0.5.5
+
+
+
+ 'input.bam' &&
+ samtools index -b 'input.bam' &&
+ #set $input_file = 'input.bam'
+ #else:
+ ln -sf '${input}' 'input.bam' &&
+ ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
+ #set $input_file = 'input.bam'
+ #end if
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .{8,12})(?PGAGTGATTGCTTGTGACGCCTT)(?P.{8})(?P.{6})T{3}.*
+
+ Where only reads with a 3' T-tail and `GAGTGATTGCTTGTGACGCCTT` in
+ the correct position to yield two cell barcodes of 8-12 and 8bp
+ respectively, and a 6bp UMI will be retained.
+
+ You can also specify fuzzy matching to allow errors. For example if
+ the discard group above was specified as below this would enable
+ matches with up to 2 errors in the discard_1 group.
+
+ ::
+
+ (?PGAGTGATTGCTTGTGACGCCTT){s<=2}
+
+ Note that all UMIs must be the same length for downstream
+ processing with dedup, group or count commands]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ``,
+replacing with e.g ":".
+
+Alternatively, if your UMIs are encoded in a tag, you can specify this
+by setting the option --extract-umi-method=tag and set the tag name
+with the --umi-tag option. For example, if your UMIs are encoded in
+the 'UM' tag, provide the following options:
+``--extract-umi-method=tag`` ``--umi-tag=UM``
+
+Finally, if you have used umis to extract the UMI +/- cell barcode,
+you can specify ``--extract-umi-method=umis``
+
+The start position of a read is considered to be the start of its alignment
+minus any soft clipped bases. A read aligned at position 500 with
+cigar 2S98M will be assumed to start at position 498.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ = (2* umi B counts) - 1. Each
+ network is a read group.
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ log
+
+
+
+
diff -r bc082a79d655 -r cf25b50eff0a test-data/chr19_gene_tags.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/chr19_gene_tags.sam Mon Sep 13 14:50:56 2021 +0000
@@ -0,0 +1,1492 @@
+@HD VN:1.4 SO:queryname
+@SQ SN:chr1 LN:248956422
+@SQ SN:chr2 LN:242193529
+@SQ SN:chr3 LN:198295559
+@SQ SN:chr4 LN:190214555
+@SQ SN:chr5 LN:181538259
+@SQ SN:chr6 LN:170805979
+@SQ SN:chr7 LN:159345973
+@SQ SN:chr8 LN:145138636
+@SQ SN:chr9 LN:138394717
+@SQ SN:chr10 LN:133797422
+@SQ SN:chr11 LN:135086622
+@SQ SN:chr12 LN:133275309
+@SQ SN:chr13 LN:114364328
+@SQ SN:chr14 LN:107043718
+@SQ SN:chr15 LN:101991189
+@SQ SN:chr16 LN:90338345
+@SQ SN:chr17 LN:83257441
+@SQ SN:chr18 LN:80373285
+@SQ SN:chr19 LN:58617616
+@SQ SN:chr20 LN:64444167
+@SQ SN:chr21 LN:46709983
+@SQ SN:chr22 LN:50818468
+@SQ SN:chrX LN:156040895
+@SQ SN:chrY LN:57227415
+@SQ SN:chrM LN:16569
+@SQ SN:GL000008.2 LN:209709
+@SQ SN:GL000009.2 LN:201709
+@SQ SN:GL000194.1 LN:191469
+@SQ SN:GL000195.1 LN:182896
+@SQ SN:GL000205.2 LN:185591
+@SQ SN:GL000208.1 LN:92689
+@SQ SN:GL000213.1 LN:164239
+@SQ SN:GL000214.1 LN:137718
+@SQ SN:GL000216.2 LN:176608
+@SQ SN:GL000218.1 LN:161147
+@SQ SN:GL000219.1 LN:179198
+@SQ SN:GL000220.1 LN:161802
+@SQ SN:GL000221.1 LN:155397
+@SQ SN:GL000224.1 LN:179693
+@SQ SN:GL000225.1 LN:211173
+@SQ SN:GL000226.1 LN:15008
+@SQ SN:KI270302.1 LN:2274
+@SQ SN:KI270303.1 LN:1942
+@SQ SN:KI270304.1 LN:2165
+@SQ SN:KI270305.1 LN:1472
+@SQ SN:KI270310.1 LN:1201
+@SQ SN:KI270311.1 LN:12399
+@SQ SN:KI270312.1 LN:998
+@SQ SN:KI270315.1 LN:2276
+@SQ SN:KI270316.1 LN:1444
+@SQ SN:KI270317.1 LN:37690
+@SQ SN:KI270320.1 LN:4416
+@SQ SN:KI270322.1 LN:21476
+@SQ SN:KI270329.1 LN:1040
+@SQ SN:KI270330.1 LN:1652
+@SQ SN:KI270333.1 LN:2699
+@SQ SN:KI270334.1 LN:1368
+@SQ SN:KI270335.1 LN:1048
+@SQ SN:KI270336.1 LN:1026
+@SQ SN:KI270337.1 LN:1121
+@SQ SN:KI270338.1 LN:1428
+@SQ SN:KI270340.1 LN:1428
+@SQ SN:KI270362.1 LN:3530
+@SQ SN:KI270363.1 LN:1803
+@SQ SN:KI270364.1 LN:2855
+@SQ SN:KI270366.1 LN:8320
+@SQ SN:KI270371.1 LN:2805
+@SQ SN:KI270372.1 LN:1650
+@SQ SN:KI270373.1 LN:1451
+@SQ SN:KI270374.1 LN:2656
+@SQ SN:KI270375.1 LN:2378
+@SQ SN:KI270376.1 LN:1136
+@SQ SN:KI270378.1 LN:1048
+@SQ SN:KI270379.1 LN:1045
+@SQ SN:KI270381.1 LN:1930
+@SQ SN:KI270382.1 LN:4215
+@SQ SN:KI270383.1 LN:1750
+@SQ SN:KI270384.1 LN:1658
+@SQ SN:KI270385.1 LN:990
+@SQ SN:KI270386.1 LN:1788
+@SQ SN:KI270387.1 LN:1537
+@SQ SN:KI270388.1 LN:1216
+@SQ SN:KI270389.1 LN:1298
+@SQ SN:KI270390.1 LN:2387
+@SQ SN:KI270391.1 LN:1484
+@SQ SN:KI270392.1 LN:971
+@SQ SN:KI270393.1 LN:1308
+@SQ SN:KI270394.1 LN:970
+@SQ SN:KI270395.1 LN:1143
+@SQ SN:KI270396.1 LN:1880
+@SQ SN:KI270411.1 LN:2646
+@SQ SN:KI270412.1 LN:1179
+@SQ SN:KI270414.1 LN:2489
+@SQ SN:KI270417.1 LN:2043
+@SQ SN:KI270418.1 LN:2145
+@SQ SN:KI270419.1 LN:1029
+@SQ SN:KI270420.1 LN:2321
+@SQ SN:KI270422.1 LN:1445
+@SQ SN:KI270423.1 LN:981
+@SQ SN:KI270424.1 LN:2140
+@SQ SN:KI270425.1 LN:1884
+@SQ SN:KI270429.1 LN:1361
+@SQ SN:KI270435.1 LN:92983
+@SQ SN:KI270438.1 LN:112505
+@SQ SN:KI270442.1 LN:392061
+@SQ SN:KI270448.1 LN:7992
+@SQ SN:KI270465.1 LN:1774
+@SQ SN:KI270466.1 LN:1233
+@SQ SN:KI270467.1 LN:3920
+@SQ SN:KI270468.1 LN:4055
+@SQ SN:KI270507.1 LN:5353
+@SQ SN:KI270508.1 LN:1951
+@SQ SN:KI270509.1 LN:2318
+@SQ SN:KI270510.1 LN:2415
+@SQ SN:KI270511.1 LN:8127
+@SQ SN:KI270512.1 LN:22689
+@SQ SN:KI270515.1 LN:6361
+@SQ SN:KI270516.1 LN:1300
+@SQ SN:KI270517.1 LN:3253
+@SQ SN:KI270518.1 LN:2186
+@SQ SN:KI270519.1 LN:138126
+@SQ SN:KI270521.1 LN:7642
+@SQ SN:KI270522.1 LN:5674
+@SQ SN:KI270528.1 LN:2983
+@SQ SN:KI270529.1 LN:1899
+@SQ SN:KI270530.1 LN:2168
+@SQ SN:KI270538.1 LN:91309
+@SQ SN:KI270539.1 LN:993
+@SQ SN:KI270544.1 LN:1202
+@SQ SN:KI270548.1 LN:1599
+@SQ SN:KI270579.1 LN:31033
+@SQ SN:KI270580.1 LN:1553
+@SQ SN:KI270581.1 LN:7046
+@SQ SN:KI270582.1 LN:6504
+@SQ SN:KI270583.1 LN:1400
+@SQ SN:KI270584.1 LN:4513
+@SQ SN:KI270587.1 LN:2969
+@SQ SN:KI270588.1 LN:6158
+@SQ SN:KI270589.1 LN:44474
+@SQ SN:KI270590.1 LN:4685
+@SQ SN:KI270591.1 LN:5796
+@SQ SN:KI270593.1 LN:3041
+@SQ SN:KI270706.1 LN:175055
+@SQ SN:KI270707.1 LN:32032
+@SQ SN:KI270708.1 LN:127682
+@SQ SN:KI270709.1 LN:66860
+@SQ SN:KI270710.1 LN:40176
+@SQ SN:KI270711.1 LN:42210
+@SQ SN:KI270712.1 LN:176043
+@SQ SN:KI270713.1 LN:40745
+@SQ SN:KI270714.1 LN:41717
+@SQ SN:KI270715.1 LN:161471
+@SQ SN:KI270716.1 LN:153799
+@SQ SN:KI270717.1 LN:40062
+@SQ SN:KI270718.1 LN:38054
+@SQ SN:KI270719.1 LN:176845
+@SQ SN:KI270720.1 LN:39050
+@SQ SN:KI270721.1 LN:100316
+@SQ SN:KI270722.1 LN:194050
+@SQ SN:KI270723.1 LN:38115
+@SQ SN:KI270724.1 LN:39555
+@SQ SN:KI270725.1 LN:172810
+@SQ SN:KI270726.1 LN:43739
+@SQ SN:KI270727.1 LN:448248
+@SQ SN:KI270728.1 LN:1872759
+@SQ SN:KI270729.1 LN:280839
+@SQ SN:KI270730.1 LN:112551
+@SQ SN:KI270731.1 LN:150754
+@SQ SN:KI270732.1 LN:41543
+@SQ SN:KI270733.1 LN:179772
+@SQ SN:KI270734.1 LN:165050
+@SQ SN:KI270735.1 LN:42811
+@SQ SN:KI270736.1 LN:181920
+@SQ SN:KI270737.1 LN:103838
+@SQ SN:KI270738.1 LN:99375
+@SQ SN:KI270739.1 LN:73985
+@SQ SN:KI270740.1 LN:37240
+@SQ SN:KI270741.1 LN:157432
+@SQ SN:KI270742.1 LN:186739
+@SQ SN:KI270743.1 LN:210658
+@SQ SN:KI270744.1 LN:168472
+@SQ SN:KI270745.1 LN:41891
+@SQ SN:KI270746.1 LN:66486
+@SQ SN:KI270747.1 LN:198735
+@SQ SN:KI270748.1 LN:93321
+@SQ SN:KI270749.1 LN:158759
+@SQ SN:KI270750.1 LN:148850
+@SQ SN:KI270751.1 LN:150742
+@SQ SN:KI270752.1 LN:27745
+@SQ SN:KI270753.1 LN:62944
+@SQ SN:KI270754.1 LN:40191
+@SQ SN:KI270755.1 LN:36723
+@SQ SN:KI270756.1 LN:79590
+@SQ SN:KI270757.1 LN:71251
+@SQ SN:ERCC-00002 LN:1061
+@SQ SN:ERCC-00003 LN:1023
+@SQ SN:ERCC-00004 LN:523
+@SQ SN:ERCC-00009 LN:984
+@SQ SN:ERCC-00012 LN:994
+@SQ SN:ERCC-00013 LN:808
+@SQ SN:ERCC-00014 LN:1957
+@SQ SN:ERCC-00016 LN:844
+@SQ SN:ERCC-00017 LN:1136
+@SQ SN:ERCC-00019 LN:644
+@SQ SN:ERCC-00022 LN:751
+@SQ SN:ERCC-00024 LN:536
+@SQ SN:ERCC-00025 LN:1994
+@SQ SN:ERCC-00028 LN:1130
+@SQ SN:ERCC-00031 LN:1138
+@SQ SN:ERCC-00033 LN:2022
+@SQ SN:ERCC-00034 LN:1019
+@SQ SN:ERCC-00035 LN:1130
+@SQ SN:ERCC-00039 LN:740
+@SQ SN:ERCC-00040 LN:744
+@SQ SN:ERCC-00041 LN:1122
+@SQ SN:ERCC-00042 LN:1023
+@SQ SN:ERCC-00043 LN:1023
+@SQ SN:ERCC-00044 LN:1156
+@SQ SN:ERCC-00046 LN:522
+@SQ SN:ERCC-00048 LN:992
+@SQ SN:ERCC-00051 LN:274
+@SQ SN:ERCC-00053 LN:1023
+@SQ SN:ERCC-00054 LN:274
+@SQ SN:ERCC-00057 LN:1021
+@SQ SN:ERCC-00058 LN:1136
+@SQ SN:ERCC-00059 LN:525
+@SQ SN:ERCC-00060 LN:523
+@SQ SN:ERCC-00061 LN:1136
+@SQ SN:ERCC-00062 LN:1023
+@SQ SN:ERCC-00067 LN:644
+@SQ SN:ERCC-00069 LN:1137
+@SQ SN:ERCC-00071 LN:642
+@SQ SN:ERCC-00073 LN:603
+@SQ SN:ERCC-00074 LN:522
+@SQ SN:ERCC-00075 LN:1023
+@SQ SN:ERCC-00076 LN:642
+@SQ SN:ERCC-00077 LN:273
+@SQ SN:ERCC-00078 LN:993
+@SQ SN:ERCC-00079 LN:644
+@SQ SN:ERCC-00081 LN:534
+@SQ SN:ERCC-00083 LN:1022
+@SQ SN:ERCC-00084 LN:994
+@SQ SN:ERCC-00085 LN:844
+@SQ SN:ERCC-00086 LN:1020
+@SQ SN:ERCC-00092 LN:1124
+@SQ SN:ERCC-00095 LN:521
+@SQ SN:ERCC-00096 LN:1107
+@SQ SN:ERCC-00097 LN:523
+@SQ SN:ERCC-00098 LN:1143
+@SQ SN:ERCC-00099 LN:1350
+@SQ SN:ERCC-00104 LN:2022
+@SQ SN:ERCC-00108 LN:1022
+@SQ SN:ERCC-00109 LN:536
+@SQ SN:ERCC-00111 LN:994
+@SQ SN:ERCC-00112 LN:1136
+@SQ SN:ERCC-00113 LN:840
+@SQ SN:ERCC-00116 LN:1991
+@SQ SN:ERCC-00117 LN:1136
+@SQ SN:ERCC-00120 LN:536
+@SQ SN:ERCC-00123 LN:1022
+@SQ SN:ERCC-00126 LN:1118
+@SQ SN:ERCC-00130 LN:1059
+@SQ SN:ERCC-00131 LN:771
+@SQ SN:ERCC-00134 LN:274
+@SQ SN:ERCC-00136 LN:1033
+@SQ SN:ERCC-00137 LN:537
+@SQ SN:ERCC-00138 LN:1024
+@SQ SN:ERCC-00142 LN:493
+@SQ SN:ERCC-00143 LN:784
+@SQ SN:ERCC-00144 LN:538
+@SQ SN:ERCC-00145 LN:1042
+@SQ SN:ERCC-00147 LN:1023
+@SQ SN:ERCC-00148 LN:494
+@SQ SN:ERCC-00150 LN:743
+@SQ SN:ERCC-00154 LN:537
+@SQ SN:ERCC-00156 LN:494
+@SQ SN:ERCC-00157 LN:1019
+@SQ SN:ERCC-00158 LN:1027
+@SQ SN:ERCC-00160 LN:743
+@SQ SN:ERCC-00162 LN:523
+@SQ SN:ERCC-00163 LN:543
+@SQ SN:ERCC-00164 LN:1022
+@SQ SN:ERCC-00165 LN:872
+@SQ SN:ERCC-00168 LN:1024
+@SQ SN:ERCC-00170 LN:1023
+@SQ SN:ERCC-00171 LN:505
+@PG ID:STAR PN:STAR VN:STAR_2.5.2b CL:STAR --runThreadN 8 --genomeDir /data/home/mvanloenhout/Gencode_v25/Star_overhang69/ --readFilesIn /data/home/mvanloenhout/WTF2/scRNA_Analysis/Processed_data/HSC2-I02_S5_R2_001.fastq.gz --readFilesCommand gunzip -c --outFileNamePrefix /data/home/mvanloenhout/WTF2/scRNA_Analysis/Aligned_files/HSC2-I02_S5_R2_001 --outSAMtype BAM SortedByCoordinate --outSAMmultNmax 1 --outFilterType BySJout --outFilterMultimapNmax 20
+@CO user command line: STAR --runThreadN 8 --genomeDir /data/home/mvanloenhout/Gencode_v25/Star_overhang69/ --outSAMtype BAM SortedByCoordinate --outSAMmultNmax 1 --outFilterMultimapNmax 20 --outFilterType BySJout --outFileNamePrefix /data/home/mvanloenhout/WTF2/scRNA_Analysis/Aligned_files/HSC2-I02_S5_R2_001 --readFilesCommand gunzip -c --readFilesIn /data/home/mvanloenhout/WTF2/scRNA_Analysis/Processed_data/HSC2-I02_S5_R2_001.fastq.gz
+NS500668:144:H5FCJBGXY:1:11102:10920:18759:CELL_TTCACG:UMI_TTGGGA:SAMPLE_CGATGT:UID_CGATGTTTCACGTTGGGA 0 chr19 812244 255 51M9S * 0 0 CGCTGTGGACTCTGTAGAGGCAGGTTGGCCAGTCTGTACCTGGACTTCGAANNNNNNNNN AAAA/A//EE/AA/EEEA//EE////' mode='w' encoding='UTF-8'>
-# stdin : <_io.TextIOWrapper name='/tmp/tmpibtvD6/files/000/dataset_5.dat' mode='r' encoding='UTF-8'>
-# stdlog : <_io.TextIOWrapper name='/tmp/tmpibtvD6/files/000/dataset_8.dat' mode='a' encoding='UTF-8'>
-# stdout : <_io.TextIOWrapper name='' mode='w' encoding='UTF-8'>
+# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'>
+# stdin : <_io.TextIOWrapper name='input_read1.gz' encoding='ascii'>
+# stdlog : <_io.TextIOWrapper name='/tmp/tmpcx2d26we/files/0/0/8/dataset_008b1843-bfa2-44fb-9d3c-52695bd9ce74.dat' mode='a' encoding='UTF-8'>
+# stdout : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'>
# subset_reads : 0
# timeit_file : None
# timeit_header : None
# timeit_name : all
+# tmpdir : None
# whitelist_tsv : None
-2018-02-25 10:50:16,016 INFO Starting barcode extraction
-2018-02-25 10:50:16,017 INFO Parsed 0 reads
-2018-02-25 10:50:16,019 INFO Starting - whitelist determination
-2018-02-25 10:50:17,208 INFO Finished - whitelist determination
-2018-02-25 10:50:17,208 INFO Starting - finding putative error cell barcodes
-2018-02-25 10:50:17,208 INFO Finished - finding putative error cell barcodes
-2018-02-25 10:50:17,208 INFO Writing out whitelist
-2018-02-25 10:50:17,208 INFO Parsed 100 reads
-2018-02-25 10:50:17,208 INFO 100 reads matched the barcode pattern
-2018-02-25 10:50:17,208 INFO Found 23 unique cell barcodes
-# job finished in 1 seconds at Sun Feb 25 10:50:17 2018 -- 2.35 0.08 0.00 0.00 -- e78e4e5b-e99e-426a-8a92-c8b3beeadf18
+2021-07-13 15:21:12,587 INFO Starting barcode extraction
+2021-07-13 15:21:12,588 INFO Parsed 0 reads
+2021-07-13 15:21:12,590 INFO Starting - whitelist determination
+2021-07-13 15:21:14,249 INFO Finished - whitelist determination
+2021-07-13 15:21:14,249 INFO Starting - finding putative error cell barcodes
+2021-07-13 15:21:14,249 INFO building bktree
+2021-07-13 15:21:14,249 INFO done building bktree
+2021-07-13 15:21:14,249 INFO Finished - finding putative error cell barcodes
+2021-07-13 15:21:14,249 INFO Top 1 cell barcodes passed the selected threshold
+2021-07-13 15:21:14,249 INFO Writing out whitelist
+2021-07-13 15:21:14,249 INFO Parsed 100 reads
+2021-07-13 15:21:14,249 INFO 100 reads matched the barcode pattern
+2021-07-13 15:21:14,249 INFO Found 23 unique cell barcodes
+2021-07-13 15:21:14,249 INFO Found 15 total reads matching the selected cell barcodes
+2021-07-13 15:21:14,249 INFO Found 85 total reads which can be error corrected to the selected cell barcodes
+# job finished in 1 seconds at Tue Jul 13 15:21:14 2021 -- 7.19 0.62 0.08 0.02 -- ba3841c0-b2d5-4188-88ca-4ee241163293
diff -r bc082a79d655 -r cf25b50eff0a umi-tools_group.xml
--- a/umi-tools_group.xml Wed Feb 10 19:31:44 2021 +0000
+++ b/umi-tools_group.xml Mon Sep 13 14:50:56 2021 +0000
@@ -1,115 +1,126 @@
-
+Extract UMI from fastq files
+ macros.xml
- samtools
+ samtools 0:
- --gene-tag '$gene_tag'
- #end if
#if $group_output:
--group-out '$group_out'
#end if
- #if $input.is_of_type("sam"):
- --in-sam
- #end if
--output-bam
- -I '$input_file' -S grouped.bam &&
- samtools sort grouped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM
+ @GROUPDEDUP_OPTIONS@
+ @BARCODE_OPTIONS@
+ @UMI_GROUPING_OPTIONS@
+ @SAMBAM_OPTIONS@
+ @FULLSC_OPTIONS@
+ -I '$input_file' -S grouped.bam
+ @ADVANCED_OPTIONS@
+ @LOG@
+ ## TODO using samtools sort is a workaround, for the following error that appears when Galaxy
+ ## compares the generated file with the one in test-data
+ ## `Converting history BAM to SAM failed: 'samtools returned with error 1: stdout=None, stderr=[main_samview] fail to read the header from "/tmp/tmpd8o61jykdedup_out6.bam".\n'. Will compare BAM files`
+ ## may be dropped in the future
+ --no-sort-output
+ && samtools sort --no-PG grouped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM
]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
- group_out
+ group_output
+
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
+
+
+
+
+
+
+
+
+
+
-
+
-
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
+
-
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
",
-replacing with e.g ":".
-
-Alternatively, if your UMIs are encoded in a tag, you can specify this
-by setting the option --extract-umi-method=tag and set the tag name
-with the --umi-tag option. For example, if your UMIs are encoded in
-the 'UM' tag, provide the following options:
-"--extract-umi-method=tag --umi-tag=UM"
-
-By default, reads are considered identical if they have the same start
-coordinate, are on the same strand, and have the same UMI. Optionally,
-splicing status can be considered (see below).
-
-The start postion of a read is considered to be the start of its alignment
-minus any soft clipped bases. A read aligned at position 500 with
-cigar 2S98M will be assumed to start at postion 498.
-
-Methods
--------
-
-group can be run with multiple methods to identify group of reads with
-the same (or similar) UMI(s). All methods start by identifying the
-reads with the same mapping position.
-
-The simpliest method, "unique", groups reads with the exact same
-UMI. The network-based methods, "cluster", "adjacency" and
-"directional", build networks where nodes are UMIs and edges connect
-UMIs with an edit distance <= threshold (usually 1). The groups of
-reads are then defined from the network in a method-specific manner.
-
-Note that the "percentile" method used with the dedup command is not
-available with group. This is because this method does not group
-similar UMIs as per the network methods. Instead it applies a
-threshold for inclusion of the UMI in the output and excluded UMIs are
-not assigned to a "true" UMI.
-
- "unique"
- Reads group share the exact same UMI
-
- "cluster"
- Identify clusters of connected UMIs (based on hamming distance
- threshold). Each network is a read group
-
- "directional"
- Identify clusters of connected UMIs (based on hamming distance
- threshold) and umi A counts >= (2* umi B counts) - 1. Each
- network is a read group.
+their genomic coordinate and UMI.
The group command can be used to create two types of outfile: a tagged
BAM or a flatfile describing the read groups
@@ -227,138 +182,9 @@
- unique_id
The unique id for the group
-
-Options
--------
-
---extract-umi-method (choice)
- How are the UMIs encoded in the read?
-
- Options are:
-
- - "read_id" (default)
- UMIs contained at the end of the read separated as
- specified with --umi-separator option
-
- - "tag"
- UMIs contained in a tag, see --umi-tag option
-
---umi-separator (string)
- Separator between read id and UMI. See --extract-umi-method above
-
---umi-tag (string)
- Tag which contains UMI. See --extract-umi-method above
-
---method (choice, string)
- Method used to identify PCR duplicates within reads. All methods
- start by identifying the reads with the same mapping position
-
- Options are:
-
- - "unique"
- Reads group share the exact same UMI
-
- - "cluster"
- Identify clusters of connected UMIs (based on edit distance
- threshold). Each network is a read group
-
- - "directional"
- Identify clusters of connected UMIs (based on edit distance
- threshold) and umi A counts >= (2* umi B counts) - 1. Each
- network is a read group.
-
---edit-distance-threshold (int)
- For the adjacency and cluster methods the threshold for the
- edit distance to connect two UMIs in the network can be
- increased. The default value of 1 works best unless the UMI is
- very long (>14bp)
-
---paired
- BAM is paired end - output both read pairs. This will also
- force the use of the template length to determine reads with
- the same mapping coordinates.
-
---spliced-is-unique
- Causes two reads that start in the same position on the same
- strand and having the same UMI to be considered unique if one is
- spliced and the other is not. (Uses the 'N' cigar operation to test
- for splicing)
-
---soft-clip-threshold (int)
- Mappers that soft clip, will sometimes do so rather than mapping a
- spliced read if there is only a small overhang over the exon
- junction. By setting this option, you can treat reads with at least
- this many bases soft-clipped at the 3' end as spliced.
+@BARCODE_HELP@
---multimapping-detection-method (string, choice)
- If the sam/bam contains tags to identify multimapping reads, you can
- specify for use when selecting the best read at a given loci.
- Supported tags are "NH", "X0" and "XT". If not specified, the read
- with the highest mapping quality will be selected
-
---read-length
- Use the read length as as a criteria when deduping, for e.g sRNA-Seq
-
---whole-contig
- Consider all alignments to a single contig together. This is useful if
- you have aligned to a transcriptome multi-fasta
-
---subset (float, [0-1])
- Only consider a fraction of the reads, chosen at random. This is useful
- for doing saturation analyses.
-
---chrom
- Only consider a single chromosome. This is useful for debugging purposes
-
---per-contig (string)
- Deduplicate per contig (field 3 in BAM; RNAME).
- All reads with the same contig will be
- considered to have the same alignment position. This is useful
- if your library prep generates PCR duplicates with non identical
- alignment positions such as CEL-Seq. In this case, you would
- align to a reference transcriptome with one transcript per gene
-
---per-gene (string)
- Deduplicate per gene. As above except with this option you can
- align to a reference transcriptome with more than one transcript
- per gene. You need to also provide --gene-transcript-map option.
- This will also add a metacontig ('MC') tag to the reads if used
- in conjunction with --output-bam
-
---gene-transcript-map (string)
- File mapping genes to transripts (tab separated), e.g:
-
- gene1 transcript1
- gene1 transcript2
- gene2 transcript3
-
---gene-tag (string)
- Deduplicate per gene. As per --per-gene except here the gene
- information is encoded in the bam read tag specified so you do
- not need to supply --gene-transcript-map
-
---group-out (string, filename)
- Output a flatfile describing the read groups
-
---output-bam (string, filename)
- Output a tagged bam file to stdout or -S
-
--i, --in-sam/-o, --out-sam
- By default, inputs are assumed to be in BAM format and output are output
- in BAM format. Use these options to specify the use of SAM format for
- inputs or outputs.
-
--I (string, filename) input file name
- The input file must be sorted and indexed.
-
--S (string, filename) output file name
-
--L (string, filename) log file name
-
-Usage
------
- umi_tools group -I infile.bam --output-bam -S grouped.bam -L group.log --
-
+@UMI_GROUPING_HELP@
]]>