# HG changeset patch
# User devteam
# Date 1380121722 14400
# Node ID e942fd3a76a51ebd04974f84cfb36febad86f86e
Uploaded tool tarball.
diff -r 000000000000 -r e942fd3a76a5 fastx_collapser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_collapser.xml Wed Sep 25 11:08:42 2013 -0400
@@ -0,0 +1,90 @@
+
+ sequences
+
+ fastx_toolkit
+
+ zcat -f '$input' | fastx_collapser -v -o '$output'
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool collapses identical sequences in a FASTA file into a single sequence.
+
+--------
+
+**Example**
+
+Example Input File (Sequence "ATAT" appears multiple times)::
+
+ >CSHL_2_FC0042AGLLOO_1_1_605_414
+ TGCG
+ >CSHL_2_FC0042AGLLOO_1_1_537_759
+ ATAT
+ >CSHL_2_FC0042AGLLOO_1_1_774_520
+ TGGC
+ >CSHL_2_FC0042AGLLOO_1_1_742_502
+ ATAT
+ >CSHL_2_FC0042AGLLOO_1_1_781_514
+ TGAG
+ >CSHL_2_FC0042AGLLOO_1_1_757_487
+ TTCA
+ >CSHL_2_FC0042AGLLOO_1_1_903_769
+ ATAT
+ >CSHL_2_FC0042AGLLOO_1_1_724_499
+ ATAT
+
+Example Output file::
+
+ >1-1
+ TGCG
+ >2-4
+ ATAT
+ >3-1
+ TGGC
+ >4-1
+ TGAG
+ >5-1
+ TTCA
+
+.. class:: infomark
+
+Original Sequence Names / Lane descriptions (e.g. "CSHL_2_FC0042AGLLOO_1_1_742_502") are discarded.
+
+The output sequence name is composed of two numbers: the first is the sequence's number, the second is the multiplicity value.
+
+The following output::
+
+ >2-4
+ ATAT
+
+means that the sequence "ATAT" is the second sequence in the file, and it appeared 4 times in the input FASTA file.
+
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+
+
+
diff -r 000000000000 -r e942fd3a76a5 test-data/fasta_collapser1.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_collapser1.fasta Wed Sep 25 11:08:42 2013 -0400
@@ -0,0 +1,84 @@
+>1
+TGTATTTACAATGACTAGAAA
+>2
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>3
+AGTACAAGGACATGC
+>4
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>5
+AGTACAAGGACATGC
+>6
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>7
+AGTACAAGGACATGC
+>8
+AGTACAAGGACATGC
+>9
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>10
+AGTACAAGGACATGC
+>11
+AGTACAAGGACATGC
+>12
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>13
+CGATTGCCGAAGTCTACCA
+>14
+AGTACAAGGACATGC
+>15
+CCTTGTAGTGGATTCTGATGA
+>16
+AGTACAAGGACATGC
+>17
+AGTACAAGGACATGC
+>18
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>19
+AGTACAAGGACATGC
+>20
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>21
+AGTACAAGGACATGC
+>22
+AGTACAAGGACATGC
+>23
+CTGCTGCGATCGGTGTGC
+>24
+AGTACAAGGACATGC
+>25
+ACCATTCGAGCATAC
+>26
+AGTACAAGGACATGC
+>27
+TCAAATTCTAGATTTTTACGG
+>28
+AGTACAAGGACATGC
+>29
+TGATTTCCAGAGCCAAT
+>30
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>31
+TTACCTCACGATATTGTAATA
+>32
+ATGACTTCATCGTCCACCCTTTAGAACT
+>33
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>34
+TTCAACGCCGCCGTGAAC
+>35
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>36
+CTGCTGCGATCGGTGTGC
+>37
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>38
+TTCAACGCCGCCGTGAAC
+>39
+TTCAACGCCGCCGTGAAC
+>40
+CTGCTGCGATCGGTGTGC
+>41
+TTCAACGCCGCCGTGAAC
+>42
+TTCAACGCCGCCGTGAAC
diff -r 000000000000 -r e942fd3a76a5 test-data/fasta_collapser1.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_collapser1.out Wed Sep 25 11:08:42 2013 -0400
@@ -0,0 +1,24 @@
+>1-15
+AGTACAAGGACATGC
+>2-11
+ATTGCTGCTCGGATGGTCCGGCTGTGCACAC
+>3-5
+TTCAACGCCGCCGTGAAC
+>4-3
+CTGCTGCGATCGGTGTGC
+>5-1
+TCAAATTCTAGATTTTTACGG
+>6-1
+ACCATTCGAGCATAC
+>7-1
+TGATTTCCAGAGCCAAT
+>8-1
+TTACCTCACGATATTGTAATA
+>9-1
+TGTATTTACAATGACTAGAAA
+>10-1
+CCTTGTAGTGGATTCTGATGA
+>11-1
+CGATTGCCGAAGTCTACCA
+>12-1
+ATGACTTCATCGTCCACCCTTTAGAACT
\ No newline at end of file
diff -r 000000000000 -r e942fd3a76a5 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Sep 25 11:08:42 2013 -0400
@@ -0,0 +1,6 @@
+
+
+
+
+
+