Mercurial > repos > lparsons > ea_utils
changeset 0:e6f1c31279db draft
Initial version with fastq-join
author | Lance Parsons <lparsons@princeton.edu> |
---|---|
date | Thu, 20 Sep 2012 18:59:38 -0400 |
parents | |
children | cf4b5125a835 |
files | fastq-join.xml test-data/test_read1.fastq test-data/test_read3.fastq test-data/testout.join.fastq test-data/testout.un1.fastq test-data/testout.un2.fastq tool_dependencies.xml |
diffstat | 7 files changed, 207 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq-join.xml Thu Sep 20 18:59:38 2012 -0400 @@ -0,0 +1,65 @@ +<tool id="fastq_join" name="fastq-join" version="0.1"> + <description> - Joins two paired-end reads on the overlapping ends</description> + <requirements> + <requirement type="package" version="1.1.2-469">ea-utils</requirement> + </requirements> + <command> + fastq-join + -v '$splitChar' + -p $pctMaxDiff + -m $minOverlap + #if $stitchLengthReport: + -r $outputStitchLengthReport + #end if + $read1 + $read2 + -o $outputUnmatched1 -o $outputUnmatched2 -o $outputJoined + </command> + <inputs> + <param format="fastq, fastqillumina, fastqsanger, fastqsolexa" name="read1" type="data" label="Read 1 Fastq" /> + <param format="fastq, fastqillumina, fastqsanger, fastqsolexa" name="read2" type="data" label="Read 2 Fastq" /> + <param name="splitChar" type="text" value=" " label="Split read ids on this character" help="Default is space ' ' for Illumina reads" /> + <param name="pctMaxDiff" type="float" value="8" min="0" max="100" label="Maximum percentage difference between matching segments" /> + <param name="minOverlap" type="integer" value="6" min="1" label="Minimum length of matching segements" /> + <param name="stitchLengthReport" type="boolean" value="False" label="Output verbose stitch length report" /> + </inputs> + + <outputs> + <data format="input" format_source="read1" name="outputJoined" label="${tool.name} on ${on_string} (joined)"/> + <data format="input" format_source="read1" name="outputUnmatched1" label="${tool.name} on ${on_string} (unmatched1)"/> + <data format="input" format_source="read2" name="outputUnmatched2" label="${tool.name} on ${on_string} (unmatched2)"/> + <data format="tabular" name="outputStitchLengthReport" label="${tool.name} on ${on_string} (stitch length report)"> + <filter>stitchLengthReport</filter> + </data> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" description="Unknown error occurred" /> + </stdio> + + <tests> + <test> + <param name="read1" value="test_read1.fastq" /> + <param name="read2" value="test_read3.fastq" /> + <output name="outputJoined" file="testout.join.fastq" /> + <output name="outputUnmatched1" file="testout.un1.fastq" /> + <output name="outputUnmatched2" file="testout.un2.fastq" /> + </test> + </tests> + + <help> +Overview +-------- +fastq-join joins two paired-end reads on the overlapping ends. + +Split read ids character: Verifies that the 2 files probe id's match up to char C. Use ' ' for Illumina reads. + +Maximum difference is the maximum allowed percentage of bases that differ in the matching region. + +Minimum overlap is the minimum number of bases that must overlap (with no more than the maximum difference) for reads to be joined. + +Verbose stitch length report is a report for each joined paired of reads showing how large the overlapping section was. + +This tool uses sqr(distance)/len for anchored alignment quality algorithm. It's a good measure of anchored alignment quality, akin to squared-deviation for means. This tool uses the fastq-join program that is part of the ea-utils suite. See http://code.google.com/p/ea-utils/wiki/FastqJoin for details. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_read1.fastq Thu Sep 20 18:59:38 2012 -0400 @@ -0,0 +1,40 @@ +@JLK5VL1:222:D1888ACXX:1:1101:1656:2143 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +?+:BDDDAD8BDD?@4+<?EECD?BBFIDFD<E:*?DI@?DDEDDCEECEIECDCCAD>5;6?;;@A;AABA>A@>AAAD;A>A>5=5>>>BBAA###### +@JLK5VL1:222:D1888ACXX:1:1101:1613:2167 1:N:0: +GTGATAGAGATACTGAGCACAGAGCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@?BDFFFFHBHHHJJJGFEHHEIHEGHJJJJDI@DHIIHGGGGGEHIHGIIIIJJJGIB?AACBEDDFCDCCCCCEDC>@CD>CC@BA>CDEECD=BDDB> +@JLK5VL1:222:D1888ACXX:1:1101:1927:2121 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@B@DFFFFHHHHHJJIHJIJJJJJJJJJJJJIJHJJIJIJIHHGIJIIJGIIJJJIIII?EEEDFFFFEEEEEDECDDDEDDDDCCBACDEDECC@BB9<A +@JLK5VL1:222:D1888ACXX:1:1101:2000:2166 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACGTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +B?@DFFFFHHHHHGIIJIIJJJJJJIJJJJJIIHIJJIJJIIGGJJGHGHIGGIIJHIGAEEEDFFEEEEFEEDDDDDCC>CCCCCACDDEEDCA9>BBDC +@JLK5VL1:222:D1888ACXX:1:1101:1763:2172 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAAGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@?@DDDBDHDFDHBFB3CCGFFHDHHIIIIIIG@<EFHA?DDFFDHHHHCHIIIIGGGH-;@5=CHHEADDEBEECCCCDCA@CCCB@CC@BC(;';BBBC +@JLK5VL1:222:D1888ACXX:1:1101:1832:2198 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGCATATACCGCCAAT ++ +B=BDFFFFHGHHHJJIIJHIJIIJJJJJJJJIJIJJJIIJJHIIIJJJJIIIIJJIGJJ??CBBFFFFEDEEEEECDDCDDDCDCC@BDCDDEEC><>BD? +@JLK5VL1:222:D1888ACXX:1:1101:2169:2161 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTCGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@?BFFFFFHHHHHJJJGIGIJIIJJJIGJJJJJJIIIIHGHIIGIJGIIIJJIJIIJJJEHDBDCDEEEEDDDDD@CCCDAA@CA@A:@CADD>C9<@B9C +@JLK5VL1:222:D1888ACXX:1:1101:2236:2214 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@=@DFDFFHAHDHGFEE@FFHGGEDEGHGIJ@GFGHAGFIIGBDGHIIJIGJJJJJIIIACEBDEFDFDEEEEDEDDCCDCD;CDCB@CF@DD@C?B<@DC +@JLK5VL1:222:D1888ACXX:1:1101:2090:2240 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@:?DDDDDF<FHHIIEEGBHGGDGGIIIGHIIIGGGHGEHFIHGIIIII<HHHIEGGIIHE=?BDECD@CDDCCCCCAAC@ACACCBCCDCDDDC###### +@JLK5VL1:222:D1888ACXX:1:1101:2952:2161 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@@@DFFFFHFHHGIJIJJJJJIJJJJIIIJJIIJJIJIEHIHIEHHIGJEHIIJIJDCGCE?BFFFDFEDDEEECACCCDDD@C>ABCCDCFFCA:9BBBC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_read3.fastq Thu Sep 20 18:59:38 2012 -0400 @@ -0,0 +1,40 @@ +@JLK5VL1:222:D1888ACXX:1:1101:1656:2143 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +BB8=4ADDHHHHACG<FHIGGIIIIICAEEC;CDF0@DEG3?BBACCGGICFD@@;@7;AED<<=CC@>(;(;@C@C:<:>>@>>;B>B(44?>C@B9<<8 +@JLK5VL1:222:D1888ACXX:1:1101:1613:2167 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +@?=DDDFFDFHHHIGGHHIGGFEGIJJFAFEGGGH@AFFHGIIIGEFHBF>@FCGAE@DD=D'9@DCC@;C@C@CDDCD>CCD@>>:@@C@CDDACA<28? +@JLK5VL1:222:D1888ACXX:1:1101:1927:2121 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +CCCFFFFFHHHHHJIIJJJJJJIJJJJIJJJJJJJIJJJJJJJJJJJJJJIHIJJIJEEHHF>BBDEFEDDDDDDDCDDDDCDDDEEEEDDDDDDDBDBDD +@JLK5VL1:222:D1888ACXX:1:1101:2000:2166 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +B@@DDFFFGHHHDIFHIJIIIJJJJIJJJJJIJJIIJIJIGIIJGIJJJJDFGGGII:DHFF<BDECDEDD@CCCDDCDDDCEDDDDC@C@CCDCD?@8<< +@JLK5VL1:222:D1888ACXX:1:1101:1763:2172 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +@;?DFDFFFDHHDIGACFHGGGGJJEH3@GDGCGHIEIIG?FDHCEGHEI9@C@C@@@9@=<',5;5;;;AC>3>C>CCCC(5(;>CDECCAC>@AB08AB +@JLK5VL1:222:D1888ACXX:1:1101:1832:2198 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATGCTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +C@@FFFFFHHHHHJJJJJIJIJJJJIIJJIJIIFIGIJGHIDGIIIGIJJIHGHIIJGIIHF<<@DDDDEEDDCCCDDDCCCDDDEDDEDDCCDDCBB?AB +@JLK5VL1:222:D1888ACXX:1:1101:2169:2161 3:N:0: +TTTGCCCTATTTATAGGACCCTAGAAAGAAAAAACCTCAAATAAAATAAAAGTGATTGGCGGGATATACTTGTGAACAATGTGTATTTACTTACCACCACC ++ +##################################################################################################### +@JLK5VL1:222:D1888ACXX:1:1101:2236:2214 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +@@@DDFFFHHHHFEEGIEHIJJGGFGIGIHCHJJJJJJIJIJGIGGIJIJGCDGEGG>EGHF=ABC@BDDCDDA@CCCCDCDE@CCD>CC@CDDDDBD?<< +@JLK5VL1:222:D1888ACXX:1:1101:2090:2240 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGATAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +CC@FFFFFGFHDDHH>@FHHCHIIGIIEGGHIJIIBEEHIGGGGJIJJJJGBFGIGGCAHEA,9?B>B@@C>@@ACACCCC@C@@CDCDC@>CCCDD@(8? +@JLK5VL1:222:D1888ACXX:1:1101:2952:2161 3:N:0: +TGGATCCTATTAATAGGACCCTAGAAAGAAAAAACGCCAATCACAATAAAGTTGATTGGCGGTATATACTTGTGAACAATGTGTATTCACTAACAACGTCA ++ +CCCFFFFFHHHGHJJJJJIJJJJJIJJJHIJJIJJIIGIIIJEIIJIIJIG@FHIJJJIJHF<A?CBCC>CCDCCDDDDDDCDCCDDEACA@CDDDDDABB
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/testout.join.fastq Thu Sep 20 18:59:38 2012 -0400 @@ -0,0 +1,36 @@ +@JLK5VL1:222:D1888ACXX:1:1101:1656:2143 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +?+:BDDDAD8BDD?@4+<?EECD?BBFIDFD<E:*?DI@?DDEDDCEECEIECDCCADB@C>?;;@B>BABA@A@>AACDCA>A>5>@CC>BBDEA;7@;@@DFCIGGCCABB?3GED@0FDC;CEEACIIIIIGGIHF<GCAHHHHDDA4=8BB +@JLK5VL1:222:D1888ACXX:1:1101:1613:2167 +GTGATAGAGATACTGAGCACAGAGCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +@?BDFFFFHBHHHJJJGFEHHEIHEGHJJJJDI@DHIIHGGGGGEHIHGIIIIJJJGIBCADDCEDDFCDCCDCCEDCDDCDCCC@BCCDDEEDDDDDEBGCF@>FBHFEGIIIGHFFA@HGGGEFAFJJIGEFGGIHHGGIHHHFDFFDDD=?@ +@JLK5VL1:222:D1888ACXX:1:1101:1927:2121 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +@B@DFFFFHHHHHJJIHJIJJJJJJJJJJJJIJHJJIJIJIHHGIJIIJGIIJJJIIIIDEEEDFFFFEEEEEDEDDDDEDDDDDDEFEDEDEFHHEEJIJJIHIJJJJJJJJJJJJJJIJJJJJJJIJJJJIJJJJJJIIJHHHHHFFFFFCCC +@JLK5VL1:222:D1888ACXX:1:1101:2000:2166 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACGTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +B?@DFFFFHHHHHGIIJIIJJJJJJIJJJJJIIHIJJIJJIIGGJJGHGHIGGIIJHIGDEEEDFFEEEEFEEDDDDDDDCCCCDDEDDEEEDFFHDBIIGGGFDJJJJIGJIIGIJIJIIJJIJJJJJIJJJJIIIJIHFIDHHHGFFFDD@@B +@JLK5VL1:222:D1888ACXX:1:1101:1763:2172 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAAGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +@?@DDDBDHDFDHBFB3CCGFFHDHHIIIIIIG@<EFHA?DDFFDHHHHCHIIIIGGGHA@@CACHHECDDEBEECCCCDCA@CCCB@CC@BC<=@;BBBC@C@9IEHGECHDF?GIIEIHGCGDG@3HEJJGGGGHFCAGIDHHDFFFDFD?;@ +@JLK5VL1:222:D1888ACXX:1:1101:1832:2198 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGCATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +B=BDFFFFHGHHHJJIIJHIJIIJJJJJJJJIJIJJJIIJJHIIIJJJJIIIIJJIGJJCDDCCFFFFEEEEEEECDDDDDDDDEEDDDDDDEFHIIGJIIHGHIJJIGIIIGDIHGJIGIFIIJIJJIIJJJJIJIJJJJJHHHHHFFFFF@@C +@JLK5VL1:222:D1888ACXX:1:1101:2236:2214 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +@=@DFDFFHAHDHGFEE@FFHGGEDEGHGIJ@GFGHAGFIIGBDGHIIJIGJJJJJIIIDDEDDEFDFDEEEEDEDDCCDCDDDDDDBCFBDDFHGE>GGEGDCGJIJIGGIGJIJIJJJJJJHCHIGIGFGGJJIHEIGEEFHHHHFFFDD@@@ +@JLK5VL1:222:D1888ACXX:1:1101:2090:2240 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTATCTTTCTAGGGTCCTATTAATAGGATCCA ++ +@:?DDDDDF<FHHIIEEGBHGGDGGIIIGHIIIGGGHGEHFIHGIIIII<HHHIEGGIIHECCBDEDDDCDDCCCCCCACAACACCBCCDCDDDEHACGGIGFBGJJJJIJGGGGIHEEBIIJIHGGEIIGIIHCHHF@>HHDDHFGFFFFF@CC +@JLK5VL1:222:D1888ACXX:1:1101:2952:2161 +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTTGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAATCAACTTTATTGTGATTGGCGTTTTTTCTTTCTAGGGTCCTATTAATAGGATCCA ++ +@@@DFFFFHFHHGIJIJJJJJIJJJJIIIJJIIJJIJIEHIHIEHHIGJEHIIJIJDDGDEDCFFFDFEDDEEEDDDDDDDDDCCACCCDCFFFHJIJJJIHF@GIJIIJIIEJIIIGIIJJIJJIHJJJIJJJJJIJJJJJHGHHHFFFFFCCC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/testout.un1.fastq Thu Sep 20 18:59:38 2012 -0400 @@ -0,0 +1,4 @@ +@JLK5VL1:222:D1888ACXX:1:1101:2169:2161 1:N:0: +GTGATAGAGATACTGAGCACAGACCCTTATTAAGCCGAGGGTCACCTAGCCAACTGACGTCGTTAGTGAATACACATTGTTCACAAGTATATACCGCCAAT ++ +@?BFFFFFHHHHHJJJGIGIJIIJJJIGJJJJJJIIIIHGHIIGIJGIIIJJIJIIJJJEHDBDCDEEEEDDDDD@CCCDAA@CA@A:@CADD>C9<@B9C
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/testout.un2.fastq Thu Sep 20 18:59:38 2012 -0400 @@ -0,0 +1,4 @@ +@JLK5VL1:222:D1888ACXX:1:1101:2169:2161 3:N:0: +TTTGCCCTATTTATAGGACCCTAGAAAGAAAAAACCTCAAATAAAATAAAAGTGATTGGCGGGATATACTTGTGAACAATGTGTATTTACTTACCACCACC ++ +#####################################################################################################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Sep 20 18:59:38 2012 -0400 @@ -0,0 +1,18 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="ea-utils" version="1.1.2-469"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://ea-utils.googlecode.com/files/ea-utils.1.1.2-469.tar.gz</action> + <!-- sam-stats and varcall require an extra Bamtools library and we don't need them, so skip --> + <action type="shell_command">sed -i.bak 's/sam-stats varcall$//' Makefile</action> + <action type="shell_command">PREFIX=$INSTALL_DIR make install</action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> + </action> + </actions> + </install> + <readme> + </readme> + </package> +</tool_dependency>