# HG changeset patch # User bjoern-gruening # Date 1359707377 18000 # Node ID f8f1a3878edd1a0672bb01176009e3f487d2cb28 # Parent 13df908a02b0b4b33a90b98ed8b265eb77e7c277 0.1.1 version, prevent a crash if no repeat is found. Thanks to Simon Guest diff -r 13df908a02b0 -r f8f1a3878edd RepeatMasker.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RepeatMasker.xml Fri Feb 01 03:29:37 2013 -0500 @@ -0,0 +1,314 @@ + + Masks different kind of repeats + +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces + +## create temp directory +#import tempfile, os +#set $dirname = os.path.abspath(tempfile.mkdtemp()) +#set $input_filename = os.path.split(str($query))[-1] +#set $output_basename = os.path.join($dirname, $input_filename) + + +RepeatMasker +-parallel 8 + +$nolow +$noint +$norna + +#if str($species)!="all": + $species +#end if + + +-dir $dirname + +#if $adv_opts.adv_opts_selector=="advanced": + + #if str($adv_opts.gc)!="0": + -gc $adv_opts.gc + #end if + + $adv_opts.gccalc + + #set $output_files_list = str($adv_opts.output_files).split(',') + #if "gff" in $output_files_list: + -gff + #end if + #if "html" in $output_files_list: + -html + #end if + + $adv_opts.slow_search + $adv_opts.quick_search + $adv_opts.rush_search + $adv_opts.only_alus + $adv_opts.is_only + +#else: + ## Set defaults + -gff + +## End of advanced options: +#end if + +$query + + +> /dev/null 2> /dev/null; +## Copy the output files to galaxy +## AgR: if there are no repeats, the output files may not exist. +## This causes the job to fail, so touch files to ensure they exist. +#if $adv_opts.adv_opts_selector=="advanced": + + #if "summary" in $output_files_list: + ## Write out the summary file (default) + #set $summary_file = $output_basename + '.tbl' + touch $summary_file + cp $summary_file $output_summary; + #end if + + #if "gff" in $output_files_list: + ## Write out the gff file (default) + #set $gff_file = $output_basename + '.out.gff' + touch $gff_file + cp $gff_file $output_gff; + #end if + + #if "html" in $output_files_list: + ## Write out the html file + #set $html_file = $output_basename + '.out.html' + touch $html_file + cp $html_file $output_html; + #end if + +#else: + + ## Write out the summary file (default) + #set $summary_file = $output_basename + '.tbl' + touch $summary_file + cp $summary_file $output_summary; + + ## Write out the gff file (default) + #set $gff_file = $output_basename + '.out.gff' + touch $gff_file + cp $gff_file $output_gff; + + +## End of advanced options: +#end if + +## Write out mask sequence file +#set $mask_sequence_file = $output_basename + '.masked' +touch $mask_sequence_file +cp $mask_sequence_file $output_mask; + +## Write out standard file (default) +## The default '.out' file from RepeatMasker has a 3-line header and spaces rather +## than tabs. Remove the header and replace the whitespaces with tab +#set $standard_file = $output_basename + '.out' +tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std; + +## Delete all temporary files +rm $dirname -r; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files']) + + + + ( + (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files']) + or + (adv_opts['adv_opts_selector'] == 'basic') + ) + + + + (adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files']) + + + + (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files']) + + + + + RepeatMasker + + + +.. class:: warningmark + +**What it does** + +RepeatMasker is a program that screens DNA sequences for *interspersed repeats* +and *low complexity* DNA sequences. The output of the program is a detailed +annotation of the repeats that are present in the query sequence as well as a +modified version of the query sequence in which all the annotated repeats have +been masked (default: replaced by Ns). + +----- + +**How to read the results** + + + +The annotation file contains the cross_match output lines. It lists all best matches +(above a set minimum score) between the query sequence and any of the sequences in +the repeat database or with low complexity DNA. The term "best matches" reflects +that a match is not shown if its domain is over 80% contained within the domain +of a higher scoring match, where the "domain" of a match is the region in +the query sequence that is defined by the alignment start and stop. These domains +have been masked in the returned masked sequence file. In the output, matches are +ordered by query name, and for each query by position of the start of the alignment. + +Example: + +======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == +SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID +======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == + 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1 + 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2 + 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3 + 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4 + 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5 + 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6 + 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7 + 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8 +======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == + +This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy. +Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the +poly A of the Alu element. The first line is interpreted like this: + +:Table description: + +1. **1306** = Smith-Waterman score of the match, usually complexity adjusted + The SW scores are not always directly comparable. Sometimes + the complexity adjustment has been turned off, and a variety of + scoring-matrices are used. + +#. **15.6** = % substitutions in matching region compared to the consensus +#. **6.2** = % of bases opposite a gap in the query sequence (deleted bp) +#. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp) +#. **HSU08988** = name of query sequence +#. **6563** = starting position of match in query sequence +#. **7714** = ending position of match in query sequence +#. **(22462)** = no. of bases in query sequence past the ending position of match +#. **C** = match is with the Complement of the consensus sequence in the database +#. **MER7A** = name of the matching interspersed repeat +#. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references) +#. **2418** = starting position of match in database sequence (using top-strand numbering) +#. **1465** = ending position of match in database sequence +#. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence) +#. **1** = Identifier + +An asterisk (\*) in the final column (no example shown) indicates that there is +a higher-scoring match whose domain partly (<80%) includes the domain of this match. + +Note that the SW score and divergence numbers for the three Tigger1 lines are identical. +This is because the information is derived from a single alignment (the Alus were deleted +from the query before the alignment with the Tigger element was performed). +The program makes educated guesses about many fragments if they are derived from +the same element (e.g. it knows that the MER7A fragments represent one insert). +In a next version I can identify each element with a unique ID, if interest exists +(this could help to represent repeats cleaner in graphic displays). + + +------- + +**References** + +Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0. + +http://www.repeatmasker.org/ + + + diff -r 13df908a02b0 -r f8f1a3878edd readme.txt --- a/readme.txt Wed Jan 11 04:50:59 2012 -0500 +++ b/readme.txt Fri Feb 01 03:29:37 2013 -0500 @@ -1,7 +1,7 @@ Galaxy wrapper for RepeatMasker ===================================== -This wrapper is copyright 2012 by Björn Grüning. +This wrapper is copyright 2013 by Björn Grüning. This is a wrapper for the command line tool of RepeatMasker from the Institute for Systems Biology. http://www.repeatmasker.org/ @@ -34,6 +34,7 @@ ======= v0.1 - Initial public release +v0.1.1 - patch from Simon Guest, to create empty files if no repeat is found Wrapper Licence (MIT/BSD style) diff -r 13df908a02b0 -r f8f1a3878edd tools/RepeatMasker.xml --- a/tools/RepeatMasker.xml Wed Jan 11 04:50:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,309 +0,0 @@ - - Masks different kind of repeats - -## The command is a Cheetah template which allows some Python based syntax. -## Lines starting hash hash are comments. Galaxy will turn newlines into spaces - -## create temp directory -#import tempfile, os -#set $dirname = os.path.abspath(tempfile.mkdtemp()) -#set $input_filename = os.path.split(str($query))[-1] -#set $output_basename = os.path.join($dirname, $input_filename) - - -RepeatMasker --parallel 8 - -$nolow -$noint -$norna - -#if str($species)!="all": - $species -#end if - - --dir $dirname - -#if $adv_opts.adv_opts_selector=="advanced": - - #if str($adv_opts.gc)!="0": - -gc $adv_opts.gc - #end if - - $adv_opts.gccalc - - #set $output_files_list = str($adv_opts.output_files).split(',') - #if "gff" in $output_files_list: - -gff - #end if - #if "html" in $output_files_list: - -html - #end if - - $adv_opts.slow_search - $adv_opts.quick_search - $adv_opts.rush_search - $adv_opts.only_alus - $adv_opts.is_only - -#else: - ## Set defaults - -gff - -## End of advanced options: -#end if - -$query - - -> /dev/null 2> /dev/null; -## Copy the output files to galaxy -#if $adv_opts.adv_opts_selector=="advanced": - - #if "summary" in $output_files_list: - ## Write out the summary file (default) - #set $summary_file = $output_basename + '.tbl' - cp $summary_file $output_summary; - #end if - - #if "gff" in $output_files_list: - ## Write out the gff file (default) - #set $gff_file = $output_basename + '.out.gff' - cp $gff_file $output_gff; - #end if - - #if "html" in $output_files_list: - ## Write out the html file - #set $html_file = $output_basename + '.out.html' - cp $html_file $output_html; - #end if - -#else: - - ## Write out the summary file (default) - #set $summary_file = $output_basename + '.tbl' - cp $summary_file $output_summary; - - ## Write out the gff file (default) - #set $gff_file = $output_basename + '.out.gff' - cp $gff_file $output_gff; - - -## End of advanced options: -#end if - -## Write out mask sequence file -#set $mask_sequence_file = $output_basename + '.masked' -cp $mask_sequence_file $output_mask; - -## Write out standard file (default) -## The default '.out' file from RepeatMasker has a 3-line header and spaces rather -## than tabs. Remove the header and replace the whitespaces with tab -#set $standard_file = $output_basename + '.out' -tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std; - -## Delete all temporary files -rm $dirname -r; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files']) - - - - ( - (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files']) - or - (adv_opts['adv_opts_selector'] == 'basic') - ) - - - - (adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files']) - - - - (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files']) - - - - - RepeatMasker - - - -.. class:: warningmark - - ------ - -**What it does** - -RepeatMasker is a program that screens DNA sequences for *interspersed repeats* -and *low complexity* DNA sequences. The output of the program is a detailed -annotation of the repeats that are present in the query sequence as well as a -modified version of the query sequence in which all the annotated repeats have -been masked (default: replaced by Ns). - ------ - -**How to read the results** - - - -The annotation file contains the cross_match output lines. It lists all best matches -(above a set minimum score) between the query sequence and any of the sequences in -the repeat database or with low complexity DNA. The term "best matches" reflects -that a match is not shown if its domain is over 80% contained within the domain -of a higher scoring match, where the "domain" of a match is the region in -the query sequence that is defined by the alignment start and stop. These domains -have been masked in the returned masked sequence file. In the output, matches are -ordered by query name, and for each query by position of the start of the alignment. - -Example: - -======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == -SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID -======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == - 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1 - 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2 - 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3 - 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4 - 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5 - 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6 - 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7 - 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8 -======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == - -This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy. -Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the -poly A of the Alu element. The first line is interpreted like this: - -:Table description: - -1. **1306** = Smith-Waterman score of the match, usually complexity adjusted - The SW scores are not always directly comparable. Sometimes - the complexity adjustment has been turned off, and a variety of - scoring-matrices are used. - -#. **15.6** = % substitutions in matching region compared to the consensus -#. **6.2** = % of bases opposite a gap in the query sequence (deleted bp) -#. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp) -#. **HSU08988** = name of query sequence -#. **6563** = starting position of match in query sequence -#. **7714** = ending position of match in query sequence -#. **(22462)** = no. of bases in query sequence past the ending position of match -#. **C** = match is with the Complement of the consensus sequence in the database -#. **MER7A** = name of the matching interspersed repeat -#. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references) -#. **2418** = starting position of match in database sequence (using top-strand numbering) -#. **1465** = ending position of match in database sequence -#. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence) -#. **1** = Identifier - -An asterisk (\*) in the final column (no example shown) indicates that there is -a higher-scoring match whose domain partly (<80%) includes the domain of this match. - -Note that the SW score and divergence numbers for the three Tigger1 lines are identical. -This is because the information is derived from a single alignment (the Alus were deleted -from the query before the alignment with the Tigger element was performed). -The program makes educated guesses about many fragments if they are derived from -the same element (e.g. it knows that the MER7A fragments represent one insert). -In a next version I can identify each element with a unique ID, if interest exists -(this could help to represent repeats cleaner in graphic displays). - - -------- - -**References** - -Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0. - -http://www.repeatmasker.org/ - - -