# HG changeset patch # User urgi-team # Date 1404999150 14400 # Node ID a2e1d1f25e35446c24b78bf552e0f49bbc452976 Uploaded diff -r 000000000000 -r a2e1d1f25e35 tandem_repeats_finder_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tandem_repeats_finder_wrapper.py Thu Jul 10 09:32:30 2014 -0400 @@ -0,0 +1,117 @@ +#!/usr/bin/env python + + +import subprocess, tempfile, sys, os, glob, shutil, time +from optparse import OptionParser + + +class tandemRepeatsFinderWrapper(object): + + def __init__(self): + self._options = None + + def getSystemCommand(self, prg, lArgs): + systemCmd = prg + for arg in lArgs: + systemCmd += " " + arg + return systemCmd + + def setOptions(self, options): + self._options = options + + def stop_err(self, msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + def setAttributesFromCmdLine(self): + parser = OptionParser(description = "Tandem Repeats Finder wrapper", version = "4.0") + parser.add_option( "--html", dest = "html", help = "html summary file for Galaxy") + parser.add_option( "--dirhtml", dest = "dir", help = "html files directory for Galaxy") + parser.add_option( "--txt", dest = "txt", default="", help = "txt summary file for Galaxy") + parser.add_option( "--file", dest = "inputFile", help = "Input Fasta File name") + parser.add_option( "--match", dest = "match", type="int", default=2, help = "matching weight"), + parser.add_option( "--mismatch", dest = "mismatch", type="int", default=7, help = "mismatching penalty"), + parser.add_option( "--delta", dest = "delta", type="int", default = 7, help = "indel penalty") + parser.add_option( "--pm", dest = "pm", type="int", default=80, help="matching probability") + parser.add_option( "--pi", dest="pi", type="int", default=10, help="indel probability") + parser.add_option( "--minscore", dest="minscore", type="int", default=30, help="minimum alignment score to report") + parser.add_option( "--maxperiod", dest="maxperiod", type="int", default=500, help="maximum period size to report") + parser.add_option( "--flanking", dest="flanking", action="store_true", help="") + parser.add_option( "--mask", dest="mask", help="" ) + options, args = parser.parse_args() + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setOptions(options) + + def run(self): + prg = "trf" + args = [] + args.append("%s" % self._options.inputFile) + args.append("%d" % self._options.match) + args.append("%d" % self._options.mismatch) + args.append("%d" % self._options.delta) + args.append("%d" % self._options.pm) + args.append("%d" % self._options.pi) + args.append("%d" % self._options.minscore) + args.append("%d" % self._options.maxperiod) + if not self._options.html and not self._options.dir: + args.append("-h") + if self._options.flanking == True: + args.append("-f") + if self._options.mask: + args.append("-m") + args.append("-d") + + cmd = self.getSystemCommand(prg, args) + + try: + tmp_err = tempfile.NamedTemporaryFile().name + tmp_stderr = open( tmp_err, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stderr=tmp_stderr ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp_err, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if stderr: + raise Exception, stderr + except Exception, e: + self.stop_err( 'Error in Tandem Repeats Finder:\n' + str( e ) ) + if self._options.html: + summary = glob.glob("*.summary.html") + if not summary: + summary = glob.glob("*1.html") + + shutil.move("%s" % (summary[0]) , "%s" % (self._options.html)) + + os.mkdir("%s" % (self._options.dir)) + for results in glob.glob("*.html"): + baseName = os.path.basename(results) + shutil.move("%s" % (results) , "%s/%s" % (self._options.dir, baseName)) + + if self._options.txt: + data = glob.glob("*.dat") + shutil.move("%s" % (data[0]) , "%s" % (self._options.txt)) + else: + data = glob.glob("*.dat") + shutil.move("%s" % (data[0]) , "%s" % (self._options.txt)) + + if self._options.mask: + masked_file = glob.glob("*.mask") + shutil.move("%s" % (masked_file[0]) , "%s" % (self._options.mask)) + + +if __name__ == "__main__": + iWrapper = tandemRepeatsFinderWrapper() + iWrapper.setAttributesFromCmdLine() + iWrapper.run() diff -r 000000000000 -r a2e1d1f25e35 tandem_repeats_finder_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tandem_repeats_finder_wrapper.xml Thu Jul 10 09:32:30 2014 -0400 @@ -0,0 +1,137 @@ + + locates and displays tandem repeats in DNA sequences + + tandem_repeats_finder + + trf | grep Version + tandem_repeats_finder_wrapper.py --file $file --match $match --mismatch $mismatch --delta $delta --pm $pm --pi $pi --minscore $minscore --maxperiod $maxperiod +#if $nohtml + --txt "$output_txt" +#else + --html "$output_html" --dirhtml "$output_html.files_path" --txt "$output_txt" +#end if +#if $flanking + --flanking +#end if +#if $mask + --mask "$output_mask" +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (nohtml == False) + + + (mask == True) + + + + + + + + + + + + + +**What it does** + +A tandem repeat in DNA is two or more adjacent, approximate copies of a pattern of nucleotides. Tandem Repeats Finder is a program to locate and display tandem repeats in DNA sequences. In order to use the program, the user submits a sequence in FASTA format. There is no need to specify the pattern, the size of the pattern or any other parameter. The output consists of two files: a repeat table file and an alignment file. The repeat table contains information about each repeat, including its location, size, number of copies and nucleotide content. Clicking on the location indices for one of the table entries opens a second web browser that shows an alignment of the copies against a consensus pattern. The program is very fast, analyzing sequences on the order of .5Mb in just a few seconds. Submitted sequences may be of arbitrary length. Repeats with pattern size in the range from 1 to 2000 bases are detected. + +------- + +**Input format** + +The FASTA format is a plain text format which looks something like this: + +>myseq +AGTCGTCGCT AGCTAGCTAG CATCGAGTCT TTTCGATCGA GGACTAGACT TCTAGCTAGC TAGCATAGCA TACGAGCATA TCGGTCATGA GACTGATTGG GCTTTAGCTA GCTAGCATAG CATACGAGCA TATCGGTAGA CTGATTGGGT TTAGGTTACC + +The first line starts with a greater than sign ">" and contains a name or other identifier for the sequence. This is the sequence header and must be in a single line. The remaining lines contain the sequence data. The sequence can be in upper or lower case letters. Anything other than letters (numbers for example) is ignored. Multiple sequences can be present in the same file as long as each sequence has its own header. + +------- + +**Output format** + +Table Explanation: + +The summary table includes the following information:: + + 1 Indices of the repeat relative to the start of the sequence. + 2 Period size of the repeat. + 3 Number of copies aligned with the consensus pattern. + 4 Size of consensus pattern (may differ slightly from the period size). + 5 Percent of matches between adjacent copies overall. + 6 Percent of indels between adjacent copies overall. + 7 Alignment score. + 8 Percent composition for each of the four nucleotides. + 9 Entropy measure based on percent composition. + +If the output contains more than 120 repeats, multiple linked tables are produced. The links to the other tables appear at the top and bottom of each table. + +Note: If you save multiple linked summary table files, use the default names supplied by your browser to preserve the automatic linking. + +Alignment Explanation: + +The alignment is presented as follows:: + + 1 In each pair of lines, the actual sequence is on the top and a consensus sequence for all the copies is on the bottom. + 2 Each pair of lines is one period except for very small patterns. + 3 The 10 sequence characters before and after a repeat are shown. + 4 Symbol * indicates a mismatch. + 5 Symbol - indicates an insertion or deletion. + 6 Statistics refers to the matches, mismatches and indels overall between adjacent copies in the sequence, not between the sequence and the consensus pattern. + 7 Distances between matching characters at corresponding positions are listed as distance, number at that distance, percentage of all matches. + 8 ACGTcount is percentage of each nucleotide in the repeat sequence. + 9 Consensus sequence is shown by itself. + 10 If chosen as an option, 500 characters of flanking sequence on each side of the repeat are shown. + +Note: If you save the alignment file, use the default name supplied by your browser to preserve the automatic cross-referencing with the summary table. + +The data file is a text file which contains the same information, in the same order, as the repeat table file, plus consensus and repeat sequences. This file contains no labeling and is suitable for additional processing, for example with a perl script, outside of the program. + + +------- + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers: + +G. Benson, +"Tandem repeats finder: a program to analyze DNA sequences" +Nucleic Acids Research (1999) +Vol. 27, No. 2, pp. 573-580. + + + diff -r 000000000000 -r a2e1d1f25e35 test-data/TRF_summary_2_7_80_10_50_500.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TRF_summary_2_7_80_10_50_500.html Thu Jul 10 09:32:30 2014 -0400 @@ -0,0 +1,29 @@ +Output Summary
+Tandem Repeats Finder Program written by:
+Gary Benson +Department of Biomathematical Sciences +Mount Sinai School of Medicine +Version 4.00
+ +Please cite: +G. Benson, +"Tandem repeats finder: a program to analyze DNA sequences" +Nucleic Acid Research(1999) +Vol. 27, No. 2, pp. 573-580. + + +Multiple Sequence Summary + +Only sequences containing repeats are shown! + +Click on sequence description to view repeat table. + + + + +
Sequence +Index
Sequence +Description
Number of +Repeats
1
I
46
2
II
85
3
III
53
4
IV
166
5
IX
109
6
MT
549
7
V
61
8
VI
45
9
VII
112
10
VIII
85
11
X
73
12
XI
112
13
XII
132
14
XIII
122
15
XIV
98
16
XV
119
17
XVI
107
+ + diff -r 000000000000 -r a2e1d1f25e35 test-data/TRF_summary_2_7_80_10_50_500.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TRF_summary_2_7_80_10_50_500.txt Thu Jul 10 09:32:30 2014 -0400 @@ -0,0 +1,16 @@ +Tandem Repeats Finder Program writen by: + +Gary Benson +Department of Biomathematical Sciences +Mount Sinai School of Medicine +Version 4.00 + + +Sequence: myseq + + + +Parameters: 2 7 7 80 10 50 500 + + +53 154 52 2.1 49 90 9 163 27 17 27 27 1.98 TAGCTAGCTAGCATAGCATACGAGCATATCGGATGAGACTGATTGGGTT TAGCTAGCTAGCATAGCATACGAGCATATCGGTCATGAGACTGATTGGGCTTTAGCTAGCTAGCATAGCATACGAGCATATCGGTAGACTGATTGGGTTTAG diff -r 000000000000 -r a2e1d1f25e35 test-data/sequence_trf_test.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence_trf_test.fasta Thu Jul 10 09:32:30 2014 -0400 @@ -0,0 +1,4 @@ +>myseq +AGTCGTCGCTAGCTAGCTAGCATCGAGTCTTTTCGATCGAGGACTAGACTTCTAGCTAGC +TAGCATAGCATACGAGCATATCGGTCATGAGACTGATTGGGCTTTAGCTAGCTAGCATAG +CATACGAGCATATCGGTAGACTGATTGGGTTTAGGTTACC diff -r 000000000000 -r a2e1d1f25e35 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Jul 10 09:32:30 2014 -0400 @@ -0,0 +1,32 @@ + + + + + + + http://tandem.bu.edu/trf/downloads/trf400.linuxAMD64.exe + + . + $INSTALL_DIR + + + $INSTALL_DIR/trf + + + + http://tandem.bu.edu/trf/downloads/trf400.linux.exe + + . + $INSTALL_DIR + + + $INSTALL_DIR/trf + + + + $INSTALL_DIR + + + + +