changeset 0:a2e1d1f25e35 draft default tip

Uploaded
author urgi-team
date Thu, 10 Jul 2014 09:32:30 -0400
parents
children
files tandem_repeats_finder_wrapper.py tandem_repeats_finder_wrapper.xml test-data/TRF_summary_2_7_80_10_50_500.html test-data/TRF_summary_2_7_80_10_50_500.txt test-data/sequence_trf_test.fasta tool_dependencies.xml
diffstat 6 files changed, 335 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tandem_repeats_finder_wrapper.py	Thu Jul 10 09:32:30 2014 -0400
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+
+import subprocess, tempfile, sys, os, glob, shutil, time
+from optparse import OptionParser
+
+
+class tandemRepeatsFinderWrapper(object):
+
+	def __init__(self):
+		self._options = None
+
+	def getSystemCommand(self, prg, lArgs):
+		systemCmd = prg
+		for arg in lArgs:
+			systemCmd += " " + arg
+		return systemCmd
+
+	def setOptions(self, options):
+		self._options = options
+
+	def stop_err(self, msg ):
+		sys.stderr.write( "%s\n" % msg )
+		sys.exit()
+
+	def setAttributesFromCmdLine(self):
+		parser = OptionParser(description = "Tandem Repeats Finder wrapper", version = "4.0")
+		parser.add_option( "--html", dest = "html", help = "html summary file for Galaxy")
+		parser.add_option( "--dirhtml", dest = "dir", help = "html files directory for Galaxy")
+		parser.add_option( "--txt", dest = "txt", default="", help = "txt summary file for Galaxy")
+		parser.add_option( "--file", dest = "inputFile", help = "Input Fasta File name")
+		parser.add_option( "--match", dest = "match", type="int", default=2, help = "matching weight"),
+		parser.add_option( "--mismatch", dest = "mismatch", type="int", default=7, help = "mismatching penalty"),
+		parser.add_option( "--delta", dest = "delta", type="int", default = 7, help = "indel penalty")
+		parser.add_option( "--pm", dest = "pm", type="int", default=80, help="matching probability")
+		parser.add_option( "--pi", dest="pi", type="int", default=10, help="indel probability")
+		parser.add_option( "--minscore", dest="minscore", type="int", default=30, help="minimum alignment score to report")
+		parser.add_option( "--maxperiod", dest="maxperiod", type="int", default=500, help="maximum period size to report")
+		parser.add_option( "--flanking", dest="flanking", action="store_true", help="")
+		parser.add_option( "--mask", dest="mask", help="" )
+		options, args = parser.parse_args()
+		self._setAttributesFromOptions(options)
+
+	def _setAttributesFromOptions(self, options):
+		self.setOptions(options)
+
+	def run(self):
+		prg = "trf"
+		args = []
+		args.append("%s" % self._options.inputFile)
+		args.append("%d" % self._options.match)
+		args.append("%d" % self._options.mismatch)
+		args.append("%d" % self._options.delta)
+		args.append("%d" % self._options.pm)
+		args.append("%d" % self._options.pi)
+		args.append("%d" % self._options.minscore)
+		args.append("%d" % self._options.maxperiod)
+		if not self._options.html and not self._options.dir:
+			args.append("-h")
+		if self._options.flanking == True:
+			args.append("-f")
+		if self._options.mask:
+			args.append("-m")
+		args.append("-d")
+
+		cmd = self.getSystemCommand(prg, args)
+
+		try:
+			tmp_err = tempfile.NamedTemporaryFile().name
+			tmp_stderr = open( tmp_err, 'wb' )
+			proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stderr=tmp_stderr )
+			returncode = proc.wait()
+			tmp_stderr.close()
+			# get stderr, allowing for case where it's very large
+			tmp_stderr = open( tmp_err, 'rb' )
+			stderr = ''
+			buffsize = 1048576
+			try:
+				while True:
+					stderr += tmp_stderr.read( buffsize )
+					if not stderr or len( stderr ) % buffsize != 0:
+						break
+			except OverflowError:
+				pass
+			tmp_stderr.close()
+			if stderr:
+				raise Exception, stderr
+		except Exception, e:
+			self.stop_err( 'Error in Tandem Repeats Finder:\n' + str( e ) )
+		if self._options.html:
+			summary = glob.glob("*.summary.html")
+			if not summary:
+				summary = glob.glob("*1.html")
+
+			shutil.move("%s" % (summary[0]) , "%s" % (self._options.html))
+
+			os.mkdir("%s" % (self._options.dir))
+			for results in glob.glob("*.html"):
+				baseName = os.path.basename(results)
+				shutil.move("%s" % (results) , "%s/%s" % (self._options.dir, baseName))
+
+			if self._options.txt:
+				data = glob.glob("*.dat")
+				shutil.move("%s" % (data[0]) , "%s" % (self._options.txt))
+		else:
+			data = glob.glob("*.dat")
+			shutil.move("%s" % (data[0]) , "%s" % (self._options.txt))
+
+		if self._options.mask:
+			masked_file = glob.glob("*.mask")
+			shutil.move("%s" % (masked_file[0]) , "%s" % (self._options.mask))
+
+
+if __name__ == "__main__":
+	iWrapper = tandemRepeatsFinderWrapper()
+	iWrapper.setAttributesFromCmdLine()
+	iWrapper.run()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tandem_repeats_finder_wrapper.xml	Thu Jul 10 09:32:30 2014 -0400
@@ -0,0 +1,137 @@
+<tool id="tandem_repeats_finder" name="Tandem Repeats Finder" version="1.0.0">
+	<description>locates and displays tandem repeats in DNA sequences</description>
+	<requirements>
+		<requirement type="package" version="4.0">tandem_repeats_finder</requirement>
+	</requirements>
+	<version_command>trf | grep Version</version_command>
+	<command interpreter="python">tandem_repeats_finder_wrapper.py --file $file --match $match --mismatch $mismatch --delta $delta --pm $pm --pi $pi --minscore $minscore --maxperiod $maxperiod 
+#if $nohtml
+	--txt "$output_txt"
+#else
+	--html "$output_html" --dirhtml "$output_html.files_path" --txt "$output_txt"
+#end if
+#if $flanking
+	--flanking
+#end if
+#if $mask
+	--mask "$output_mask"
+#end if
+	
+	</command>
+	<inputs>
+        	<param name="file" type="data" format="fasta" label="DNA sequences in Fasta format"/>
+		<param name="match" type="integer" value="2" label="Matching weight" help="default value 2">
+			<validator type="in_range" min="1" />
+		</param>
+		<param name="mismatch" type="integer" value="7" label="Mismatching penalty" help="default value 7">
+			<validator type="in_range" min="0" />
+		</param>
+		<param name="delta" type="integer" value="7" label="Indel penalty" help="default value 7">
+			<validator type="in_range" min="0" />
+		</param>
+		<param name="pm" type="integer" value="80" label="Matching probability" help="default value 80">
+			<validator type="in_range" min="1" />
+		</param>
+		<param name="pi" type="integer" value="10" label="Indel probability" help="default value 10">
+			<validator type="in_range" min="1" />
+		</param>
+		<param name="minscore" type="integer" value="50" label="Minimum alignment score to report" help="">
+			<validator type="in_range" min="30" />
+		</param>
+		<param name="maxperiod" type="integer" value="500" label="Maximum period size to report" help="">
+			<validator type="in_range" min="1" />
+		</param>
+		<param name="nohtml" type="boolean" checked="false" label="No html output" help="Export dat file only" />
+		<param name="flanking" type="boolean" checked="false" label="Flanking sequence" help="Flanking sequence consists of the 500 nucleotides on each side of a repeat. Flanking sequence is recorded in the alignment file. This may be useful for PCR primer determination." />
+		<param name="mask" type="boolean" checked="false" label="Masked sequence file" help="The masked sequence file is a FASTA format file containing a copy of the sequence with every character that occurred in a tandem repeat changed to the letter 'N'. The word 'masked' is added to the sequence description line just after the '>' character." />
+	</inputs>
+	<outputs>
+ 		<data format="html" name="output_html" label="TRF_summary_${match}_${mismatch}_${delta}_${pm}_${pi}_${minscore}_${maxperiod}.html">
+			<filter>(nohtml == False)</filter>
+		</data>
+		<data format="txt" name="output_mask" label="TRF_summary_${match}_${mismatch}_${delta}_${pm}_${pi}_${minscore}_${maxperiod}.mask">
+			<filter>(mask == True)</filter>
+		</data>
+		<data format="txt" name="output_txt" label="TRF_summary_${match}_${mismatch}_${delta}_${pm}_${pi}_${minscore}_${maxperiod}.txt"/>
+	</outputs>
+	<tests>
+		<test>
+			<param name="file" value="sequence_trf_test.fasta" />
+			<param name="nohtml" value="True" />
+			<output name="output_txt" file="TRF_summary_2_7_80_10_50_500.txt" ftype="txt" />
+		</test>
+	</tests>
+	<help>
+
+
+**What it does**
+
+A tandem repeat in DNA is two or more adjacent, approximate copies of a pattern of nucleotides. Tandem Repeats Finder is a program to locate and display tandem repeats in DNA sequences. In order to use the program, the user submits a sequence in FASTA format. There is no need to specify the pattern, the size of the pattern or any other parameter. The output consists of two files: a repeat table file and an alignment file. The repeat table contains information about each repeat, including its location, size, number of copies and nucleotide content. Clicking on the location indices for one of the table entries opens a second web browser that shows an alignment of the copies against a consensus pattern. The program is very fast, analyzing sequences on the order of .5Mb in just a few seconds. Submitted sequences may be of arbitrary length. Repeats with pattern size in the range from 1 to 2000 bases are detected.
+
+-------
+
+**Input format**
+
+The FASTA format is a plain text format which looks something like this:
+
+>myseq
+AGTCGTCGCT AGCTAGCTAG CATCGAGTCT TTTCGATCGA GGACTAGACT TCTAGCTAGC TAGCATAGCA TACGAGCATA TCGGTCATGA GACTGATTGG GCTTTAGCTA GCTAGCATAG CATACGAGCA TATCGGTAGA CTGATTGGGT TTAGGTTACC
+
+The first line starts with a greater than sign ">" and contains a name or other identifier for the sequence. This is the sequence header and must be in a single line. The remaining lines contain the sequence data. The sequence can be in upper or lower case letters. Anything other than letters (numbers for example) is ignored. Multiple sequences can be present in the same file as long as each sequence has its own header.
+
+-------
+
+**Output format**
+
+Table Explanation:
+
+The summary table includes the following information::
+
+ 1 Indices of the repeat relative to the start of the sequence.
+ 2 Period size of the repeat.
+ 3 Number of copies aligned with the consensus pattern.
+ 4 Size of consensus pattern (may differ slightly from the period size).
+ 5 Percent of matches between adjacent copies overall.
+ 6 Percent of indels between adjacent copies overall.
+ 7 Alignment score.
+ 8 Percent composition for each of the four nucleotides.
+ 9 Entropy measure based on percent composition.
+
+If the output contains more than 120 repeats, multiple linked tables are produced. The links to the other tables appear at the top and bottom of each table.
+
+Note: If you save multiple linked summary table files, use the default names supplied by your browser to preserve the automatic linking.
+
+Alignment Explanation:
+
+The alignment is presented as follows::
+
+ 1 In each pair of lines, the actual sequence is on the top and a consensus sequence for all the copies is on the bottom.
+ 2 Each pair of lines is one period except for very small patterns.
+ 3 The 10 sequence characters before and after a repeat are shown.
+ 4 Symbol * indicates a mismatch.
+ 5 Symbol - indicates an insertion or deletion.
+ 6 Statistics refers to the matches, mismatches and indels overall between adjacent copies in the sequence, not between the sequence and the consensus pattern.
+ 7 Distances between matching characters at corresponding positions are listed as distance, number at that distance, percentage of all matches.
+ 8 ACGTcount is percentage of each nucleotide in the repeat sequence.
+ 9 Consensus sequence is shown by itself.
+ 10 If chosen as an option, 500 characters of flanking sequence on each side of the repeat are shown.
+
+Note: If you save the alignment file, use the default name supplied by your browser to preserve the automatic cross-referencing with the summary table.
+
+The data file is a text file which contains the same information, in the same order, as the repeat table file, plus consensus and repeat sequences. This file contains no labeling and is suitable for additional processing, for example with a perl script, outside of the program.
+
+
+-------
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+G. Benson,
+"Tandem repeats finder: a program to analyze DNA sequences"
+Nucleic Acids Research (1999)
+Vol. 27, No. 2, pp. 573-580.
+	</help>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/TRF_summary_2_7_80_10_50_500.html	Thu Jul 10 09:32:30 2014 -0400
@@ -0,0 +1,29 @@
+<HTML><HEAD><TITLE>Output Summary</TITLE></HEAD><BODY bgcolor="#FBF8BC"><PRE>
+Tandem Repeats Finder Program written by:<CENTER>
+Gary Benson
+Department of Biomathematical Sciences
+Mount Sinai School of Medicine
+Version 4.00</CENTER>
+
+Please cite:
+G. Benson,
+"Tandem repeats finder: a program to analyze DNA sequences"
+Nucleic Acid Research(1999)
+Vol. 27, No. 2, pp. 573-580.
+
+
+<B>Multiple Sequence Summary</B>
+
+Only sequences containing repeats are shown!
+
+Click on sequence description to view repeat table.
+
+<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=0>
+<TR><TD WIDTH=80><CENTER>Sequence
+Index</CENTER></TD><TD WIDTH=400><CENTER>Sequence
+Description</CENTER></TD><TD WIDTH=80><CENTER>Number of
+Repeats</CENTER></TD></TR>
+<TR><TD><CENTER>1</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s1.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s1.2.7.7.80.10.50.500.1.html">I</A></CENTER></TD><TD><CENTER>46</CENTER></TD></TR><TR><TD><CENTER>2</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s2.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s2.2.7.7.80.10.50.500.1.html">II</A></CENTER></TD><TD><CENTER>85</CENTER></TD></TR><TR><TD><CENTER>3</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s3.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s3.2.7.7.80.10.50.500.1.html">III</A></CENTER></TD><TD><CENTER>53</CENTER></TD></TR><TR><TD><CENTER>4</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s4.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s4.2.7.7.80.10.50.500.1.html">IV</A></CENTER></TD><TD><CENTER>166</CENTER></TD></TR><TR><TD><CENTER>5</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s5.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s5.2.7.7.80.10.50.500.1.html">IX</A></CENTER></TD><TD><CENTER>109</CENTER></TD></TR><TR><TD><CENTER>6</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s6.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s6.2.7.7.80.10.50.500.1.html">MT</A></CENTER></TD><TD><CENTER>549</CENTER></TD></TR><TR><TD><CENTER>7</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s7.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s7.2.7.7.80.10.50.500.1.html">V</A></CENTER></TD><TD><CENTER>61</CENTER></TD></TR><TR><TD><CENTER>8</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s8.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s8.2.7.7.80.10.50.500.1.html">VI</A></CENTER></TD><TD><CENTER>45</CENTER></TD></TR><TR><TD><CENTER>9</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s9.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s9.2.7.7.80.10.50.500.1.html">VII</A></CENTER></TD><TD><CENTER>112</CENTER></TD></TR><TR><TD><CENTER>10</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s10.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s10.2.7.7.80.10.50.500.1.html">VIII</A></CENTER></TD><TD><CENTER>85</CENTER></TD></TR><TR><TD><CENTER>11</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s11.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s11.2.7.7.80.10.50.500.1.html">X</A></CENTER></TD><TD><CENTER>73</CENTER></TD></TR><TR><TD><CENTER>12</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s12.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s12.2.7.7.80.10.50.500.1.html">XI</A></CENTER></TD><TD><CENTER>112</CENTER></TD></TR><TR><TD><CENTER>13</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s13.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s13.2.7.7.80.10.50.500.1.html">XII</A></CENTER></TD><TD><CENTER>132</CENTER></TD></TR><TR><TD><CENTER>14</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s14.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s14.2.7.7.80.10.50.500.1.html">XIII</A></CENTER></TD><TD><CENTER>122</CENTER></TD></TR><TR><TD><CENTER>15</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s15.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s15.2.7.7.80.10.50.500.1.html">XIV</A></CENTER></TD><TD><CENTER>98</CENTER></TD></TR><TR><TD><CENTER>16</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s16.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s16.2.7.7.80.10.50.500.1.html">XV</A></CENTER></TD><TD><CENTER>119</CENTER></TD></TR><TR><TD><CENTER>17</CENTER></TD><TD><CENTER><A TARGET="dataset_1.dat.s17.2.7.7.80.10.50.500.1.html" HREF="dataset_1.dat.s17.2.7.7.80.10.50.500.1.html">XVI</A></CENTER></TD><TD><CENTER>107</CENTER></TD></TR>
+</TABLE>
+
+</BODY></HTML>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/TRF_summary_2_7_80_10_50_500.txt	Thu Jul 10 09:32:30 2014 -0400
@@ -0,0 +1,16 @@
+Tandem Repeats Finder Program writen by:
+
+Gary Benson
+Department of Biomathematical Sciences
+Mount Sinai School of Medicine
+Version 4.00
+
+
+Sequence: myseq
+
+
+
+Parameters: 2 7 7 80 10 50 500
+
+
+53 154 52 2.1 49 90 9 163 27 17 27 27 1.98 TAGCTAGCTAGCATAGCATACGAGCATATCGGATGAGACTGATTGGGTT TAGCTAGCTAGCATAGCATACGAGCATATCGGTCATGAGACTGATTGGGCTTTAGCTAGCTAGCATAGCATACGAGCATATCGGTAGACTGATTGGGTTTAG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sequence_trf_test.fasta	Thu Jul 10 09:32:30 2014 -0400
@@ -0,0 +1,4 @@
+>myseq
+AGTCGTCGCTAGCTAGCTAGCATCGAGTCTTTTCGATCGAGGACTAGACTTCTAGCTAGC
+TAGCATAGCATACGAGCATATCGGTCATGAGACTGATTGGGCTTTAGCTAGCTAGCATAG
+CATACGAGCATATCGGTAGACTGATTGGGTTTAGGTTACC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Jul 10 09:32:30 2014 -0400
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<tool_dependency>
+	<package name="tandem_repeats_finder" version="4.0">
+		<install version="1.0">
+			<actions_group>
+				<actions os="linux" architecture="x86_64">
+					<action type="download_by_url" target_filename="trf">http://tandem.bu.edu/trf/downloads/trf400.linuxAMD64.exe</action>
+					<action type="move_directory_files">
+						<source_directory>.</source_directory>
+						<destination_directory>$INSTALL_DIR</destination_directory>
+					</action>
+					<action type="chmod">
+						<file mode="750">$INSTALL_DIR/trf</file>
+					</action>
+				</actions>
+				<actions os="linux" architecture="i386">
+					<action type="download_by_url" target_filename="trf">http://tandem.bu.edu/trf/downloads/trf400.linux.exe</action>
+					<action type="move_directory_files">
+						<source_directory>.</source_directory>
+						<destination_directory>$INSTALL_DIR</destination_directory>
+					</action>
+					<action type="chmod">
+						<file mode="750">$INSTALL_DIR/trf</file>
+					</action>
+				</actions>
+				<action type="set_environment">
+					<environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
+				 </action>
+			</actions_group>
+		</install>
+	</package>
+</tool_dependency>