changeset 4:6acaa2c93f47 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 315c3ddcdbf38a27d43753aae3b6d379306be5a9
author iuc
date Wed, 12 Jul 2017 10:11:53 -0400
parents f7608d0363bf
children 2365720de36d
files repmatch_gff3.py repmatch_gff3.xml repmatch_gff3_util.py test-data/detail_out1.tabular test-data/matched_peaks_out1.gff
diffstat 5 files changed, 68 insertions(+), 65 deletions(-) [+]
line wrap: on
line diff
--- a/repmatch_gff3.py	Fri Jan 13 10:52:02 2017 -0500
+++ b/repmatch_gff3.py	Wed Jul 12 10:11:53 2017 -0400
@@ -8,6 +8,7 @@
 # Files: statistics_table.tabular (file to replicate ID), matched_paired_peaks.tabular, detail.tabular, unmatched_peaks.tabular
 
 import argparse
+
 import repmatch_gff3_util
 
 if __name__ == '__main__':
--- a/repmatch_gff3.xml	Fri Jan 13 10:52:02 2017 -0500
+++ b/repmatch_gff3.xml	Wed Jul 12 10:11:53 2017 -0400
@@ -1,42 +1,34 @@
 <?xml version="1.0"?>
-<tool id="repmatch_gff3" name="RepMatch" version="@WRAPPER_VERSION@.1">
+<tool id="repmatch_gff3" name="RepMatch" version="@WRAPPER_VERSION@.2">
     <description>Match paired peaks from two or more replicates</description>
     <macros>
         <import>repmatch_gff3_macros.xml</import>
     </macros>
     <expand macro="requirements" />
-    <stdio>
-        <!-- Anything other than zero is an error -->
-        <exit_code range=":-1" />
-        <exit_code range="1:" />
-        <!-- Check stderr in case the return code has not been set -->
-        <regex match="Error:" />
-        <regex match="Exception:" />
-    </stdio>
-    <command>
-        python $__tool_directory__/repmatch_gff3.py
-        #for $i in $input:
-             --input "${i}" "${i.hid}"
-        #end for
-        --method $method
-        --distance $distance
-        --replicates $replicates
-        --output_files $output_files_cond.output_files
-        --output_matched_peaks "$output_matched_peaks"
-        #if str($output_files_cond.output_files) in ["all", "matched_peaks_unmatched_peaks"]:
-            --output_unmatched_peaks "$output_unmatched_peaks"
-        #end if
-        #if str($output_files_cond.output_files) =="all":
-            --output_detail "$output_detail"
-            --output_statistics_table "$output_statistics_table"
-            --output_statistics_histogram "$output_statistics_histogram"
-        #end if
-        #if str($advanced_options_cond.advanced_options) == "on":
-            --step $advanced_options_cond.step
-            --low_limit $advanced_options_cond.low_limit
-            --up_limit $advanced_options_cond.up_limit
-        #end if
-    </command>
+    <command detect_errors="aggressive"><![CDATA[
+python '$__tool_directory__/repmatch_gff3.py'
+#for $i in $input:
+     --input '${i}' '${i.hid}'
+#end for
+--method $method
+--distance $distance
+--replicates $replicates
+--output_files $output_files_cond.output_files
+--output_matched_peaks '$output_matched_peaks'
+#if str($output_files_cond.output_files) in ['all', 'matched_peaks_unmatched_peaks']:
+    --output_unmatched_peaks '$output_unmatched_peaks'
+#end if
+#if str($output_files_cond.output_files) == 'all':
+    --output_detail '$output_detail'
+    --output_statistics_table '$output_statistics_table'
+    --output_statistics_histogram '$output_statistics_histogram'
+#end if
+#if str($advanced_options_cond.advanced_options) == 'on':
+    --step $advanced_options_cond.step
+    --low_limit $advanced_options_cond.low_limit
+    --up_limit $advanced_options_cond.up_limit
+#end if
+    ]]></command>
     <inputs>
         <param  name="input" type="data" format="gff" multiple="True" min="2" label="Match paired peaks on" />
         <param name="method" type="select" label="Method of finding match">
--- a/repmatch_gff3_util.py	Fri Jan 13 10:52:02 2017 -0500
+++ b/repmatch_gff3_util.py	Wed Jul 12 10:11:53 2017 -0400
@@ -23,6 +23,7 @@
 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
 
 COLORS = 'krb'
+ISPY2 = sys.version_info[0] == 2
 
 
 class Replicate(object):
@@ -30,7 +31,11 @@
     def __init__(self, id, dataset_path):
         self.id = id
         self.dataset_path = dataset_path
-        self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t'))
+        if ISPY2:
+            fh = open(dataset_path, 'rb')
+        else:
+            fh = open(dataset_path, 'r', newline='')
+        self.parse(csv.reader(fh, delimiter='\t'))
 
     def parse(self, reader):
         self.chromosomes = {}
@@ -39,11 +44,10 @@
                 continue
             cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
             attrs = parse_gff_attrs(attrs)
-            distance = attrs['cw_distance']
+            distance = int(attrs['cw_distance'])
             mid = int(mid)
             midplus = int(midplus)
             value = float(value)
-            distance = int(distance)
             if cname not in self.chromosomes:
                 self.chromosomes[cname] = Chromosome(cname)
             chrom = self.chromosomes[cname]
@@ -107,11 +111,11 @@
 
     @property
     def chrom(self):
-        return self.peaks.values()[0].chrom
+        return list(self.peaks.values())[0].chrom
 
     @property
     def midpoint(self):
-        return median([peak.midpoint for peak in self.peaks.values()])
+        return int(median([peak.midpoint for peak in self.peaks.values()]))
 
     @property
     def num_replicates(self):
@@ -119,7 +123,7 @@
 
     @property
     def median_distance(self):
-        return median([peak.distance for peak in self.peaks.values()])
+        return int(median([peak.distance for peak in self.peaks.values()]))
 
     @property
     def value_sum(self):
@@ -133,7 +137,7 @@
 
     @property
     def peakpeak_distance(self):
-        keys = self.peaks.keys()
+        keys = list(self.peaks.keys())
         return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
 
 
@@ -187,8 +191,8 @@
     Returns a window of all peaks from a replicate within a certain distance of
     a peak from another replicate.
     """
-    lower = target_peaks[0].midpoint
-    upper = target_peaks[0].midpoint
+    lower = list(target_peaks)[0].midpoint
+    upper = list(target_peaks)[0].midpoint
     for peak in target_peaks:
         lower = min(lower, peak.midpoint - distance)
         upper = max(upper, peak.midpoint + distance)
@@ -234,10 +238,10 @@
 METHODS = {'closest': match_closest, 'largest': match_largest}
 
 
-def gff_attrs(d):
-    if not d:
+def gff_attrs(l):
+    if len(l) == 0:
         return '.'
-    return ';'.join('%s=%s' % item for item in d.items())
+    return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l)
 
 
 def parse_gff_attrs(s):
@@ -250,8 +254,8 @@
     return d
 
 
-def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
-    return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
+def gff_row(cname, start, end, score, source, stype='.', strand='.', phase='.', attrs=None):
+    return (cname, source, stype, start, end, score, strand, phase, gff_attrs(attrs or []))
 
 
 def get_temporary_plot_path():
@@ -321,7 +325,12 @@
 
     def td_writer(file_path):
         # Returns a tab-delimited writer for a certain output
-        return csv.writer(open(file_path, 'wt'), delimiter='\t')
+        if ISPY2:
+            fh = open(file_path, 'wb')
+            return csv.writer(fh, delimiter='\t')
+        else:
+            fh = open(file_path, 'w', newline='')
+            return csv.writer(fh, delimiter='\t', quoting=csv.QUOTE_NONE)
 
     labels = ('chrom',
               'median midpoint',
@@ -363,7 +372,7 @@
             # Iterate over each replicate as "main"
             main = reps[0]
             reps.remove(main)
-            for chromosome in main.chromosomes.values():
+            for chromosome in list(main.chromosomes.values()):
                 peaks_by_value = chromosome.peaks[:]
                 # Sort main replicate by value
                 peaks_by_value.sort(key=lambda peak: -peak.value)
@@ -379,9 +388,7 @@
                                 continue
                             try:
                                 # Lines changed to remove a major bug by Rohit Reja.
-                                window, chrum = get_window(replicate.chromosomes[chromosome.name],
-                                                           group.peaks.values(),
-                                                           distance)
+                                window, chrum = get_window(replicate.chromosomes[chromosome.name], list(group.peaks.values()), distance)
                                 match = METHODS[method](window, peak, chrum)
                             except KeyError:
                                 continue
@@ -392,9 +399,9 @@
                             break
                 # Attempt to enlarge existing peak groups
                 for group in peak_groups:
-                    old_peaks = group.peaks.values()[:]
+                    old_peaks = list(group.peaks.values())
                     search_for_matches(group)
-                    for peak in group.peaks.values():
+                    for peak in list(group.peaks.values()):
                         if peak not in old_peaks:
                             peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
                 # Attempt to find new peaks groups.  For each peak in the
@@ -405,7 +412,7 @@
                     search_for_matches(matches)
                     # Were enough replicates matched?
                     if matches.num_replicates >= num_required:
-                        for peak in matches.peaks.values():
+                        for peak in list(matches.peaks.values()):
                             peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
                         peak_groups.append(matches)
     # Zero or less = no stepping
@@ -432,11 +439,14 @@
         matched_peaks_output.writerow(gff_row(cname=group.chrom,
                                               start=group.midpoint,
                                               end=group.midpoint + 1,
+                                              score=group.normalized_value(med),
                                               source='repmatch',
-                                              score=group.normalized_value(med),
-                                              attrs={'median_distance': group.median_distance,
-                                                     'replicates': group.num_replicates,
-                                                     'value_sum': group.value_sum}))
+                                              stype='.',
+                                              strand='.',
+                                              phase='.',
+                                              attrs=[('median_distance', group.median_distance),
+                                                     ('value_sum', group.value_sum),
+                                                     ('replicates', group.num_replicates)]))
         if output_detail_file:
             matched_peaks = (group.chrom,
                              group.midpoint,
--- a/test-data/detail_out1.tabular	Fri Jan 13 10:52:02 2017 -0500
+++ b/test-data/detail_out1.tabular	Wed Jul 12 10:11:53 2017 -0400
@@ -24,7 +24,7 @@
 chr1	87029	87030	1009.0689873417721	2	1	2025.0	chr1	87044	87045	1191.0	30	1	chr1	87015	87016	834.0	-28	2
 chr1	25305	25306	1178.1332278481013	2	99	2366.0	chr1	25305	25306	1183.0	99	1	chr1	25305	25306	1183.0	99	2
 chr1	139230	139231	1174.1496835443038	2	15	2358.0	chr1	139230	139231	1179.0	15	1	chr1	139230	139231	1179.0	15	2
-chr1	335	336	1173.125	2	-5	2356.0	chr1	325	326	1171.0	16	1	chr1	345	346	1185.0	-25	2
+chr1	335	336	1173.125	2	-4	2356.0	chr1	325	326	1171.0	16	1	chr1	345	346	1185.0	-25	2
 chr1	55548	55549	952.067088607595	2	86	1912.0	chr1	55548	55549	956.0	86	1	chr1	55548	55549	956.0	86	2
 chr1	360	361	888.3591772151899	2	45	1784.0	chr1	370	371	899.0	25	1	chr1	350	351	885.0	66	2
 chr1	72795	72796	961.6268987341772	2	9	1932.0	chr1	72805	72806	869.0	29	1	chr1	72786	72787	1063.0	-10	2
@@ -58,8 +58,8 @@
 chr1	668	669	317.6876582278481	2	-48	638.0	chr1	668	669	319.0	-48	1	chr1	668	669	319.0	-48	2
 chr1	98551	98552	121.49810126582278	2	27	244.0	chr1	98551	98552	122.0	27	1	chr1	98551	98552	122.0	27	2
 chr1	180331	180332	96.60094936708862	2	82	194.0	chr1	180331	180332	97.0	82	1	chr1	180331	180332	97.0	82	2
-chr1	230172	230173	42.87658227848101	2	-2	86.0	chr1	230178	230179	56.0	10	1	chr1	230166	230167	30.0	-13	2
-chr1	230133	230134	26.95886075949367	2	-8	54.0	chr1	230125	230126	44.0	10	1	chr1	230142	230143	10.0	-25	2
+chr1	230172	230173	42.87658227848101	2	-1	86.0	chr1	230178	230179	56.0	10	1	chr1	230166	230167	30.0	-13	2
+chr1	230133	230134	26.95886075949367	2	-7	54.0	chr1	230125	230126	44.0	10	1	chr1	230142	230143	10.0	-25	2
 chr1	230154	230155	44.69145569620253	2	34	90.0	chr1	230157	230158	15.0	5	1	chr1	230151	230152	75.0	63	2
 chr1	96919	96920	11.950632911392404	2	3	24.0	chr1	96919	96920	12.0	3	1	chr1	96919	96920	12.0	3	2
 chr1	197535	197536	4.9794303797468356	2	73	10.0	chr1	197535	197536	5.0	73	1	chr1	197535	197536	5.0	73	2
--- a/test-data/matched_peaks_out1.gff	Fri Jan 13 10:52:02 2017 -0500
+++ b/test-data/matched_peaks_out1.gff	Wed Jul 12 10:11:53 2017 -0400
@@ -23,7 +23,7 @@
 chr1	repmatch	.	87029	87030	1009.0689873417721	.	.	median_distance=1;value_sum=2025.0;replicates=2
 chr1	repmatch	.	25305	25306	1178.1332278481013	.	.	median_distance=99;value_sum=2366.0;replicates=2
 chr1	repmatch	.	139230	139231	1174.1496835443038	.	.	median_distance=15;value_sum=2358.0;replicates=2
-chr1	repmatch	.	335	336	1173.125	.	.	median_distance=-5;value_sum=2356.0;replicates=2
+chr1	repmatch	.	335	336	1173.125	.	.	median_distance=-4;value_sum=2356.0;replicates=2
 chr1	repmatch	.	55548	55549	952.067088607595	.	.	median_distance=86;value_sum=1912.0;replicates=2
 chr1	repmatch	.	360	361	888.3591772151899	.	.	median_distance=45;value_sum=1784.0;replicates=2
 chr1	repmatch	.	72795	72796	961.6268987341772	.	.	median_distance=9;value_sum=1932.0;replicates=2
@@ -57,8 +57,8 @@
 chr1	repmatch	.	668	669	317.6876582278481	.	.	median_distance=-48;value_sum=638.0;replicates=2
 chr1	repmatch	.	98551	98552	121.49810126582278	.	.	median_distance=27;value_sum=244.0;replicates=2
 chr1	repmatch	.	180331	180332	96.60094936708862	.	.	median_distance=82;value_sum=194.0;replicates=2
-chr1	repmatch	.	230172	230173	42.87658227848101	.	.	median_distance=-2;value_sum=86.0;replicates=2
-chr1	repmatch	.	230133	230134	26.95886075949367	.	.	median_distance=-8;value_sum=54.0;replicates=2
+chr1	repmatch	.	230172	230173	42.87658227848101	.	.	median_distance=-1;value_sum=86.0;replicates=2
+chr1	repmatch	.	230133	230134	26.95886075949367	.	.	median_distance=-7;value_sum=54.0;replicates=2
 chr1	repmatch	.	230154	230155	44.69145569620253	.	.	median_distance=34;value_sum=90.0;replicates=2
 chr1	repmatch	.	96919	96920	11.950632911392404	.	.	median_distance=3;value_sum=24.0;replicates=2
 chr1	repmatch	.	197535	197536	4.9794303797468356	.	.	median_distance=73;value_sum=10.0;replicates=2