diff repmatch_gff3_util.py @ 4:6acaa2c93f47 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 315c3ddcdbf38a27d43753aae3b6d379306be5a9
author iuc
date Wed, 12 Jul 2017 10:11:53 -0400
parents e5c7fffdc078
children 2365720de36d
line wrap: on
line diff
--- a/repmatch_gff3_util.py	Fri Jan 13 10:52:02 2017 -0500
+++ b/repmatch_gff3_util.py	Wed Jul 12 10:11:53 2017 -0400
@@ -23,6 +23,7 @@
 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
 
 COLORS = 'krb'
+ISPY2 = sys.version_info[0] == 2
 
 
 class Replicate(object):
@@ -30,7 +31,11 @@
     def __init__(self, id, dataset_path):
         self.id = id
         self.dataset_path = dataset_path
-        self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t'))
+        if ISPY2:
+            fh = open(dataset_path, 'rb')
+        else:
+            fh = open(dataset_path, 'r', newline='')
+        self.parse(csv.reader(fh, delimiter='\t'))
 
     def parse(self, reader):
         self.chromosomes = {}
@@ -39,11 +44,10 @@
                 continue
             cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
             attrs = parse_gff_attrs(attrs)
-            distance = attrs['cw_distance']
+            distance = int(attrs['cw_distance'])
             mid = int(mid)
             midplus = int(midplus)
             value = float(value)
-            distance = int(distance)
             if cname not in self.chromosomes:
                 self.chromosomes[cname] = Chromosome(cname)
             chrom = self.chromosomes[cname]
@@ -107,11 +111,11 @@
 
     @property
     def chrom(self):
-        return self.peaks.values()[0].chrom
+        return list(self.peaks.values())[0].chrom
 
     @property
     def midpoint(self):
-        return median([peak.midpoint for peak in self.peaks.values()])
+        return int(median([peak.midpoint for peak in self.peaks.values()]))
 
     @property
     def num_replicates(self):
@@ -119,7 +123,7 @@
 
     @property
     def median_distance(self):
-        return median([peak.distance for peak in self.peaks.values()])
+        return int(median([peak.distance for peak in self.peaks.values()]))
 
     @property
     def value_sum(self):
@@ -133,7 +137,7 @@
 
     @property
     def peakpeak_distance(self):
-        keys = self.peaks.keys()
+        keys = list(self.peaks.keys())
         return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
 
 
@@ -187,8 +191,8 @@
     Returns a window of all peaks from a replicate within a certain distance of
     a peak from another replicate.
     """
-    lower = target_peaks[0].midpoint
-    upper = target_peaks[0].midpoint
+    lower = list(target_peaks)[0].midpoint
+    upper = list(target_peaks)[0].midpoint
     for peak in target_peaks:
         lower = min(lower, peak.midpoint - distance)
         upper = max(upper, peak.midpoint + distance)
@@ -234,10 +238,10 @@
 METHODS = {'closest': match_closest, 'largest': match_largest}
 
 
-def gff_attrs(d):
-    if not d:
+def gff_attrs(l):
+    if len(l) == 0:
         return '.'
-    return ';'.join('%s=%s' % item for item in d.items())
+    return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l)
 
 
 def parse_gff_attrs(s):
@@ -250,8 +254,8 @@
     return d
 
 
-def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
-    return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
+def gff_row(cname, start, end, score, source, stype='.', strand='.', phase='.', attrs=None):
+    return (cname, source, stype, start, end, score, strand, phase, gff_attrs(attrs or []))
 
 
 def get_temporary_plot_path():
@@ -321,7 +325,12 @@
 
     def td_writer(file_path):
         # Returns a tab-delimited writer for a certain output
-        return csv.writer(open(file_path, 'wt'), delimiter='\t')
+        if ISPY2:
+            fh = open(file_path, 'wb')
+            return csv.writer(fh, delimiter='\t')
+        else:
+            fh = open(file_path, 'w', newline='')
+            return csv.writer(fh, delimiter='\t', quoting=csv.QUOTE_NONE)
 
     labels = ('chrom',
               'median midpoint',
@@ -363,7 +372,7 @@
             # Iterate over each replicate as "main"
             main = reps[0]
             reps.remove(main)
-            for chromosome in main.chromosomes.values():
+            for chromosome in list(main.chromosomes.values()):
                 peaks_by_value = chromosome.peaks[:]
                 # Sort main replicate by value
                 peaks_by_value.sort(key=lambda peak: -peak.value)
@@ -379,9 +388,7 @@
                                 continue
                             try:
                                 # Lines changed to remove a major bug by Rohit Reja.
-                                window, chrum = get_window(replicate.chromosomes[chromosome.name],
-                                                           group.peaks.values(),
-                                                           distance)
+                                window, chrum = get_window(replicate.chromosomes[chromosome.name], list(group.peaks.values()), distance)
                                 match = METHODS[method](window, peak, chrum)
                             except KeyError:
                                 continue
@@ -392,9 +399,9 @@
                             break
                 # Attempt to enlarge existing peak groups
                 for group in peak_groups:
-                    old_peaks = group.peaks.values()[:]
+                    old_peaks = list(group.peaks.values())
                     search_for_matches(group)
-                    for peak in group.peaks.values():
+                    for peak in list(group.peaks.values()):
                         if peak not in old_peaks:
                             peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
                 # Attempt to find new peaks groups.  For each peak in the
@@ -405,7 +412,7 @@
                     search_for_matches(matches)
                     # Were enough replicates matched?
                     if matches.num_replicates >= num_required:
-                        for peak in matches.peaks.values():
+                        for peak in list(matches.peaks.values()):
                             peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
                         peak_groups.append(matches)
     # Zero or less = no stepping
@@ -432,11 +439,14 @@
         matched_peaks_output.writerow(gff_row(cname=group.chrom,
                                               start=group.midpoint,
                                               end=group.midpoint + 1,
+                                              score=group.normalized_value(med),
                                               source='repmatch',
-                                              score=group.normalized_value(med),
-                                              attrs={'median_distance': group.median_distance,
-                                                     'replicates': group.num_replicates,
-                                                     'value_sum': group.value_sum}))
+                                              stype='.',
+                                              strand='.',
+                                              phase='.',
+                                              attrs=[('median_distance', group.median_distance),
+                                                     ('value_sum', group.value_sum),
+                                                     ('replicates', group.num_replicates)]))
         if output_detail_file:
             matched_peaks = (group.chrom,
                              group.midpoint,