Mercurial > repos > iuc > repmatch_gff3
changeset 4:6acaa2c93f47 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 315c3ddcdbf38a27d43753aae3b6d379306be5a9
author | iuc |
---|---|
date | Wed, 12 Jul 2017 10:11:53 -0400 |
parents | f7608d0363bf |
children | 2365720de36d |
files | repmatch_gff3.py repmatch_gff3.xml repmatch_gff3_util.py test-data/detail_out1.tabular test-data/matched_peaks_out1.gff |
diffstat | 5 files changed, 68 insertions(+), 65 deletions(-) [+] |
line wrap: on
line diff
--- a/repmatch_gff3.py Fri Jan 13 10:52:02 2017 -0500 +++ b/repmatch_gff3.py Wed Jul 12 10:11:53 2017 -0400 @@ -8,6 +8,7 @@ # Files: statistics_table.tabular (file to replicate ID), matched_paired_peaks.tabular, detail.tabular, unmatched_peaks.tabular import argparse + import repmatch_gff3_util if __name__ == '__main__':
--- a/repmatch_gff3.xml Fri Jan 13 10:52:02 2017 -0500 +++ b/repmatch_gff3.xml Wed Jul 12 10:11:53 2017 -0400 @@ -1,42 +1,34 @@ <?xml version="1.0"?> -<tool id="repmatch_gff3" name="RepMatch" version="@WRAPPER_VERSION@.1"> +<tool id="repmatch_gff3" name="RepMatch" version="@WRAPPER_VERSION@.2"> <description>Match paired peaks from two or more replicates</description> <macros> <import>repmatch_gff3_macros.xml</import> </macros> <expand macro="requirements" /> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range=":-1" /> - <exit_code range="1:" /> - <!-- Check stderr in case the return code has not been set --> - <regex match="Error:" /> - <regex match="Exception:" /> - </stdio> - <command> - python $__tool_directory__/repmatch_gff3.py - #for $i in $input: - --input "${i}" "${i.hid}" - #end for - --method $method - --distance $distance - --replicates $replicates - --output_files $output_files_cond.output_files - --output_matched_peaks "$output_matched_peaks" - #if str($output_files_cond.output_files) in ["all", "matched_peaks_unmatched_peaks"]: - --output_unmatched_peaks "$output_unmatched_peaks" - #end if - #if str($output_files_cond.output_files) =="all": - --output_detail "$output_detail" - --output_statistics_table "$output_statistics_table" - --output_statistics_histogram "$output_statistics_histogram" - #end if - #if str($advanced_options_cond.advanced_options) == "on": - --step $advanced_options_cond.step - --low_limit $advanced_options_cond.low_limit - --up_limit $advanced_options_cond.up_limit - #end if - </command> + <command detect_errors="aggressive"><![CDATA[ +python '$__tool_directory__/repmatch_gff3.py' +#for $i in $input: + --input '${i}' '${i.hid}' +#end for +--method $method +--distance $distance +--replicates $replicates +--output_files $output_files_cond.output_files +--output_matched_peaks '$output_matched_peaks' +#if str($output_files_cond.output_files) in ['all', 'matched_peaks_unmatched_peaks']: + --output_unmatched_peaks '$output_unmatched_peaks' +#end if +#if str($output_files_cond.output_files) == 'all': + --output_detail '$output_detail' + --output_statistics_table '$output_statistics_table' + --output_statistics_histogram '$output_statistics_histogram' +#end if +#if str($advanced_options_cond.advanced_options) == 'on': + --step $advanced_options_cond.step + --low_limit $advanced_options_cond.low_limit + --up_limit $advanced_options_cond.up_limit +#end if + ]]></command> <inputs> <param name="input" type="data" format="gff" multiple="True" min="2" label="Match paired peaks on" /> <param name="method" type="select" label="Method of finding match">
--- a/repmatch_gff3_util.py Fri Jan 13 10:52:02 2017 -0500 +++ b/repmatch_gff3_util.py Wed Jul 12 10:11:53 2017 -0400 @@ -23,6 +23,7 @@ pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) COLORS = 'krb' +ISPY2 = sys.version_info[0] == 2 class Replicate(object): @@ -30,7 +31,11 @@ def __init__(self, id, dataset_path): self.id = id self.dataset_path = dataset_path - self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t')) + if ISPY2: + fh = open(dataset_path, 'rb') + else: + fh = open(dataset_path, 'r', newline='') + self.parse(csv.reader(fh, delimiter='\t')) def parse(self, reader): self.chromosomes = {} @@ -39,11 +44,10 @@ continue cname, junk, junk, mid, midplus, value, strand, junk, attrs = line attrs = parse_gff_attrs(attrs) - distance = attrs['cw_distance'] + distance = int(attrs['cw_distance']) mid = int(mid) midplus = int(midplus) value = float(value) - distance = int(distance) if cname not in self.chromosomes: self.chromosomes[cname] = Chromosome(cname) chrom = self.chromosomes[cname] @@ -107,11 +111,11 @@ @property def chrom(self): - return self.peaks.values()[0].chrom + return list(self.peaks.values())[0].chrom @property def midpoint(self): - return median([peak.midpoint for peak in self.peaks.values()]) + return int(median([peak.midpoint for peak in self.peaks.values()])) @property def num_replicates(self): @@ -119,7 +123,7 @@ @property def median_distance(self): - return median([peak.distance for peak in self.peaks.values()]) + return int(median([peak.distance for peak in self.peaks.values()])) @property def value_sum(self): @@ -133,7 +137,7 @@ @property def peakpeak_distance(self): - keys = self.peaks.keys() + keys = list(self.peaks.keys()) return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) @@ -187,8 +191,8 @@ Returns a window of all peaks from a replicate within a certain distance of a peak from another replicate. """ - lower = target_peaks[0].midpoint - upper = target_peaks[0].midpoint + lower = list(target_peaks)[0].midpoint + upper = list(target_peaks)[0].midpoint for peak in target_peaks: lower = min(lower, peak.midpoint - distance) upper = max(upper, peak.midpoint + distance) @@ -234,10 +238,10 @@ METHODS = {'closest': match_closest, 'largest': match_largest} -def gff_attrs(d): - if not d: +def gff_attrs(l): + if len(l) == 0: return '.' - return ';'.join('%s=%s' % item for item in d.items()) + return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l) def parse_gff_attrs(s): @@ -250,8 +254,8 @@ return d -def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): - return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) +def gff_row(cname, start, end, score, source, stype='.', strand='.', phase='.', attrs=None): + return (cname, source, stype, start, end, score, strand, phase, gff_attrs(attrs or [])) def get_temporary_plot_path(): @@ -321,7 +325,12 @@ def td_writer(file_path): # Returns a tab-delimited writer for a certain output - return csv.writer(open(file_path, 'wt'), delimiter='\t') + if ISPY2: + fh = open(file_path, 'wb') + return csv.writer(fh, delimiter='\t') + else: + fh = open(file_path, 'w', newline='') + return csv.writer(fh, delimiter='\t', quoting=csv.QUOTE_NONE) labels = ('chrom', 'median midpoint', @@ -363,7 +372,7 @@ # Iterate over each replicate as "main" main = reps[0] reps.remove(main) - for chromosome in main.chromosomes.values(): + for chromosome in list(main.chromosomes.values()): peaks_by_value = chromosome.peaks[:] # Sort main replicate by value peaks_by_value.sort(key=lambda peak: -peak.value) @@ -379,9 +388,7 @@ continue try: # Lines changed to remove a major bug by Rohit Reja. - window, chrum = get_window(replicate.chromosomes[chromosome.name], - group.peaks.values(), - distance) + window, chrum = get_window(replicate.chromosomes[chromosome.name], list(group.peaks.values()), distance) match = METHODS[method](window, peak, chrum) except KeyError: continue @@ -392,9 +399,9 @@ break # Attempt to enlarge existing peak groups for group in peak_groups: - old_peaks = group.peaks.values()[:] + old_peaks = list(group.peaks.values()) search_for_matches(group) - for peak in group.peaks.values(): + for peak in list(group.peaks.values()): if peak not in old_peaks: peak.replicate.chromosomes[chromosome.name].remove_peak(peak) # Attempt to find new peaks groups. For each peak in the @@ -405,7 +412,7 @@ search_for_matches(matches) # Were enough replicates matched? if matches.num_replicates >= num_required: - for peak in matches.peaks.values(): + for peak in list(matches.peaks.values()): peak.replicate.chromosomes[chromosome.name].remove_peak(peak) peak_groups.append(matches) # Zero or less = no stepping @@ -432,11 +439,14 @@ matched_peaks_output.writerow(gff_row(cname=group.chrom, start=group.midpoint, end=group.midpoint + 1, + score=group.normalized_value(med), source='repmatch', - score=group.normalized_value(med), - attrs={'median_distance': group.median_distance, - 'replicates': group.num_replicates, - 'value_sum': group.value_sum})) + stype='.', + strand='.', + phase='.', + attrs=[('median_distance', group.median_distance), + ('value_sum', group.value_sum), + ('replicates', group.num_replicates)])) if output_detail_file: matched_peaks = (group.chrom, group.midpoint,
--- a/test-data/detail_out1.tabular Fri Jan 13 10:52:02 2017 -0500 +++ b/test-data/detail_out1.tabular Wed Jul 12 10:11:53 2017 -0400 @@ -24,7 +24,7 @@ chr1 87029 87030 1009.0689873417721 2 1 2025.0 chr1 87044 87045 1191.0 30 1 chr1 87015 87016 834.0 -28 2 chr1 25305 25306 1178.1332278481013 2 99 2366.0 chr1 25305 25306 1183.0 99 1 chr1 25305 25306 1183.0 99 2 chr1 139230 139231 1174.1496835443038 2 15 2358.0 chr1 139230 139231 1179.0 15 1 chr1 139230 139231 1179.0 15 2 -chr1 335 336 1173.125 2 -5 2356.0 chr1 325 326 1171.0 16 1 chr1 345 346 1185.0 -25 2 +chr1 335 336 1173.125 2 -4 2356.0 chr1 325 326 1171.0 16 1 chr1 345 346 1185.0 -25 2 chr1 55548 55549 952.067088607595 2 86 1912.0 chr1 55548 55549 956.0 86 1 chr1 55548 55549 956.0 86 2 chr1 360 361 888.3591772151899 2 45 1784.0 chr1 370 371 899.0 25 1 chr1 350 351 885.0 66 2 chr1 72795 72796 961.6268987341772 2 9 1932.0 chr1 72805 72806 869.0 29 1 chr1 72786 72787 1063.0 -10 2 @@ -58,8 +58,8 @@ chr1 668 669 317.6876582278481 2 -48 638.0 chr1 668 669 319.0 -48 1 chr1 668 669 319.0 -48 2 chr1 98551 98552 121.49810126582278 2 27 244.0 chr1 98551 98552 122.0 27 1 chr1 98551 98552 122.0 27 2 chr1 180331 180332 96.60094936708862 2 82 194.0 chr1 180331 180332 97.0 82 1 chr1 180331 180332 97.0 82 2 -chr1 230172 230173 42.87658227848101 2 -2 86.0 chr1 230178 230179 56.0 10 1 chr1 230166 230167 30.0 -13 2 -chr1 230133 230134 26.95886075949367 2 -8 54.0 chr1 230125 230126 44.0 10 1 chr1 230142 230143 10.0 -25 2 +chr1 230172 230173 42.87658227848101 2 -1 86.0 chr1 230178 230179 56.0 10 1 chr1 230166 230167 30.0 -13 2 +chr1 230133 230134 26.95886075949367 2 -7 54.0 chr1 230125 230126 44.0 10 1 chr1 230142 230143 10.0 -25 2 chr1 230154 230155 44.69145569620253 2 34 90.0 chr1 230157 230158 15.0 5 1 chr1 230151 230152 75.0 63 2 chr1 96919 96920 11.950632911392404 2 3 24.0 chr1 96919 96920 12.0 3 1 chr1 96919 96920 12.0 3 2 chr1 197535 197536 4.9794303797468356 2 73 10.0 chr1 197535 197536 5.0 73 1 chr1 197535 197536 5.0 73 2
--- a/test-data/matched_peaks_out1.gff Fri Jan 13 10:52:02 2017 -0500 +++ b/test-data/matched_peaks_out1.gff Wed Jul 12 10:11:53 2017 -0400 @@ -23,7 +23,7 @@ chr1 repmatch . 87029 87030 1009.0689873417721 . . median_distance=1;value_sum=2025.0;replicates=2 chr1 repmatch . 25305 25306 1178.1332278481013 . . median_distance=99;value_sum=2366.0;replicates=2 chr1 repmatch . 139230 139231 1174.1496835443038 . . median_distance=15;value_sum=2358.0;replicates=2 -chr1 repmatch . 335 336 1173.125 . . median_distance=-5;value_sum=2356.0;replicates=2 +chr1 repmatch . 335 336 1173.125 . . median_distance=-4;value_sum=2356.0;replicates=2 chr1 repmatch . 55548 55549 952.067088607595 . . median_distance=86;value_sum=1912.0;replicates=2 chr1 repmatch . 360 361 888.3591772151899 . . median_distance=45;value_sum=1784.0;replicates=2 chr1 repmatch . 72795 72796 961.6268987341772 . . median_distance=9;value_sum=1932.0;replicates=2 @@ -57,8 +57,8 @@ chr1 repmatch . 668 669 317.6876582278481 . . median_distance=-48;value_sum=638.0;replicates=2 chr1 repmatch . 98551 98552 121.49810126582278 . . median_distance=27;value_sum=244.0;replicates=2 chr1 repmatch . 180331 180332 96.60094936708862 . . median_distance=82;value_sum=194.0;replicates=2 -chr1 repmatch . 230172 230173 42.87658227848101 . . median_distance=-2;value_sum=86.0;replicates=2 -chr1 repmatch . 230133 230134 26.95886075949367 . . median_distance=-8;value_sum=54.0;replicates=2 +chr1 repmatch . 230172 230173 42.87658227848101 . . median_distance=-1;value_sum=86.0;replicates=2 +chr1 repmatch . 230133 230134 26.95886075949367 . . median_distance=-7;value_sum=54.0;replicates=2 chr1 repmatch . 230154 230155 44.69145569620253 . . median_distance=34;value_sum=90.0;replicates=2 chr1 repmatch . 96919 96920 11.950632911392404 . . median_distance=3;value_sum=24.0;replicates=2 chr1 repmatch . 197535 197536 4.9794303797468356 . . median_distance=73;value_sum=10.0;replicates=2