Mercurial > repos > iuc > resize_coordinate_window
changeset 1:0164d2edba9f draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/resize_coordinate_window commit 7aa2429d3f53a14be7e44dc6021ed3e11dc2f080
author | iuc |
---|---|
date | Tue, 16 Feb 2016 04:05:23 -0500 |
parents | 08b6255afde7 |
children | 541f300f322d |
files | resize_coordinate_window.py resize_coordinate_window.xml test-data/output_discard.gff |
diffstat | 3 files changed, 206 insertions(+), 27 deletions(-) [+] |
line wrap: on
line diff
--- a/resize_coordinate_window.py Tue Jan 19 09:34:56 2016 -0500 +++ b/resize_coordinate_window.py Tue Feb 16 04:05:23 2016 -0500 @@ -1,41 +1,88 @@ import argparse +import fileinput import sys +# Maximum value of a signed 32 bit integer (2**31 - 1). +MAX_CHROM_LEN = 2147483647 -def stop_err( msg ): - sys.stderr.write( msg ) + +def stop_err(msg): + sys.stderr.write(msg) sys.exit(1) parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help="Input dataset") +parser.add_argument('--start_coordinate', dest='start_coordinate', type=int, help='Chromosome start coordinate, either 0 or 1.') parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.') parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.') -parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.') +parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end instead of from computed midpoint.') +parser.add_argument('--chrom_len_file', dest='chrom_len_file', help="File names of .len files for chromosome lengths") +parser.add_argument('--region_boundaries', dest='region_boundaries', help="Option for handling region boundaries") parser.add_argument('--output', dest='output', help="Output dataset") args = parser.parse_args() extend_existing = args.extend_existing == 'existing' out = open(args.output, 'wb') -for line in open(args.input): - if line.startswith('#'): - continue - items = line.split('\t') - if len(items) != 9: - continue - start = int(items[3]) - end = int(items[4]) - if extend_existing: - start -= args.subtract_from_start - end += args.add_to_end - else: - midpoint = (start + end) // 2 - start = midpoint - args.subtract_from_start - end = midpoint + args.add_to_end - if start < 1: - out.close() - stop_err('Requested expansion places region beyond chromosome bounds.') - new_line = '\t'.join([items[0], items[1], items[2], str(start), str(end), items[5], items[6], items[7], items[8]]) - out.write(new_line) +chrom_start = int(args.start_coordinate) +chrom_lens = dict() +# Determine the length of each chromosome and add it to the chrom_lens dictionary. +len_file_missing = False +len_file_error = None +len_file = fileinput.FileInput(args.chrom_len_file) +try: + for line in len_file: + fields = line.split("\t") + chrom_lens[fields[0]] = int(fields[1]) +except Exception, e: + len_file_error = str(e) + +with open(args.input) as fhi: + for line in fhi: + if line.startswith('#'): + # Skip comments. + continue + items = line.split('\t') + if len(items) != 9: + # Skip invalid gff data. + continue + chrom = items[0] + start = int(items[3]) + end = int(items[4]) + if extend_existing: + new_start = start - args.subtract_from_start + new_end = end + args.add_to_end + else: + midpoint = (start + end) // 2 + new_start = midpoint - args.subtract_from_start + new_end = midpoint + args.add_to_end + # Check start boundary. + if new_start < chrom_start: + if args.region_boundaries == 'discard': + continue + elif args.region_boundaries == 'limit': + new_start = chrom_start + elif args.region_boundaries == 'error': + out.close() + stop_err('Requested expansion places region beyond chromosome start boundary of %d.' % chrom_start) + # Check end boundary. + chrom_len = chrom_lens.get(chrom, None) + if chrom_len is None: + len_file_missing = True + chrom_len = MAX_CHROM_LEN + if new_end > chrom_len: + if args.region_boundaries == 'discard': + continue + elif args.region_boundaries == 'limit': + new_end = chrom_len + elif args.region_boundaries == 'error': + out.close() + stop_err('Requested expansion places region beyond chromosome end boundary of %d.' % chrom_len) + new_line = '\t'.join([chrom, items[1], items[2], str(new_start), str(new_end), items[5], items[6], items[7], items[8]]) + out.write(new_line) out.close() +if len_file_error is not None: + print "All chrom lengths set to %d, error in chrom len file: %s" % (MAX_CHROM_LEN, len_file_error) +if len_file_missing: + print "All chrom lengths set to %d, chrom len files are not installed." % MAX_CHROM_LEN
--- a/resize_coordinate_window.xml Tue Jan 19 09:34:56 2016 -0500 +++ b/resize_coordinate_window.xml Tue Feb 16 04:05:23 2016 -0500 @@ -1,21 +1,34 @@ -<tool id="resize_coordinate_window" name="Resize coordinate window" version="1.0.0"> +<tool id="resize_coordinate_window" name="Resize coordinate window" version="1.0.1"> <description>of GFF data</description> <command> python $__tool_directory__/resize_coordinate_window.py --input "$input" + --start_coordinate $start_coordinate --subtract_from_start $subtract_from_start --add_to_end $add_to_end --extend_existing $extend_existing + --chrom_len_file ${chromInfo} + --region_boundaries $region_boundaries --output "$output" </command> <inputs> <param name="input" type="data" format="gff" label="Gff file" /> + <param name="start_coordinate" type="select" label="Start coordinate" help="Input data is 0-based or 1-based"> + <option value="0" selected="True">0</option> + <option value="1">1</option> + </param> <param name="subtract_from_start" type="integer" value="30" min="0" label="Distance to subtract from the start coordinate"/> <param name="add_to_end" type="integer" value="30" min="0" label="Distance to add to the end coordinate"/> <param name="extend_existing" type="select" label="Resize window from" help="The midpoint is computed as (start + end) // 2"> <option value="midpoint" selected="True">the midpoint of the start and end coordinates</option> <option value="existing">the start and end coordinates</option> </param> + <param name="region_boundaries" type="select" label="Handle chromosome boundaries by" help="Expanding the region may result in crossing chromosome start and end coordinate boundaries."> + <option value="discard" selected="True">discarding the region</option> + <option value="limit">keeping the region by limiting the expansion to not cross the start or end coordinate boundary</option> + <option value="nothing">keeping the region by allowing the expansion to cross the start or end coordinate boundary</option> + <option value="error">outputting an error</option> + </param> </inputs> <outputs> <data name="output" format="gff" /> @@ -23,20 +36,43 @@ <tests> <test> <param name="input" value="input.gff" ftype="gff" /> + <param name="start_coordinate" value="1" /> <param name="subtract_from_start" value="13" /> <param name="add_to_end" value="13" /> <param name="extend_existing" value="midpoint" /> + <param name="region_boundaries" value="error" /> <output name="output" file="output.gff" ftype="gff" /> </test> + <test> + <param name="input" value="input.gff" ftype="gff" /> + <param name="start_coordinate" value="0" /> + <param name="subtract_from_start" value="80" /> + <param name="add_to_end" value="80" /> + <param name="extend_existing" value="midpoint" /> + <param name="region_boundaries" value="discard" /> + <output name="output" file="output_discard.gff" ftype="gff" /> + </test> </tests> <help> **What it does** Modifies the start and end coordinates of GFF data such that the new start and end position is based on a -specified window size that is computed either from the existing start and end coordinates or centered on +specified region size that is computed either from the existing start and end coordinates or centered on the midpoint between them. +Region expansion may result in the new start or end coordinates crossing the chromosome boundary. The +chromosome start is set to 0 or 1 using the **Start coordinate** parameter. The end is retrieved from a +file within the Galaxy environment that includes the length of chromosomes for all genome builds. If these +files are missing, the end coordinate is set to 2147483647, which is the maximum value of a signed 32 bit +integer. The **Handle chromosome boundaries by** parameter handles chromosome boundaries that are crossed +by expanding the region using one of the following options. + +* **discarding the region** - the region will be discarded and processing will continue with the next line in the dataset. +* **keeping the region by limiting the expansion to not cross the start or end coordinate boundary** - expansion will be restricted to not cross the chromosome's start or end coordinates for the current region. +* **keeping the region by allowing the expansion to cross the start or end coordinate boundary** - allow defined expansion, crossing the start boundary results in a negative start value. +* **outputting an error** - Stop processing and display an error. + ----- **Example** @@ -47,7 +83,7 @@ chr1 genetrack . 31 51 245 - . stddev=2.66582799529 chr1 genetrack . 40 60 2060 + . stddev=2.7859667372 -Resizing the coordinate window by 13 from the computed midpoint of the start and end coordinates produces:: +Setting start coordinate to 1 and resizing the coordinate window by 13 from the computed midpoint of the start and end coordinates produces:: chr1 genetrack . 14 40 918 + . stddev=5.96715849116 chr1 genetrack . 28 54 245 - . stddev=2.66582799529` @@ -57,7 +93,7 @@ <citations> <citation type="bibtex"> @unpublished{None, - author = {}, + author = {Greg Von Kuster}, title = {None}, year = {None}, eprint = {None},
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_discard.gff Tue Feb 16 04:05:23 2016 -0500 @@ -0,0 +1,96 @@ +chr1 genetrack . 3 163 397 + . stddev=0.0 +chr1 genetrack . 19 179 521 + . stddev=0.747112137937 +chr1 genetrack . 53 213 5129 + . stddev=3.01025384354 +chr1 genetrack . 55 215 4659 - . stddev=3.8642622228 +chr1 genetrack . 85 245 897 - . stddev=3.22709952671 +chr1 genetrack . 101 261 956 - . stddev=4.95899971687 +chr1 genetrack . 110 270 1527 + . stddev=4.62574275346 +chr1 genetrack . 115 275 494 - . stddev=1.4255957 +chr1 genetrack . 122 282 2538 + . stddev=5.04731591122 +chr1 genetrack . 136 296 2087 - . stddev=3.6160253713 +chr1 genetrack . 168 328 2496 + . stddev=2.11105291581 +chr1 genetrack . 172 332 5047 - . stddev=3.62629343395 +chr1 genetrack . 184 344 1525 + . stddev=4.46082441647 +chr1 genetrack . 211 371 15 + . stddev=1.74610678049 +chr1 genetrack . 232 392 626 - . stddev=0.0 +chr1 genetrack . 238 398 1544 + . stddev=4.43066151722 +chr1 genetrack . 264 424 533 + . stddev=1.34355443899 +chr1 genetrack . 274 434 726 - . stddev=1.36767079956 +chr1 genetrack . 277 437 286 + . stddev=0.0 +chr1 genetrack . 288 448 792 - . stddev=1.47737416556 +chr1 genetrack . 304 464 608 + . stddev=1.44652711793 +chr1 genetrack . 319 479 126 - . stddev=0.471404520791 +chr1 genetrack . 369 529 618 - . stddev=5.47536569145 +chr1 genetrack . 371 531 1393 + . stddev=4.75587332865 +chr1 genetrack . 391 551 754 - . stddev=3.28891288785 +chr1 genetrack . 413 573 58 + . stddev=0.0 +chr1 genetrack . 468 628 1015 - . stddev=0.0 +chr1 genetrack . 658 818 39 - . stddev=0.0 +chr1 genetrack . 687 847 23 + . stddev=0.0 +chr1 genetrack . 729 889 607 + . stddev=0.0 +chr1 genetrack . 774 934 665 + . stddev=0.0 +chr1 genetrack . 807 967 468 + . stddev=0.0 +chr1 genetrack . 833 993 107 - . stddev=0.0 +chr1 genetrack . 874 1034 2 - . stddev=0.0 +chr1 genetrack . 1022 1182 740 + . stddev=0.0 +chr1 genetrack . 1057 1217 940 - . stddev=3.96036497305 +chr1 genetrack . 1113 1273 25 + . stddev=0.0 +chr1 genetrack . 1221 1381 454 - . stddev=0.0 +chr1 genetrack . 1259 1419 207 - . stddev=0.0 +chr1 genetrack . 1414 1574 584 + . stddev=0.0 +chr1 genetrack . 2005 2165 1181 + . stddev=0.0 +chr1 genetrack . 2032 2192 481 + . stddev=0.0455486534308 +chr1 genetrack . 2055 2215 199 - . stddev=0.0 +chr1 genetrack . 2382 2542 1246 + . stddev=0.0 +chr1 genetrack . 2532 2692 34 + . stddev=0.0 +chr1 genetrack . 2763 2923 1062 + . stddev=1.01561431542 +chr1 genetrack . 2768 2928 1144 - . stddev=1.09438744148 +chr1 genetrack . 2941 3101 1212 - . stddev=0.0 +chr1 genetrack . 3046 3206 555 - . stddev=0.0 +chr1 genetrack . 3060 3220 17 + . stddev=0.0 +chr1 genetrack . 3308 3468 525 - . stddev=0.0 +chr1 genetrack . 3599 3759 845 + . stddev=0.0 +chr1 genetrack . 3715 3875 23 - . stddev=0.0 +chr1 genetrack . 3777 3937 316 - . stddev=0.0 +chr1 genetrack . 3798 3958 491 + . stddev=0.0 +chr1 genetrack . 4027 4187 536 - . stddev=0.0 +chr1 genetrack . 4256 4416 482 + . stddev=0.0 +chr1 genetrack . 4325 4485 3 + . stddev=0.0 +chr1 genetrack . 4391 4551 1110 + . stddev=0.0 +chr1 genetrack . 4430 4590 125 - . stddev=0.0 +chr1 genetrack . 4550 4710 147 + . stddev=0.0 +chr1 genetrack . 4756 4916 1761 + . stddev=4.82408982772 +chr1 genetrack . 4832 4992 710 + . stddev=0.0 +chr1 genetrack . 5040 5200 828 + . stddev=0.0 +chr1 genetrack . 5332 5492 282 - . stddev=0.0 +chr1 genetrack . 5431 5591 75 + . stddev=0.0 +chr1 genetrack . 5637 5797 2 + . stddev=0.0 +chr1 genetrack . 5647 5807 737 - . stddev=0.36608362591 +chr1 genetrack . 6016 6176 646 + . stddev=0.039314009595 +chr1 genetrack . 6028 6188 230 - . stddev=0.0657945476105 +chr1 genetrack . 6117 6277 329 - . stddev=0.0 +chr1 genetrack . 6220 6380 5 + . stddev=0.0 +chr1 genetrack . 6286 6446 285 + . stddev=0.0 +chr1 genetrack . 6310 6470 34 - . stddev=0.0 +chr1 genetrack . 6331 6491 1587 + . stddev=5.61831543503 +chr1 genetrack . 6345 6505 953 - . stddev=3.52372902021 +chr1 genetrack . 6362 6522 742 + . stddev=0.0 +chr1 genetrack . 6426 6586 691 + . stddev=0.0 +chr1 genetrack . 6436 6596 61 - . stddev=1.5137105198 +chr1 genetrack . 6773 6933 28 + . stddev=0.0 +chr1 genetrack . 6988 7148 518 - . stddev=0.0 +chr1 genetrack . 7054 7214 654 + . stddev=0.0 +chr1 genetrack . 7695 7855 714 + . stddev=0.0 +chr1 genetrack . 7777 7937 3 + . stddev=0.0 +chr1 genetrack . 8139 8299 17 + . stddev=0.0 +chr1 genetrack . 8202 8362 2 - . stddev=0.0 +chr1 genetrack . 8389 8549 10 + . stddev=0.0 +chr1 genetrack . 8401 8561 5 - . stddev=0.0 +chr1 genetrack . 8645 8805 5 + . stddev=0.0 +chr1 genetrack . 8764 8924 332 + . stddev=0.0 +chr1 genetrack . 8769 8929 593 - . stddev=0.0 +chr1 genetrack . 8964 9124 24 + . stddev=0.0 +chr1 genetrack . 8988 9148 4 + . stddev=0.0 +chr1 genetrack . 9415 9575 36 + . stddev=0.0 +chr1 genetrack . 9640 9800 480 + . stddev=0.0 +chr1 genetrack . 9853 10013 606 - . stddev=0.0