changeset 1:0164d2edba9f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/resize_coordinate_window commit 7aa2429d3f53a14be7e44dc6021ed3e11dc2f080
author iuc
date Tue, 16 Feb 2016 04:05:23 -0500
parents 08b6255afde7
children 541f300f322d
files resize_coordinate_window.py resize_coordinate_window.xml test-data/output_discard.gff
diffstat 3 files changed, 206 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/resize_coordinate_window.py	Tue Jan 19 09:34:56 2016 -0500
+++ b/resize_coordinate_window.py	Tue Feb 16 04:05:23 2016 -0500
@@ -1,41 +1,88 @@
 import argparse
+import fileinput
 import sys
 
+# Maximum value of a signed 32 bit integer (2**31 - 1).
+MAX_CHROM_LEN = 2147483647
 
-def stop_err( msg ):
-    sys.stderr.write( msg )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
     sys.exit(1)
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--input', dest='input', help="Input dataset")
+parser.add_argument('--start_coordinate', dest='start_coordinate', type=int, help='Chromosome start coordinate, either 0 or 1.')
 parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.')
 parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.')
-parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.')
+parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end instead of from computed midpoint.')
+parser.add_argument('--chrom_len_file', dest='chrom_len_file', help="File names of .len files for chromosome lengths")
+parser.add_argument('--region_boundaries', dest='region_boundaries', help="Option for handling region boundaries")
 parser.add_argument('--output', dest='output', help="Output dataset")
 args = parser.parse_args()
 
 extend_existing = args.extend_existing == 'existing'
 out = open(args.output, 'wb')
 
-for line in open(args.input):
-    if line.startswith('#'):
-        continue
-    items = line.split('\t')
-    if len(items) != 9:
-        continue
-    start = int(items[3])
-    end = int(items[4])
-    if extend_existing:
-        start -= args.subtract_from_start
-        end += args.add_to_end
-    else:
-        midpoint = (start + end) // 2
-        start = midpoint - args.subtract_from_start
-        end = midpoint + args.add_to_end
-    if start < 1:
-        out.close()
-        stop_err('Requested expansion places region beyond chromosome bounds.')
-    new_line = '\t'.join([items[0], items[1], items[2], str(start), str(end), items[5], items[6], items[7], items[8]])
-    out.write(new_line)
+chrom_start = int(args.start_coordinate)
+chrom_lens = dict()
+# Determine the length of each chromosome and add it to the chrom_lens dictionary.
+len_file_missing = False
+len_file_error = None
+len_file = fileinput.FileInput(args.chrom_len_file)
+try:
+    for line in len_file:
+        fields = line.split("\t")
+        chrom_lens[fields[0]] = int(fields[1])
+except Exception, e:
+    len_file_error = str(e)
+
+with open(args.input) as fhi:
+    for line in fhi:
+        if line.startswith('#'):
+            # Skip comments.
+            continue
+        items = line.split('\t')
+        if len(items) != 9:
+            # Skip invalid gff data.
+            continue
+        chrom = items[0]
+        start = int(items[3])
+        end = int(items[4])
+        if extend_existing:
+            new_start = start - args.subtract_from_start
+            new_end = end + args.add_to_end
+        else:
+            midpoint = (start + end) // 2
+            new_start = midpoint - args.subtract_from_start
+            new_end = midpoint + args.add_to_end
+        # Check start boundary.
+        if new_start < chrom_start:
+            if args.region_boundaries == 'discard':
+                continue
+            elif args.region_boundaries == 'limit':
+                new_start = chrom_start
+            elif args.region_boundaries == 'error':
+                out.close()
+                stop_err('Requested expansion places region beyond chromosome start boundary of %d.' % chrom_start)
+        # Check end boundary.
+        chrom_len = chrom_lens.get(chrom, None)
+        if chrom_len is None:
+            len_file_missing = True
+            chrom_len = MAX_CHROM_LEN
+        if new_end > chrom_len:
+            if args.region_boundaries == 'discard':
+                continue
+            elif args.region_boundaries == 'limit':
+                new_end = chrom_len
+            elif args.region_boundaries == 'error':
+                out.close()
+                stop_err('Requested expansion places region beyond chromosome end boundary of %d.' % chrom_len)
+        new_line = '\t'.join([chrom, items[1], items[2], str(new_start), str(new_end), items[5], items[6], items[7], items[8]])
+        out.write(new_line)
 out.close()
 
+if len_file_error is not None:
+    print "All chrom lengths set to %d, error in chrom len file: %s" % (MAX_CHROM_LEN, len_file_error)
+if len_file_missing:
+    print "All chrom lengths set to %d, chrom len files are not installed." % MAX_CHROM_LEN
--- a/resize_coordinate_window.xml	Tue Jan 19 09:34:56 2016 -0500
+++ b/resize_coordinate_window.xml	Tue Feb 16 04:05:23 2016 -0500
@@ -1,21 +1,34 @@
-<tool id="resize_coordinate_window" name="Resize coordinate window" version="1.0.0">
+<tool id="resize_coordinate_window" name="Resize coordinate window" version="1.0.1">
     <description>of GFF data</description>
     <command>
         python $__tool_directory__/resize_coordinate_window.py
         --input "$input"
+        --start_coordinate $start_coordinate
         --subtract_from_start $subtract_from_start
         --add_to_end $add_to_end
         --extend_existing $extend_existing
+        --chrom_len_file ${chromInfo}
+        --region_boundaries $region_boundaries
         --output "$output"
     </command>
     <inputs>
         <param name="input" type="data" format="gff" label="Gff file" />
+        <param name="start_coordinate" type="select" label="Start coordinate" help="Input data is 0-based or 1-based">
+            <option value="0" selected="True">0</option>
+            <option value="1">1</option>
+        </param>
         <param name="subtract_from_start" type="integer" value="30" min="0" label="Distance to subtract from the start coordinate"/>
         <param name="add_to_end" type="integer" value="30" min="0" label="Distance to add to the end coordinate"/>
         <param name="extend_existing" type="select" label="Resize window from" help="The midpoint is computed as (start + end) // 2">
             <option value="midpoint" selected="True">the midpoint of the start and end coordinates</option>
             <option value="existing">the start and end coordinates</option>
         </param>
+        <param name="region_boundaries" type="select" label="Handle chromosome boundaries by" help="Expanding the region may result in crossing chromosome start and end coordinate boundaries.">
+            <option value="discard" selected="True">discarding the region</option>
+            <option value="limit">keeping the region by limiting the expansion to not cross the start or end coordinate boundary</option>
+            <option value="nothing">keeping the region by allowing the expansion to cross the start or end coordinate boundary</option>
+            <option value="error">outputting an error</option>
+        </param>
     </inputs>
     <outputs>
         <data name="output" format="gff" />
@@ -23,20 +36,43 @@
     <tests>
         <test>
             <param name="input" value="input.gff" ftype="gff" />
+            <param name="start_coordinate" value="1" />
             <param name="subtract_from_start" value="13" />
             <param name="add_to_end" value="13" />
             <param name="extend_existing" value="midpoint" />
+            <param name="region_boundaries" value="error" />
             <output name="output" file="output.gff" ftype="gff" />
         </test>
+        <test>
+            <param name="input" value="input.gff" ftype="gff" />
+            <param name="start_coordinate" value="0" />
+            <param name="subtract_from_start" value="80" />
+            <param name="add_to_end" value="80" />
+            <param name="extend_existing" value="midpoint" />
+            <param name="region_boundaries" value="discard" />
+            <output name="output" file="output_discard.gff" ftype="gff" />
+        </test>
     </tests>
     <help>
 
 **What it does**
 
 Modifies the start and end coordinates of GFF data such that the new start and end position is based on a
-specified window size that is computed either from the existing start and end coordinates or centered on
+specified region size that is computed either from the existing start and end coordinates or centered on
 the midpoint between them.
 
+Region expansion may result in the new start or end coordinates crossing the chromosome boundary.  The
+chromosome start is set to 0 or 1 using the **Start coordinate** parameter.  The end is retrieved from a
+file within the Galaxy environment that includes the length of chromosomes for all genome builds.  If these
+files are missing, the end coordinate is set to 2147483647, which is the maximum value of a signed 32 bit
+integer.  The **Handle chromosome boundaries by** parameter handles chromosome boundaries that are crossed
+by expanding the region using one of the following options.
+
+* **discarding the region** - the region will be discarded and processing will continue with the next line in the dataset.
+* **keeping the region by limiting the expansion to not cross the start or end coordinate boundary** - expansion will be restricted to not cross the chromosome's start or end coordinates for the current region.
+* **keeping the region by allowing the expansion to cross the start or end coordinate boundary** - allow defined expansion, crossing the start boundary results in a negative start value.
+* **outputting an error** - Stop processing and display an error.
+
 -----
 
 **Example**
@@ -47,7 +83,7 @@
     chr1    genetrack       .       31      51      245     -       .       stddev=2.66582799529
     chr1    genetrack       .       40      60      2060    +       .       stddev=2.7859667372
 
-Resizing the coordinate window by 13 from the computed midpoint of the start and end coordinates produces::
+Setting start coordinate to 1 and resizing the coordinate window by 13 from the computed midpoint of the start and end coordinates produces::
 
     chr1    genetrack       .       14      40      918     +       .       stddev=5.96715849116
     chr1    genetrack       .       28      54      245     -       .       stddev=2.66582799529`
@@ -57,7 +93,7 @@
     <citations>
         <citation type="bibtex">
             @unpublished{None,
-            author = {},
+            author = {Greg Von Kuster},
             title = {None},
             year = {None},
             eprint = {None},
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_discard.gff	Tue Feb 16 04:05:23 2016 -0500
@@ -0,0 +1,96 @@
+chr1	genetrack	.	3	163	397	+	.	stddev=0.0
+chr1	genetrack	.	19	179	521	+	.	stddev=0.747112137937
+chr1	genetrack	.	53	213	5129	+	.	stddev=3.01025384354
+chr1	genetrack	.	55	215	4659	-	.	stddev=3.8642622228
+chr1	genetrack	.	85	245	897	-	.	stddev=3.22709952671
+chr1	genetrack	.	101	261	956	-	.	stddev=4.95899971687
+chr1	genetrack	.	110	270	1527	+	.	stddev=4.62574275346
+chr1	genetrack	.	115	275	494	-	.	stddev=1.4255957
+chr1	genetrack	.	122	282	2538	+	.	stddev=5.04731591122
+chr1	genetrack	.	136	296	2087	-	.	stddev=3.6160253713
+chr1	genetrack	.	168	328	2496	+	.	stddev=2.11105291581
+chr1	genetrack	.	172	332	5047	-	.	stddev=3.62629343395
+chr1	genetrack	.	184	344	1525	+	.	stddev=4.46082441647
+chr1	genetrack	.	211	371	15	+	.	stddev=1.74610678049
+chr1	genetrack	.	232	392	626	-	.	stddev=0.0
+chr1	genetrack	.	238	398	1544	+	.	stddev=4.43066151722
+chr1	genetrack	.	264	424	533	+	.	stddev=1.34355443899
+chr1	genetrack	.	274	434	726	-	.	stddev=1.36767079956
+chr1	genetrack	.	277	437	286	+	.	stddev=0.0
+chr1	genetrack	.	288	448	792	-	.	stddev=1.47737416556
+chr1	genetrack	.	304	464	608	+	.	stddev=1.44652711793
+chr1	genetrack	.	319	479	126	-	.	stddev=0.471404520791
+chr1	genetrack	.	369	529	618	-	.	stddev=5.47536569145
+chr1	genetrack	.	371	531	1393	+	.	stddev=4.75587332865
+chr1	genetrack	.	391	551	754	-	.	stddev=3.28891288785
+chr1	genetrack	.	413	573	58	+	.	stddev=0.0
+chr1	genetrack	.	468	628	1015	-	.	stddev=0.0
+chr1	genetrack	.	658	818	39	-	.	stddev=0.0
+chr1	genetrack	.	687	847	23	+	.	stddev=0.0
+chr1	genetrack	.	729	889	607	+	.	stddev=0.0
+chr1	genetrack	.	774	934	665	+	.	stddev=0.0
+chr1	genetrack	.	807	967	468	+	.	stddev=0.0
+chr1	genetrack	.	833	993	107	-	.	stddev=0.0
+chr1	genetrack	.	874	1034	2	-	.	stddev=0.0
+chr1	genetrack	.	1022	1182	740	+	.	stddev=0.0
+chr1	genetrack	.	1057	1217	940	-	.	stddev=3.96036497305
+chr1	genetrack	.	1113	1273	25	+	.	stddev=0.0
+chr1	genetrack	.	1221	1381	454	-	.	stddev=0.0
+chr1	genetrack	.	1259	1419	207	-	.	stddev=0.0
+chr1	genetrack	.	1414	1574	584	+	.	stddev=0.0
+chr1	genetrack	.	2005	2165	1181	+	.	stddev=0.0
+chr1	genetrack	.	2032	2192	481	+	.	stddev=0.0455486534308
+chr1	genetrack	.	2055	2215	199	-	.	stddev=0.0
+chr1	genetrack	.	2382	2542	1246	+	.	stddev=0.0
+chr1	genetrack	.	2532	2692	34	+	.	stddev=0.0
+chr1	genetrack	.	2763	2923	1062	+	.	stddev=1.01561431542
+chr1	genetrack	.	2768	2928	1144	-	.	stddev=1.09438744148
+chr1	genetrack	.	2941	3101	1212	-	.	stddev=0.0
+chr1	genetrack	.	3046	3206	555	-	.	stddev=0.0
+chr1	genetrack	.	3060	3220	17	+	.	stddev=0.0
+chr1	genetrack	.	3308	3468	525	-	.	stddev=0.0
+chr1	genetrack	.	3599	3759	845	+	.	stddev=0.0
+chr1	genetrack	.	3715	3875	23	-	.	stddev=0.0
+chr1	genetrack	.	3777	3937	316	-	.	stddev=0.0
+chr1	genetrack	.	3798	3958	491	+	.	stddev=0.0
+chr1	genetrack	.	4027	4187	536	-	.	stddev=0.0
+chr1	genetrack	.	4256	4416	482	+	.	stddev=0.0
+chr1	genetrack	.	4325	4485	3	+	.	stddev=0.0
+chr1	genetrack	.	4391	4551	1110	+	.	stddev=0.0
+chr1	genetrack	.	4430	4590	125	-	.	stddev=0.0
+chr1	genetrack	.	4550	4710	147	+	.	stddev=0.0
+chr1	genetrack	.	4756	4916	1761	+	.	stddev=4.82408982772
+chr1	genetrack	.	4832	4992	710	+	.	stddev=0.0
+chr1	genetrack	.	5040	5200	828	+	.	stddev=0.0
+chr1	genetrack	.	5332	5492	282	-	.	stddev=0.0
+chr1	genetrack	.	5431	5591	75	+	.	stddev=0.0
+chr1	genetrack	.	5637	5797	2	+	.	stddev=0.0
+chr1	genetrack	.	5647	5807	737	-	.	stddev=0.36608362591
+chr1	genetrack	.	6016	6176	646	+	.	stddev=0.039314009595
+chr1	genetrack	.	6028	6188	230	-	.	stddev=0.0657945476105
+chr1	genetrack	.	6117	6277	329	-	.	stddev=0.0
+chr1	genetrack	.	6220	6380	5	+	.	stddev=0.0
+chr1	genetrack	.	6286	6446	285	+	.	stddev=0.0
+chr1	genetrack	.	6310	6470	34	-	.	stddev=0.0
+chr1	genetrack	.	6331	6491	1587	+	.	stddev=5.61831543503
+chr1	genetrack	.	6345	6505	953	-	.	stddev=3.52372902021
+chr1	genetrack	.	6362	6522	742	+	.	stddev=0.0
+chr1	genetrack	.	6426	6586	691	+	.	stddev=0.0
+chr1	genetrack	.	6436	6596	61	-	.	stddev=1.5137105198
+chr1	genetrack	.	6773	6933	28	+	.	stddev=0.0
+chr1	genetrack	.	6988	7148	518	-	.	stddev=0.0
+chr1	genetrack	.	7054	7214	654	+	.	stddev=0.0
+chr1	genetrack	.	7695	7855	714	+	.	stddev=0.0
+chr1	genetrack	.	7777	7937	3	+	.	stddev=0.0
+chr1	genetrack	.	8139	8299	17	+	.	stddev=0.0
+chr1	genetrack	.	8202	8362	2	-	.	stddev=0.0
+chr1	genetrack	.	8389	8549	10	+	.	stddev=0.0
+chr1	genetrack	.	8401	8561	5	-	.	stddev=0.0
+chr1	genetrack	.	8645	8805	5	+	.	stddev=0.0
+chr1	genetrack	.	8764	8924	332	+	.	stddev=0.0
+chr1	genetrack	.	8769	8929	593	-	.	stddev=0.0
+chr1	genetrack	.	8964	9124	24	+	.	stddev=0.0
+chr1	genetrack	.	8988	9148	4	+	.	stddev=0.0
+chr1	genetrack	.	9415	9575	36	+	.	stddev=0.0
+chr1	genetrack	.	9640	9800	480	+	.	stddev=0.0
+chr1	genetrack	.	9853	10013	606	-	.	stddev=0.0