diff lib/feature_finder.pl @ 0:1437a2df99c0

Uploaded
author jesse-erdmann
date Fri, 09 Dec 2011 11:56:56 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/feature_finder.pl	Fri Dec 09 11:56:56 2011 -0500
@@ -0,0 +1,241 @@
+#!/project/bioperl/perl-5.10.1-sles11/bin/perl -w
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#           Copyright 2010, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  Jesse Erdmann
+#
+# POD documentation
+#------------------------------------------------------------------------------
+=pod BEGIN
+
+=head1 NAME
+
+  feature_finder.pl - find features nearest to another set of features within a given window.
+
+=head1 SYNOPSIS
+    
+    feature_finder.pl [-help] [-intervals interval_file] [-icol interval_column_config]
+                      [-features feature_file] [-fcol feature_column_config] 
+                      [-results ALL|CLOSEST] [-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM] 
+                      [-window win_size] [-output output_file] [-html html_output_file]
+                      [-db db_name] [-cturl custom_track_url] [-include_capture] 
+                      [-promoter PROMOTER_BP]
+
+=head1 OPTIONS
+
+=over 6
+
+=item B<-help>
+ 
+ Print a usage summary
+
+=item B<-intervals interval_file>
+
+ Specify the file containing the intervals to find the nearest features for,
+ default settings require a BED compatible file
+
+=item B<-features feature_file>
+
+ Specify the file containing the features that will be mapped against, default
+ setting require a BED compatible file
+
+=item B<-results ALL|CLOSEST>
+
+ The type of results to be reported.  ALL will return all features within the
+ specified window size from each interval in the file including any overlapping
+ features.  CLOSEST will return the closest feature within the window include 
+ overlapping features.  The special case ENCOMPASSING FEATURE AND
+ INTERNAL FEATURE override the closest distance.  The first encountered 
+ ENCOMPASSING FEATURE will be returned in CLOSEST mode.  If no ENCOMPASSING
+ FEATURES exist, the first encountered INTERNAL FEATURE will be returned.  If 
+ neither exists, the closest distance is used.  The closest distance is determined 
+ by the shortest distance between either end of the interval and the feature 
+ (e.g. start-start, start-end, end-start, end-end).  
+
+ The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE.  An 
+ encompassing feature is one which starts before an interval and ends
+ after the interval.  An internal feature is opposite where the feature
+ is entirely within the interval.
+
+ DEFAULT:ALL
+
+=item B<-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM>
+
+ Direction is only used when CLOSEST is specified as the result type. BOTH
+ will return both the closest upstream and downstream feature in the window.
+ EITHER will return the closest regardless of whether it is upstream or 
+ downstream.  DOWNSTREAM will return the closest feature with a midpoint to 
+ the right of midpoint of a + strand interval or the left of a - strand
+ interval.  UPSTREAM is just the opposite.
+
+ DEFAULT:BOTH
+
+=item B<-window win_size>
+
+ Window size is the number of base pairs that will be considered upstream
+ and downstream of an interval in which to find features. This also affects
+ the performance of indexing and searching.  Minimal testing suggests
+ 50000 bp to be roughly the optimal size, at least for Mus Musculus mm9/ncbim37.
+
+ MINIMUM:2500 DEFAULT:2500
+
+=item B<-output output_file>
+ 
+ The file to report results to.  The output format is a tab delimited file
+ starting with the original line from the interval file followed by a list
+ of matching features.  Features are reported in the form of their name 
+ followed by the distance away in bp ascending from shortest distance to 
+ longest. e.g:
+
+    MY_INTERVAL_INFO   ENS000001(20 bp distant)   ENS000002 (350 bp distant) ...
+
+ The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE.  An 
+ encompassing feature is one which starts before an interval and ends
+ after the interval.  An internal feature is opposite where the feature
+ is entirely within the interval.
+
+ DEFAULT:OFF
+
+=item B<-include_capture>
+
+ Including capture information adds three columns to the output from 
+ feature_finder.  These are the start and end location to capture all 
+ features found within the given window and the total distance in bp
+ required to capture the entire set.
+
+=item B<-promoter PROMOTOR_BP>
+
+ The start and end of the capture region will be modified to include 
+ the specified amount of the promoter region depending on the orientation
+ of the feature. '+' strand features modify the start position while '-'
+ strand features modify the end position.
+
+ DEFAULT:0
+
+=item B<-icols interval_chrom_column:start:end:label:strand>
+
+ If the interval file is not in BED format, but still tab delimited -icols
+ can be used to specify which columns in the file contain specific 
+ information.  Column numbering begins with 1.
+
+ DEFAULT:1:2:3:4:6
+
+=item B<-fcols feature_chrom_column:start:end:label:strand>
+ 
+ If the interval file is not in BED format, but still tab delimited -fcols
+ can be used to specify which columns in the file contain specific 
+ information.  Column numbering begins with 1.
+
+ DEFAULT:1:2:3:4:6
+
+=item B<-html>
+
+ The output will be an HTML file with links to the features in the and
+ the interval at the UCSC browser.
+
+=item B<-db db_name>
+
+ Set the database for the organism to be used in the UCSC genome browser.
+ Only used if an HTML output is specified.
+
+ DEFAULT:mm9
+
+=item B<-cturl custom_track_url>
+
+ Define the URL to be used by the UCSC Genome Browser to load the custom
+ tracks for display purposes.  If left blank the URLs will show the windows 
+ defined by the input intervals, however the intervals themselves will not 
+ appear.  Only used if an HTML output is specified.
+
+=back
+
+=head1 BUGS
+
+=head1 REFERENCES
+
+=head1 VERSION
+
+ 0.1
+
+=cut
+
+#### END of POD documentation.
+#-----------------------------------------------------------------------------
+
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+use List::Util qw[min max];
+
+my $path = $0;
+$path =~ s/\/\w*\.pl$//g;
+require "$path/feature_finder_methods.pl";
+
+my $intervals;
+my $features;
+my $req_res_type = "All"; # "Closest"
+my $direction = "Both"; # "Upstream", "Downstream", "Both", "Either" 
+my $out;
+my $window = 2500;
+my %feature_hash;
+my $help_flag;
+my $interval_cols = "1:2:3:4:6";
+my $feature_cols = "1:2:3:4:6";
+my $html;
+my $db = "mm9";
+my $cturl;
+my $debug = 0;
+my $include_capture = 0;
+my $promoter = 0;
+
+my %options = (
+    "direction|d=s" => \$direction,
+    "intervals|i=s" => \$intervals,
+    "features|f=s"  => \$features,
+    "results|r=s"   => \$req_res_type,
+    "output|o=s"    => \$out,
+    "window|w=s"    => \$window,
+    "icols=s"       => \$interval_cols,
+    "fcols=s"       => \$feature_cols,
+    "help|h"        => \$help_flag,
+    "html"          => \$html,
+    "db=s"          => \$db,
+    "cturl=s"       => \$cturl,
+    "include_capture|ic" => \$include_capture,
+    "promoter|p=i"  => \$promoter,
+    "debug"         => \$debug
+    );
+
+GetOptions(%options) or pod2usage(2);
+pod2usage(1) if $help_flag;
+
+pod2usage({ message => "The window specified with -window must be at least 2500.\n", exitval => 2}) if ($window < 2500);
+#pod2usage({ message => "Result type specified with -result must be either $all or $close.\n", exitval => 2}) if (uc $req_res_type ne $all && uc $req_res_type ne $close);
+pod2usage({ message => "Direction sepecified with -direction must be one of BOTH, EITHER, UPSTREAM or DOWNSTREAM.\n", exitval => 2}) if (uc $direction ne "BOTH" && uc $direction ne "EITHER" && uc $direction ne "UPSTREAM" && uc $direction ne "DOWNSTREAM");
+
+my @icols = split(":", $interval_cols);
+pod2usage({ message => "Must specify 5 columns with -icol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#icols != 4);
+my $ichrom = $icols[0]-1;
+my $istart = $icols[1]-1;
+my $iend = $icols[2]-1;
+my $ilabel = $icols[3]-1;
+my $istrand = $icols[4]-1;
+
+my @fcols = split(":", $feature_cols);
+pod2usage({ message => "Must specify 5 columns with -fcol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#fcols != 4);
+my $fchrom = $fcols[0]-1;
+my $fstart = $fcols[1]-1;
+my $fend = $fcols[2]-1;
+my $flabel = $fcols[3]-1;
+my $fstrand = $fcols[4]-1;
+
+
+my $feature_hash_ref = &feature_index(\$features, \$window, \$fchrom, \$fstart, \$fend, \$debug);
+&process_intervals(\$intervals, \$out, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance, \$include_capture, \$promoter);
+exit(0);
+
+