view lib/feature_finder.pl @ 3:17ce4f3bffa2 default tip

Uploaded
author jesse-erdmann
date Tue, 24 Jan 2012 18:33:41 -0500
parents 1437a2df99c0
children
line wrap: on
line source

#!/project/bioperl/perl-5.10.1-sles11/bin/perl -w
#
#------------------------------------------------------------------------------
#                         University of Minnesota
#           Copyright 2010, Regents of the University of Minnesota
#------------------------------------------------------------------------------
# Author:
#
#  Jesse Erdmann
#
# POD documentation
#------------------------------------------------------------------------------
=pod BEGIN

=head1 NAME

  feature_finder.pl - find features nearest to another set of features within a given window.

=head1 SYNOPSIS
    
    feature_finder.pl [-help] [-intervals interval_file] [-icol interval_column_config]
                      [-features feature_file] [-fcol feature_column_config] 
                      [-results ALL|CLOSEST] [-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM] 
                      [-window win_size] [-output output_file] [-html html_output_file]
                      [-db db_name] [-cturl custom_track_url] [-include_capture] 
                      [-promoter PROMOTER_BP]

=head1 OPTIONS

=over 6

=item B<-help>
 
 Print a usage summary

=item B<-intervals interval_file>

 Specify the file containing the intervals to find the nearest features for,
 default settings require a BED compatible file

=item B<-features feature_file>

 Specify the file containing the features that will be mapped against, default
 setting require a BED compatible file

=item B<-results ALL|CLOSEST>

 The type of results to be reported.  ALL will return all features within the
 specified window size from each interval in the file including any overlapping
 features.  CLOSEST will return the closest feature within the window include 
 overlapping features.  The special case ENCOMPASSING FEATURE AND
 INTERNAL FEATURE override the closest distance.  The first encountered 
 ENCOMPASSING FEATURE will be returned in CLOSEST mode.  If no ENCOMPASSING
 FEATURES exist, the first encountered INTERNAL FEATURE will be returned.  If 
 neither exists, the closest distance is used.  The closest distance is determined 
 by the shortest distance between either end of the interval and the feature 
 (e.g. start-start, start-end, end-start, end-end).  

 The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE.  An 
 encompassing feature is one which starts before an interval and ends
 after the interval.  An internal feature is opposite where the feature
 is entirely within the interval.

 DEFAULT:ALL

=item B<-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM>

 Direction is only used when CLOSEST is specified as the result type. BOTH
 will return both the closest upstream and downstream feature in the window.
 EITHER will return the closest regardless of whether it is upstream or 
 downstream.  DOWNSTREAM will return the closest feature with a midpoint to 
 the right of midpoint of a + strand interval or the left of a - strand
 interval.  UPSTREAM is just the opposite.

 DEFAULT:BOTH

=item B<-window win_size>

 Window size is the number of base pairs that will be considered upstream
 and downstream of an interval in which to find features. This also affects
 the performance of indexing and searching.  Minimal testing suggests
 50000 bp to be roughly the optimal size, at least for Mus Musculus mm9/ncbim37.

 MINIMUM:2500 DEFAULT:2500

=item B<-output output_file>
 
 The file to report results to.  The output format is a tab delimited file
 starting with the original line from the interval file followed by a list
 of matching features.  Features are reported in the form of their name 
 followed by the distance away in bp ascending from shortest distance to 
 longest. e.g:

    MY_INTERVAL_INFO   ENS000001(20 bp distant)   ENS000002 (350 bp distant) ...

 The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE.  An 
 encompassing feature is one which starts before an interval and ends
 after the interval.  An internal feature is opposite where the feature
 is entirely within the interval.

 DEFAULT:OFF

=item B<-include_capture>

 Including capture information adds three columns to the output from 
 feature_finder.  These are the start and end location to capture all 
 features found within the given window and the total distance in bp
 required to capture the entire set.

=item B<-promoter PROMOTOR_BP>

 The start and end of the capture region will be modified to include 
 the specified amount of the promoter region depending on the orientation
 of the feature. '+' strand features modify the start position while '-'
 strand features modify the end position.

 DEFAULT:0

=item B<-icols interval_chrom_column:start:end:label:strand>

 If the interval file is not in BED format, but still tab delimited -icols
 can be used to specify which columns in the file contain specific 
 information.  Column numbering begins with 1.

 DEFAULT:1:2:3:4:6

=item B<-fcols feature_chrom_column:start:end:label:strand>
 
 If the interval file is not in BED format, but still tab delimited -fcols
 can be used to specify which columns in the file contain specific 
 information.  Column numbering begins with 1.

 DEFAULT:1:2:3:4:6

=item B<-html>

 The output will be an HTML file with links to the features in the and
 the interval at the UCSC browser.

=item B<-db db_name>

 Set the database for the organism to be used in the UCSC genome browser.
 Only used if an HTML output is specified.

 DEFAULT:mm9

=item B<-cturl custom_track_url>

 Define the URL to be used by the UCSC Genome Browser to load the custom
 tracks for display purposes.  If left blank the URLs will show the windows 
 defined by the input intervals, however the intervals themselves will not 
 appear.  Only used if an HTML output is specified.

=back

=head1 BUGS

=head1 REFERENCES

=head1 VERSION

 0.1

=cut

#### END of POD documentation.
#-----------------------------------------------------------------------------

use strict;
use Getopt::Long;
use Pod::Usage;
use List::Util qw[min max];

my $path = $0;
$path =~ s/\/\w*\.pl$//g;
require "$path/feature_finder_methods.pl";

my $intervals;
my $features;
my $req_res_type = "All"; # "Closest"
my $direction = "Both"; # "Upstream", "Downstream", "Both", "Either" 
my $out;
my $window = 2500;
my %feature_hash;
my $help_flag;
my $interval_cols = "1:2:3:4:6";
my $feature_cols = "1:2:3:4:6";
my $html;
my $db = "mm9";
my $cturl;
my $debug = 0;
my $include_capture = 0;
my $promoter = 0;

my %options = (
    "direction|d=s" => \$direction,
    "intervals|i=s" => \$intervals,
    "features|f=s"  => \$features,
    "results|r=s"   => \$req_res_type,
    "output|o=s"    => \$out,
    "window|w=s"    => \$window,
    "icols=s"       => \$interval_cols,
    "fcols=s"       => \$feature_cols,
    "help|h"        => \$help_flag,
    "html"          => \$html,
    "db=s"          => \$db,
    "cturl=s"       => \$cturl,
    "include_capture|ic" => \$include_capture,
    "promoter|p=i"  => \$promoter,
    "debug"         => \$debug
    );

GetOptions(%options) or pod2usage(2);
pod2usage(1) if $help_flag;

pod2usage({ message => "The window specified with -window must be at least 2500.\n", exitval => 2}) if ($window < 2500);
#pod2usage({ message => "Result type specified with -result must be either $all or $close.\n", exitval => 2}) if (uc $req_res_type ne $all && uc $req_res_type ne $close);
pod2usage({ message => "Direction sepecified with -direction must be one of BOTH, EITHER, UPSTREAM or DOWNSTREAM.\n", exitval => 2}) if (uc $direction ne "BOTH" && uc $direction ne "EITHER" && uc $direction ne "UPSTREAM" && uc $direction ne "DOWNSTREAM");

my @icols = split(":", $interval_cols);
pod2usage({ message => "Must specify 5 columns with -icol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#icols != 4);
my $ichrom = $icols[0]-1;
my $istart = $icols[1]-1;
my $iend = $icols[2]-1;
my $ilabel = $icols[3]-1;
my $istrand = $icols[4]-1;

my @fcols = split(":", $feature_cols);
pod2usage({ message => "Must specify 5 columns with -fcol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#fcols != 4);
my $fchrom = $fcols[0]-1;
my $fstart = $fcols[1]-1;
my $fend = $fcols[2]-1;
my $flabel = $fcols[3]-1;
my $fstrand = $fcols[4]-1;


my $feature_hash_ref = &feature_index(\$features, \$window, \$fchrom, \$fstart, \$fend, \$debug);
&process_intervals(\$intervals, \$out, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance, \$include_capture, \$promoter);
exit(0);