Mercurial > repos > jesse-erdmann > tapdance
diff lib/feature_finder.pl @ 0:1437a2df99c0
Uploaded
| author | jesse-erdmann |
|---|---|
| date | Fri, 09 Dec 2011 11:56:56 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/feature_finder.pl Fri Dec 09 11:56:56 2011 -0500 @@ -0,0 +1,241 @@ +#!/project/bioperl/perl-5.10.1-sles11/bin/perl -w +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2010, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# Jesse Erdmann +# +# POD documentation +#------------------------------------------------------------------------------ +=pod BEGIN + +=head1 NAME + + feature_finder.pl - find features nearest to another set of features within a given window. + +=head1 SYNOPSIS + + feature_finder.pl [-help] [-intervals interval_file] [-icol interval_column_config] + [-features feature_file] [-fcol feature_column_config] + [-results ALL|CLOSEST] [-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM] + [-window win_size] [-output output_file] [-html html_output_file] + [-db db_name] [-cturl custom_track_url] [-include_capture] + [-promoter PROMOTER_BP] + +=head1 OPTIONS + +=over 6 + +=item B<-help> + + Print a usage summary + +=item B<-intervals interval_file> + + Specify the file containing the intervals to find the nearest features for, + default settings require a BED compatible file + +=item B<-features feature_file> + + Specify the file containing the features that will be mapped against, default + setting require a BED compatible file + +=item B<-results ALL|CLOSEST> + + The type of results to be reported. ALL will return all features within the + specified window size from each interval in the file including any overlapping + features. CLOSEST will return the closest feature within the window include + overlapping features. The special case ENCOMPASSING FEATURE AND + INTERNAL FEATURE override the closest distance. The first encountered + ENCOMPASSING FEATURE will be returned in CLOSEST mode. If no ENCOMPASSING + FEATURES exist, the first encountered INTERNAL FEATURE will be returned. If + neither exists, the closest distance is used. The closest distance is determined + by the shortest distance between either end of the interval and the feature + (e.g. start-start, start-end, end-start, end-end). + + The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE. An + encompassing feature is one which starts before an interval and ends + after the interval. An internal feature is opposite where the feature + is entirely within the interval. + + DEFAULT:ALL + +=item B<-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM> + + Direction is only used when CLOSEST is specified as the result type. BOTH + will return both the closest upstream and downstream feature in the window. + EITHER will return the closest regardless of whether it is upstream or + downstream. DOWNSTREAM will return the closest feature with a midpoint to + the right of midpoint of a + strand interval or the left of a - strand + interval. UPSTREAM is just the opposite. + + DEFAULT:BOTH + +=item B<-window win_size> + + Window size is the number of base pairs that will be considered upstream + and downstream of an interval in which to find features. This also affects + the performance of indexing and searching. Minimal testing suggests + 50000 bp to be roughly the optimal size, at least for Mus Musculus mm9/ncbim37. + + MINIMUM:2500 DEFAULT:2500 + +=item B<-output output_file> + + The file to report results to. The output format is a tab delimited file + starting with the original line from the interval file followed by a list + of matching features. Features are reported in the form of their name + followed by the distance away in bp ascending from shortest distance to + longest. e.g: + + MY_INTERVAL_INFO ENS000001(20 bp distant) ENS000002 (350 bp distant) ... + + The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE. An + encompassing feature is one which starts before an interval and ends + after the interval. An internal feature is opposite where the feature + is entirely within the interval. + + DEFAULT:OFF + +=item B<-include_capture> + + Including capture information adds three columns to the output from + feature_finder. These are the start and end location to capture all + features found within the given window and the total distance in bp + required to capture the entire set. + +=item B<-promoter PROMOTOR_BP> + + The start and end of the capture region will be modified to include + the specified amount of the promoter region depending on the orientation + of the feature. '+' strand features modify the start position while '-' + strand features modify the end position. + + DEFAULT:0 + +=item B<-icols interval_chrom_column:start:end:label:strand> + + If the interval file is not in BED format, but still tab delimited -icols + can be used to specify which columns in the file contain specific + information. Column numbering begins with 1. + + DEFAULT:1:2:3:4:6 + +=item B<-fcols feature_chrom_column:start:end:label:strand> + + If the interval file is not in BED format, but still tab delimited -fcols + can be used to specify which columns in the file contain specific + information. Column numbering begins with 1. + + DEFAULT:1:2:3:4:6 + +=item B<-html> + + The output will be an HTML file with links to the features in the and + the interval at the UCSC browser. + +=item B<-db db_name> + + Set the database for the organism to be used in the UCSC genome browser. + Only used if an HTML output is specified. + + DEFAULT:mm9 + +=item B<-cturl custom_track_url> + + Define the URL to be used by the UCSC Genome Browser to load the custom + tracks for display purposes. If left blank the URLs will show the windows + defined by the input intervals, however the intervals themselves will not + appear. Only used if an HTML output is specified. + +=back + +=head1 BUGS + +=head1 REFERENCES + +=head1 VERSION + + 0.1 + +=cut + +#### END of POD documentation. +#----------------------------------------------------------------------------- + +use strict; +use Getopt::Long; +use Pod::Usage; +use List::Util qw[min max]; + +my $path = $0; +$path =~ s/\/\w*\.pl$//g; +require "$path/feature_finder_methods.pl"; + +my $intervals; +my $features; +my $req_res_type = "All"; # "Closest" +my $direction = "Both"; # "Upstream", "Downstream", "Both", "Either" +my $out; +my $window = 2500; +my %feature_hash; +my $help_flag; +my $interval_cols = "1:2:3:4:6"; +my $feature_cols = "1:2:3:4:6"; +my $html; +my $db = "mm9"; +my $cturl; +my $debug = 0; +my $include_capture = 0; +my $promoter = 0; + +my %options = ( + "direction|d=s" => \$direction, + "intervals|i=s" => \$intervals, + "features|f=s" => \$features, + "results|r=s" => \$req_res_type, + "output|o=s" => \$out, + "window|w=s" => \$window, + "icols=s" => \$interval_cols, + "fcols=s" => \$feature_cols, + "help|h" => \$help_flag, + "html" => \$html, + "db=s" => \$db, + "cturl=s" => \$cturl, + "include_capture|ic" => \$include_capture, + "promoter|p=i" => \$promoter, + "debug" => \$debug + ); + +GetOptions(%options) or pod2usage(2); +pod2usage(1) if $help_flag; + +pod2usage({ message => "The window specified with -window must be at least 2500.\n", exitval => 2}) if ($window < 2500); +#pod2usage({ message => "Result type specified with -result must be either $all or $close.\n", exitval => 2}) if (uc $req_res_type ne $all && uc $req_res_type ne $close); +pod2usage({ message => "Direction sepecified with -direction must be one of BOTH, EITHER, UPSTREAM or DOWNSTREAM.\n", exitval => 2}) if (uc $direction ne "BOTH" && uc $direction ne "EITHER" && uc $direction ne "UPSTREAM" && uc $direction ne "DOWNSTREAM"); + +my @icols = split(":", $interval_cols); +pod2usage({ message => "Must specify 5 columns with -icol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#icols != 4); +my $ichrom = $icols[0]-1; +my $istart = $icols[1]-1; +my $iend = $icols[2]-1; +my $ilabel = $icols[3]-1; +my $istrand = $icols[4]-1; + +my @fcols = split(":", $feature_cols); +pod2usage({ message => "Must specify 5 columns with -fcol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#fcols != 4); +my $fchrom = $fcols[0]-1; +my $fstart = $fcols[1]-1; +my $fend = $fcols[2]-1; +my $flabel = $fcols[3]-1; +my $fstrand = $fcols[4]-1; + + +my $feature_hash_ref = &feature_index(\$features, \$window, \$fchrom, \$fstart, \$fend, \$debug); +&process_intervals(\$intervals, \$out, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance, \$include_capture, \$promoter); +exit(0); + +
