|
0
|
1 #!/project/bioperl/perl-5.10.1-sles11/bin/perl -w
|
|
|
2 #
|
|
|
3 #------------------------------------------------------------------------------
|
|
|
4 # University of Minnesota
|
|
|
5 # Copyright 2010, Regents of the University of Minnesota
|
|
|
6 #------------------------------------------------------------------------------
|
|
|
7 # Author:
|
|
|
8 #
|
|
|
9 # Jesse Erdmann
|
|
|
10 #
|
|
|
11 # POD documentation
|
|
|
12 #------------------------------------------------------------------------------
|
|
|
13 =pod BEGIN
|
|
|
14
|
|
|
15 =head1 NAME
|
|
|
16
|
|
|
17 feature_finder.pl - find features nearest to another set of features within a given window.
|
|
|
18
|
|
|
19 =head1 SYNOPSIS
|
|
|
20
|
|
|
21 feature_finder.pl [-help] [-intervals interval_file] [-icol interval_column_config]
|
|
|
22 [-features feature_file] [-fcol feature_column_config]
|
|
|
23 [-results ALL|CLOSEST] [-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM]
|
|
|
24 [-window win_size] [-output output_file] [-html html_output_file]
|
|
|
25 [-db db_name] [-cturl custom_track_url] [-include_capture]
|
|
|
26 [-promoter PROMOTER_BP]
|
|
|
27
|
|
|
28 =head1 OPTIONS
|
|
|
29
|
|
|
30 =over 6
|
|
|
31
|
|
|
32 =item B<-help>
|
|
|
33
|
|
|
34 Print a usage summary
|
|
|
35
|
|
|
36 =item B<-intervals interval_file>
|
|
|
37
|
|
|
38 Specify the file containing the intervals to find the nearest features for,
|
|
|
39 default settings require a BED compatible file
|
|
|
40
|
|
|
41 =item B<-features feature_file>
|
|
|
42
|
|
|
43 Specify the file containing the features that will be mapped against, default
|
|
|
44 setting require a BED compatible file
|
|
|
45
|
|
|
46 =item B<-results ALL|CLOSEST>
|
|
|
47
|
|
|
48 The type of results to be reported. ALL will return all features within the
|
|
|
49 specified window size from each interval in the file including any overlapping
|
|
|
50 features. CLOSEST will return the closest feature within the window include
|
|
|
51 overlapping features. The special case ENCOMPASSING FEATURE AND
|
|
|
52 INTERNAL FEATURE override the closest distance. The first encountered
|
|
|
53 ENCOMPASSING FEATURE will be returned in CLOSEST mode. If no ENCOMPASSING
|
|
|
54 FEATURES exist, the first encountered INTERNAL FEATURE will be returned. If
|
|
|
55 neither exists, the closest distance is used. The closest distance is determined
|
|
|
56 by the shortest distance between either end of the interval and the feature
|
|
|
57 (e.g. start-start, start-end, end-start, end-end).
|
|
|
58
|
|
|
59 The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE. An
|
|
|
60 encompassing feature is one which starts before an interval and ends
|
|
|
61 after the interval. An internal feature is opposite where the feature
|
|
|
62 is entirely within the interval.
|
|
|
63
|
|
|
64 DEFAULT:ALL
|
|
|
65
|
|
|
66 =item B<-direction BOTH|EITHER|UPSTREAM|DOWNSTREAM>
|
|
|
67
|
|
|
68 Direction is only used when CLOSEST is specified as the result type. BOTH
|
|
|
69 will return both the closest upstream and downstream feature in the window.
|
|
|
70 EITHER will return the closest regardless of whether it is upstream or
|
|
|
71 downstream. DOWNSTREAM will return the closest feature with a midpoint to
|
|
|
72 the right of midpoint of a + strand interval or the left of a - strand
|
|
|
73 interval. UPSTREAM is just the opposite.
|
|
|
74
|
|
|
75 DEFAULT:BOTH
|
|
|
76
|
|
|
77 =item B<-window win_size>
|
|
|
78
|
|
|
79 Window size is the number of base pairs that will be considered upstream
|
|
|
80 and downstream of an interval in which to find features. This also affects
|
|
|
81 the performance of indexing and searching. Minimal testing suggests
|
|
|
82 50000 bp to be roughly the optimal size, at least for Mus Musculus mm9/ncbim37.
|
|
|
83
|
|
|
84 MINIMUM:2500 DEFAULT:2500
|
|
|
85
|
|
|
86 =item B<-output output_file>
|
|
|
87
|
|
|
88 The file to report results to. The output format is a tab delimited file
|
|
|
89 starting with the original line from the interval file followed by a list
|
|
|
90 of matching features. Features are reported in the form of their name
|
|
|
91 followed by the distance away in bp ascending from shortest distance to
|
|
|
92 longest. e.g:
|
|
|
93
|
|
|
94 MY_INTERVAL_INFO ENS000001(20 bp distant) ENS000002 (350 bp distant) ...
|
|
|
95
|
|
|
96 The distance may also be ENCOMPASSING FEATURE or INTERNAL FEATURE. An
|
|
|
97 encompassing feature is one which starts before an interval and ends
|
|
|
98 after the interval. An internal feature is opposite where the feature
|
|
|
99 is entirely within the interval.
|
|
|
100
|
|
|
101 DEFAULT:OFF
|
|
|
102
|
|
|
103 =item B<-include_capture>
|
|
|
104
|
|
|
105 Including capture information adds three columns to the output from
|
|
|
106 feature_finder. These are the start and end location to capture all
|
|
|
107 features found within the given window and the total distance in bp
|
|
|
108 required to capture the entire set.
|
|
|
109
|
|
|
110 =item B<-promoter PROMOTOR_BP>
|
|
|
111
|
|
|
112 The start and end of the capture region will be modified to include
|
|
|
113 the specified amount of the promoter region depending on the orientation
|
|
|
114 of the feature. '+' strand features modify the start position while '-'
|
|
|
115 strand features modify the end position.
|
|
|
116
|
|
|
117 DEFAULT:0
|
|
|
118
|
|
|
119 =item B<-icols interval_chrom_column:start:end:label:strand>
|
|
|
120
|
|
|
121 If the interval file is not in BED format, but still tab delimited -icols
|
|
|
122 can be used to specify which columns in the file contain specific
|
|
|
123 information. Column numbering begins with 1.
|
|
|
124
|
|
|
125 DEFAULT:1:2:3:4:6
|
|
|
126
|
|
|
127 =item B<-fcols feature_chrom_column:start:end:label:strand>
|
|
|
128
|
|
|
129 If the interval file is not in BED format, but still tab delimited -fcols
|
|
|
130 can be used to specify which columns in the file contain specific
|
|
|
131 information. Column numbering begins with 1.
|
|
|
132
|
|
|
133 DEFAULT:1:2:3:4:6
|
|
|
134
|
|
|
135 =item B<-html>
|
|
|
136
|
|
|
137 The output will be an HTML file with links to the features in the and
|
|
|
138 the interval at the UCSC browser.
|
|
|
139
|
|
|
140 =item B<-db db_name>
|
|
|
141
|
|
|
142 Set the database for the organism to be used in the UCSC genome browser.
|
|
|
143 Only used if an HTML output is specified.
|
|
|
144
|
|
|
145 DEFAULT:mm9
|
|
|
146
|
|
|
147 =item B<-cturl custom_track_url>
|
|
|
148
|
|
|
149 Define the URL to be used by the UCSC Genome Browser to load the custom
|
|
|
150 tracks for display purposes. If left blank the URLs will show the windows
|
|
|
151 defined by the input intervals, however the intervals themselves will not
|
|
|
152 appear. Only used if an HTML output is specified.
|
|
|
153
|
|
|
154 =back
|
|
|
155
|
|
|
156 =head1 BUGS
|
|
|
157
|
|
|
158 =head1 REFERENCES
|
|
|
159
|
|
|
160 =head1 VERSION
|
|
|
161
|
|
|
162 0.1
|
|
|
163
|
|
|
164 =cut
|
|
|
165
|
|
|
166 #### END of POD documentation.
|
|
|
167 #-----------------------------------------------------------------------------
|
|
|
168
|
|
|
169 use strict;
|
|
|
170 use Getopt::Long;
|
|
|
171 use Pod::Usage;
|
|
|
172 use List::Util qw[min max];
|
|
|
173
|
|
|
174 my $path = $0;
|
|
|
175 $path =~ s/\/\w*\.pl$//g;
|
|
|
176 require "$path/feature_finder_methods.pl";
|
|
|
177
|
|
|
178 my $intervals;
|
|
|
179 my $features;
|
|
|
180 my $req_res_type = "All"; # "Closest"
|
|
|
181 my $direction = "Both"; # "Upstream", "Downstream", "Both", "Either"
|
|
|
182 my $out;
|
|
|
183 my $window = 2500;
|
|
|
184 my %feature_hash;
|
|
|
185 my $help_flag;
|
|
|
186 my $interval_cols = "1:2:3:4:6";
|
|
|
187 my $feature_cols = "1:2:3:4:6";
|
|
|
188 my $html;
|
|
|
189 my $db = "mm9";
|
|
|
190 my $cturl;
|
|
|
191 my $debug = 0;
|
|
|
192 my $include_capture = 0;
|
|
|
193 my $promoter = 0;
|
|
|
194
|
|
|
195 my %options = (
|
|
|
196 "direction|d=s" => \$direction,
|
|
|
197 "intervals|i=s" => \$intervals,
|
|
|
198 "features|f=s" => \$features,
|
|
|
199 "results|r=s" => \$req_res_type,
|
|
|
200 "output|o=s" => \$out,
|
|
|
201 "window|w=s" => \$window,
|
|
|
202 "icols=s" => \$interval_cols,
|
|
|
203 "fcols=s" => \$feature_cols,
|
|
|
204 "help|h" => \$help_flag,
|
|
|
205 "html" => \$html,
|
|
|
206 "db=s" => \$db,
|
|
|
207 "cturl=s" => \$cturl,
|
|
|
208 "include_capture|ic" => \$include_capture,
|
|
|
209 "promoter|p=i" => \$promoter,
|
|
|
210 "debug" => \$debug
|
|
|
211 );
|
|
|
212
|
|
|
213 GetOptions(%options) or pod2usage(2);
|
|
|
214 pod2usage(1) if $help_flag;
|
|
|
215
|
|
|
216 pod2usage({ message => "The window specified with -window must be at least 2500.\n", exitval => 2}) if ($window < 2500);
|
|
|
217 #pod2usage({ message => "Result type specified with -result must be either $all or $close.\n", exitval => 2}) if (uc $req_res_type ne $all && uc $req_res_type ne $close);
|
|
|
218 pod2usage({ message => "Direction sepecified with -direction must be one of BOTH, EITHER, UPSTREAM or DOWNSTREAM.\n", exitval => 2}) if (uc $direction ne "BOTH" && uc $direction ne "EITHER" && uc $direction ne "UPSTREAM" && uc $direction ne "DOWNSTREAM");
|
|
|
219
|
|
|
220 my @icols = split(":", $interval_cols);
|
|
|
221 pod2usage({ message => "Must specify 5 columns with -icol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#icols != 4);
|
|
|
222 my $ichrom = $icols[0]-1;
|
|
|
223 my $istart = $icols[1]-1;
|
|
|
224 my $iend = $icols[2]-1;
|
|
|
225 my $ilabel = $icols[3]-1;
|
|
|
226 my $istrand = $icols[4]-1;
|
|
|
227
|
|
|
228 my @fcols = split(":", $feature_cols);
|
|
|
229 pod2usage({ message => "Must specify 5 columns with -fcol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#fcols != 4);
|
|
|
230 my $fchrom = $fcols[0]-1;
|
|
|
231 my $fstart = $fcols[1]-1;
|
|
|
232 my $fend = $fcols[2]-1;
|
|
|
233 my $flabel = $fcols[3]-1;
|
|
|
234 my $fstrand = $fcols[4]-1;
|
|
|
235
|
|
|
236
|
|
|
237 my $feature_hash_ref = &feature_index(\$features, \$window, \$fchrom, \$fstart, \$fend, \$debug);
|
|
|
238 &process_intervals(\$intervals, \$out, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance, \$include_capture, \$promoter);
|
|
|
239 exit(0);
|
|
|
240
|
|
|
241
|