Mercurial > repos > jesse-erdmann > tapdance
view lib/feature_finder_methods.pl @ 0:1437a2df99c0
Uploaded
author | jesse-erdmann |
---|---|
date | Fri, 09 Dec 2011 11:56:56 -0500 |
parents | |
children |
line wrap: on
line source
use strict; use List::Util qw[min max]; my $all = "ALL"; my $close = "CLOSEST"; my $first_idx = 0; my $left = -1; my $right = 1; my $both = 0; my $keys = "KEYS"; my $NOT_FOUND = -3; my $NOT_FOUND_STR = "NOT FOUND"; my $ENCOMPASSING = -2; #my $ENCOMPASSING_STR = ""; my $ENCOMPASSING_STR = "ENCOMPASSING FEATURE"; my $INTERNAL = -1; #my $INTERNAL_STR = ""; my $INTERNAL_STR = "INTERNAL FEATURE"; return 1; sub feature_index { my ($feature_file, $window, $fchrom_col, $fstart_col, $fend_col, $debug) = @_; open(FEATURES, "<${$feature_file}") || die "Unable to open features file, ${$feature_file}: $!\n"; my %feature_hash = (); my $num_indexed = 0; while (<FEATURES>) { if (${$debug}) { if (($num_indexed % 1000) == 0) { my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); print sprintf("%02d-%02d %02d:%02d:%02d Features Indexed: %9s\n", ($mon+1), $day, $hour, $min, $sec, $num_indexed); } } chomp; $num_indexed++; my @split_line = split("\t"); if (!@split_line) { next; } my $interim_hash1 = int($split_line[${$fstart_col}]/${$window}); my $interim_hash2 = int($split_line[${$fend_col}]/${$window}); for (my $interim_hash = min($interim_hash1, $interim_hash2); $interim_hash <= max($interim_hash1, $interim_hash2); $interim_hash++) { push @{$feature_hash{$split_line[${$fchrom_col}]}->{$interim_hash}->{$split_line[${$fstart_col}]}}, \@split_line; } } close(FEATURES); if (${$debug}) { print "$num_indexed indexed\n"; } my $min_keys_in_window; my $max_keys_in_window; my $num_idxs = 0; my $total_keys = 0; foreach my $chrom (keys %feature_hash) { foreach my $interim_hash (keys %{$feature_hash{$chrom}}) { $num_idxs++; my @sorted_keys = sort {$a <=> $b} keys %{$feature_hash{$chrom}->{$interim_hash}}; my $interim_key = join("_", ($interim_hash, $keys)); my $num_keys = $#sorted_keys+1; #print OUT "interim_hash:$interim_hash interim_key:$interim_key num_keys:$num_keys @sorted_keys\n"; $feature_hash{$chrom}->{$interim_key} = \@sorted_keys; $total_keys += $#sorted_keys + 1; if (!defined($min_keys_in_window) || ($#sorted_keys+1) < $min_keys_in_window) { $min_keys_in_window = ($#sorted_keys+1); } if (!defined($max_keys_in_window) || ($#sorted_keys+1) > $max_keys_in_window) { $max_keys_in_window = ($#sorted_keys+1); } } } if (${$debug}) { my $avg_keys = $total_keys/$num_idxs; print "Smallest # of keys in window: $min_keys_in_window, Largest: $max_keys_in_window $total_keys in $num_idxs windows for an average of $avg_keys\n"; } return \%feature_hash; } sub process_intervals { my ($intervals_file, $out_file, $feature_hash_ref, $req_res_type_ref, $direction_ref, $window_ref, $html_ref, $cturl_ref, $db_ref, $icols_ref, $fcols_ref, $debug_ref, $dd_ref, $capture, $promoter) = @_; if (${$debug_ref}) { print sprintf("process_intervals(intervals:%s, out:%s, direction:%s, window:%s, icols_ref:%s)\n", ${$intervals_file}, ${$out_file}, ${$direction_ref}, ${$window_ref}, ${$icols_ref}); } open(INTERVALS, "<${$intervals_file}") || die "Unable to open intervals file, ${$intervals_file}: $!\n"; open(my $out_h, ">${$out_file}") || die "Unable to open output file, ${$out_file}: $!\n"; #if (${$debug_ref}) { print sprintf("%s features in index.\n", keys %{$feature_hash_ref}); } my @icols = split(":", ${$icols_ref}); #pod2usage({ message => "Must specify 5 columns with -icol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#icols != 4); my $ichrom = $icols[0]-1; my $istart = $icols[1]-1; my $iend = $icols[2]-1; my $ilabel = $icols[3]-1; my $istrand = $icols[4]-1; my @fcols = split(":", ${$fcols_ref}); #pod2usage({ message => "Must specify 5 columns with -fcol, chrom:start:end:label:strand.\n", exitval => 2}) if ($#fcols != 4); my $fchrom = $fcols[0]-1; my $fstart = $fcols[1]-1; my $fend = $fcols[2]-1; my $flabel = $fcols[3]-1; my $fstrand = $fcols[4]-1; my $cturl; my $print = \&print_res; if (${$html_ref}) { $print = \&print_res_html; print $out_h "<HTML>\n<HEAD>\n\t<TITLE>Feature Finder Annotation</TITLE>\n</HEAD>\n<BODY>\n<TABLE>\n"; if (defined(${$cturl_ref})) { ${$cturl} =~ s/X26/\%26/g; } } my $count = 0; foreach (<INTERVALS>) { if (${$debug_ref}) { if (($count % 1000) == 0) { my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); print sprintf("%02d-%02d %02d:%02d:%02d Intervals Processed: %9s\n", ($mon+1), $day, $hour, $min, $sec, $count); } $count++; } chomp; my @split_line = split("\t"); if (!@split_line) { next; } if (!defined(${$feature_hash_ref}{$split_line[$ichrom]})) { #if (${$debug_ref}) { print "."; } next; } my $win_start = min($split_line[$istart], $split_line[$iend]) - ${$window_ref}; if ($win_start < 0) { $win_start = 0; } my $start_idx = int($win_start/${$window_ref}) -1; if ($start_idx < 0) { $start_idx = 0; } my $win_end = max($split_line[$istart], $split_line[$iend]) + ${$window_ref}; my $end_idx = int($win_end/${$window_ref}) +1; if (uc ${$req_res_type_ref} eq $all) { &$print(\$out_h, \@split_line, \$istart, \$iend, &compare(\$out_h, $feature_hash_ref, \$all, \@split_line, \$split_line[$ichrom], \$win_start, \$win_end, \$start_idx, \$end_idx, \$both, \$ichrom, \$istart, \$iend, \$ilabel, \$istrand, \$fchrom, \$fstart, \$fend, \$flabel, $debug_ref), \$flabel, \$fstart, \$fend, \$fstrand, $window_ref, $promoter, $capture, $dd_ref, $db_ref, $cturl_ref, \$ichrom, \$istart, \$iend); } else { my %left_res = (); my %right_res = (); #if (${$debug_ref}) { print sprintf("STRAND:%s\tDIRECTION:%s\n", $split_line[$istrand], ${$direction_ref}); } if ($split_line[$istrand] ne "+" && $split_line[$istrand] ne "-") { warn sprintf("Found \"%s\" as the strand of %s:%s-%s %s, please verify icols match your input data\n", $split_line[$istrand], $split_line[$ichrom], $split_line[$istart], $split_line[$iend], $split_line[$ilabel]); } if (($split_line[$istrand] eq "+" && uc ${$direction_ref} ne "DOWNSTREAM") || ($split_line[$istrand] eq "-" && uc ${$direction_ref} ne "UPSTREAM")) { %left_res = %{&compare(\$out_h, $feature_hash_ref, \$close, \@split_line, \$split_line[$ichrom], \$win_start, \$win_end, \$start_idx, \$end_idx, \$left, \$ichrom, \$istart, \$iend, \$ilabel, \$istrand, \$fchrom, \$fstart, \$fend, \$flabel, $debug_ref)}; } if (($split_line[$istrand] eq "-" && uc ${$direction_ref} ne "DOWNSTREAM") || ($split_line[$istrand] eq "+" && uc ${$direction_ref} ne "UPSTREAM")) { %right_res = %{&compare(\$out_h, $feature_hash_ref, \$close, \@split_line, \$split_line[$ichrom], \$win_start, \$win_end, \$start_idx, \$end_idx, \$right, \$ichrom, \$istart, \$iend, \$ilabel, \$istrand, \$fchrom, \$fstart, \$fend, \$flabel, $debug_ref)}; } if (keys %left_res == 0) { &$print(\$out_h, \@split_line, \$istart, \$iend, \%right_res, \$flabel, \$fstart, \$fend, \$fstrand, $window_ref, $promoter, $capture, $dd_ref, $db_ref, $cturl_ref, \$ichrom); } elsif (keys %right_res == 0) { &$print(\$out_h, \@split_line, \$istart, \$iend, \%left_res, \$flabel, \$fstart, \$fend, \$fstrand, $window_ref, $promoter, $capture, $dd_ref, $db_ref, $cturl_ref, \$ichrom); } else { my @left_keys = keys %left_res; my @right_keys = keys %right_res; if (uc ${$direction_ref} eq "BOTH") { @left_res{keys %right_res} = values %right_res; &$print(\$out_h, \@split_line, \$istart, \$iend, \%left_res, \$flabel, \$fstart, \$fend, \$fstrand, $window_ref, $promoter, $capture, $dd_ref, $db_ref, $cturl_ref, \$ichrom); } else { if ($left_keys[0] <= $right_keys[0]) { &$print(\$out_h, \@split_line, \$istart, \$iend, \%left_res, \$flabel, \$fstart, \$fend, \$fstrand, $window_ref, $promoter, $capture, $dd_ref, $db_ref, $cturl_ref, \$ichrom); } elsif ($right_keys[0] < $left_keys[0]) { &$print(\$out_h, \@split_line, \$istart, \$iend, \%right_res, \$flabel, \$fstart, \$fend, \$fstrand, $window_ref, $promoter, $capture, $dd_ref, $db_ref, $cturl_ref, \$ichrom); } } } } } close(INTERVALS); if (${$html_ref}) { print $out_h "</TABLE>\n</BODY>\n</HTML>\n"; } close($out_h); } sub compare { my ($out_h_ref, $feature_hash_ref, $res_type, $interval_array_ref, $chrom, $window_start, $window_end, $start_idx, $end_idx, $direction, $ichrom, $istart, $iend, $ilabel, $istrand, $fchrom, $fstart, $fend, $flabel, $debug) = @_; #if (${$debug}) { print sprintf("compare(result type:%s, chrom:%s, win start:%s, win end:%s, direction:%s, ichrom_col:%s, fchrom_col:%s, ilabel_col:%s, flabel_col:%s, debug:%s)\n", ${$res_type}, ${$chrom}, ${$window_start}, ${$window_end}, ${$direction}, ${$ichrom}, ${$fchrom}, ${$ilabel}, ${$flabel}, ${$debug}); } my %result = (); my %reshash = (); my $feature_start; my $feature_end; my $interval_start; my $interval_end; my $interval_idx; my $best_distance = $NOT_FOUND; for ($interval_idx = ${$start_idx}; $interval_idx <= ${$end_idx}; $interval_idx++) { my $interval_key = join("_", ($interval_idx, $keys)); if (!defined(${$feature_hash_ref}{${$chrom}}->{$interval_key})) { next; } my @feature_keys = @{${$feature_hash_ref}{${$chrom}}->{$interval_key}}; my $interval_midp = (($interval_array_ref->[${$istart}] + $interval_array_ref->[${$iend}])/2); foreach my $feature_idx (@feature_keys) { foreach my $feature (@{${$feature_hash_ref}{${$chrom}}->{$interval_idx}->{$feature_idx}}) { $feature_start = min($feature->[${$fstart}], $feature->[${$fend}]); $feature_end = max($feature->[${$fstart}], $feature->[${$fend}]); $interval_start = min($interval_array_ref->[${$istart}], $interval_array_ref->[${$iend}]); $interval_end = max($interval_array_ref->[${$istart}], $interval_array_ref->[${$iend}]); my $reskey = join(":", ($interval_array_ref->[${$ichrom}], ${$window_start}, ${$window_end}, $interval_array_ref->[${$ilabel}], $feature->[${$fchrom}], $feature_start, $feature_end, $feature->[${$flabel}])); my $distance; if ($feature_start <= $interval_start && $feature_end >= $interval_end) { $distance = $ENCOMPASSING; } elsif ($interval_start < $feature_start && $interval_end > $feature_end) { $distance = $INTERNAL; } else { $distance = min(abs($interval_array_ref->[${$istart}] - $feature_start), abs($interval_array_ref->[${$istart}] - $feature_end), abs($interval_array_ref->[${$iend}] - $feature_start), abs($interval_array_ref->[${$iend}] - $feature_end)); #if (${$debug}) { print "FEATURE:$feature->[${$flabel}]\tDISTANCE:$distance\n"; } } my $dir_str = ""; my $feature_midp = ($feature_start + $feature_end)/2; if ($interval_midp >= $feature_midp) { if ($interval_array_ref->[${$istrand}] eq "+") { $dir_str = "UPSTREAM"; } else { $dir_str = "DOWNSTREAM" } } else { if ($interval_array_ref->[${$istrand}] eq "-") { $dir_str = "UPSTREAM"; } else { $dir_str = "DOWNSTREAM" } } if (${$res_type} eq $close) { if (${$direction} == $left) { if ($interval_midp < $feature_midp) { next; } } elsif (${$direction} == $right) { if ($interval_midp >= $feature_midp) { next; } } #print ${$out_h_ref} "distance:$distance best_distance:$best_distance\n"; if ($best_distance > $NOT_FOUND) { if ($distance < $best_distance) { delete $result{$best_distance}; if (!defined($reshash{$reskey})) { $reshash{$reskey} = $feature; push @{$result{$distance}{$feature->[${$flabel}]}}, $feature, $dir_str; #$result{$distance}{$feature->[${$flabel}]} = $dir_str; $best_distance = $distance; } } } else { if (!defined($reshash{$reskey})) { $reshash{$reskey} = $feature; push @{$result{$distance}{$feature->[${$flabel}]}}, $feature, $dir_str; #$result{$distance}{$feature->[${$flabel}]} = $dir_str; $best_distance= $distance; } } } else { # if the interval start - the window is less than the end of the feature and the interval end + the window is greater than the start of the feature it's included if (${$window_start} <= $feature_end && ${$window_end} >= $feature_start) { if (!defined($reshash{$reskey})) { $reshash{$reskey} = $feature; push @{$result{$distance}{$feature->[${$flabel}]}}, $feature, $dir_str; #$result{$distance}{$feature->[${$flabel}]} = $dir_str; } } } } } } return \%result; } sub print_res { my ($out_h_ref, $interval_array_ref, $istart, $iend, $result_hash_ref, $flabel, $fstart, $fend, $fstrand, $window, $promoter_len, $capture, $dd_ref) = @_; print ${$out_h_ref} join("\t", @{$interval_array_ref}); my @sorted_keys = sort {$a <=> $b} keys %{$result_hash_ref}; if (defined($sorted_keys[0])) { print ${$out_h_ref} sprintf("\t"); my $left_end = $interval_array_ref->[${$istart}]; my $right_end = $interval_array_ref->[${$iend}]; my $fstart_pos; my $fend_pos; foreach my $next_key (@sorted_keys) { foreach my $res (keys %{$result_hash_ref->{$next_key}}) { #my @feature_keys = keys %{$res}; if (defined($capture) && ${$capture}) { if (@{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstrand}] eq '+') { $fstart_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstart}] - ${$promoter_len}; } else { $fstart_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstart}]; } if (@{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstrand}] eq '-') { $fend_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fend}] + ${$promoter_len}; } else { $fend_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fend}]; } if ($fstart_pos < $left_end) { $left_end = $fstart_pos; } if ($fend_pos > $right_end) { $right_end = $fend_pos; } } if (!${$dd_ref}) { print ${$out_h_ref} sprintf("%s, ", $res); } else { if ($next_key == $ENCOMPASSING) { print ${$out_h_ref} sprintf("%s (%s), ", $res, $ENCOMPASSING_STR); } elsif ($next_key == $INTERNAL) { print ${$out_h_ref} sprintf("%s (%s), ", $res, $INTERNAL_STR); } else { print ${$out_h_ref} sprintf("%s (%s bp %s), ", $res, $next_key, @{$result_hash_ref->{$next_key}->{$res}}[1]); } } } } if (defined($capture) && ${$capture}) { print ${$out_h_ref} sprintf("\t%s\t%s\t%s", $left_end, $right_end, $right_end - $left_end); } } else { print ${$out_h_ref} sprintf("\tNo results within ${$window} bp window"); } print ${$out_h_ref} sprintf("\n"); } sub print_res_html { my ($out_h_ref, $interval_array_ref, $istart, $iend, $result_hash_ref, $flabel, $fstart, $fend, $fstrand, $window, $promoter_len, $capture, $dd_ref, $db, $cturl, $ichrom) = @_; print ${$out_h_ref} sprintf("<TR>\n\t<TD>"); my $win_start = min($interval_array_ref->[${$istart}], $interval_array_ref->[${$iend}]) - $window; my $win_end = max($interval_array_ref->[${$istart}], $interval_array_ref->[${$iend}]) + $window; print ${$out_h_ref} sprintf("<A HREF=\"http://genome.ucsc.edu/cgi-bin/hgTracks?db=%s&position=%s:%s-%s&display_app=ucsc&authz_method=display_at", $db, $interval_array_ref->[${$ichrom}], $win_start, $win_end); my $left_end = $interval_array_ref->[${$istart}]; my $right_end = $interval_array_ref->[${$iend}]; my $fstart_pos; my $fend_pos; if (defined($cturl)) { #print ${$out_h_ref} sprintf ("&hgt.customText=%s/display_as?id=%s&display_app=ucsc&authz_method=display_at", $server_url, $out_dbkey); print ${$out_h_ref} sprintf("&hgt.customText=%s", $cturl); } print ${$out_h_ref} sprintf("\">UCSC Visualization</A></TD><TD>"); #custom track stuff something like : &hgt.customText=%s/display_as?id=%s&display_app=ucsc&authz_method=display_at, $server_url, $out_dbkey print ${$out_h_ref} join("</TD><TD>", @{$interval_array_ref}); my @sorted_keys = sort {$a <=> $b} keys %{$result_hash_ref}; if (defined($sorted_keys[0])) { foreach my $next_key (@sorted_keys) { foreach my $res (keys %{$result_hash_ref->{$next_key}}) { if (${$capture}) { if (@{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstrand}] eq '+') { $fstart_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstart}] - ${$promoter_len}; } else { $fstart_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstart}]; } if (@{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fstrand}] eq '-') { $fend_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fend}] + ${$promoter_len}; } else { $fend_pos = @{$result_hash_ref->{$next_key}->{$res}}[0]->[${$fend}]; } if ($fstart_pos < $left_end) { $left_end = $fstart_pos; } if ($fend_pos > $right_end) { $right_end = $fend_pos; } } if (!${$dd_ref}) { print ${$out_h_ref} sprintf("<TD>%s</TD>", $res); } else { if ($next_key == $ENCOMPASSING) { print ${$out_h_ref} sprintf("<TD>%s (%s)</TD>", $res, $ENCOMPASSING_STR); } elsif ($next_key == $INTERNAL) { print ${$out_h_ref} sprintf("<TD>%s (%s)</TD>", $res, $INTERNAL_STR); } else { print ${$out_h_ref} sprintf("<TD>%s (%s bp %s)</TD>", $res, $next_key, @{$result_hash_ref->{$next_key}->{$res}}[0]); } } } } if (${$capture}) { print ${$out_h_ref} sprintf("<TD>%s</TD><TD>%s</TD><TD>%s</TD>", $left_end, $right_end, $right_end - $left_end); } } else { print ${$out_h_ref} sprintf("<TD>No results within $window bp window</TD>"); } print ${$out_h_ref} sprintf("</TD>\n</TR>\n"); }