Mercurial > repos > jesse-erdmann > tapdance
view lib/cis_finder.pl @ 3:17ce4f3bffa2 default tip
Uploaded
author | jesse-erdmann |
---|---|
date | Tue, 24 Jan 2012 18:33:41 -0500 |
parents | 1437a2df99c0 |
children |
line wrap: on
line source
#This script takes a set of chromosomes and positions from a text file and generates defined CIS'es #version 5.0 ALS #Nov 15 2010 # Open the presorted NR chromosome file and load it into a hash. require 'config.pl'; $boxSize = $ARGV[0]; open OUT, "> CIS/cis$ARGV[0].txt"; open SOURCE, "< CIS/nr.txt"; while (defined($line = <SOURCE>)) { $count++; chomp $line; @field= split(/\t/, $line); $hash[$count]= $field[1]; $chrom[$count]= $field[0]; $lib[$count]= $field[2]; $orient[$count]=$field[5]; } #Count the number of nr positions within a box the size of which is defined below and add to a parallel hash. foreach $n (1..$count) { $cis_count = 0; @lib_list = ''; @pos_list = ''; $lib_count = ''; $orient_count = '0'; $i =$n; #print "$n\n"; $j = $hash["$n"]+$boxSize; #print "pos=>".$hash[$n]."\n"; while (($hash["$n"] <= $hash["$i"]) and ($j > ($hash["$i"]+0)) and ($i <= $count)) { push(@lib_list,$lib["$i"]); push(@pos_list,$hash["$i"]); if ($orient[$i] eq "+") { $orient_count = $orient_count +1; } $i=$i+1; #print "$i\n"; $cis_count=$cis_count+1; #print "$cis_count\n"; } $cis[$n]=$cis_count; $strand[$n]=$orient_count; #print "Cis_count=$cis_count\t"; #print @lib_list; #This section rather hackily counts the number of libraries represented in the CIS window. %seen=(); @uniq = (); $libstring = ''; foreach $item (@lib_list) { unless ($seen{$item}) { $seen{$item}= 1; push(@uniq,$item) } } #print "\n@uniq\n"; $lib_count = -1; foreach $i (@uniq) { $lib_count++; $libstring = $libstring.$i.'::'; } #print "Library_count=$lib_count"; $lib[$n]=$lib_count; $lib_name[$n]=$libstring; #print "$cis[$n]\t$lib[$n]"; #This section rather hackily counts the the number of regions represented in the CIS window. %seen=(); @uniq = (); $posstring = ''; foreach $item (@pos_list) { unless ($seen{$item}) { $seen{$item}= 1; push(@uniq,$item) } } #print "\n@uniq\n"; $pos_count = -1; foreach $i (@uniq) { $pos_count++; $posstring = $posstring.$i.'::'; } #print "Library_count=$lib_count"; $pos[$n]=$pos_count; $pos_name[$n]=$posstring; #print "\t$pos[$n]\n"; } $cis[0] = 0; # select peaks and export them with complex logic scheme. # update Sept22 seperate this into 4 cases. foreach $n (1..$count) { $prev = $n-1; $this = $n; $next = $n+1; #print "\t$ARGV[1]\n"; if ($ARGV[1] eq library) { #print "$ARGV[1]\t$cis[$this]\t$lib[$n]\n"; $cis[$this]=$lib[$this]; $cis[$next]=$lib[$next]; $cis[$prev]=$lib[$prev]; } # outcome 1 no other cises are relevant i.e. nothing within + or - cis size. if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and (($hash[$next]-$hash[$this] > $boxSize) or ($chrom[$next] ne $chrom[$this])) and ($cis[$this] > 1)){ outputpart1(); } # outcome 2 no other cis + cis size cis - cis exists and is smaller. if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($hash[$next]-$hash[$this] > $boxSize) and ($cis[$this] > 1)){ outputpart1(); } # outcome 3 no other cis - cis size + cis exists and is smaller are relevant nothing within + or - cis size. if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$this] >= $cis[$next]) and ($cis[$this] > 1)){ outputpart1(); } # outcome 4 other cises on both sides. if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($hash[$next]-$hash[$this] > 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($cis[$this] >= $cis[$next]) and $cis[$this] > 1 ) { outputpart1(); } #print "$chrom[$n]\t$hash[$n]\t$cis[$n]\n"; } #hack to prevent permanent looping...observedon one occasion unsure why... $iter = 0; $count3 = 1; $count2 = 0; while (($count3 ne $count2) and ($iter < 100)){ print "$iter"; $iter++; close OUT; #print "$count3\t\t\t\t$count2\n\n"; $count3 = $count2; $count2 = 0; %hash= (); %chrom= (); %cis=(); %libraries=(); %full_line=(); #open OUT, "> cis2.txt"; open SOURCE, "< CIS/cis$ARGV[0].txt"; while (defined($line = <SOURCE>)) { $count2++; chomp $line; $full_line[$count2]=$line; @field= split(/\t/, $line); $hash[$count2]= $field[1]; $chrom[$count2]= $field[0]; $cis[$count2]= $field[4]; #$libraries[$count2]=$field[11]; } close SOURCE; open OUT, "> CIS/cis$ARGV[0].txt"; # select peaks and export them with complex logic scheme. # update Sept22 seperate this into 4 cases. foreach $n (1..$count2) { $prev = $n-1; $this = $n; $next = $n+1; # outcome 1 no other cises are relevant i.e. nothing within + or - cis size. if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and (($hash[$this]-$hash[$next] < -$boxSize) or ($chrom[$next] ne $chrom[$this])) and ($cis[$this] > 1)){ output(); } # outcome 2 no other cis + cis size cis - cis exists and is smaller. if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($hash[$next]-$hash[$this] > $boxSize) and ($cis[$this] > 1)){ output(); } # outcome 3 no other cis - cis size + cis exists and is smaller are relevant nothing within + or - cis size. if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$this] >= $cis[$next]) and ($cis[$this] > 1)){ output(); } # outcome 4 other cises on both sides. if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($cis[$this] >= $cis[$next]) and $cis[$this] > 1 ) { output(); } #print "$chrom[$n]\t$hash[$n]\t$cis[$n]\n"; } } close OUT; #Subroutines # provide data in the form of poisson('BoxSize','Total','count') sub outputpart1 { $pval = poissonBonf($boxSize,$count,$lib[$this]); $pvalTotal = poissonBonf($boxSize,$count,$cis[$this]); $pvalRegion = poissonBonf($boxSize,$count,$pos[$this]); if ($pval < $CIS_total_pvalue) { $end = $hash[$n]+$boxSize; print OUT "$chrom[$n]\t$hash[$n]\t$end\t$chrom[$n]_$hash[$n]\t$lib[$this]\t$boxSize\t$pval\t$cis[$this]\t$pvalTotal\t$pos[$this]\t$pvalRegion\t$lib_name[$n]\t$strand[$this]\n"; } } sub output { print OUT "$full_line[$n]\n"; #if ($pval < 0.05) { #$end = $hash[$n]+$boxSize; #print OUT "$chrom[$n]\t$hash[$n]\t$end\t$chrom[$n]_$hash[$n]\t$cis[$this]\t$boxSize\t$pval\t$bonf\t$libraries[$n]\n"; #} } sub poisson { return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0])))/(fac($_[2]))); } sub poissonBonf { return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0])))/(fac($_[2])))*$_[1]); } sub fac { my ($n) = @_; if ($n < 2) { return $n; } else { return $n * fac($n-1); } }