diff lib/cis_finder.pl @ 0:1437a2df99c0

Uploaded
author jesse-erdmann
date Fri, 09 Dec 2011 11:56:56 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/cis_finder.pl	Fri Dec 09 11:56:56 2011 -0500
@@ -0,0 +1,242 @@
+#This script takes a set of chromosomes and positions from a text file and generates defined CIS'es
+#version 5.0 ALS 
+#Nov 15 2010
+
+#  Open the presorted NR chromosome file and load it into a hash.
+require 'config.pl';
+$boxSize = $ARGV[0];
+
+open OUT, "> CIS/cis$ARGV[0].txt";
+open SOURCE, "< CIS/nr.txt";
+while (defined($line = <SOURCE>)) {
+$count++; 
+chomp $line;
+@field= split(/\t/, $line);
+$hash[$count]= $field[1];
+$chrom[$count]= $field[0];
+$lib[$count]= $field[2];
+$orient[$count]=$field[5];
+}
+
+#Count the number of nr positions within a box the size of which is defined below and add to a parallel hash.
+foreach $n (1..$count) {
+$cis_count = 0;
+@lib_list = '';
+@pos_list = '';
+$lib_count = '';
+$orient_count = '0';
+$i =$n;
+#print "$n\n";
+$j = $hash["$n"]+$boxSize;
+#print "pos=>".$hash[$n]."\n";
+while (($hash["$n"] <= $hash["$i"]) and ($j > ($hash["$i"]+0)) and ($i <= $count)) {
+push(@lib_list,$lib["$i"]);
+push(@pos_list,$hash["$i"]);
+if ($orient[$i] eq "+") {
+$orient_count = $orient_count +1;
+}
+
+$i=$i+1;
+#print "$i\n";
+$cis_count=$cis_count+1;
+#print "$cis_count\n";
+
+}
+$cis[$n]=$cis_count;
+$strand[$n]=$orient_count;
+#print "Cis_count=$cis_count\t";
+#print @lib_list;
+#This section rather hackily counts the number of libraries represented in the CIS window.
+%seen=();
+@uniq = ();
+$libstring = '';
+foreach $item (@lib_list) {
+unless ($seen{$item}) {
+$seen{$item}= 1;
+push(@uniq,$item)
+}
+}
+#print "\n@uniq\n";
+
+$lib_count = -1;
+foreach $i (@uniq) {
+$lib_count++;
+$libstring = $libstring.$i.'::';
+}
+
+#print "Library_count=$lib_count";
+
+$lib[$n]=$lib_count;
+$lib_name[$n]=$libstring;
+#print "$cis[$n]\t$lib[$n]";
+
+#This section rather hackily counts the the number of regions represented in the CIS window.
+%seen=();
+@uniq = ();
+$posstring = '';
+foreach $item (@pos_list) {
+unless ($seen{$item}) {
+$seen{$item}= 1;
+push(@uniq,$item)
+}
+}
+#print "\n@uniq\n";
+
+$pos_count = -1;
+foreach $i (@uniq) {
+$pos_count++;
+$posstring = $posstring.$i.'::';
+}
+
+#print "Library_count=$lib_count";
+
+$pos[$n]=$pos_count;
+$pos_name[$n]=$posstring;
+#print "\t$pos[$n]\n";
+
+
+
+}
+
+
+$cis[0] = 0;
+
+# select peaks and export them with complex logic scheme.
+# update Sept22 seperate this into 4 cases.
+
+foreach $n (1..$count) {
+$prev = $n-1;
+$this = $n;
+$next = $n+1;
+#print "\t$ARGV[1]\n";
+
+
+if ($ARGV[1] eq library) {
+#print "$ARGV[1]\t$cis[$this]\t$lib[$n]\n";
+$cis[$this]=$lib[$this];
+$cis[$next]=$lib[$next];
+$cis[$prev]=$lib[$prev];
+}
+# outcome 1 no other cises are relevant i.e. nothing within + or - cis size.
+if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev]))  and (($hash[$next]-$hash[$this] > $boxSize) or ($chrom[$next] ne $chrom[$this])) and ($cis[$this] > 1)){ 
+outputpart1();
+}
+
+# outcome 2 no other cis + cis size cis - cis exists and is smaller.
+if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($hash[$next]-$hash[$this] > $boxSize) and ($cis[$this] > 1)){ 
+outputpart1();
+}
+
+# outcome 3 no other cis - cis size + cis exists and is smaller are relevant nothing within + or - cis size.
+if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev]))  and ($hash[$next]-$hash[$this] >= 0)  and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$this] >= $cis[$next]) and ($cis[$this] > 1)){ 
+outputpart1();
+}
+
+# outcome 4 other cises on both sides.
+if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize)  and ($hash[$next]-$hash[$this] > 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($cis[$this] >= $cis[$next]) and $cis[$this] > 1 ) {
+outputpart1();
+}
+#print "$chrom[$n]\t$hash[$n]\t$cis[$n]\n";
+}
+#hack to prevent permanent looping...observedon one occasion unsure why...
+$iter = 0;
+$count3 = 1;
+$count2 = 0;
+while (($count3 ne $count2) and ($iter < 100)){
+print "$iter";
+$iter++;
+close OUT;
+#print "$count3\t\t\t\t$count2\n\n";
+$count3 = $count2;
+$count2 = 0;
+%hash= ();
+%chrom= ();
+%cis=();
+%libraries=();
+%full_line=();
+#open OUT, "> cis2.txt";
+open SOURCE, "< CIS/cis$ARGV[0].txt";
+while (defined($line = <SOURCE>)) {
+$count2++; 
+chomp $line;
+$full_line[$count2]=$line;
+@field= split(/\t/, $line);
+$hash[$count2]= $field[1];
+$chrom[$count2]= $field[0];
+$cis[$count2]= $field[4];
+#$libraries[$count2]=$field[11];
+}
+close SOURCE;
+open OUT, "> CIS/cis$ARGV[0].txt";
+# select peaks and export them with complex logic scheme.
+# update Sept22 seperate this into 4 cases.
+
+foreach $n (1..$count2) {
+$prev = $n-1;
+$this = $n;
+$next = $n+1;
+
+# outcome 1 no other cises are relevant i.e. nothing within + or - cis size.
+if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev]))  and (($hash[$this]-$hash[$next] < -$boxSize) or ($chrom[$next] ne $chrom[$this])) and ($cis[$this] > 1)){ 
+output();
+}
+
+# outcome 2 no other cis + cis size cis - cis exists and is smaller.
+if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($hash[$next]-$hash[$this] > $boxSize) and ($cis[$this] > 1)){ 
+output();
+}
+
+# outcome 3 no other cis - cis size + cis exists and is smaller are relevant nothing within + or - cis size.
+if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev]))  and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$this] >= $cis[$next]) and ($cis[$this] > 1)){ 
+output();
+}
+
+# outcome 4 other cises on both sides.
+if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($cis[$this] >= $cis[$next]) and $cis[$this] > 1 ) {
+output();
+}
+#print "$chrom[$n]\t$hash[$n]\t$cis[$n]\n";
+}
+
+}
+close OUT;
+
+#Subroutines
+# provide data in the form of poisson('BoxSize','Total','count')
+sub outputpart1 {
+$pval = poissonBonf($boxSize,$count,$lib[$this]);
+$pvalTotal = poissonBonf($boxSize,$count,$cis[$this]);
+$pvalRegion = poissonBonf($boxSize,$count,$pos[$this]);
+if ($pval < $CIS_total_pvalue) {
+$end = $hash[$n]+$boxSize;
+print OUT "$chrom[$n]\t$hash[$n]\t$end\t$chrom[$n]_$hash[$n]\t$lib[$this]\t$boxSize\t$pval\t$cis[$this]\t$pvalTotal\t$pos[$this]\t$pvalRegion\t$lib_name[$n]\t$strand[$this]\n";           
+}
+}
+
+sub output {
+print OUT "$full_line[$n]\n";
+#if ($pval < 0.05) {
+#$end = $hash[$n]+$boxSize;
+#print OUT "$chrom[$n]\t$hash[$n]\t$end\t$chrom[$n]_$hash[$n]\t$cis[$this]\t$boxSize\t$pval\t$bonf\t$libraries[$n]\n";
+#}
+}
+sub poisson {
+return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0])))/(fac($_[2])));
+}
+
+sub poissonBonf {
+return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0])))/(fac($_[2])))*$_[1]);
+}
+
+sub fac {
+  my ($n) = @_;
+
+  if ($n < 2) {
+    return $n;
+  }
+  else {
+    return $n * fac($n-1);
+  }
+}
+
+