0
|
1 #This script takes a set of chromosomes and positions from a text file and generates defined CIS'es
|
|
2 #version 5.0 ALS
|
|
3 #Nov 15 2010
|
|
4
|
|
5 # Open the presorted NR chromosome file and load it into a hash.
|
|
6 require 'config.pl';
|
|
7 $boxSize = $ARGV[0];
|
|
8
|
|
9 open OUT, "> CIS/cis$ARGV[0].txt";
|
|
10 open SOURCE, "< CIS/nr.txt";
|
|
11 while (defined($line = <SOURCE>)) {
|
|
12 $count++;
|
|
13 chomp $line;
|
|
14 @field= split(/\t/, $line);
|
|
15 $hash[$count]= $field[1];
|
|
16 $chrom[$count]= $field[0];
|
|
17 $lib[$count]= $field[2];
|
|
18 $orient[$count]=$field[5];
|
|
19 }
|
|
20
|
|
21 #Count the number of nr positions within a box the size of which is defined below and add to a parallel hash.
|
|
22 foreach $n (1..$count) {
|
|
23 $cis_count = 0;
|
|
24 @lib_list = '';
|
|
25 @pos_list = '';
|
|
26 $lib_count = '';
|
|
27 $orient_count = '0';
|
|
28 $i =$n;
|
|
29 #print "$n\n";
|
|
30 $j = $hash["$n"]+$boxSize;
|
|
31 #print "pos=>".$hash[$n]."\n";
|
|
32 while (($hash["$n"] <= $hash["$i"]) and ($j > ($hash["$i"]+0)) and ($i <= $count)) {
|
|
33 push(@lib_list,$lib["$i"]);
|
|
34 push(@pos_list,$hash["$i"]);
|
|
35 if ($orient[$i] eq "+") {
|
|
36 $orient_count = $orient_count +1;
|
|
37 }
|
|
38
|
|
39 $i=$i+1;
|
|
40 #print "$i\n";
|
|
41 $cis_count=$cis_count+1;
|
|
42 #print "$cis_count\n";
|
|
43
|
|
44 }
|
|
45 $cis[$n]=$cis_count;
|
|
46 $strand[$n]=$orient_count;
|
|
47 #print "Cis_count=$cis_count\t";
|
|
48 #print @lib_list;
|
|
49 #This section rather hackily counts the number of libraries represented in the CIS window.
|
|
50 %seen=();
|
|
51 @uniq = ();
|
|
52 $libstring = '';
|
|
53 foreach $item (@lib_list) {
|
|
54 unless ($seen{$item}) {
|
|
55 $seen{$item}= 1;
|
|
56 push(@uniq,$item)
|
|
57 }
|
|
58 }
|
|
59 #print "\n@uniq\n";
|
|
60
|
|
61 $lib_count = -1;
|
|
62 foreach $i (@uniq) {
|
|
63 $lib_count++;
|
|
64 $libstring = $libstring.$i.'::';
|
|
65 }
|
|
66
|
|
67 #print "Library_count=$lib_count";
|
|
68
|
|
69 $lib[$n]=$lib_count;
|
|
70 $lib_name[$n]=$libstring;
|
|
71 #print "$cis[$n]\t$lib[$n]";
|
|
72
|
|
73 #This section rather hackily counts the the number of regions represented in the CIS window.
|
|
74 %seen=();
|
|
75 @uniq = ();
|
|
76 $posstring = '';
|
|
77 foreach $item (@pos_list) {
|
|
78 unless ($seen{$item}) {
|
|
79 $seen{$item}= 1;
|
|
80 push(@uniq,$item)
|
|
81 }
|
|
82 }
|
|
83 #print "\n@uniq\n";
|
|
84
|
|
85 $pos_count = -1;
|
|
86 foreach $i (@uniq) {
|
|
87 $pos_count++;
|
|
88 $posstring = $posstring.$i.'::';
|
|
89 }
|
|
90
|
|
91 #print "Library_count=$lib_count";
|
|
92
|
|
93 $pos[$n]=$pos_count;
|
|
94 $pos_name[$n]=$posstring;
|
|
95 #print "\t$pos[$n]\n";
|
|
96
|
|
97
|
|
98
|
|
99 }
|
|
100
|
|
101
|
|
102 $cis[0] = 0;
|
|
103
|
|
104 # select peaks and export them with complex logic scheme.
|
|
105 # update Sept22 seperate this into 4 cases.
|
|
106
|
|
107 foreach $n (1..$count) {
|
|
108 $prev = $n-1;
|
|
109 $this = $n;
|
|
110 $next = $n+1;
|
|
111 #print "\t$ARGV[1]\n";
|
|
112
|
|
113
|
|
114 if ($ARGV[1] eq library) {
|
|
115 #print "$ARGV[1]\t$cis[$this]\t$lib[$n]\n";
|
|
116 $cis[$this]=$lib[$this];
|
|
117 $cis[$next]=$lib[$next];
|
|
118 $cis[$prev]=$lib[$prev];
|
|
119 }
|
|
120 # outcome 1 no other cises are relevant i.e. nothing within + or - cis size.
|
|
121 if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and (($hash[$next]-$hash[$this] > $boxSize) or ($chrom[$next] ne $chrom[$this])) and ($cis[$this] > 1)){
|
|
122 outputpart1();
|
|
123 }
|
|
124
|
|
125 # outcome 2 no other cis + cis size cis - cis exists and is smaller.
|
|
126 if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($hash[$next]-$hash[$this] > $boxSize) and ($cis[$this] > 1)){
|
|
127 outputpart1();
|
|
128 }
|
|
129
|
|
130 # outcome 3 no other cis - cis size + cis exists and is smaller are relevant nothing within + or - cis size.
|
|
131 if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$this] >= $cis[$next]) and ($cis[$this] > 1)){
|
|
132 outputpart1();
|
|
133 }
|
|
134
|
|
135 # outcome 4 other cises on both sides.
|
|
136 if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($hash[$next]-$hash[$this] > 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($cis[$this] >= $cis[$next]) and $cis[$this] > 1 ) {
|
|
137 outputpart1();
|
|
138 }
|
|
139 #print "$chrom[$n]\t$hash[$n]\t$cis[$n]\n";
|
|
140 }
|
|
141 #hack to prevent permanent looping...observedon one occasion unsure why...
|
|
142 $iter = 0;
|
|
143 $count3 = 1;
|
|
144 $count2 = 0;
|
|
145 while (($count3 ne $count2) and ($iter < 100)){
|
|
146 print "$iter";
|
|
147 $iter++;
|
|
148 close OUT;
|
|
149 #print "$count3\t\t\t\t$count2\n\n";
|
|
150 $count3 = $count2;
|
|
151 $count2 = 0;
|
|
152 %hash= ();
|
|
153 %chrom= ();
|
|
154 %cis=();
|
|
155 %libraries=();
|
|
156 %full_line=();
|
|
157 #open OUT, "> cis2.txt";
|
|
158 open SOURCE, "< CIS/cis$ARGV[0].txt";
|
|
159 while (defined($line = <SOURCE>)) {
|
|
160 $count2++;
|
|
161 chomp $line;
|
|
162 $full_line[$count2]=$line;
|
|
163 @field= split(/\t/, $line);
|
|
164 $hash[$count2]= $field[1];
|
|
165 $chrom[$count2]= $field[0];
|
|
166 $cis[$count2]= $field[4];
|
|
167 #$libraries[$count2]=$field[11];
|
|
168 }
|
|
169 close SOURCE;
|
|
170 open OUT, "> CIS/cis$ARGV[0].txt";
|
|
171 # select peaks and export them with complex logic scheme.
|
|
172 # update Sept22 seperate this into 4 cases.
|
|
173
|
|
174 foreach $n (1..$count2) {
|
|
175 $prev = $n-1;
|
|
176 $this = $n;
|
|
177 $next = $n+1;
|
|
178
|
|
179 # outcome 1 no other cises are relevant i.e. nothing within + or - cis size.
|
|
180 if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and (($hash[$this]-$hash[$next] < -$boxSize) or ($chrom[$next] ne $chrom[$this])) and ($cis[$this] > 1)){
|
|
181 output();
|
|
182 }
|
|
183
|
|
184 # outcome 2 no other cis + cis size cis - cis exists and is smaller.
|
|
185 if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($hash[$next]-$hash[$this] > $boxSize) and ($cis[$this] > 1)){
|
|
186 output();
|
|
187 }
|
|
188
|
|
189 # outcome 3 no other cis - cis size + cis exists and is smaller are relevant nothing within + or - cis size.
|
|
190 if ((($hash[$this] - $hash[$prev] > $boxSize) or ($chrom[$this] ne $chrom[$prev])) and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$this] >= $cis[$next]) and ($cis[$this] > 1)){
|
|
191 output();
|
|
192 }
|
|
193
|
|
194 # outcome 4 other cises on both sides.
|
|
195 if (($hash[$this] - $hash[$prev] >0) and ($hash[$this] - $hash[$prev] <= $boxSize) and ($hash[$next]-$hash[$this] >= 0) and ($hash[$next]-$hash[$this] <= $boxSize) and ($cis[$prev] < $cis[$this]) and ($cis[$this] >= $cis[$next]) and $cis[$this] > 1 ) {
|
|
196 output();
|
|
197 }
|
|
198 #print "$chrom[$n]\t$hash[$n]\t$cis[$n]\n";
|
|
199 }
|
|
200
|
|
201 }
|
|
202 close OUT;
|
|
203
|
|
204 #Subroutines
|
|
205 # provide data in the form of poisson('BoxSize','Total','count')
|
|
206 sub outputpart1 {
|
|
207 $pval = poissonBonf($boxSize,$count,$lib[$this]);
|
|
208 $pvalTotal = poissonBonf($boxSize,$count,$cis[$this]);
|
|
209 $pvalRegion = poissonBonf($boxSize,$count,$pos[$this]);
|
|
210 if ($pval < $CIS_total_pvalue) {
|
|
211 $end = $hash[$n]+$boxSize;
|
|
212 print OUT "$chrom[$n]\t$hash[$n]\t$end\t$chrom[$n]_$hash[$n]\t$lib[$this]\t$boxSize\t$pval\t$cis[$this]\t$pvalTotal\t$pos[$this]\t$pvalRegion\t$lib_name[$n]\t$strand[$this]\n";
|
|
213 }
|
|
214 }
|
|
215
|
|
216 sub output {
|
|
217 print OUT "$full_line[$n]\n";
|
|
218 #if ($pval < 0.05) {
|
|
219 #$end = $hash[$n]+$boxSize;
|
|
220 #print OUT "$chrom[$n]\t$hash[$n]\t$end\t$chrom[$n]_$hash[$n]\t$cis[$this]\t$boxSize\t$pval\t$bonf\t$libraries[$n]\n";
|
|
221 #}
|
|
222 }
|
|
223 sub poisson {
|
|
224 return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0])))/(fac($_[2])));
|
|
225 }
|
|
226
|
|
227 sub poissonBonf {
|
|
228 return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0])))/(fac($_[2])))*$_[1]);
|
|
229 }
|
|
230
|
|
231 sub fac {
|
|
232 my ($n) = @_;
|
|
233
|
|
234 if ($n < 2) {
|
|
235 return $n;
|
|
236 }
|
|
237 else {
|
|
238 return $n * fac($n-1);
|
|
239 }
|
|
240 }
|
|
241
|
|
242
|