2
|
1 #!/usr/bin/perl -w
|
|
2 use strict;
|
|
3
|
|
4 # Define variables
|
|
5 my @temp=();
|
|
6 my $result1;
|
|
7 my $result2;
|
|
8 my $result3;
|
|
9 my $result4;
|
|
10 my $result5;
|
|
11 my $result6;
|
|
12 my $resultfinal;
|
|
13 my $count;
|
|
14 my $coun;
|
|
15 my $cou;
|
|
16 my @digit=();
|
|
17 my $digit;
|
|
18 my $marks;
|
|
19 my $log;
|
|
20 my $coll;
|
|
21 my @scorearray=();
|
|
22 my $scorearray;
|
|
23 my $percent;
|
|
24 my $kount;
|
|
25 my @result=();
|
|
26 my $result;
|
|
27 my %final=();
|
|
28 my $final;
|
|
29 my @c=();
|
|
30 my @matrix1;
|
|
31 my @matrix2;
|
|
32 my $matrix1;
|
|
33 my $matrix2;
|
|
34 $coll=0;
|
|
35 my $count2;
|
|
36 my $var;
|
|
37 my $entry1;
|
|
38 my $entry2;
|
|
39 my $reventry1;
|
|
40 my $reventry2;
|
|
41 my $revvar;
|
|
42 my @revmatrix1;
|
|
43 my $revkount;
|
|
44 my $revcoun;
|
|
45 my $revcount2;
|
|
46 my @revtemp;
|
|
47 my $revcoll;
|
|
48 my @revdigit;
|
|
49 my $revdigit;
|
|
50 my $revmarks;
|
|
51 my $revresult1;
|
|
52 my $revresult2;
|
|
53 my $revresult3;
|
|
54 my $revresult4;
|
|
55 my $revresult5;
|
|
56 my $revresult6;
|
|
57 my $revresultfinal;
|
|
58 my @revscorearray;
|
|
59 my $revscorearray;
|
|
60
|
|
61
|
|
62
|
|
63 #define variables from configuration file
|
|
64 open (IN, "<$ARGV[0]");
|
|
65 open (IN2, "<$ARGV[1]");
|
|
66 open (OUT, ">$ARGV[2]");
|
|
67
|
|
68 #assign arrays to variables from configuration file
|
|
69 my @array5=<IN>;
|
|
70
|
|
71 my @coordinates=<IN2>;
|
|
72
|
|
73
|
|
74 #split the chromosome number and starting position from coordinates file into 2 separate strings
|
|
75
|
|
76 foreach my $coordinates(@coordinates) {
|
|
77
|
|
78 chomp($coordinates);
|
|
79
|
|
80 my @coordinates2=split(/\s+/, $coordinates);
|
|
81
|
|
82 my $coordinates2;
|
|
83
|
|
84 $entry1=$coordinates2[0];
|
|
85 $entry2=$coordinates2[1];
|
|
86
|
|
87 }
|
|
88
|
|
89
|
|
90 print OUT "CTCF Site", "\t", "Chromosome no.", "\t", "Start", "\t", "End", "\t", "Score", "\t", "Strand", "\n";
|
|
91
|
|
92 chomp (@array5);
|
|
93
|
|
94 my $digits=join("", @array5);
|
|
95
|
|
96 my @yeslap = $digits =~ /(?=(\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w))/g;
|
|
97
|
|
98
|
|
99 $var = "@yeslap\n";
|
|
100
|
|
101
|
|
102 @matrix1=qw/87.25 291.25 76.25 459.25 167.25 145.25 414.25 187.25 281.25 49.25 449.25 134.25 56.25 800.25 21.25 36.25 8.25 903.25 0.25 2.25 744.25 13.25 65.25 91.25 40.25 528.25 334.25 11.25 107.25 433.25 48.25 324.25 851.25 11.25 32.25 18.25 5.25 0.25 903.25 3.25 333.25 3.25 566.25 9.25 54.25 12.25 504.25 341.25 12.25 0.25 890.25 8.25 56.25 8.25 775.25 71.25 104.25 733.25 5.25 67.25 372.25 13.25 507.25 17.25 82.25 482.25 307.25 37.25 117.25 322.25 73.25 396.25 402.25 181.25 266.25 59.25/;
|
|
103
|
|
104 $kount=0;
|
|
105 $coun=0;
|
|
106
|
|
107 # Define the pattern for CTCF. Because of pseudocount, a wildcard is allowed at
|
|
108 #each position.
|
|
109 my $pattern = "[ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC]";
|
|
110
|
|
111 # Compare the pattern with the 19 bp nucleotide segments.
|
|
112
|
|
113 while($var =~ m/$pattern/gi)
|
|
114 {
|
|
115 $coun++;
|
|
116 $count2++;
|
|
117 my $endpos = pos $var;
|
|
118
|
|
119 # Get the starting and ending positions of the matched pattern.
|
|
120
|
|
121 my $startpos=($endpos+1)-19;
|
|
122 my $lastpos=$endpos;
|
|
123
|
|
124
|
|
125 my $consensus = substr($var, ($startpos-1), 19);
|
|
126
|
|
127 push(@temp, $consensus, $startpos, $lastpos);
|
|
128
|
|
129 $coll=0;
|
|
130 $kount++;
|
|
131
|
|
132
|
|
133 # Split the matched pattern into 19 single bases.
|
|
134
|
|
135 @digit = split(//, $consensus);
|
|
136
|
|
137
|
|
138 # For each base, if the base is A, calculate the weight score of A according to
|
|
139 #its frequency in the CTCF Position Frequency Matrix.
|
|
140 foreach $digit (@digit)
|
|
141 {
|
|
142
|
|
143
|
|
144 if($digit =~ m/A/)
|
|
145 {
|
|
146
|
|
147
|
|
148 my $ref = \@matrix1;
|
|
149 $marks = @{$ref}[$coll];
|
|
150
|
|
151
|
|
152 $result1 = sqrt(914);
|
|
153
|
|
154 $result2 = $result1*0.3;
|
|
155
|
|
156 $result3 = $result2+$marks;
|
|
157 $result4 = sqrt(914);
|
|
158 $result5 = $result4+914;
|
|
159 $result6 = 0.3;
|
|
160
|
|
161 $resultfinal = log($result3/$result5/$result6)/log(2);
|
|
162
|
|
163
|
|
164
|
|
165
|
|
166
|
|
167 push(@scorearray, $resultfinal);
|
|
168
|
|
169
|
|
170
|
|
171 }
|
|
172
|
|
173 if($digit =~ m/C/)
|
|
174
|
|
175 {
|
|
176
|
|
177 my $ref = \@matrix1;
|
|
178
|
|
179 $marks = @{$ref}[$coll + 1];
|
|
180
|
|
181 $result1 = sqrt(914);
|
|
182 $result2 = $result1*0.2;
|
|
183 $result3 = $result2+$marks;
|
|
184 $result4 = sqrt(914);
|
|
185 $result5 = $result4+914;
|
|
186 $result6 = 0.2;
|
|
187
|
|
188 $resultfinal = log($result3/$result5/$result6)/log(2);
|
|
189
|
|
190 push(@scorearray, $resultfinal);
|
|
191
|
|
192
|
|
193 }
|
|
194
|
|
195
|
|
196 if($digit =~ m/G/)
|
|
197
|
|
198 {
|
|
199
|
|
200 my $ref = \@matrix1;
|
|
201
|
|
202 $marks = @{$ref}[$coll+2];
|
|
203
|
|
204 $result1 = sqrt(914);
|
|
205 $result2 = $result1*0.2;
|
|
206 $result3 = $result2+$marks;
|
|
207 $result4 = sqrt(914);
|
|
208 $result5 = $result4+914;
|
|
209 $result6 = 0.2;
|
|
210
|
|
211 $resultfinal = log($result3/$result5/$result6)/log(2);
|
|
212
|
|
213
|
|
214 push(@scorearray, $resultfinal);
|
|
215
|
|
216 }
|
|
217
|
|
218 if($digit =~ m/T/)
|
|
219
|
|
220 {
|
|
221
|
|
222 my $ref = \@matrix1;
|
|
223
|
|
224 $marks = @{$ref}[$coll+3];
|
|
225
|
|
226 $result1 = sqrt(914);
|
|
227 $result2 = $result1*0.3;
|
|
228 $result3 = $result2+$marks;
|
|
229 $result4 = sqrt(914);
|
|
230 $result5 = $result4+914;
|
|
231 $result6 = 0.3;
|
|
232
|
|
233 $resultfinal = log($result3/$result5/$result6)/log(2);
|
|
234
|
|
235
|
|
236 push(@scorearray, $resultfinal);
|
|
237
|
|
238 }
|
|
239
|
|
240 $coll=$coll + 4;
|
|
241
|
|
242 }
|
|
243
|
|
244 @digit=();
|
|
245 my $tem=0;
|
|
246
|
|
247
|
|
248 foreach $scorearray(@scorearray)
|
|
249 {
|
|
250
|
|
251 $tem = $tem + $scorearray;
|
|
252
|
|
253
|
|
254 }
|
|
255
|
|
256 @scorearray = ();
|
|
257
|
|
258
|
|
259 my $fpercent = $tem;
|
|
260
|
|
261
|
|
262 if ($fpercent >= 18) {
|
|
263
|
|
264 print OUT $consensus, "\t", $entry1, "\t", $entry2 - 18 - $count2, "\t", $entry2 - $count2, "\t", "$fpercent", "\t", "-", "\n";
|
|
265
|
|
266
|
|
267 }
|
|
268
|
|
269 }
|
|
270
|
|
271
|
|
272 close ( OUT );
|
|
273 close ( IN );
|
|
274 close ( IN2 );
|