Mercurial > repos > mkhan1980 > ctcf_analysis
comparison check2.pl @ 2:2cceb9398d33 draft
Uploaded
author | mkhan1980 |
---|---|
date | Mon, 04 Mar 2013 06:38:21 -0500 |
parents | |
children | f1696b304b8d |
comparison
equal
deleted
inserted
replaced
1:e3c4e5ff7f74 | 2:2cceb9398d33 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 | |
4 # Define variables | |
5 my @temp=(); | |
6 my $result1; | |
7 my $result2; | |
8 my $result3; | |
9 my $result4; | |
10 my $result5; | |
11 my $result6; | |
12 my $resultfinal; | |
13 my $count; | |
14 my $coun; | |
15 my $cou; | |
16 my @digit=(); | |
17 my $digit; | |
18 my $marks; | |
19 my $log; | |
20 my $coll; | |
21 my @scorearray=(); | |
22 my $scorearray; | |
23 my $percent; | |
24 my $kount; | |
25 my @result=(); | |
26 my $result; | |
27 my %final=(); | |
28 my $final; | |
29 my @c=(); | |
30 my @matrix1; | |
31 my @matrix2; | |
32 my $matrix1; | |
33 my $matrix2; | |
34 $coll=0; | |
35 my $count2; | |
36 my $var; | |
37 my $entry1; | |
38 my $entry2; | |
39 my $reventry1; | |
40 my $reventry2; | |
41 my $revvar; | |
42 my @revmatrix1; | |
43 my $revkount; | |
44 my $revcoun; | |
45 my $revcount2; | |
46 my @revtemp; | |
47 my $revcoll; | |
48 my @revdigit; | |
49 my $revdigit; | |
50 my $revmarks; | |
51 my $revresult1; | |
52 my $revresult2; | |
53 my $revresult3; | |
54 my $revresult4; | |
55 my $revresult5; | |
56 my $revresult6; | |
57 my $revresultfinal; | |
58 my @revscorearray; | |
59 my $revscorearray; | |
60 | |
61 | |
62 | |
63 #define variables from configuration file | |
64 open (IN, "<$ARGV[0]"); | |
65 open (IN2, "<$ARGV[1]"); | |
66 open (OUT, ">$ARGV[2]"); | |
67 | |
68 #assign arrays to variables from configuration file | |
69 my @array5=<IN>; | |
70 | |
71 my @coordinates=<IN2>; | |
72 | |
73 | |
74 #split the chromosome number and starting position from coordinates file into 2 separate strings | |
75 | |
76 foreach my $coordinates(@coordinates) { | |
77 | |
78 chomp($coordinates); | |
79 | |
80 my @coordinates2=split(/\s+/, $coordinates); | |
81 | |
82 my $coordinates2; | |
83 | |
84 $entry1=$coordinates2[0]; | |
85 $entry2=$coordinates2[1]; | |
86 | |
87 } | |
88 | |
89 | |
90 print OUT "CTCF Site", "\t", "Chromosome no.", "\t", "Start", "\t", "End", "\t", "Score", "\t", "Strand", "\n"; | |
91 | |
92 chomp (@array5); | |
93 | |
94 my $digits=join("", @array5); | |
95 | |
96 my @yeslap = $digits =~ /(?=(\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w))/g; | |
97 | |
98 | |
99 $var = "@yeslap\n"; | |
100 | |
101 | |
102 @matrix1=qw/87.25 291.25 76.25 459.25 167.25 145.25 414.25 187.25 281.25 49.25 449.25 134.25 56.25 800.25 21.25 36.25 8.25 903.25 0.25 2.25 744.25 13.25 65.25 91.25 40.25 528.25 334.25 11.25 107.25 433.25 48.25 324.25 851.25 11.25 32.25 18.25 5.25 0.25 903.25 3.25 333.25 3.25 566.25 9.25 54.25 12.25 504.25 341.25 12.25 0.25 890.25 8.25 56.25 8.25 775.25 71.25 104.25 733.25 5.25 67.25 372.25 13.25 507.25 17.25 82.25 482.25 307.25 37.25 117.25 322.25 73.25 396.25 402.25 181.25 266.25 59.25/; | |
103 | |
104 $kount=0; | |
105 $coun=0; | |
106 | |
107 # Define the pattern for CTCF. Because of pseudocount, a wildcard is allowed at | |
108 #each position. | |
109 my $pattern = "[ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC]"; | |
110 | |
111 # Compare the pattern with the 19 bp nucleotide segments. | |
112 | |
113 while($var =~ m/$pattern/gi) | |
114 { | |
115 $coun++; | |
116 $count2++; | |
117 my $endpos = pos $var; | |
118 | |
119 # Get the starting and ending positions of the matched pattern. | |
120 | |
121 my $startpos=($endpos+1)-19; | |
122 my $lastpos=$endpos; | |
123 | |
124 | |
125 my $consensus = substr($var, ($startpos-1), 19); | |
126 | |
127 push(@temp, $consensus, $startpos, $lastpos); | |
128 | |
129 $coll=0; | |
130 $kount++; | |
131 | |
132 | |
133 # Split the matched pattern into 19 single bases. | |
134 | |
135 @digit = split(//, $consensus); | |
136 | |
137 | |
138 # For each base, if the base is A, calculate the weight score of A according to | |
139 #its frequency in the CTCF Position Frequency Matrix. | |
140 foreach $digit (@digit) | |
141 { | |
142 | |
143 | |
144 if($digit =~ m/A/) | |
145 { | |
146 | |
147 | |
148 my $ref = \@matrix1; | |
149 $marks = @{$ref}[$coll]; | |
150 | |
151 | |
152 $result1 = sqrt(914); | |
153 | |
154 $result2 = $result1*0.3; | |
155 | |
156 $result3 = $result2+$marks; | |
157 $result4 = sqrt(914); | |
158 $result5 = $result4+914; | |
159 $result6 = 0.3; | |
160 | |
161 $resultfinal = log($result3/$result5/$result6)/log(2); | |
162 | |
163 | |
164 | |
165 | |
166 | |
167 push(@scorearray, $resultfinal); | |
168 | |
169 | |
170 | |
171 } | |
172 | |
173 if($digit =~ m/C/) | |
174 | |
175 { | |
176 | |
177 my $ref = \@matrix1; | |
178 | |
179 $marks = @{$ref}[$coll + 1]; | |
180 | |
181 $result1 = sqrt(914); | |
182 $result2 = $result1*0.2; | |
183 $result3 = $result2+$marks; | |
184 $result4 = sqrt(914); | |
185 $result5 = $result4+914; | |
186 $result6 = 0.2; | |
187 | |
188 $resultfinal = log($result3/$result5/$result6)/log(2); | |
189 | |
190 push(@scorearray, $resultfinal); | |
191 | |
192 | |
193 } | |
194 | |
195 | |
196 if($digit =~ m/G/) | |
197 | |
198 { | |
199 | |
200 my $ref = \@matrix1; | |
201 | |
202 $marks = @{$ref}[$coll+2]; | |
203 | |
204 $result1 = sqrt(914); | |
205 $result2 = $result1*0.2; | |
206 $result3 = $result2+$marks; | |
207 $result4 = sqrt(914); | |
208 $result5 = $result4+914; | |
209 $result6 = 0.2; | |
210 | |
211 $resultfinal = log($result3/$result5/$result6)/log(2); | |
212 | |
213 | |
214 push(@scorearray, $resultfinal); | |
215 | |
216 } | |
217 | |
218 if($digit =~ m/T/) | |
219 | |
220 { | |
221 | |
222 my $ref = \@matrix1; | |
223 | |
224 $marks = @{$ref}[$coll+3]; | |
225 | |
226 $result1 = sqrt(914); | |
227 $result2 = $result1*0.3; | |
228 $result3 = $result2+$marks; | |
229 $result4 = sqrt(914); | |
230 $result5 = $result4+914; | |
231 $result6 = 0.3; | |
232 | |
233 $resultfinal = log($result3/$result5/$result6)/log(2); | |
234 | |
235 | |
236 push(@scorearray, $resultfinal); | |
237 | |
238 } | |
239 | |
240 $coll=$coll + 4; | |
241 | |
242 } | |
243 | |
244 @digit=(); | |
245 my $tem=0; | |
246 | |
247 | |
248 foreach $scorearray(@scorearray) | |
249 { | |
250 | |
251 $tem = $tem + $scorearray; | |
252 | |
253 | |
254 } | |
255 | |
256 @scorearray = (); | |
257 | |
258 | |
259 my $fpercent = $tem; | |
260 | |
261 | |
262 if ($fpercent >= 18) { | |
263 | |
264 print OUT $consensus, "\t", $entry1, "\t", $entry2 - 18 - $count2, "\t", $entry2 - $count2, "\t", "$fpercent", "\t", "-", "\n"; | |
265 | |
266 | |
267 } | |
268 | |
269 } | |
270 | |
271 | |
272 close ( OUT ); | |
273 close ( IN ); | |
274 close ( IN2 ); |