0
|
1 #!/usr/bin/perl
|
|
2 # combine microarray pre-preprocessed data and binding data
|
|
3
|
|
4 use Getopt::Long;
|
|
5 $|++;
|
|
6
|
|
7 #
|
|
8 # command line arguments
|
|
9 #
|
|
10 $options = "Usage: ./Integrate_Data.pl <OPTIONS>
|
|
11 -b binding data (required)
|
|
12 -bf binding data format (default 1)
|
|
13 1 - .bed format (<chr> <pkposn> <other information>)
|
|
14 first two columns are required,
|
|
15 <other information> may contain peak ids, enrichments, etc, and is optional but will be retained
|
|
16 2 - Dinner group \"annotation\" format
|
|
17 -e pre-processed expression data (required)
|
|
18 -o output (required)
|
|
19 -d distance (positive number, in kbp, default 100)
|
|
20 -dt distance type (1 = to gene boundaries, 2 = to TSS, default 1)
|
|
21 -v verbose (print progress to stdout: y or n, default y)
|
|
22 \n";
|
|
23
|
|
24 $b = "";
|
|
25 $bf = 1;
|
|
26 $e = "";
|
|
27 $o = "";
|
|
28 $d = 100;
|
|
29 $dt = "1";
|
|
30 $v = "y";
|
|
31
|
|
32 GetOptions('b=s' => \$b,
|
|
33 'bf=i' => \$bf,
|
|
34 'e=s' => \$e,
|
|
35 'o=s' => \$o,
|
|
36 'd=f' => \$d,
|
|
37 'dt=i' => \$dt,
|
|
38 'v=s' => \$v,
|
|
39 ) || die "\n$options\n";
|
|
40
|
|
41 if( $b eq "" ){
|
|
42 print "\nError: set a value for -b\n\n$options";
|
|
43 exit;
|
|
44 }
|
|
45 if( $bf != 1 && $bf != 2 ){
|
|
46 print "\nError: set -bf to be 1 or 2\n\n$options";
|
|
47 exit;
|
|
48 }
|
|
49 if( $e eq "" ){
|
|
50 print "\nError: set a value for -e\n\n$options";
|
|
51 exit;
|
|
52 }
|
|
53 if( $o eq "" ){
|
|
54 print "\nError: set a value for -o\n\n$options";
|
|
55 exit;
|
|
56 }
|
|
57
|
|
58 if( $d < 0 ){
|
|
59 print "\nError: choose -d to be positive\n\n$options";
|
|
60 exit;
|
|
61 }
|
|
62 if( $dt != 1 && $dt != 2 ){
|
|
63 print "\nError: choose -dt to be 1 or 2\n\n$options";
|
|
64 exit;
|
|
65 }
|
|
66 if( $v ne "y" && $v ne "n" ){
|
|
67 print "\nError: choose -v to be y or n\n\n$options";
|
|
68 exit;
|
|
69 }
|
|
70
|
|
71 #
|
|
72 # read in binding data
|
|
73 #
|
|
74 if( $v eq "y" ){
|
|
75 print "\nReading in binding and expression data\n";
|
|
76 }
|
|
77 @peaks = ();
|
|
78 open(IN,"$b") || die "Error: can't open file $b\n";
|
|
79 while($line = <IN>){
|
|
80 if( $line !~ /#/ ){
|
|
81 chomp($line);
|
|
82 @parts = split(' ',$line);
|
|
83 $info = {};
|
|
84 if( $bf == 1 ){
|
|
85 $info->{chr} = $parts[0];
|
|
86 $info->{pk} = $parts[1];
|
|
87 $val = "";
|
|
88 if( $#parts >= 2 ){
|
|
89 $val = $parts[2];
|
|
90 for($i=3; $i<= $#parts; $i++){
|
|
91 $val = sprintf("%s %s", $val, $parts[$i]);
|
|
92 }
|
|
93 }
|
|
94 $info->{rest} = $val;
|
|
95 push(@peaks, $info);
|
|
96 }
|
|
97 if( $bf == 2 ){
|
|
98 if( $#parts == 19 ){
|
|
99 $info->{chr} = $parts[2];
|
|
100 $info->{pk} = $parts[3];
|
|
101 $info->{rest} = sprintf("%s %s %s", $parts[0], $parts[1], $parts[6]);
|
|
102 push(@peaks, $info);
|
|
103 }
|
|
104 }
|
|
105 }
|
|
106 }
|
|
107 close(IN);
|
|
108
|
|
109 #
|
|
110 # read in expression data
|
|
111 #
|
|
112 @expr = ();
|
|
113 open(IN,"$e") || die "Error: can't open file $e\n";
|
|
114 while($line = <IN>){
|
|
115 chomp($line);
|
|
116 @parts = split(' ',$line);
|
|
117 $info = {};
|
|
118 $info->{chr} = $parts[2];
|
|
119 $info->{start} = $parts[3];
|
|
120 $info->{end} = $parts[4];
|
|
121 $info->{strand} = $parts[5];
|
|
122 $info->{line} = $line;
|
|
123 push(@expr, $info);
|
|
124 }
|
|
125 close(IN);
|
|
126
|
|
127 if( $v eq "y" ){
|
|
128 printf(" %i peaks, %i genes\n", $#peaks+1, $#expr+1);
|
|
129 printf("\nSorting peaks and genes, indexing chromosomes\n");
|
|
130 }
|
|
131
|
|
132 #
|
|
133 # sort binding and expression data by chr to save some time
|
|
134 #
|
|
135 @speaks = sort{ $a->{chr} cmp $b->{chr} } @peaks;
|
|
136 @sexpr = sort{ $a->{chr} cmp $b->{chr} } @expr;
|
|
137
|
|
138 #
|
|
139 # index starting elements for each chromosome in sexpr
|
|
140 #
|
|
141 @chrinds = ();
|
|
142 $chr = $sexpr[0]->{chr};
|
|
143 $info = {};
|
|
144 $info->{chr} = $chr;
|
|
145 $info->{start} = 0;
|
|
146 for($i=0; $i<= $#sexpr; $i++){
|
|
147 if( $sexpr[$i]->{chr} ne $chr ){
|
|
148 $info->{end} = $i-1;
|
|
149 push(@chrinds, $info);
|
|
150 $info = {};
|
|
151 $chr = $sexpr[$i]->{chr};
|
|
152 $info->{chr} = $chr;
|
|
153 $info->{start} = $i;
|
|
154 }
|
|
155 }
|
|
156 $info->{end} = $i-1;
|
|
157 push(@chrinds, $info);
|
|
158
|
|
159 #
|
|
160 # compile potential targets for each peak and print out
|
|
161 #
|
|
162 if( $v eq "y" ){
|
|
163 printf("\nFinding potential targets\n");
|
|
164 }
|
|
165 open(OUT,">$o");
|
|
166 $count=0;
|
|
167 for($i=0; $i<= $#speaks; $i++){
|
|
168 if( $v eq "y" ){
|
|
169 printf("\r peak %i of %i", $i+1, $#speaks+1);
|
|
170 }
|
|
171 # print out peak info
|
|
172 printf OUT ("PEAK: %s %s %s\n", $speaks[$i]->{chr}, $speaks[$i]->{pk}, $speaks[$i]->{rest} );
|
|
173
|
|
174 # find the right chromosome and search through all the genes on the same chromosome for possible matches
|
|
175 $j=0;
|
|
176 $end=$#chrinds;
|
|
177 while( $chrinds[$j]->{chr} ne $speaks[$i]->{chr} && $j<= $end ){$j++;}
|
|
178 if( $chrinds[$j]->{chr} eq $speaks[$i]->{chr} ){
|
|
179 $start = $chrinds[$j]->{start};
|
|
180 $end = $chrinds[$j]->{end};
|
|
181 for($k=$start; $k<= $end; $k++){
|
|
182 if( &distance( $i, $k ) <= $d ){
|
|
183 $count++;
|
|
184 printf OUT ("GENE: %s\n", $sexpr[$k]->{line});
|
|
185 }
|
|
186 }
|
|
187 }
|
|
188 }
|
|
189 close(OUT);
|
|
190 if( $v eq "y" ){
|
|
191 print "\n\nGot $count potential targets\n\n";
|
|
192 }
|
|
193
|
|
194 exit;
|
|
195
|
|
196
|
|
197 # compute distance between peak $_[0] and gene $_[1]
|
|
198 sub distance{
|
|
199 my $dist;
|
|
200 my $tss;
|
|
201 my $ii;
|
|
202 my $jj;
|
|
203 $ii = $_[0];
|
|
204 $jj = $_[1];
|
|
205
|
|
206 if( $speaks[$ii]->{chr} ne $sexpr[$jj]->{chr}){
|
|
207 print "\nError: somehow computing distance between peak/gene on different chromosomes\n";
|
|
208 print " peak chr: $speaks[$ii]->{chr}, gene chr: $sexpr[$jj]->{chr}\n";
|
|
209 exit;
|
|
210 }
|
|
211 if( $dt == 1 ){
|
|
212 # distance to closest side of gene
|
|
213 if( $speaks[$ii]->{pk} < $sexpr[$jj]->{start} ){
|
|
214 $dist = $sexpr[$jj]->{start} - $speaks[$ii]->{pk};
|
|
215 }
|
|
216 elsif( $speaks[$ii]->{pk} > $sexpr[$jj]->{end} ){
|
|
217 $dist = $speaks[$ii]->{pk} - $sexpr[$jj]->{end};
|
|
218 }
|
|
219 else{
|
|
220 $dist = 0;
|
|
221 }
|
|
222 }
|
|
223 elsif( $dt == 2 ){
|
|
224 # distance to TSS
|
|
225 if( $sexpr[$jj]->{strand} == 1 ){
|
|
226 $tss = $sexpr[$jj]->{start};
|
|
227 }
|
|
228 elsif( $sexpr[$jj]->{strand} == -1 ){
|
|
229 $tss = $sexpr[$jj]->{end};
|
|
230 }
|
|
231 else{
|
|
232 print "\nError: did not have strand of +/-1 (got $sexpr[$jj]->{strand})\n";
|
|
233 exit;
|
|
234 }
|
|
235 $dist = abs($speaks[$ii]->{pk} - $tss);
|
|
236 }
|
|
237 else{
|
|
238 print "\nError: wrong value for dt ($dt)\n";
|
|
239 exit;
|
|
240 }
|
|
241 if( $dist < 0 ){
|
|
242 print "\nError: negative distance\n";
|
|
243 exit;
|
|
244 }
|
|
245 # convert to kbp
|
|
246 $dist/=1000;
|
|
247 return $dist;
|
|
248 }
|
|
249
|
|
250
|
|
251
|
|
252
|