comparison GALAXY_FILES/tools/EMBER/Integrate_Data.pl @ 0:003f802d4c7d

Uploaded
author mmaiensc
date Wed, 29 Feb 2012 15:03:33 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:003f802d4c7d
1 #!/usr/bin/perl
2 # combine microarray pre-preprocessed data and binding data
3
4 use Getopt::Long;
5 $|++;
6
7 #
8 # command line arguments
9 #
10 $options = "Usage: ./Integrate_Data.pl <OPTIONS>
11 -b binding data (required)
12 -bf binding data format (default 1)
13 1 - .bed format (<chr> <pkposn> <other information>)
14 first two columns are required,
15 <other information> may contain peak ids, enrichments, etc, and is optional but will be retained
16 2 - Dinner group \"annotation\" format
17 -e pre-processed expression data (required)
18 -o output (required)
19 -d distance (positive number, in kbp, default 100)
20 -dt distance type (1 = to gene boundaries, 2 = to TSS, default 1)
21 -v verbose (print progress to stdout: y or n, default y)
22 \n";
23
24 $b = "";
25 $bf = 1;
26 $e = "";
27 $o = "";
28 $d = 100;
29 $dt = "1";
30 $v = "y";
31
32 GetOptions('b=s' => \$b,
33 'bf=i' => \$bf,
34 'e=s' => \$e,
35 'o=s' => \$o,
36 'd=f' => \$d,
37 'dt=i' => \$dt,
38 'v=s' => \$v,
39 ) || die "\n$options\n";
40
41 if( $b eq "" ){
42 print "\nError: set a value for -b\n\n$options";
43 exit;
44 }
45 if( $bf != 1 && $bf != 2 ){
46 print "\nError: set -bf to be 1 or 2\n\n$options";
47 exit;
48 }
49 if( $e eq "" ){
50 print "\nError: set a value for -e\n\n$options";
51 exit;
52 }
53 if( $o eq "" ){
54 print "\nError: set a value for -o\n\n$options";
55 exit;
56 }
57
58 if( $d < 0 ){
59 print "\nError: choose -d to be positive\n\n$options";
60 exit;
61 }
62 if( $dt != 1 && $dt != 2 ){
63 print "\nError: choose -dt to be 1 or 2\n\n$options";
64 exit;
65 }
66 if( $v ne "y" && $v ne "n" ){
67 print "\nError: choose -v to be y or n\n\n$options";
68 exit;
69 }
70
71 #
72 # read in binding data
73 #
74 if( $v eq "y" ){
75 print "\nReading in binding and expression data\n";
76 }
77 @peaks = ();
78 open(IN,"$b") || die "Error: can't open file $b\n";
79 while($line = <IN>){
80 if( $line !~ /#/ ){
81 chomp($line);
82 @parts = split(' ',$line);
83 $info = {};
84 if( $bf == 1 ){
85 $info->{chr} = $parts[0];
86 $info->{pk} = $parts[1];
87 $val = "";
88 if( $#parts >= 2 ){
89 $val = $parts[2];
90 for($i=3; $i<= $#parts; $i++){
91 $val = sprintf("%s %s", $val, $parts[$i]);
92 }
93 }
94 $info->{rest} = $val;
95 push(@peaks, $info);
96 }
97 if( $bf == 2 ){
98 if( $#parts == 19 ){
99 $info->{chr} = $parts[2];
100 $info->{pk} = $parts[3];
101 $info->{rest} = sprintf("%s %s %s", $parts[0], $parts[1], $parts[6]);
102 push(@peaks, $info);
103 }
104 }
105 }
106 }
107 close(IN);
108
109 #
110 # read in expression data
111 #
112 @expr = ();
113 open(IN,"$e") || die "Error: can't open file $e\n";
114 while($line = <IN>){
115 chomp($line);
116 @parts = split(' ',$line);
117 $info = {};
118 $info->{chr} = $parts[2];
119 $info->{start} = $parts[3];
120 $info->{end} = $parts[4];
121 $info->{strand} = $parts[5];
122 $info->{line} = $line;
123 push(@expr, $info);
124 }
125 close(IN);
126
127 if( $v eq "y" ){
128 printf(" %i peaks, %i genes\n", $#peaks+1, $#expr+1);
129 printf("\nSorting peaks and genes, indexing chromosomes\n");
130 }
131
132 #
133 # sort binding and expression data by chr to save some time
134 #
135 @speaks = sort{ $a->{chr} cmp $b->{chr} } @peaks;
136 @sexpr = sort{ $a->{chr} cmp $b->{chr} } @expr;
137
138 #
139 # index starting elements for each chromosome in sexpr
140 #
141 @chrinds = ();
142 $chr = $sexpr[0]->{chr};
143 $info = {};
144 $info->{chr} = $chr;
145 $info->{start} = 0;
146 for($i=0; $i<= $#sexpr; $i++){
147 if( $sexpr[$i]->{chr} ne $chr ){
148 $info->{end} = $i-1;
149 push(@chrinds, $info);
150 $info = {};
151 $chr = $sexpr[$i]->{chr};
152 $info->{chr} = $chr;
153 $info->{start} = $i;
154 }
155 }
156 $info->{end} = $i-1;
157 push(@chrinds, $info);
158
159 #
160 # compile potential targets for each peak and print out
161 #
162 if( $v eq "y" ){
163 printf("\nFinding potential targets\n");
164 }
165 open(OUT,">$o");
166 $count=0;
167 for($i=0; $i<= $#speaks; $i++){
168 if( $v eq "y" ){
169 printf("\r peak %i of %i", $i+1, $#speaks+1);
170 }
171 # print out peak info
172 printf OUT ("PEAK: %s %s %s\n", $speaks[$i]->{chr}, $speaks[$i]->{pk}, $speaks[$i]->{rest} );
173
174 # find the right chromosome and search through all the genes on the same chromosome for possible matches
175 $j=0;
176 $end=$#chrinds;
177 while( $chrinds[$j]->{chr} ne $speaks[$i]->{chr} && $j<= $end ){$j++;}
178 if( $chrinds[$j]->{chr} eq $speaks[$i]->{chr} ){
179 $start = $chrinds[$j]->{start};
180 $end = $chrinds[$j]->{end};
181 for($k=$start; $k<= $end; $k++){
182 if( &distance( $i, $k ) <= $d ){
183 $count++;
184 printf OUT ("GENE: %s\n", $sexpr[$k]->{line});
185 }
186 }
187 }
188 }
189 close(OUT);
190 if( $v eq "y" ){
191 print "\n\nGot $count potential targets\n\n";
192 }
193
194 exit;
195
196
197 # compute distance between peak $_[0] and gene $_[1]
198 sub distance{
199 my $dist;
200 my $tss;
201 my $ii;
202 my $jj;
203 $ii = $_[0];
204 $jj = $_[1];
205
206 if( $speaks[$ii]->{chr} ne $sexpr[$jj]->{chr}){
207 print "\nError: somehow computing distance between peak/gene on different chromosomes\n";
208 print " peak chr: $speaks[$ii]->{chr}, gene chr: $sexpr[$jj]->{chr}\n";
209 exit;
210 }
211 if( $dt == 1 ){
212 # distance to closest side of gene
213 if( $speaks[$ii]->{pk} < $sexpr[$jj]->{start} ){
214 $dist = $sexpr[$jj]->{start} - $speaks[$ii]->{pk};
215 }
216 elsif( $speaks[$ii]->{pk} > $sexpr[$jj]->{end} ){
217 $dist = $speaks[$ii]->{pk} - $sexpr[$jj]->{end};
218 }
219 else{
220 $dist = 0;
221 }
222 }
223 elsif( $dt == 2 ){
224 # distance to TSS
225 if( $sexpr[$jj]->{strand} == 1 ){
226 $tss = $sexpr[$jj]->{start};
227 }
228 elsif( $sexpr[$jj]->{strand} == -1 ){
229 $tss = $sexpr[$jj]->{end};
230 }
231 else{
232 print "\nError: did not have strand of +/-1 (got $sexpr[$jj]->{strand})\n";
233 exit;
234 }
235 $dist = abs($speaks[$ii]->{pk} - $tss);
236 }
237 else{
238 print "\nError: wrong value for dt ($dt)\n";
239 exit;
240 }
241 if( $dist < 0 ){
242 print "\nError: negative distance\n";
243 exit;
244 }
245 # convert to kbp
246 $dist/=1000;
247 return $dist;
248 }
249
250
251
252