0
|
1 #!/usr/bin/perl
|
|
2 # prepare microarray data for use in EMBER w/ discretized behaviors
|
|
3
|
|
4 use Getopt::Long;
|
|
5 $|++;
|
|
6
|
|
7 #
|
|
8 # command line arguments
|
|
9 #
|
|
10 $options = "Usage: ./PreProcess_Expresssion_Data.pl <OPTIONS>
|
|
11 -i microarray data (required)
|
|
12 format: <id> <value 1> <value 2> ... <value N>
|
|
13 this requires a title line (starting with #):
|
|
14 #ID <condition 1><replicate 1> <condition 2><replicate 2> ...
|
|
15 where <condition> should match the conditions in input -c,
|
|
16 and the <condition><replicate> pair tells what the data in the column is from
|
|
17 -c list of comparisons (required)
|
|
18 format: <condition1> <condition2>
|
|
19 -a annotation file (required)
|
|
20 format: <id> <gene name> <chr> <start> <end> <strand>
|
|
21 -o output (required)
|
|
22 -p percentile threshold (number between 0 and 1, default 0.63)
|
|
23 this value is used as follows: all data are concatenated into one list,
|
|
24 and the pth percentile of that list is taken as the threshold.
|
|
25 Then, a probe is removed if its value is less than the threshold in ALL conditions.
|
|
26 p=1.0 means all probes are retained, p=0.0 means none are
|
|
27 -l log-transform the data (y or n, default n)
|
|
28 -v verbose (print progress to stdout: y or n, default y)
|
|
29 \n";
|
|
30
|
|
31 $in = "";
|
|
32 $c = "";
|
|
33 $a = "";
|
|
34 $o = "";
|
|
35 $p = 0.63;
|
|
36 $l = "n";
|
|
37 $v = "y";
|
|
38
|
|
39 GetOptions('i=s' => \$in,
|
|
40 'c=s' => \$c,
|
|
41 'a=s' => \$a,
|
|
42 'o=s' => \$o,
|
|
43 'p=f' => \$p,
|
|
44 'l=s' => \$l,
|
|
45 'v=s' => \$v,
|
|
46 ) || die "\n$options\n";
|
|
47
|
|
48 if( $in eq "" ){
|
|
49 print "\nError: set a value for -i\n\n$options";
|
|
50 exit;
|
|
51 }
|
|
52 if( $c eq "" ){
|
|
53 print "\nError: set a value for -c\n\n$options";
|
|
54 exit;
|
|
55 }
|
|
56 if( $a eq "" ){
|
|
57 print "\nError: set a value for -a\n\n$options";
|
|
58 exit;
|
|
59 }
|
|
60 if( $o eq "" ){
|
|
61 print "\nError: set a value for -o\n\n$options";
|
|
62 exit;
|
|
63 }
|
|
64
|
|
65 if( $p < 0 || $p > 1 ){
|
|
66 print "\nError: choose -p to be between 0 and 1\n\n$options";
|
|
67 exit;
|
|
68 }
|
|
69 if( $l ne "y" && $l ne "n" ){
|
|
70 print "\nError: choose -l to be y or n\n\n$options";
|
|
71 exit;
|
|
72 }
|
|
73 if( $v ne "y" && $v ne "n" ){
|
|
74 print "\nError: choose -v to be y or n\n\n$options";
|
|
75 exit;
|
|
76 }
|
|
77
|
|
78 #################################################
|
|
79 # SECTION 1 - read in file names and conditions #
|
|
80 #################################################
|
|
81
|
|
82 #
|
|
83 # read in data list (title line of $in) and list of comparisons
|
|
84 # arrays: @names, @comps
|
|
85 #
|
|
86 open(IN,"$in") || die "Error: can't open file $in\n";
|
|
87 $line = <IN>;
|
|
88 close(IN);
|
|
89 @names = ();
|
|
90 @parts = split(' ',$line);
|
|
91 if( substr($parts[0], 0, 1) ne "#" ){
|
|
92 print "Error: no title line detected in file $in\n";
|
|
93 exit;
|
|
94 }
|
|
95 for($i=1; $i<= $#parts; $i++){
|
|
96 if( $parts[$i] !~ /#/ ){
|
|
97 printf("Error: column %i (%s) does not have a replicate number\n", $i, $parts[$i]);
|
|
98 exit;
|
|
99 }
|
|
100 ($condition, $replicate) = split('#', $parts[$i]);
|
|
101 push(@names, [($condition, $replicate)] );
|
|
102 }
|
|
103 #while($line = <IN>){
|
|
104 # chomp($line);
|
|
105 # push(@files, $line);
|
|
106 # @parts = split('/',$line);
|
|
107 # ($base, $suff) = split('\.txt', $parts[$#parts]);
|
|
108 # push(@names, $base);
|
|
109 #}
|
|
110 #close(IN);
|
|
111 open(IN,"$c") || die "Error: can't open file $c\n";
|
|
112 @comps = ();
|
|
113 while($line = <IN>){
|
|
114 chomp($line);
|
|
115 @parts = split(' ',$line);
|
|
116 if( $#parts != 1 ){
|
|
117 print "Error: file $c does not have 2 columns\n";
|
|
118 exit;
|
|
119 }
|
|
120 $info1 = {};
|
|
121 $info1->{name} = $parts[0];
|
|
122 $info1->{namei} = 0;
|
|
123 $info2 = {};
|
|
124 $info2->{name} = $parts[1];
|
|
125 $info2->{namei} = 0;
|
|
126 push(@comps, [($info1, $info2)] );
|
|
127 }
|
|
128 close(IN);
|
|
129
|
|
130 #
|
|
131 # count how many conditions we have, and how many replicates of each
|
|
132 # arrays: @conds
|
|
133 #
|
|
134 @conds = ();
|
|
135 for($i=0; $i<= $#comps; $i++){
|
|
136 for($j=0; $j< 2; $j++){
|
|
137 $ok = 1;
|
|
138 $k=-1;
|
|
139 $end = $#conds;
|
|
140 while($k< $end && $ok == 1){
|
|
141 $k++;
|
|
142 if( $conds[$k]->{name} eq $comps[$i][$j]->{name} ){
|
|
143 $ok = 0;
|
|
144 }
|
|
145 }
|
|
146 if( $ok == 1 ){
|
|
147 @empty1 = ();
|
|
148 $info = {};
|
|
149 $info->{name} = $comps[$i][$j]->{name};
|
|
150 $info->{reps} = [@empty1];
|
|
151 $info->{nreps} = 0;
|
|
152 push(@conds, $info);
|
|
153 $comps[$i][$j]->{namei} = $#conds;
|
|
154 }
|
|
155 else{
|
|
156 $comps[$i][$j]->{namei} = $k;
|
|
157 }
|
|
158 }
|
|
159 }
|
|
160 for($i=0; $i<= $#conds; $i++){
|
|
161 for($j=0; $j<= $#names; $j++){
|
|
162 if( $names[$j][0] eq $conds[$i]->{name} ){
|
|
163 push(@{$conds[$i]->{reps}}, $j);
|
|
164 $conds[$i]->{nreps}++;
|
|
165 }
|
|
166 }
|
|
167 }
|
|
168
|
|
169 #
|
|
170 # print summary to stdout
|
|
171 #
|
|
172 if( $v eq "y" ){
|
|
173 printf("\nReading in %i conditions, %i total data columns:\n", $#conds+1, $#names+1);
|
|
174 for($i=0; $i<= $#conds; $i++){
|
|
175 printf(" %s, %i replicates\n", $conds[$i]->{name}, $conds[$i]->{nreps});
|
|
176 }
|
|
177 printf("\nMaking %i comparisons:\n", $#comps+1);
|
|
178 for($i=0; $i<= $#comps; $i++){
|
|
179 printf(" %s vs %s\n", $comps[$i][0]->{name}, $comps[$i][1]->{name});
|
|
180 }
|
|
181 }
|
|
182
|
|
183 ###########################################################
|
|
184 # SECTION 2 - read in annotation file and microarray data #
|
|
185 ###########################################################
|
|
186
|
|
187 if( $v eq "y" ){
|
|
188 print "\nReading in annotation file and data\n";
|
|
189 }
|
|
190
|
|
191 #
|
|
192 # read in probe annotations
|
|
193 # arrays: @probes
|
|
194 #
|
|
195 open(IN,"$a") || die "Error: can't open file $a\n";
|
|
196 @vals = ();
|
|
197 while($line = <IN>){
|
|
198 chomp($line);
|
|
199 @parts = split(' ',$line);
|
|
200 if( $#parts != 5 ){
|
|
201 print "Error: file $a does not have 6 columns\n";
|
|
202 exit;
|
|
203 }
|
|
204 @empty1 = ();
|
|
205 @empty2 = ();
|
|
206 $info = {};
|
|
207 $info->{id} = $parts[0];
|
|
208 $info->{name} = $parts[1];
|
|
209 $info->{chr} = $parts[2];
|
|
210 $info->{start} = $parts[3];
|
|
211 $info->{end} = $parts[4];
|
|
212 $info->{strand} = $parts[5];
|
|
213 $info->{vals} = [@empty1];
|
|
214 $info->{classes} = [@empty2];
|
|
215 $info->{use} = 0;
|
|
216 push(@vals, $info);
|
|
217 }
|
|
218 close(IN);
|
|
219 @probes = sort{ $a->{id} cmp $b->{id} } @vals;
|
|
220
|
|
221 #
|
|
222 # read in data, sort by ids, and add to @probes
|
|
223 # record average for each file
|
|
224 # arrays: @averages
|
|
225 #
|
|
226 @averages = ();
|
|
227 open(IN,"$in");
|
|
228 $line = <IN>;
|
|
229 @data = ();
|
|
230 while( $line = <IN> ){
|
|
231 chomp($line);
|
|
232 @parts = split(' ',$line);
|
|
233 push(@data, [@parts] );
|
|
234 }
|
|
235 @sdata = sort{ $a->[0] cmp $b->[0] } @data;
|
|
236 close(IN);
|
|
237
|
|
238 $j=0;
|
|
239 $k=0;
|
|
240 $endd = $#sdata;
|
|
241 $endp = $#probes;
|
|
242 while( $j <= $endd && $k<= $endp ){
|
|
243 if( $probes[$k]->{id} gt $sdata[$j][0] ){$j++;}
|
|
244 elsif( $probes[$k]->{id} lt $sdata[$j][0] ){
|
|
245 $probes[$k]->{use} = 0;
|
|
246 $k++;
|
|
247 }
|
|
248 elsif( $probes[$k]->{id} eq $sdata[$j][0] ){
|
|
249 for( $i=0; $i<= $#names; $i++){
|
|
250 if( $l eq "n" ){push(@{$probes[$k]->{vals}}, $sdata[$j][$i+1] );}
|
|
251 if( $l eq "y" ){push(@{$probes[$k]->{vals}}, log($sdata[$j][$i+1])/log(2) );}
|
|
252 }
|
|
253 $probes[$k]->{use} = 1;
|
|
254 $j++;
|
|
255 $k++;
|
|
256 }
|
|
257 else{
|
|
258 printf("Unknown line comparison\n");
|
|
259 exit;
|
|
260 }
|
|
261 }
|
|
262 for($i=0; $i<= $#names; $i++){
|
|
263 $avg=0;
|
|
264 #
|
|
265 # compute average
|
|
266 #
|
|
267 $avg = 0;
|
|
268 $tot = 0;
|
|
269 for($k=0; $k<= $#probes; $k++){
|
|
270 if( $probes[$k]->{use} == 1 ){
|
|
271 $avg+= $probes[$k]->{vals}[$i];
|
|
272 $tot++;
|
|
273 }
|
|
274 }
|
|
275 $avg/= $tot;
|
|
276 push(@averages, $avg);
|
|
277 }
|
|
278
|
|
279 if( $v eq "y" ){
|
|
280 printf(" using %i probes\n", $tot);
|
|
281 }
|
|
282
|
|
283 ##################################################################
|
|
284 # SECTION 3 - determine threshold and label probes to be removed #
|
|
285 ##################################################################
|
|
286
|
|
287 if( $v eq "y" ){
|
|
288 printf("\nDetermining threshold\n");
|
|
289 }
|
|
290 #
|
|
291 # find threshold
|
|
292 # variables: $thresh
|
|
293 #
|
|
294 @allvals = ();
|
|
295 for($i=0; $i<= $#probes; $i++){
|
|
296 for($j=0; $j<= $#names; $j++){
|
|
297 if( $probes[$i]->{use} == 1 ){
|
|
298 push(@allvals, ($probes[$i]->{vals}[$j])/$averages[$j]);
|
|
299 }
|
|
300 }
|
|
301 }
|
|
302
|
|
303 @sallvals = sort{ $a <=> $b } @allvals;
|
|
304
|
|
305 $end = (1-$p)*$#sallvals;
|
|
306 $i = sprintf("%i", $end);
|
|
307 $thresh = $sallvals[$i];
|
|
308
|
|
309 #
|
|
310 # label probes as on or off
|
|
311 #
|
|
312 $totuse = 0;
|
|
313 for($i=0; $i<= $#probes; $i++){
|
|
314 $j=0;
|
|
315 $probes[$i]->{use} = 0;
|
|
316 while($j<= $#names && $probes[$i]->{use} == 0 ){
|
|
317 if( ($probes[$i]->{vals}[$j])/$averages[$j] > $thresh ){
|
|
318 $probes[$i]->{use} = 1;
|
|
319 $totuse++;
|
|
320 }
|
|
321 $j++;
|
|
322 }
|
|
323 }
|
|
324 if( $v eq "y" ){
|
|
325 printf(" %i probes retained for analysis\n", $totuse);
|
|
326 }
|
|
327
|
|
328 #############################################
|
|
329 # SECTION 4 - make discrete classifications #
|
|
330 #############################################
|
|
331
|
|
332 if( $v eq "y" ){
|
|
333 printf("\nClassifying probes\n");
|
|
334 }
|
|
335 #
|
|
336 # for each probe,
|
|
337 # for each comparison,
|
|
338 # compute avg and stdev of first and second conditions
|
|
339 # apply rules for classification
|
|
340 #
|
|
341 for($i=0; $i<= $#probes; $i++){
|
|
342 for($j=0; $j<= $#comps; $j++){
|
|
343 $avg = (0,0);
|
|
344 $sd = (0,0);
|
|
345 for($k=0; $k< 2; $k++){
|
|
346 $condi = $comps[$j][$k]->{namei};
|
|
347 $avg[$k] = 0;
|
|
348 $sd[$k] = 0;
|
|
349 for($l=0; $l< $conds[$condi]->{nreps}; $l++){
|
|
350 $filei = $conds[$condi]->{reps}[$l];
|
|
351 $avg[$k]+= $probes[$i]->{vals}[$filei];
|
|
352 }
|
|
353 $avg[$k] /= $conds[$condi]->{nreps};
|
|
354 for($l=0; $l< $conds[$condi]->{nreps}; $l++){
|
|
355 $filei = $conds[$condi]->{reps}[$l];
|
|
356 $sd[$k]+= ($avg[$k]-$probes[$i]->{vals}[$filei])**2;
|
|
357 }
|
|
358 $sd[$k] = sqrt($sd[$k]/($conds[$condi]->{nreps}));
|
|
359 }
|
|
360
|
|
361 $s = $sd[0] + $sd[1];
|
|
362 # ++ category
|
|
363 if( $avg[1]-$avg[0] > 3*$s ){
|
|
364 push( @{$probes[$i]->{classes}}, 2);
|
|
365 }
|
|
366 # + category
|
|
367 elsif( $avg[1]-$avg[0] > $s ){
|
|
368 push(@{$probes[$i]->{classes}}, 1);
|
|
369 }
|
|
370 # -- category
|
|
371 elsif( $avg[0]-$avg[1] > 3*$s ){
|
|
372 push(@{$probes[$i]->{classes}}, -2);
|
|
373 }
|
|
374 # 0 category
|
|
375 elsif( $avg[0]-$avg[1] > $s ){
|
|
376 push(@{$probes[$i]->{classes}}, -1);
|
|
377 }
|
|
378 else{
|
|
379 push(@{$probes[$i]->{classes}}, 0);
|
|
380 }
|
|
381 }
|
|
382 }
|
|
383
|
|
384 ############################
|
|
385 # SECTION 5 - print result #
|
|
386 ############################
|
|
387
|
|
388 if( $v eq "y" ){
|
|
389 printf("\nPrinting result\n\n");
|
|
390 }
|
|
391 open(OUT,">$o");
|
|
392 $delim = " ";
|
|
393 for($i=0; $i<= $#probes; $i++){
|
|
394 printf OUT ("%s%s%s%s%s%s%s%s%s%s%s", $probes[$i]->{id}, $delim, $probes[$i]->{name}, $delim, $probes[$i]->{chr}, $delim, $probes[$i]->{start}, $delim, $probes[$i]->{end}, $delim, $probes[$i]->{strand} );
|
|
395 if( $probes[$i]->{use} == 1 ){
|
|
396 for($j=0; $j<= $#comps; $j++){
|
|
397 printf OUT ("%s%i", $delim, $probes[$i]->{classes}[$j]);
|
|
398 }
|
|
399 print OUT "\n";
|
|
400 }
|
|
401 else{
|
|
402 print OUT "$delim-1\n";
|
|
403 }
|
|
404 }
|
|
405 close(OUT);
|
|
406
|
|
407
|
|
408
|
|
409
|
|
410
|
|
411
|
|
412
|
|
413
|