annotate GALAXY_FILES/tools/EMBER/PreProcess_Expression_Data.pl @ 0:003f802d4c7d

Uploaded
author mmaiensc
date Wed, 29 Feb 2012 15:03:33 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
1 #!/usr/bin/perl
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
2 # prepare microarray data for use in EMBER w/ discretized behaviors
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
3
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
4 use Getopt::Long;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
5 $|++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
6
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
7 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
8 # command line arguments
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
9 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
10 $options = "Usage: ./PreProcess_Expresssion_Data.pl <OPTIONS>
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
11 -i microarray data (required)
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
12 format: <id> <value 1> <value 2> ... <value N>
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
13 this requires a title line (starting with #):
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
14 #ID <condition 1><replicate 1> <condition 2><replicate 2> ...
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
15 where <condition> should match the conditions in input -c,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
16 and the <condition><replicate> pair tells what the data in the column is from
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
17 -c list of comparisons (required)
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
18 format: <condition1> <condition2>
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
19 -a annotation file (required)
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
20 format: <id> <gene name> <chr> <start> <end> <strand>
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
21 -o output (required)
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
22 -p percentile threshold (number between 0 and 1, default 0.63)
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
23 this value is used as follows: all data are concatenated into one list,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
24 and the pth percentile of that list is taken as the threshold.
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
25 Then, a probe is removed if its value is less than the threshold in ALL conditions.
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
26 p=1.0 means all probes are retained, p=0.0 means none are
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
27 -l log-transform the data (y or n, default n)
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
28 -v verbose (print progress to stdout: y or n, default y)
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
29 \n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
30
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
31 $in = "";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
32 $c = "";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
33 $a = "";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
34 $o = "";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
35 $p = 0.63;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
36 $l = "n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
37 $v = "y";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
38
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
39 GetOptions('i=s' => \$in,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
40 'c=s' => \$c,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
41 'a=s' => \$a,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
42 'o=s' => \$o,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
43 'p=f' => \$p,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
44 'l=s' => \$l,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
45 'v=s' => \$v,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
46 ) || die "\n$options\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
47
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
48 if( $in eq "" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
49 print "\nError: set a value for -i\n\n$options";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
50 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
51 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
52 if( $c eq "" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
53 print "\nError: set a value for -c\n\n$options";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
54 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
55 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
56 if( $a eq "" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
57 print "\nError: set a value for -a\n\n$options";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
58 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
59 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
60 if( $o eq "" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
61 print "\nError: set a value for -o\n\n$options";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
62 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
63 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
64
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
65 if( $p < 0 || $p > 1 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
66 print "\nError: choose -p to be between 0 and 1\n\n$options";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
67 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
68 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
69 if( $l ne "y" && $l ne "n" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
70 print "\nError: choose -l to be y or n\n\n$options";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
71 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
72 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
73 if( $v ne "y" && $v ne "n" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
74 print "\nError: choose -v to be y or n\n\n$options";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
75 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
76 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
77
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
78 #################################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
79 # SECTION 1 - read in file names and conditions #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
80 #################################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
81
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
82 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
83 # read in data list (title line of $in) and list of comparisons
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
84 # arrays: @names, @comps
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
85 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
86 open(IN,"$in") || die "Error: can't open file $in\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
87 $line = <IN>;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
88 close(IN);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
89 @names = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
90 @parts = split(' ',$line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
91 if( substr($parts[0], 0, 1) ne "#" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
92 print "Error: no title line detected in file $in\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
93 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
94 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
95 for($i=1; $i<= $#parts; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
96 if( $parts[$i] !~ /#/ ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
97 printf("Error: column %i (%s) does not have a replicate number\n", $i, $parts[$i]);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
98 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
99 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
100 ($condition, $replicate) = split('#', $parts[$i]);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
101 push(@names, [($condition, $replicate)] );
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
102 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
103 #while($line = <IN>){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
104 # chomp($line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
105 # push(@files, $line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
106 # @parts = split('/',$line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
107 # ($base, $suff) = split('\.txt', $parts[$#parts]);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
108 # push(@names, $base);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
109 #}
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
110 #close(IN);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
111 open(IN,"$c") || die "Error: can't open file $c\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
112 @comps = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
113 while($line = <IN>){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
114 chomp($line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
115 @parts = split(' ',$line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
116 if( $#parts != 1 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
117 print "Error: file $c does not have 2 columns\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
118 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
119 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
120 $info1 = {};
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
121 $info1->{name} = $parts[0];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
122 $info1->{namei} = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
123 $info2 = {};
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
124 $info2->{name} = $parts[1];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
125 $info2->{namei} = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
126 push(@comps, [($info1, $info2)] );
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
127 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
128 close(IN);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
129
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
130 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
131 # count how many conditions we have, and how many replicates of each
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
132 # arrays: @conds
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
133 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
134 @conds = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
135 for($i=0; $i<= $#comps; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
136 for($j=0; $j< 2; $j++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
137 $ok = 1;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
138 $k=-1;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
139 $end = $#conds;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
140 while($k< $end && $ok == 1){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
141 $k++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
142 if( $conds[$k]->{name} eq $comps[$i][$j]->{name} ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
143 $ok = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
144 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
145 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
146 if( $ok == 1 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
147 @empty1 = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
148 $info = {};
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
149 $info->{name} = $comps[$i][$j]->{name};
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
150 $info->{reps} = [@empty1];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
151 $info->{nreps} = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
152 push(@conds, $info);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
153 $comps[$i][$j]->{namei} = $#conds;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
154 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
155 else{
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
156 $comps[$i][$j]->{namei} = $k;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
157 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
158 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
159 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
160 for($i=0; $i<= $#conds; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
161 for($j=0; $j<= $#names; $j++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
162 if( $names[$j][0] eq $conds[$i]->{name} ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
163 push(@{$conds[$i]->{reps}}, $j);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
164 $conds[$i]->{nreps}++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
165 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
166 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
167 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
168
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
169 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
170 # print summary to stdout
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
171 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
172 if( $v eq "y" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
173 printf("\nReading in %i conditions, %i total data columns:\n", $#conds+1, $#names+1);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
174 for($i=0; $i<= $#conds; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
175 printf(" %s, %i replicates\n", $conds[$i]->{name}, $conds[$i]->{nreps});
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
176 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
177 printf("\nMaking %i comparisons:\n", $#comps+1);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
178 for($i=0; $i<= $#comps; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
179 printf(" %s vs %s\n", $comps[$i][0]->{name}, $comps[$i][1]->{name});
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
180 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
181 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
182
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
183 ###########################################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
184 # SECTION 2 - read in annotation file and microarray data #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
185 ###########################################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
186
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
187 if( $v eq "y" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
188 print "\nReading in annotation file and data\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
189 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
190
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
191 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
192 # read in probe annotations
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
193 # arrays: @probes
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
194 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
195 open(IN,"$a") || die "Error: can't open file $a\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
196 @vals = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
197 while($line = <IN>){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
198 chomp($line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
199 @parts = split(' ',$line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
200 if( $#parts != 5 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
201 print "Error: file $a does not have 6 columns\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
202 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
203 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
204 @empty1 = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
205 @empty2 = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
206 $info = {};
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
207 $info->{id} = $parts[0];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
208 $info->{name} = $parts[1];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
209 $info->{chr} = $parts[2];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
210 $info->{start} = $parts[3];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
211 $info->{end} = $parts[4];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
212 $info->{strand} = $parts[5];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
213 $info->{vals} = [@empty1];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
214 $info->{classes} = [@empty2];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
215 $info->{use} = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
216 push(@vals, $info);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
217 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
218 close(IN);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
219 @probes = sort{ $a->{id} cmp $b->{id} } @vals;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
220
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
221 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
222 # read in data, sort by ids, and add to @probes
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
223 # record average for each file
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
224 # arrays: @averages
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
225 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
226 @averages = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
227 open(IN,"$in");
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
228 $line = <IN>;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
229 @data = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
230 while( $line = <IN> ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
231 chomp($line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
232 @parts = split(' ',$line);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
233 push(@data, [@parts] );
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
234 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
235 @sdata = sort{ $a->[0] cmp $b->[0] } @data;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
236 close(IN);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
237
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
238 $j=0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
239 $k=0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
240 $endd = $#sdata;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
241 $endp = $#probes;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
242 while( $j <= $endd && $k<= $endp ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
243 if( $probes[$k]->{id} gt $sdata[$j][0] ){$j++;}
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
244 elsif( $probes[$k]->{id} lt $sdata[$j][0] ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
245 $probes[$k]->{use} = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
246 $k++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
247 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
248 elsif( $probes[$k]->{id} eq $sdata[$j][0] ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
249 for( $i=0; $i<= $#names; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
250 if( $l eq "n" ){push(@{$probes[$k]->{vals}}, $sdata[$j][$i+1] );}
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
251 if( $l eq "y" ){push(@{$probes[$k]->{vals}}, log($sdata[$j][$i+1])/log(2) );}
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
252 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
253 $probes[$k]->{use} = 1;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
254 $j++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
255 $k++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
256 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
257 else{
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
258 printf("Unknown line comparison\n");
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
259 exit;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
260 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
261 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
262 for($i=0; $i<= $#names; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
263 $avg=0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
264 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
265 # compute average
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
266 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
267 $avg = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
268 $tot = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
269 for($k=0; $k<= $#probes; $k++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
270 if( $probes[$k]->{use} == 1 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
271 $avg+= $probes[$k]->{vals}[$i];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
272 $tot++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
273 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
274 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
275 $avg/= $tot;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
276 push(@averages, $avg);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
277 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
278
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
279 if( $v eq "y" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
280 printf(" using %i probes\n", $tot);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
281 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
282
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
283 ##################################################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
284 # SECTION 3 - determine threshold and label probes to be removed #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
285 ##################################################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
286
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
287 if( $v eq "y" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
288 printf("\nDetermining threshold\n");
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
289 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
290 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
291 # find threshold
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
292 # variables: $thresh
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
293 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
294 @allvals = ();
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
295 for($i=0; $i<= $#probes; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
296 for($j=0; $j<= $#names; $j++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
297 if( $probes[$i]->{use} == 1 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
298 push(@allvals, ($probes[$i]->{vals}[$j])/$averages[$j]);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
299 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
300 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
301 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
302
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
303 @sallvals = sort{ $a <=> $b } @allvals;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
304
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
305 $end = (1-$p)*$#sallvals;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
306 $i = sprintf("%i", $end);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
307 $thresh = $sallvals[$i];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
308
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
309 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
310 # label probes as on or off
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
311 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
312 $totuse = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
313 for($i=0; $i<= $#probes; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
314 $j=0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
315 $probes[$i]->{use} = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
316 while($j<= $#names && $probes[$i]->{use} == 0 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
317 if( ($probes[$i]->{vals}[$j])/$averages[$j] > $thresh ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
318 $probes[$i]->{use} = 1;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
319 $totuse++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
320 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
321 $j++;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
322 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
323 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
324 if( $v eq "y" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
325 printf(" %i probes retained for analysis\n", $totuse);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
326 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
327
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
328 #############################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
329 # SECTION 4 - make discrete classifications #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
330 #############################################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
331
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
332 if( $v eq "y" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
333 printf("\nClassifying probes\n");
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
334 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
335 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
336 # for each probe,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
337 # for each comparison,
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
338 # compute avg and stdev of first and second conditions
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
339 # apply rules for classification
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
340 #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
341 for($i=0; $i<= $#probes; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
342 for($j=0; $j<= $#comps; $j++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
343 $avg = (0,0);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
344 $sd = (0,0);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
345 for($k=0; $k< 2; $k++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
346 $condi = $comps[$j][$k]->{namei};
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
347 $avg[$k] = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
348 $sd[$k] = 0;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
349 for($l=0; $l< $conds[$condi]->{nreps}; $l++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
350 $filei = $conds[$condi]->{reps}[$l];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
351 $avg[$k]+= $probes[$i]->{vals}[$filei];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
352 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
353 $avg[$k] /= $conds[$condi]->{nreps};
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
354 for($l=0; $l< $conds[$condi]->{nreps}; $l++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
355 $filei = $conds[$condi]->{reps}[$l];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
356 $sd[$k]+= ($avg[$k]-$probes[$i]->{vals}[$filei])**2;
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
357 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
358 $sd[$k] = sqrt($sd[$k]/($conds[$condi]->{nreps}));
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
359 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
360
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
361 $s = $sd[0] + $sd[1];
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
362 # ++ category
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
363 if( $avg[1]-$avg[0] > 3*$s ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
364 push( @{$probes[$i]->{classes}}, 2);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
365 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
366 # + category
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
367 elsif( $avg[1]-$avg[0] > $s ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
368 push(@{$probes[$i]->{classes}}, 1);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
369 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
370 # -- category
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
371 elsif( $avg[0]-$avg[1] > 3*$s ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
372 push(@{$probes[$i]->{classes}}, -2);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
373 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
374 # 0 category
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
375 elsif( $avg[0]-$avg[1] > $s ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
376 push(@{$probes[$i]->{classes}}, -1);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
377 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
378 else{
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
379 push(@{$probes[$i]->{classes}}, 0);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
380 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
381 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
382 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
383
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
384 ############################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
385 # SECTION 5 - print result #
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
386 ############################
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
387
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
388 if( $v eq "y" ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
389 printf("\nPrinting result\n\n");
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
390 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
391 open(OUT,">$o");
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
392 $delim = " ";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
393 for($i=0; $i<= $#probes; $i++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
394 printf OUT ("%s%s%s%s%s%s%s%s%s%s%s", $probes[$i]->{id}, $delim, $probes[$i]->{name}, $delim, $probes[$i]->{chr}, $delim, $probes[$i]->{start}, $delim, $probes[$i]->{end}, $delim, $probes[$i]->{strand} );
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
395 if( $probes[$i]->{use} == 1 ){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
396 for($j=0; $j<= $#comps; $j++){
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
397 printf OUT ("%s%i", $delim, $probes[$i]->{classes}[$j]);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
398 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
399 print OUT "\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
400 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
401 else{
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
402 print OUT "$delim-1\n";
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
403 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
404 }
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
405 close(OUT);
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
406
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
407
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
408
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
409
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
410
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
411
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
412
003f802d4c7d Uploaded
mmaiensc
parents:
diff changeset
413