Mercurial > repos > mmaiensc > ember
comparison GALAXY_FILES/tools/EMBER/Integrate_Data.pl @ 0:003f802d4c7d
Uploaded
author | mmaiensc |
---|---|
date | Wed, 29 Feb 2012 15:03:33 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:003f802d4c7d |
---|---|
1 #!/usr/bin/perl | |
2 # combine microarray pre-preprocessed data and binding data | |
3 | |
4 use Getopt::Long; | |
5 $|++; | |
6 | |
7 # | |
8 # command line arguments | |
9 # | |
10 $options = "Usage: ./Integrate_Data.pl <OPTIONS> | |
11 -b binding data (required) | |
12 -bf binding data format (default 1) | |
13 1 - .bed format (<chr> <pkposn> <other information>) | |
14 first two columns are required, | |
15 <other information> may contain peak ids, enrichments, etc, and is optional but will be retained | |
16 2 - Dinner group \"annotation\" format | |
17 -e pre-processed expression data (required) | |
18 -o output (required) | |
19 -d distance (positive number, in kbp, default 100) | |
20 -dt distance type (1 = to gene boundaries, 2 = to TSS, default 1) | |
21 -v verbose (print progress to stdout: y or n, default y) | |
22 \n"; | |
23 | |
24 $b = ""; | |
25 $bf = 1; | |
26 $e = ""; | |
27 $o = ""; | |
28 $d = 100; | |
29 $dt = "1"; | |
30 $v = "y"; | |
31 | |
32 GetOptions('b=s' => \$b, | |
33 'bf=i' => \$bf, | |
34 'e=s' => \$e, | |
35 'o=s' => \$o, | |
36 'd=f' => \$d, | |
37 'dt=i' => \$dt, | |
38 'v=s' => \$v, | |
39 ) || die "\n$options\n"; | |
40 | |
41 if( $b eq "" ){ | |
42 print "\nError: set a value for -b\n\n$options"; | |
43 exit; | |
44 } | |
45 if( $bf != 1 && $bf != 2 ){ | |
46 print "\nError: set -bf to be 1 or 2\n\n$options"; | |
47 exit; | |
48 } | |
49 if( $e eq "" ){ | |
50 print "\nError: set a value for -e\n\n$options"; | |
51 exit; | |
52 } | |
53 if( $o eq "" ){ | |
54 print "\nError: set a value for -o\n\n$options"; | |
55 exit; | |
56 } | |
57 | |
58 if( $d < 0 ){ | |
59 print "\nError: choose -d to be positive\n\n$options"; | |
60 exit; | |
61 } | |
62 if( $dt != 1 && $dt != 2 ){ | |
63 print "\nError: choose -dt to be 1 or 2\n\n$options"; | |
64 exit; | |
65 } | |
66 if( $v ne "y" && $v ne "n" ){ | |
67 print "\nError: choose -v to be y or n\n\n$options"; | |
68 exit; | |
69 } | |
70 | |
71 # | |
72 # read in binding data | |
73 # | |
74 if( $v eq "y" ){ | |
75 print "\nReading in binding and expression data\n"; | |
76 } | |
77 @peaks = (); | |
78 open(IN,"$b") || die "Error: can't open file $b\n"; | |
79 while($line = <IN>){ | |
80 if( $line !~ /#/ ){ | |
81 chomp($line); | |
82 @parts = split(' ',$line); | |
83 $info = {}; | |
84 if( $bf == 1 ){ | |
85 $info->{chr} = $parts[0]; | |
86 $info->{pk} = $parts[1]; | |
87 $val = ""; | |
88 if( $#parts >= 2 ){ | |
89 $val = $parts[2]; | |
90 for($i=3; $i<= $#parts; $i++){ | |
91 $val = sprintf("%s %s", $val, $parts[$i]); | |
92 } | |
93 } | |
94 $info->{rest} = $val; | |
95 push(@peaks, $info); | |
96 } | |
97 if( $bf == 2 ){ | |
98 if( $#parts == 19 ){ | |
99 $info->{chr} = $parts[2]; | |
100 $info->{pk} = $parts[3]; | |
101 $info->{rest} = sprintf("%s %s %s", $parts[0], $parts[1], $parts[6]); | |
102 push(@peaks, $info); | |
103 } | |
104 } | |
105 } | |
106 } | |
107 close(IN); | |
108 | |
109 # | |
110 # read in expression data | |
111 # | |
112 @expr = (); | |
113 open(IN,"$e") || die "Error: can't open file $e\n"; | |
114 while($line = <IN>){ | |
115 chomp($line); | |
116 @parts = split(' ',$line); | |
117 $info = {}; | |
118 $info->{chr} = $parts[2]; | |
119 $info->{start} = $parts[3]; | |
120 $info->{end} = $parts[4]; | |
121 $info->{strand} = $parts[5]; | |
122 $info->{line} = $line; | |
123 push(@expr, $info); | |
124 } | |
125 close(IN); | |
126 | |
127 if( $v eq "y" ){ | |
128 printf(" %i peaks, %i genes\n", $#peaks+1, $#expr+1); | |
129 printf("\nSorting peaks and genes, indexing chromosomes\n"); | |
130 } | |
131 | |
132 # | |
133 # sort binding and expression data by chr to save some time | |
134 # | |
135 @speaks = sort{ $a->{chr} cmp $b->{chr} } @peaks; | |
136 @sexpr = sort{ $a->{chr} cmp $b->{chr} } @expr; | |
137 | |
138 # | |
139 # index starting elements for each chromosome in sexpr | |
140 # | |
141 @chrinds = (); | |
142 $chr = $sexpr[0]->{chr}; | |
143 $info = {}; | |
144 $info->{chr} = $chr; | |
145 $info->{start} = 0; | |
146 for($i=0; $i<= $#sexpr; $i++){ | |
147 if( $sexpr[$i]->{chr} ne $chr ){ | |
148 $info->{end} = $i-1; | |
149 push(@chrinds, $info); | |
150 $info = {}; | |
151 $chr = $sexpr[$i]->{chr}; | |
152 $info->{chr} = $chr; | |
153 $info->{start} = $i; | |
154 } | |
155 } | |
156 $info->{end} = $i-1; | |
157 push(@chrinds, $info); | |
158 | |
159 # | |
160 # compile potential targets for each peak and print out | |
161 # | |
162 if( $v eq "y" ){ | |
163 printf("\nFinding potential targets\n"); | |
164 } | |
165 open(OUT,">$o"); | |
166 $count=0; | |
167 for($i=0; $i<= $#speaks; $i++){ | |
168 if( $v eq "y" ){ | |
169 printf("\r peak %i of %i", $i+1, $#speaks+1); | |
170 } | |
171 # print out peak info | |
172 printf OUT ("PEAK: %s %s %s\n", $speaks[$i]->{chr}, $speaks[$i]->{pk}, $speaks[$i]->{rest} ); | |
173 | |
174 # find the right chromosome and search through all the genes on the same chromosome for possible matches | |
175 $j=0; | |
176 $end=$#chrinds; | |
177 while( $chrinds[$j]->{chr} ne $speaks[$i]->{chr} && $j<= $end ){$j++;} | |
178 if( $chrinds[$j]->{chr} eq $speaks[$i]->{chr} ){ | |
179 $start = $chrinds[$j]->{start}; | |
180 $end = $chrinds[$j]->{end}; | |
181 for($k=$start; $k<= $end; $k++){ | |
182 if( &distance( $i, $k ) <= $d ){ | |
183 $count++; | |
184 printf OUT ("GENE: %s\n", $sexpr[$k]->{line}); | |
185 } | |
186 } | |
187 } | |
188 } | |
189 close(OUT); | |
190 if( $v eq "y" ){ | |
191 print "\n\nGot $count potential targets\n\n"; | |
192 } | |
193 | |
194 exit; | |
195 | |
196 | |
197 # compute distance between peak $_[0] and gene $_[1] | |
198 sub distance{ | |
199 my $dist; | |
200 my $tss; | |
201 my $ii; | |
202 my $jj; | |
203 $ii = $_[0]; | |
204 $jj = $_[1]; | |
205 | |
206 if( $speaks[$ii]->{chr} ne $sexpr[$jj]->{chr}){ | |
207 print "\nError: somehow computing distance between peak/gene on different chromosomes\n"; | |
208 print " peak chr: $speaks[$ii]->{chr}, gene chr: $sexpr[$jj]->{chr}\n"; | |
209 exit; | |
210 } | |
211 if( $dt == 1 ){ | |
212 # distance to closest side of gene | |
213 if( $speaks[$ii]->{pk} < $sexpr[$jj]->{start} ){ | |
214 $dist = $sexpr[$jj]->{start} - $speaks[$ii]->{pk}; | |
215 } | |
216 elsif( $speaks[$ii]->{pk} > $sexpr[$jj]->{end} ){ | |
217 $dist = $speaks[$ii]->{pk} - $sexpr[$jj]->{end}; | |
218 } | |
219 else{ | |
220 $dist = 0; | |
221 } | |
222 } | |
223 elsif( $dt == 2 ){ | |
224 # distance to TSS | |
225 if( $sexpr[$jj]->{strand} == 1 ){ | |
226 $tss = $sexpr[$jj]->{start}; | |
227 } | |
228 elsif( $sexpr[$jj]->{strand} == -1 ){ | |
229 $tss = $sexpr[$jj]->{end}; | |
230 } | |
231 else{ | |
232 print "\nError: did not have strand of +/-1 (got $sexpr[$jj]->{strand})\n"; | |
233 exit; | |
234 } | |
235 $dist = abs($speaks[$ii]->{pk} - $tss); | |
236 } | |
237 else{ | |
238 print "\nError: wrong value for dt ($dt)\n"; | |
239 exit; | |
240 } | |
241 if( $dist < 0 ){ | |
242 print "\nError: negative distance\n"; | |
243 exit; | |
244 } | |
245 # convert to kbp | |
246 $dist/=1000; | |
247 return $dist; | |
248 } | |
249 | |
250 | |
251 | |
252 |