0
|
1 #This is the TAp2.pl script
|
|
2 #The goal of this is to count the number of CIS present in a NR set
|
|
3 # one file is required nr.txt sorted in chromosome order!
|
|
4 # Aaron Lyman Sarver Oct 21, 2010
|
|
5 # This script takes over following the bowtie mapping and generates CIS.
|
|
6 # Update NOV15,2010 to pass library information through
|
|
7 #$Fully modified december 1 2010i
|
|
8 #Fully modifed May 1 2010
|
|
9 use File::Copy;
|
|
10 require 'config.pl';
|
|
11 require 'lib/feature_finder_methods.pl';
|
|
12 my $path = $0;
|
|
13 $path =~ s/\/\w*\.pl$//g;
|
|
14 require "$path/project_man.pl";
|
|
15 unless(-d "CIS") {
|
|
16 mkdir("CIS");
|
|
17 }
|
|
18
|
|
19 open CIS, "> results/summary_CIS_$proj.txt";
|
|
20 close CIS;
|
|
21
|
|
22 #generate required tables
|
|
23 $sth = $dbh->prepare("drop table if exists cis_result_final_$proj");
|
|
24 $sth->execute;
|
|
25 $sth = $dbh->prepare("create table cis_result_final_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))");
|
|
26 $sth->execute;
|
|
27
|
|
28 $sth = $dbh->prepare("drop table if exists sort_$proj");
|
|
29 $sth->execute;
|
|
30 $sth = $dbh->prepare("create table sort_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))");
|
|
31 $sth->execute;
|
|
32 $sth = $dbh->prepare("drop table if exists sort2_$proj");
|
|
33 $sth->execute;
|
|
34 $sth = $dbh->prepare("create table sort2_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))");
|
|
35 $sth->execute;
|
|
36
|
|
37 #load concatamer information from file
|
|
38 $sth = $dbh->prepare("drop table if exists chromo_$proj");
|
|
39 $sth->execute;
|
|
40 $sth = $dbh->prepare("create table chromo_$proj (chromo varchar(20) ) ");
|
|
41 $sth->execute;
|
|
42 $sth = $dbh->prepare("load DATA local INFILE 'data/chromo.tab' INTO TABLE chromo_$proj ");
|
|
43 $sth->execute;
|
|
44 $sth = $dbh->prepare("select distinct chromo from chromo_$proj");
|
|
45 $sth->execute;
|
|
46 print "10.3 - Loaded concatamer chromo\n";
|
|
47 #generate donor exclusion portion of the SQL query
|
|
48 @exclude;
|
|
49 $query_chromo= '';
|
|
50 while ((@row) = $sth->fetchrow_array) {
|
|
51 push (@exclude,$row[0]);
|
|
52 print "\n$row[0]";
|
|
53 }
|
|
54 print @exclude;
|
|
55 foreach $desc(@exclude) {
|
|
56 $query_chromo = $query_chromo."and chromo != '".$desc."' ";
|
|
57 }
|
|
58 print $query_chromo;
|
|
59
|
|
60 #10.4 load library metadata
|
|
61 $sth = $dbh->prepare("drop table if exists metadata_$proj");
|
|
62 $sth->execute;
|
|
63 $sth = $dbh->prepare("create table metadata_$proj (library varchar(20), descriptor varchar(20), type varchar (20) ) ");
|
|
64 $sth->execute;
|
|
65 $sth = $dbh->prepare("load DATA local INFILE 'data/metadata.tab' INTO TABLE metadata_$proj ");
|
|
66 $sth->execute;
|
|
67 $sth = $dbh->prepare("select distinct descriptor from metadata_$proj where type = 'cis'");
|
|
68 $sth->execute;
|
|
69 print "10.4 - Loaded metadata\n";
|
|
70
|
|
71 #10.2 load the feature hash
|
|
72 print "10.2 - Loading features...\n";
|
|
73 my ($feature_file, $window, $fchrom, $fstart, $fend, $debug) = ($annotation_file, 20000, 0, 1, 2, 1);
|
|
74 my $feature_hash_ref = &feature_index(\$feature_file, \$window, \$fchrom, \$fstart, \$fend, \$debug);
|
|
75 print "10.2 - Features loaded.\n";
|
|
76
|
|
77 #generate metadata based library portion of the SQL query
|
|
78 @descriptors;
|
|
79 $query_definition = '';
|
|
80 while ((@row) = $sth->fetchrow_array) {
|
|
81 push (@descriptors,$row[0]);
|
|
82 print "\n$row[0]";
|
|
83 }
|
|
84 print @descriptors;
|
|
85 foreach $desc(@descriptors) {
|
|
86 $query_definition = '';
|
|
87 $query = "select distinct library from metadata_$proj where descriptor = '$desc'";
|
|
88 #print "\n$query\n";
|
|
89 $sth = $dbh->prepare("$query");
|
|
90 $sth->execute;
|
|
91
|
|
92 while ((@row) = $sth->fetchrow_array) {
|
|
93 $query_definition = $query_definition."or A.library = '$row[0]' ";
|
|
94 #print "$row[0]";
|
|
95 }
|
|
96 print "\n$desc\n";
|
|
97 #print "$query_definition\n";
|
|
98 #setup the looping featured
|
|
99 @options = ('');
|
|
100 foreach $library(@options) {
|
|
101 @percent =("$library_percent");
|
|
102 foreach $percent(@percent) {
|
|
103
|
|
104 #11.1 query to select regions of interest
|
|
105 $sth = $dbh->prepare("select chromo, start, A.library,count, count/total,orient from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > $percent $query_chromo and (1=2 $query_definition)
|
|
106 order by chromo,start+0 ;");
|
|
107 $sth->execute;
|
|
108
|
|
109 unless(-d "results") {
|
|
110 mkdir("results");
|
|
111 }
|
|
112 unless(-d "results/$desc") {
|
|
113 mkdir("results/$desc");
|
|
114 }
|
|
115
|
|
116 open OUT, "> CIS/nr.txt";
|
|
117 $name = "$desc-nr-$proj-$percent";
|
|
118 open OUT2, "> results/$desc/$name.txt";
|
|
119 open OUT3, "> results/$desc/$name.BED";
|
|
120 print OUT3 "track type ='BED' name='$name' description='$name Insertions' visibility=2 itemRgb='On'";
|
|
121 while ((@row) = $sth->fetchrow_array) {
|
|
122 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]";
|
|
123 $start = $row[1]-50;
|
|
124 $stop = $row[1]+50;
|
|
125 print OUT2 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[4]\t$row[5]";
|
|
126 print OUT3 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[5]\t$start\t$stop\t159,0,197";
|
|
127 }
|
|
128 close OUT;
|
|
129
|
|
130 #11.2 Anotate the inserts from the .txt file
|
|
131 my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("results/$desc/$name.txt", "results/$desc/$name.ann.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0);
|
|
132 &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance);
|
|
133 unlink("results/$desc/$name.txt");
|
|
134 #generate a list of gense with high density inserts
|
|
135 #if (($percent == '0.01') and ($library eq 'library')) {
|
|
136 #system("perl lib/convert_nr_2_annot.pl results/$desc/$name.txt results/$desc/$name.ingenuity.txt");
|
|
137 #}
|
|
138
|
|
139 $count = 0;
|
|
140 open SOURCE, "< CIS/nr.txt";
|
|
141 while (defined($line = <SOURCE>)) {
|
|
142 $count++;
|
|
143 }
|
|
144 @libSizes =();
|
|
145 open CIS, ">> results/summary_CIS_$proj.txt";
|
|
146 print "\n\n11.3 - $name Total number of NR regions $count\n";
|
|
147 print CIS "\n\n$name\nExcluded chromosomes $query_chromo\nTotal number of NR regions $count\n";
|
|
148 print CIS "CIS analyses calculate window sizes based on NR count\n";
|
|
149
|
|
150 #11.4 Ideal window size calculation
|
|
151 $n = 10000;
|
|
152 $f = 3;
|
|
153 while ($n < 300000) {
|
|
154 $stat =poissonBonf($n,$count,$f);
|
|
155 if ($stat > 0.05) {
|
|
156 $nf=$n-1000;
|
|
157 $stat2=poissonBonf($nf,$count,$f);
|
|
158 if ($stat2 < 0.05) {
|
|
159 print "$stat2\t$nf\t$f\n";
|
|
160 print CIS "$stat2\t$nf\t$f\n";
|
|
161 unshift(@libSizes, $nf);
|
|
162 }
|
|
163 $f=$f+1;
|
|
164 }
|
|
165 $n = $n+1000;
|
|
166 }
|
|
167 $nf='301000';
|
|
168 unshift(@libSizes, $nf);
|
|
169 print @libSizes;
|
|
170 print CIS @libsizes;
|
|
171 close CIS;
|
|
172 if ($count < 2000) {
|
|
173 @libSizes =('301000','200000','100000','50000','25000','12500');
|
|
174 }
|
|
175 if ($count > 200000) {
|
|
176 @libSizes =('301000','200000','100000','50000','25000','12500');
|
|
177 }
|
|
178
|
|
179 #11.5
|
|
180 foreach $lib(@libSizes) {
|
|
181
|
|
182 #11.5 call the cis_finder.pl script to calculate the CIS.
|
|
183 system("perl lib/cis_finder.pl $lib");
|
|
184 print "\n$lib\n";
|
|
185
|
|
186
|
|
187 #11.6 call the feature_finder.pl script to annotat the CIS regions
|
|
188 my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("CIS/cis$lib.txt", "CIS/final_cis_named_$lib.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0);
|
|
189 &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance);
|
|
190 }
|
|
191
|
|
192 #load the results into the database.and resolve overlap
|
|
193 $sth = $dbh->prepare("delete from sort_$proj");
|
|
194 $sth->execute;
|
|
195 $sth = $dbh->prepare("delete from sort2_$proj");
|
|
196 $sth->execute;
|
|
197
|
|
198 #11.7
|
|
199 foreach $lib(@libSizes) {
|
|
200 $sth = $dbh->prepare("load DATA local INFILE 'CIS/final_cis_named_$lib.txt' INTO TABLE sort2_$proj");
|
|
201 $sth->execute;
|
|
202 resolve_cis();
|
|
203 }
|
|
204
|
|
205 #MOve and rename CIS to cis_result_final table
|
|
206 $filename = $desc."_".$percent.$library;
|
|
207 $query2 = "delete from cis_result_final_$proj where type= '$filename'";
|
|
208 print "\n$query2\n";
|
|
209 $sth = $dbh->prepare("$query2");
|
|
210 $sth->execute;
|
|
211 $sth = $dbh->prepare("insert into cis_result_final_$proj select * from sort_$proj");
|
|
212 $sth->execute;
|
|
213 $query3 = "update cis_result_final_$proj set type = '$filename' where type is NULL";
|
|
214 print "$query3";
|
|
215 $sth = $dbh->prepare("$query3");
|
|
216 $sth->execute;
|
|
217
|
|
218 #Export CIS list to the results subdirectory
|
|
219 $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;";
|
|
220 $sth = $dbh->prepare("$query4");
|
|
221 $sth->execute;
|
|
222 open CIS, "> results/$desc/cis_$name.xls";
|
|
223 print CIS "$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\tnumber of inserts drive transcription on positive strand";
|
|
224 while ((@row) = $sth->fetchrow_array) {
|
|
225 print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t";
|
|
226 }
|
|
227 close CIS;
|
|
228
|
|
229
|
|
230 open SOURCE, "< results/$desc/cis_$name.xls";
|
|
231 open CIS, "> results/$desc/plot_$name.wig";
|
|
232 print CIS "track type=wiggle_0 name=\"$proj.$filename\" description=\"$proj.$filename\" visibility=full autoScale=off";
|
|
233 while (defined($line = <SOURCE>)) {
|
|
234 chomp $line;
|
|
235 @field= split(/\t/, $line);
|
|
236 @chromo=split(/:|-/, $field[1]);
|
|
237
|
|
238 if ($field[1] =~ m/chr/) {
|
|
239 if ($field[7] < 1.0e-100) {
|
|
240 $field[7] = 1.0e-100;}
|
|
241
|
|
242 $log= -(log($field[7])/log(10));
|
|
243 print CIS "\n$chromo[0]\t$chromo[1]\t$chromo[2]\t$log";
|
|
244 }
|
|
245 }
|
|
246 close CIS;
|
|
247
|
|
248 #11.11 Generate the cis summary file of top 25 library CIS
|
|
249 $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;";
|
|
250 print "$query4";
|
|
251 $sth = $dbh->prepare("$query4");
|
|
252 $sth->execute;
|
|
253 open CIS, ">> results/summary_CIS_$proj.txt";
|
|
254 print CIS "\n\n$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\t+strand\t";
|
|
255 while ((@row) = $sth->fetchrow_array) {
|
|
256 print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t";
|
|
257 }
|
|
258 close CIS;
|
|
259
|
|
260 #generate matrix and carry out association analyses
|
|
261 if (($desc eq 'all') and ($percent == 0.0001) ) {
|
|
262 print "\n\n Starting Association analyses for $name";
|
|
263 mkdir "FISH";
|
|
264 mkdir "results/Assoc";
|
|
265 system ('perl lib/visualize.pl');
|
|
266 copy("FISH/cis4cluster.txt", "results/Assoc/cis4cluster.txt") or die "Unable to copy file: $!\n";
|
|
267 #system ('module load R');
|
|
268 system ('perl lib/matrix2fisher.pl FISH/cis4cluster.txt');
|
|
269 copy("FISH/R_result.txt", "results/Assoc/R_result.txt") or die "Unable to copy file: $!\n";
|
|
270 copy("FISH/Fisher_pre_named.txt", "results/Assoc/Fisher_pre_named.txt") or die "Unable to copy file: $!\n";
|
|
271 system ('perl lib/Fisher_clean.pl');
|
|
272 }
|
|
273
|
|
274 }#close the threshhold loop
|
|
275 }#close the libraries loop
|
|
276 }#close the metadata loop
|
|
277
|
|
278 &set_project_status($proj, 0, 1);
|
|
279
|
|
280 exit(0);
|
|
281
|
|
282 sub resolve_cis {
|
|
283 $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue > B.pvalue");
|
|
284 $sth->execute;
|
|
285
|
|
286 $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue > B.pvalue");
|
|
287 $sth->execute;
|
|
288 $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue < B.pvalue; ");
|
|
289 $sth->execute;
|
|
290 $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue <=
|
|
291 B.pvalue; ");
|
|
292 $sth->execute;
|
|
293
|
|
294 $sth = $dbh->prepare("insert into sort_$proj select * from sort2_$proj");
|
|
295 $sth->execute;
|
|
296
|
|
297 $sth = $dbh->prepare("delete from sort2_$proj");
|
|
298 $sth->execute;
|
|
299 }
|
|
300
|
|
301 sub poisson {
|
|
302 return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0
|
|
303 ])))/(fac($_[2])));
|
|
304 }
|
|
305
|
|
306 sub poissonBonf {
|
|
307 return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[
|
|
308 0])))/(fac($_[2])))*$_[1]);
|
|
309 }
|
|
310
|
|
311 sub fac {
|
|
312 my ($n) = @_;
|
|
313
|
|
314 if ($n < 2) {
|
|
315 return $n;
|
|
316 }
|
|
317 else {
|
|
318 return $n * fac($n-1);
|
|
319 }
|
|
320 }
|
|
321
|