comparison lib/TAP2.pl @ 0:1437a2df99c0

Uploaded
author jesse-erdmann
date Fri, 09 Dec 2011 11:56:56 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1437a2df99c0
1 #This is the TAp2.pl script
2 #The goal of this is to count the number of CIS present in a NR set
3 # one file is required nr.txt sorted in chromosome order!
4 # Aaron Lyman Sarver Oct 21, 2010
5 # This script takes over following the bowtie mapping and generates CIS.
6 # Update NOV15,2010 to pass library information through
7 #$Fully modified december 1 2010i
8 #Fully modifed May 1 2010
9 use File::Copy;
10 require 'config.pl';
11 require 'lib/feature_finder_methods.pl';
12 my $path = $0;
13 $path =~ s/\/\w*\.pl$//g;
14 require "$path/project_man.pl";
15 unless(-d "CIS") {
16 mkdir("CIS");
17 }
18
19 open CIS, "> results/summary_CIS_$proj.txt";
20 close CIS;
21
22 #generate required tables
23 $sth = $dbh->prepare("drop table if exists cis_result_final_$proj");
24 $sth->execute;
25 $sth = $dbh->prepare("create table cis_result_final_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))");
26 $sth->execute;
27
28 $sth = $dbh->prepare("drop table if exists sort_$proj");
29 $sth->execute;
30 $sth = $dbh->prepare("create table sort_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))");
31 $sth->execute;
32 $sth = $dbh->prepare("drop table if exists sort2_$proj");
33 $sth->execute;
34 $sth = $dbh->prepare("create table sort2_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))");
35 $sth->execute;
36
37 #load concatamer information from file
38 $sth = $dbh->prepare("drop table if exists chromo_$proj");
39 $sth->execute;
40 $sth = $dbh->prepare("create table chromo_$proj (chromo varchar(20) ) ");
41 $sth->execute;
42 $sth = $dbh->prepare("load DATA local INFILE 'data/chromo.tab' INTO TABLE chromo_$proj ");
43 $sth->execute;
44 $sth = $dbh->prepare("select distinct chromo from chromo_$proj");
45 $sth->execute;
46 print "10.3 - Loaded concatamer chromo\n";
47 #generate donor exclusion portion of the SQL query
48 @exclude;
49 $query_chromo= '';
50 while ((@row) = $sth->fetchrow_array) {
51 push (@exclude,$row[0]);
52 print "\n$row[0]";
53 }
54 print @exclude;
55 foreach $desc(@exclude) {
56 $query_chromo = $query_chromo."and chromo != '".$desc."' ";
57 }
58 print $query_chromo;
59
60 #10.4 load library metadata
61 $sth = $dbh->prepare("drop table if exists metadata_$proj");
62 $sth->execute;
63 $sth = $dbh->prepare("create table metadata_$proj (library varchar(20), descriptor varchar(20), type varchar (20) ) ");
64 $sth->execute;
65 $sth = $dbh->prepare("load DATA local INFILE 'data/metadata.tab' INTO TABLE metadata_$proj ");
66 $sth->execute;
67 $sth = $dbh->prepare("select distinct descriptor from metadata_$proj where type = 'cis'");
68 $sth->execute;
69 print "10.4 - Loaded metadata\n";
70
71 #10.2 load the feature hash
72 print "10.2 - Loading features...\n";
73 my ($feature_file, $window, $fchrom, $fstart, $fend, $debug) = ($annotation_file, 20000, 0, 1, 2, 1);
74 my $feature_hash_ref = &feature_index(\$feature_file, \$window, \$fchrom, \$fstart, \$fend, \$debug);
75 print "10.2 - Features loaded.\n";
76
77 #generate metadata based library portion of the SQL query
78 @descriptors;
79 $query_definition = '';
80 while ((@row) = $sth->fetchrow_array) {
81 push (@descriptors,$row[0]);
82 print "\n$row[0]";
83 }
84 print @descriptors;
85 foreach $desc(@descriptors) {
86 $query_definition = '';
87 $query = "select distinct library from metadata_$proj where descriptor = '$desc'";
88 #print "\n$query\n";
89 $sth = $dbh->prepare("$query");
90 $sth->execute;
91
92 while ((@row) = $sth->fetchrow_array) {
93 $query_definition = $query_definition."or A.library = '$row[0]' ";
94 #print "$row[0]";
95 }
96 print "\n$desc\n";
97 #print "$query_definition\n";
98 #setup the looping featured
99 @options = ('');
100 foreach $library(@options) {
101 @percent =("$library_percent");
102 foreach $percent(@percent) {
103
104 #11.1 query to select regions of interest
105 $sth = $dbh->prepare("select chromo, start, A.library,count, count/total,orient from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > $percent $query_chromo and (1=2 $query_definition)
106 order by chromo,start+0 ;");
107 $sth->execute;
108
109 unless(-d "results") {
110 mkdir("results");
111 }
112 unless(-d "results/$desc") {
113 mkdir("results/$desc");
114 }
115
116 open OUT, "> CIS/nr.txt";
117 $name = "$desc-nr-$proj-$percent";
118 open OUT2, "> results/$desc/$name.txt";
119 open OUT3, "> results/$desc/$name.BED";
120 print OUT3 "track type ='BED' name='$name' description='$name Insertions' visibility=2 itemRgb='On'";
121 while ((@row) = $sth->fetchrow_array) {
122 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]";
123 $start = $row[1]-50;
124 $stop = $row[1]+50;
125 print OUT2 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[4]\t$row[5]";
126 print OUT3 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[5]\t$start\t$stop\t159,0,197";
127 }
128 close OUT;
129
130 #11.2 Anotate the inserts from the .txt file
131 my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("results/$desc/$name.txt", "results/$desc/$name.ann.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0);
132 &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance);
133 unlink("results/$desc/$name.txt");
134 #generate a list of gense with high density inserts
135 #if (($percent == '0.01') and ($library eq 'library')) {
136 #system("perl lib/convert_nr_2_annot.pl results/$desc/$name.txt results/$desc/$name.ingenuity.txt");
137 #}
138
139 $count = 0;
140 open SOURCE, "< CIS/nr.txt";
141 while (defined($line = <SOURCE>)) {
142 $count++;
143 }
144 @libSizes =();
145 open CIS, ">> results/summary_CIS_$proj.txt";
146 print "\n\n11.3 - $name Total number of NR regions $count\n";
147 print CIS "\n\n$name\nExcluded chromosomes $query_chromo\nTotal number of NR regions $count\n";
148 print CIS "CIS analyses calculate window sizes based on NR count\n";
149
150 #11.4 Ideal window size calculation
151 $n = 10000;
152 $f = 3;
153 while ($n < 300000) {
154 $stat =poissonBonf($n,$count,$f);
155 if ($stat > 0.05) {
156 $nf=$n-1000;
157 $stat2=poissonBonf($nf,$count,$f);
158 if ($stat2 < 0.05) {
159 print "$stat2\t$nf\t$f\n";
160 print CIS "$stat2\t$nf\t$f\n";
161 unshift(@libSizes, $nf);
162 }
163 $f=$f+1;
164 }
165 $n = $n+1000;
166 }
167 $nf='301000';
168 unshift(@libSizes, $nf);
169 print @libSizes;
170 print CIS @libsizes;
171 close CIS;
172 if ($count < 2000) {
173 @libSizes =('301000','200000','100000','50000','25000','12500');
174 }
175 if ($count > 200000) {
176 @libSizes =('301000','200000','100000','50000','25000','12500');
177 }
178
179 #11.5
180 foreach $lib(@libSizes) {
181
182 #11.5 call the cis_finder.pl script to calculate the CIS.
183 system("perl lib/cis_finder.pl $lib");
184 print "\n$lib\n";
185
186
187 #11.6 call the feature_finder.pl script to annotat the CIS regions
188 my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("CIS/cis$lib.txt", "CIS/final_cis_named_$lib.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0);
189 &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance);
190 }
191
192 #load the results into the database.and resolve overlap
193 $sth = $dbh->prepare("delete from sort_$proj");
194 $sth->execute;
195 $sth = $dbh->prepare("delete from sort2_$proj");
196 $sth->execute;
197
198 #11.7
199 foreach $lib(@libSizes) {
200 $sth = $dbh->prepare("load DATA local INFILE 'CIS/final_cis_named_$lib.txt' INTO TABLE sort2_$proj");
201 $sth->execute;
202 resolve_cis();
203 }
204
205 #MOve and rename CIS to cis_result_final table
206 $filename = $desc."_".$percent.$library;
207 $query2 = "delete from cis_result_final_$proj where type= '$filename'";
208 print "\n$query2\n";
209 $sth = $dbh->prepare("$query2");
210 $sth->execute;
211 $sth = $dbh->prepare("insert into cis_result_final_$proj select * from sort_$proj");
212 $sth->execute;
213 $query3 = "update cis_result_final_$proj set type = '$filename' where type is NULL";
214 print "$query3";
215 $sth = $dbh->prepare("$query3");
216 $sth->execute;
217
218 #Export CIS list to the results subdirectory
219 $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;";
220 $sth = $dbh->prepare("$query4");
221 $sth->execute;
222 open CIS, "> results/$desc/cis_$name.xls";
223 print CIS "$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\tnumber of inserts drive transcription on positive strand";
224 while ((@row) = $sth->fetchrow_array) {
225 print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t";
226 }
227 close CIS;
228
229
230 open SOURCE, "< results/$desc/cis_$name.xls";
231 open CIS, "> results/$desc/plot_$name.wig";
232 print CIS "track type=wiggle_0 name=\"$proj.$filename\" description=\"$proj.$filename\" visibility=full autoScale=off";
233 while (defined($line = <SOURCE>)) {
234 chomp $line;
235 @field= split(/\t/, $line);
236 @chromo=split(/:|-/, $field[1]);
237
238 if ($field[1] =~ m/chr/) {
239 if ($field[7] < 1.0e-100) {
240 $field[7] = 1.0e-100;}
241
242 $log= -(log($field[7])/log(10));
243 print CIS "\n$chromo[0]\t$chromo[1]\t$chromo[2]\t$log";
244 }
245 }
246 close CIS;
247
248 #11.11 Generate the cis summary file of top 25 library CIS
249 $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;";
250 print "$query4";
251 $sth = $dbh->prepare("$query4");
252 $sth->execute;
253 open CIS, ">> results/summary_CIS_$proj.txt";
254 print CIS "\n\n$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\t+strand\t";
255 while ((@row) = $sth->fetchrow_array) {
256 print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t";
257 }
258 close CIS;
259
260 #generate matrix and carry out association analyses
261 if (($desc eq 'all') and ($percent == 0.0001) ) {
262 print "\n\n Starting Association analyses for $name";
263 mkdir "FISH";
264 mkdir "results/Assoc";
265 system ('perl lib/visualize.pl');
266 copy("FISH/cis4cluster.txt", "results/Assoc/cis4cluster.txt") or die "Unable to copy file: $!\n";
267 #system ('module load R');
268 system ('perl lib/matrix2fisher.pl FISH/cis4cluster.txt');
269 copy("FISH/R_result.txt", "results/Assoc/R_result.txt") or die "Unable to copy file: $!\n";
270 copy("FISH/Fisher_pre_named.txt", "results/Assoc/Fisher_pre_named.txt") or die "Unable to copy file: $!\n";
271 system ('perl lib/Fisher_clean.pl');
272 }
273
274 }#close the threshhold loop
275 }#close the libraries loop
276 }#close the metadata loop
277
278 &set_project_status($proj, 0, 1);
279
280 exit(0);
281
282 sub resolve_cis {
283 $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue > B.pvalue");
284 $sth->execute;
285
286 $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue > B.pvalue");
287 $sth->execute;
288 $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue < B.pvalue; ");
289 $sth->execute;
290 $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue <=
291 B.pvalue; ");
292 $sth->execute;
293
294 $sth = $dbh->prepare("insert into sort_$proj select * from sort2_$proj");
295 $sth->execute;
296
297 $sth = $dbh->prepare("delete from sort2_$proj");
298 $sth->execute;
299 }
300
301 sub poisson {
302 return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0
303 ])))/(fac($_[2])));
304 }
305
306 sub poissonBonf {
307 return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[
308 0])))/(fac($_[2])))*$_[1]);
309 }
310
311 sub fac {
312 my ($n) = @_;
313
314 if ($n < 2) {
315 return $n;
316 }
317 else {
318 return $n * fac($n-1);
319 }
320 }
321