Mercurial > repos > jesse-erdmann > tapdance
comparison lib/TAP2.pl @ 0:1437a2df99c0
Uploaded
| author | jesse-erdmann |
|---|---|
| date | Fri, 09 Dec 2011 11:56:56 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1437a2df99c0 |
|---|---|
| 1 #This is the TAp2.pl script | |
| 2 #The goal of this is to count the number of CIS present in a NR set | |
| 3 # one file is required nr.txt sorted in chromosome order! | |
| 4 # Aaron Lyman Sarver Oct 21, 2010 | |
| 5 # This script takes over following the bowtie mapping and generates CIS. | |
| 6 # Update NOV15,2010 to pass library information through | |
| 7 #$Fully modified december 1 2010i | |
| 8 #Fully modifed May 1 2010 | |
| 9 use File::Copy; | |
| 10 require 'config.pl'; | |
| 11 require 'lib/feature_finder_methods.pl'; | |
| 12 my $path = $0; | |
| 13 $path =~ s/\/\w*\.pl$//g; | |
| 14 require "$path/project_man.pl"; | |
| 15 unless(-d "CIS") { | |
| 16 mkdir("CIS"); | |
| 17 } | |
| 18 | |
| 19 open CIS, "> results/summary_CIS_$proj.txt"; | |
| 20 close CIS; | |
| 21 | |
| 22 #generate required tables | |
| 23 $sth = $dbh->prepare("drop table if exists cis_result_final_$proj"); | |
| 24 $sth->execute; | |
| 25 $sth = $dbh->prepare("create table cis_result_final_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))"); | |
| 26 $sth->execute; | |
| 27 | |
| 28 $sth = $dbh->prepare("drop table if exists sort_$proj"); | |
| 29 $sth->execute; | |
| 30 $sth = $dbh->prepare("create table sort_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))"); | |
| 31 $sth->execute; | |
| 32 $sth = $dbh->prepare("drop table if exists sort2_$proj"); | |
| 33 $sth->execute; | |
| 34 $sth = $dbh->prepare("create table sort2_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))"); | |
| 35 $sth->execute; | |
| 36 | |
| 37 #load concatamer information from file | |
| 38 $sth = $dbh->prepare("drop table if exists chromo_$proj"); | |
| 39 $sth->execute; | |
| 40 $sth = $dbh->prepare("create table chromo_$proj (chromo varchar(20) ) "); | |
| 41 $sth->execute; | |
| 42 $sth = $dbh->prepare("load DATA local INFILE 'data/chromo.tab' INTO TABLE chromo_$proj "); | |
| 43 $sth->execute; | |
| 44 $sth = $dbh->prepare("select distinct chromo from chromo_$proj"); | |
| 45 $sth->execute; | |
| 46 print "10.3 - Loaded concatamer chromo\n"; | |
| 47 #generate donor exclusion portion of the SQL query | |
| 48 @exclude; | |
| 49 $query_chromo= ''; | |
| 50 while ((@row) = $sth->fetchrow_array) { | |
| 51 push (@exclude,$row[0]); | |
| 52 print "\n$row[0]"; | |
| 53 } | |
| 54 print @exclude; | |
| 55 foreach $desc(@exclude) { | |
| 56 $query_chromo = $query_chromo."and chromo != '".$desc."' "; | |
| 57 } | |
| 58 print $query_chromo; | |
| 59 | |
| 60 #10.4 load library metadata | |
| 61 $sth = $dbh->prepare("drop table if exists metadata_$proj"); | |
| 62 $sth->execute; | |
| 63 $sth = $dbh->prepare("create table metadata_$proj (library varchar(20), descriptor varchar(20), type varchar (20) ) "); | |
| 64 $sth->execute; | |
| 65 $sth = $dbh->prepare("load DATA local INFILE 'data/metadata.tab' INTO TABLE metadata_$proj "); | |
| 66 $sth->execute; | |
| 67 $sth = $dbh->prepare("select distinct descriptor from metadata_$proj where type = 'cis'"); | |
| 68 $sth->execute; | |
| 69 print "10.4 - Loaded metadata\n"; | |
| 70 | |
| 71 #10.2 load the feature hash | |
| 72 print "10.2 - Loading features...\n"; | |
| 73 my ($feature_file, $window, $fchrom, $fstart, $fend, $debug) = ($annotation_file, 20000, 0, 1, 2, 1); | |
| 74 my $feature_hash_ref = &feature_index(\$feature_file, \$window, \$fchrom, \$fstart, \$fend, \$debug); | |
| 75 print "10.2 - Features loaded.\n"; | |
| 76 | |
| 77 #generate metadata based library portion of the SQL query | |
| 78 @descriptors; | |
| 79 $query_definition = ''; | |
| 80 while ((@row) = $sth->fetchrow_array) { | |
| 81 push (@descriptors,$row[0]); | |
| 82 print "\n$row[0]"; | |
| 83 } | |
| 84 print @descriptors; | |
| 85 foreach $desc(@descriptors) { | |
| 86 $query_definition = ''; | |
| 87 $query = "select distinct library from metadata_$proj where descriptor = '$desc'"; | |
| 88 #print "\n$query\n"; | |
| 89 $sth = $dbh->prepare("$query"); | |
| 90 $sth->execute; | |
| 91 | |
| 92 while ((@row) = $sth->fetchrow_array) { | |
| 93 $query_definition = $query_definition."or A.library = '$row[0]' "; | |
| 94 #print "$row[0]"; | |
| 95 } | |
| 96 print "\n$desc\n"; | |
| 97 #print "$query_definition\n"; | |
| 98 #setup the looping featured | |
| 99 @options = (''); | |
| 100 foreach $library(@options) { | |
| 101 @percent =("$library_percent"); | |
| 102 foreach $percent(@percent) { | |
| 103 | |
| 104 #11.1 query to select regions of interest | |
| 105 $sth = $dbh->prepare("select chromo, start, A.library,count, count/total,orient from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > $percent $query_chromo and (1=2 $query_definition) | |
| 106 order by chromo,start+0 ;"); | |
| 107 $sth->execute; | |
| 108 | |
| 109 unless(-d "results") { | |
| 110 mkdir("results"); | |
| 111 } | |
| 112 unless(-d "results/$desc") { | |
| 113 mkdir("results/$desc"); | |
| 114 } | |
| 115 | |
| 116 open OUT, "> CIS/nr.txt"; | |
| 117 $name = "$desc-nr-$proj-$percent"; | |
| 118 open OUT2, "> results/$desc/$name.txt"; | |
| 119 open OUT3, "> results/$desc/$name.BED"; | |
| 120 print OUT3 "track type ='BED' name='$name' description='$name Insertions' visibility=2 itemRgb='On'"; | |
| 121 while ((@row) = $sth->fetchrow_array) { | |
| 122 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]"; | |
| 123 $start = $row[1]-50; | |
| 124 $stop = $row[1]+50; | |
| 125 print OUT2 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[4]\t$row[5]"; | |
| 126 print OUT3 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[5]\t$start\t$stop\t159,0,197"; | |
| 127 } | |
| 128 close OUT; | |
| 129 | |
| 130 #11.2 Anotate the inserts from the .txt file | |
| 131 my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("results/$desc/$name.txt", "results/$desc/$name.ann.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0); | |
| 132 &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance); | |
| 133 unlink("results/$desc/$name.txt"); | |
| 134 #generate a list of gense with high density inserts | |
| 135 #if (($percent == '0.01') and ($library eq 'library')) { | |
| 136 #system("perl lib/convert_nr_2_annot.pl results/$desc/$name.txt results/$desc/$name.ingenuity.txt"); | |
| 137 #} | |
| 138 | |
| 139 $count = 0; | |
| 140 open SOURCE, "< CIS/nr.txt"; | |
| 141 while (defined($line = <SOURCE>)) { | |
| 142 $count++; | |
| 143 } | |
| 144 @libSizes =(); | |
| 145 open CIS, ">> results/summary_CIS_$proj.txt"; | |
| 146 print "\n\n11.3 - $name Total number of NR regions $count\n"; | |
| 147 print CIS "\n\n$name\nExcluded chromosomes $query_chromo\nTotal number of NR regions $count\n"; | |
| 148 print CIS "CIS analyses calculate window sizes based on NR count\n"; | |
| 149 | |
| 150 #11.4 Ideal window size calculation | |
| 151 $n = 10000; | |
| 152 $f = 3; | |
| 153 while ($n < 300000) { | |
| 154 $stat =poissonBonf($n,$count,$f); | |
| 155 if ($stat > 0.05) { | |
| 156 $nf=$n-1000; | |
| 157 $stat2=poissonBonf($nf,$count,$f); | |
| 158 if ($stat2 < 0.05) { | |
| 159 print "$stat2\t$nf\t$f\n"; | |
| 160 print CIS "$stat2\t$nf\t$f\n"; | |
| 161 unshift(@libSizes, $nf); | |
| 162 } | |
| 163 $f=$f+1; | |
| 164 } | |
| 165 $n = $n+1000; | |
| 166 } | |
| 167 $nf='301000'; | |
| 168 unshift(@libSizes, $nf); | |
| 169 print @libSizes; | |
| 170 print CIS @libsizes; | |
| 171 close CIS; | |
| 172 if ($count < 2000) { | |
| 173 @libSizes =('301000','200000','100000','50000','25000','12500'); | |
| 174 } | |
| 175 if ($count > 200000) { | |
| 176 @libSizes =('301000','200000','100000','50000','25000','12500'); | |
| 177 } | |
| 178 | |
| 179 #11.5 | |
| 180 foreach $lib(@libSizes) { | |
| 181 | |
| 182 #11.5 call the cis_finder.pl script to calculate the CIS. | |
| 183 system("perl lib/cis_finder.pl $lib"); | |
| 184 print "\n$lib\n"; | |
| 185 | |
| 186 | |
| 187 #11.6 call the feature_finder.pl script to annotat the CIS regions | |
| 188 my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("CIS/cis$lib.txt", "CIS/final_cis_named_$lib.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0); | |
| 189 &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance); | |
| 190 } | |
| 191 | |
| 192 #load the results into the database.and resolve overlap | |
| 193 $sth = $dbh->prepare("delete from sort_$proj"); | |
| 194 $sth->execute; | |
| 195 $sth = $dbh->prepare("delete from sort2_$proj"); | |
| 196 $sth->execute; | |
| 197 | |
| 198 #11.7 | |
| 199 foreach $lib(@libSizes) { | |
| 200 $sth = $dbh->prepare("load DATA local INFILE 'CIS/final_cis_named_$lib.txt' INTO TABLE sort2_$proj"); | |
| 201 $sth->execute; | |
| 202 resolve_cis(); | |
| 203 } | |
| 204 | |
| 205 #MOve and rename CIS to cis_result_final table | |
| 206 $filename = $desc."_".$percent.$library; | |
| 207 $query2 = "delete from cis_result_final_$proj where type= '$filename'"; | |
| 208 print "\n$query2\n"; | |
| 209 $sth = $dbh->prepare("$query2"); | |
| 210 $sth->execute; | |
| 211 $sth = $dbh->prepare("insert into cis_result_final_$proj select * from sort_$proj"); | |
| 212 $sth->execute; | |
| 213 $query3 = "update cis_result_final_$proj set type = '$filename' where type is NULL"; | |
| 214 print "$query3"; | |
| 215 $sth = $dbh->prepare("$query3"); | |
| 216 $sth->execute; | |
| 217 | |
| 218 #Export CIS list to the results subdirectory | |
| 219 $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;"; | |
| 220 $sth = $dbh->prepare("$query4"); | |
| 221 $sth->execute; | |
| 222 open CIS, "> results/$desc/cis_$name.xls"; | |
| 223 print CIS "$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\tnumber of inserts drive transcription on positive strand"; | |
| 224 while ((@row) = $sth->fetchrow_array) { | |
| 225 print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t"; | |
| 226 } | |
| 227 close CIS; | |
| 228 | |
| 229 | |
| 230 open SOURCE, "< results/$desc/cis_$name.xls"; | |
| 231 open CIS, "> results/$desc/plot_$name.wig"; | |
| 232 print CIS "track type=wiggle_0 name=\"$proj.$filename\" description=\"$proj.$filename\" visibility=full autoScale=off"; | |
| 233 while (defined($line = <SOURCE>)) { | |
| 234 chomp $line; | |
| 235 @field= split(/\t/, $line); | |
| 236 @chromo=split(/:|-/, $field[1]); | |
| 237 | |
| 238 if ($field[1] =~ m/chr/) { | |
| 239 if ($field[7] < 1.0e-100) { | |
| 240 $field[7] = 1.0e-100;} | |
| 241 | |
| 242 $log= -(log($field[7])/log(10)); | |
| 243 print CIS "\n$chromo[0]\t$chromo[1]\t$chromo[2]\t$log"; | |
| 244 } | |
| 245 } | |
| 246 close CIS; | |
| 247 | |
| 248 #11.11 Generate the cis summary file of top 25 library CIS | |
| 249 $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;"; | |
| 250 print "$query4"; | |
| 251 $sth = $dbh->prepare("$query4"); | |
| 252 $sth->execute; | |
| 253 open CIS, ">> results/summary_CIS_$proj.txt"; | |
| 254 print CIS "\n\n$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\t+strand\t"; | |
| 255 while ((@row) = $sth->fetchrow_array) { | |
| 256 print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t"; | |
| 257 } | |
| 258 close CIS; | |
| 259 | |
| 260 #generate matrix and carry out association analyses | |
| 261 if (($desc eq 'all') and ($percent == 0.0001) ) { | |
| 262 print "\n\n Starting Association analyses for $name"; | |
| 263 mkdir "FISH"; | |
| 264 mkdir "results/Assoc"; | |
| 265 system ('perl lib/visualize.pl'); | |
| 266 copy("FISH/cis4cluster.txt", "results/Assoc/cis4cluster.txt") or die "Unable to copy file: $!\n"; | |
| 267 #system ('module load R'); | |
| 268 system ('perl lib/matrix2fisher.pl FISH/cis4cluster.txt'); | |
| 269 copy("FISH/R_result.txt", "results/Assoc/R_result.txt") or die "Unable to copy file: $!\n"; | |
| 270 copy("FISH/Fisher_pre_named.txt", "results/Assoc/Fisher_pre_named.txt") or die "Unable to copy file: $!\n"; | |
| 271 system ('perl lib/Fisher_clean.pl'); | |
| 272 } | |
| 273 | |
| 274 }#close the threshhold loop | |
| 275 }#close the libraries loop | |
| 276 }#close the metadata loop | |
| 277 | |
| 278 &set_project_status($proj, 0, 1); | |
| 279 | |
| 280 exit(0); | |
| 281 | |
| 282 sub resolve_cis { | |
| 283 $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue > B.pvalue"); | |
| 284 $sth->execute; | |
| 285 | |
| 286 $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue > B.pvalue"); | |
| 287 $sth->execute; | |
| 288 $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue < B.pvalue; "); | |
| 289 $sth->execute; | |
| 290 $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue <= | |
| 291 B.pvalue; "); | |
| 292 $sth->execute; | |
| 293 | |
| 294 $sth = $dbh->prepare("insert into sort_$proj select * from sort2_$proj"); | |
| 295 $sth->execute; | |
| 296 | |
| 297 $sth = $dbh->prepare("delete from sort2_$proj"); | |
| 298 $sth->execute; | |
| 299 } | |
| 300 | |
| 301 sub poisson { | |
| 302 return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0 | |
| 303 ])))/(fac($_[2]))); | |
| 304 } | |
| 305 | |
| 306 sub poissonBonf { | |
| 307 return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[ | |
| 308 0])))/(fac($_[2])))*$_[1]); | |
| 309 } | |
| 310 | |
| 311 sub fac { | |
| 312 my ($n) = @_; | |
| 313 | |
| 314 if ($n < 2) { | |
| 315 return $n; | |
| 316 } | |
| 317 else { | |
| 318 return $n * fac($n-1); | |
| 319 } | |
| 320 } | |
| 321 |
