Mercurial > repos > jesse-erdmann > tapdance
view lib/TAP2.pl @ 3:17ce4f3bffa2 default tip
Uploaded
author | jesse-erdmann |
---|---|
date | Tue, 24 Jan 2012 18:33:41 -0500 |
parents | 1437a2df99c0 |
children |
line wrap: on
line source
#This is the TAp2.pl script #The goal of this is to count the number of CIS present in a NR set # one file is required nr.txt sorted in chromosome order! # Aaron Lyman Sarver Oct 21, 2010 # This script takes over following the bowtie mapping and generates CIS. # Update NOV15,2010 to pass library information through #$Fully modified december 1 2010i #Fully modifed May 1 2010 use File::Copy; require 'config.pl'; require 'lib/feature_finder_methods.pl'; my $path = $0; $path =~ s/\/\w*\.pl$//g; require "$path/project_man.pl"; unless(-d "CIS") { mkdir("CIS"); } open CIS, "> results/summary_CIS_$proj.txt"; close CIS; #generate required tables $sth = $dbh->prepare("drop table if exists cis_result_final_$proj"); $sth->execute; $sth = $dbh->prepare("create table cis_result_final_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))"); $sth->execute; $sth = $dbh->prepare("drop table if exists sort_$proj"); $sth->execute; $sth = $dbh->prepare("create table sort_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))"); $sth->execute; $sth = $dbh->prepare("drop table if exists sort2_$proj"); $sth->execute; $sth = $dbh->prepare("create table sort2_$proj (chromo varchar(20), start int, stop int, name varchar(20), number varchar(20), size varchar(20), pvalue double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type varchar(20))"); $sth->execute; #load concatamer information from file $sth = $dbh->prepare("drop table if exists chromo_$proj"); $sth->execute; $sth = $dbh->prepare("create table chromo_$proj (chromo varchar(20) ) "); $sth->execute; $sth = $dbh->prepare("load DATA local INFILE 'data/chromo.tab' INTO TABLE chromo_$proj "); $sth->execute; $sth = $dbh->prepare("select distinct chromo from chromo_$proj"); $sth->execute; print "10.3 - Loaded concatamer chromo\n"; #generate donor exclusion portion of the SQL query @exclude; $query_chromo= ''; while ((@row) = $sth->fetchrow_array) { push (@exclude,$row[0]); print "\n$row[0]"; } print @exclude; foreach $desc(@exclude) { $query_chromo = $query_chromo."and chromo != '".$desc."' "; } print $query_chromo; #10.4 load library metadata $sth = $dbh->prepare("drop table if exists metadata_$proj"); $sth->execute; $sth = $dbh->prepare("create table metadata_$proj (library varchar(20), descriptor varchar(20), type varchar (20) ) "); $sth->execute; $sth = $dbh->prepare("load DATA local INFILE 'data/metadata.tab' INTO TABLE metadata_$proj "); $sth->execute; $sth = $dbh->prepare("select distinct descriptor from metadata_$proj where type = 'cis'"); $sth->execute; print "10.4 - Loaded metadata\n"; #10.2 load the feature hash print "10.2 - Loading features...\n"; my ($feature_file, $window, $fchrom, $fstart, $fend, $debug) = ($annotation_file, 20000, 0, 1, 2, 1); my $feature_hash_ref = &feature_index(\$feature_file, \$window, \$fchrom, \$fstart, \$fend, \$debug); print "10.2 - Features loaded.\n"; #generate metadata based library portion of the SQL query @descriptors; $query_definition = ''; while ((@row) = $sth->fetchrow_array) { push (@descriptors,$row[0]); print "\n$row[0]"; } print @descriptors; foreach $desc(@descriptors) { $query_definition = ''; $query = "select distinct library from metadata_$proj where descriptor = '$desc'"; #print "\n$query\n"; $sth = $dbh->prepare("$query"); $sth->execute; while ((@row) = $sth->fetchrow_array) { $query_definition = $query_definition."or A.library = '$row[0]' "; #print "$row[0]"; } print "\n$desc\n"; #print "$query_definition\n"; #setup the looping featured @options = (''); foreach $library(@options) { @percent =("$library_percent"); foreach $percent(@percent) { #11.1 query to select regions of interest $sth = $dbh->prepare("select chromo, start, A.library,count, count/total,orient from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > $percent $query_chromo and (1=2 $query_definition) order by chromo,start+0 ;"); $sth->execute; unless(-d "results") { mkdir("results"); } unless(-d "results/$desc") { mkdir("results/$desc"); } open OUT, "> CIS/nr.txt"; $name = "$desc-nr-$proj-$percent"; open OUT2, "> results/$desc/$name.txt"; open OUT3, "> results/$desc/$name.BED"; print OUT3 "track type ='BED' name='$name' description='$name Insertions' visibility=2 itemRgb='On'"; while ((@row) = $sth->fetchrow_array) { print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]"; $start = $row[1]-50; $stop = $row[1]+50; print OUT2 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[4]\t$row[5]"; print OUT3 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[5]\t$start\t$stop\t159,0,197"; } close OUT; #11.2 Anotate the inserts from the .txt file my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("results/$desc/$name.txt", "results/$desc/$name.ann.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0); &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance); unlink("results/$desc/$name.txt"); #generate a list of gense with high density inserts #if (($percent == '0.01') and ($library eq 'library')) { #system("perl lib/convert_nr_2_annot.pl results/$desc/$name.txt results/$desc/$name.ingenuity.txt"); #} $count = 0; open SOURCE, "< CIS/nr.txt"; while (defined($line = <SOURCE>)) { $count++; } @libSizes =(); open CIS, ">> results/summary_CIS_$proj.txt"; print "\n\n11.3 - $name Total number of NR regions $count\n"; print CIS "\n\n$name\nExcluded chromosomes $query_chromo\nTotal number of NR regions $count\n"; print CIS "CIS analyses calculate window sizes based on NR count\n"; #11.4 Ideal window size calculation $n = 10000; $f = 3; while ($n < 300000) { $stat =poissonBonf($n,$count,$f); if ($stat > 0.05) { $nf=$n-1000; $stat2=poissonBonf($nf,$count,$f); if ($stat2 < 0.05) { print "$stat2\t$nf\t$f\n"; print CIS "$stat2\t$nf\t$f\n"; unshift(@libSizes, $nf); } $f=$f+1; } $n = $n+1000; } $nf='301000'; unshift(@libSizes, $nf); print @libSizes; print CIS @libsizes; close CIS; if ($count < 2000) { @libSizes =('301000','200000','100000','50000','25000','12500'); } if ($count > 200000) { @libSizes =('301000','200000','100000','50000','25000','12500'); } #11.5 foreach $lib(@libSizes) { #11.5 call the cis_finder.pl script to calculate the CIS. system("perl lib/cis_finder.pl $lib"); print "\n$lib\n"; #11.6 call the feature_finder.pl script to annotat the CIS regions my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("CIS/cis$lib.txt", "CIS/final_cis_named_$lib.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0); &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance); } #load the results into the database.and resolve overlap $sth = $dbh->prepare("delete from sort_$proj"); $sth->execute; $sth = $dbh->prepare("delete from sort2_$proj"); $sth->execute; #11.7 foreach $lib(@libSizes) { $sth = $dbh->prepare("load DATA local INFILE 'CIS/final_cis_named_$lib.txt' INTO TABLE sort2_$proj"); $sth->execute; resolve_cis(); } #MOve and rename CIS to cis_result_final table $filename = $desc."_".$percent.$library; $query2 = "delete from cis_result_final_$proj where type= '$filename'"; print "\n$query2\n"; $sth = $dbh->prepare("$query2"); $sth->execute; $sth = $dbh->prepare("insert into cis_result_final_$proj select * from sort_$proj"); $sth->execute; $query3 = "update cis_result_final_$proj set type = '$filename' where type is NULL"; print "$query3"; $sth = $dbh->prepare("$query3"); $sth->execute; #Export CIS list to the results subdirectory $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;"; $sth = $dbh->prepare("$query4"); $sth->execute; open CIS, "> results/$desc/cis_$name.xls"; print CIS "$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\tnumber of inserts drive transcription on positive strand"; while ((@row) = $sth->fetchrow_array) { print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t"; } close CIS; open SOURCE, "< results/$desc/cis_$name.xls"; open CIS, "> results/$desc/plot_$name.wig"; print CIS "track type=wiggle_0 name=\"$proj.$filename\" description=\"$proj.$filename\" visibility=full autoScale=off"; while (defined($line = <SOURCE>)) { chomp $line; @field= split(/\t/, $line); @chromo=split(/:|-/, $field[1]); if ($field[1] =~ m/chr/) { if ($field[7] < 1.0e-100) { $field[7] = 1.0e-100;} $log= -(log($field[7])/log(10)); print CIS "\n$chromo[0]\t$chromo[1]\t$chromo[2]\t$log"; } } close CIS; #11.11 Generate the cis summary file of top 25 library CIS $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename' and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;"; print "$query4"; $sth = $dbh->prepare("$query4"); $sth->execute; open CIS, ">> results/summary_CIS_$proj.txt"; print CIS "\n\n$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\t+strand\t"; while ((@row) = $sth->fetchrow_array) { print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t"; } close CIS; #generate matrix and carry out association analyses if (($desc eq 'all') and ($percent == 0.0001) ) { print "\n\n Starting Association analyses for $name"; mkdir "FISH"; mkdir "results/Assoc"; system ('perl lib/visualize.pl'); copy("FISH/cis4cluster.txt", "results/Assoc/cis4cluster.txt") or die "Unable to copy file: $!\n"; #system ('module load R'); system ('perl lib/matrix2fisher.pl FISH/cis4cluster.txt'); copy("FISH/R_result.txt", "results/Assoc/R_result.txt") or die "Unable to copy file: $!\n"; copy("FISH/Fisher_pre_named.txt", "results/Assoc/Fisher_pre_named.txt") or die "Unable to copy file: $!\n"; system ('perl lib/Fisher_clean.pl'); } }#close the threshhold loop }#close the libraries loop }#close the metadata loop &set_project_status($proj, 0, 1); exit(0); sub resolve_cis { $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue > B.pvalue"); $sth->execute; $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue > B.pvalue"); $sth->execute; $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue < B.pvalue; "); $sth->execute; $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue <= B.pvalue; "); $sth->execute; $sth = $dbh->prepare("insert into sort_$proj select * from sort2_$proj"); $sth->execute; $sth = $dbh->prepare("delete from sort2_$proj"); $sth->execute; } sub poisson { return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0 ])))/(fac($_[2]))); } sub poissonBonf { return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[ 0])))/(fac($_[2])))*$_[1]); } sub fac { my ($n) = @_; if ($n < 2) { return $n; } else { return $n * fac($n-1); } }