diff lib/TAP2.pl @ 0:1437a2df99c0

Uploaded
author jesse-erdmann
date Fri, 09 Dec 2011 11:56:56 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/TAP2.pl	Fri Dec 09 11:56:56 2011 -0500
@@ -0,0 +1,321 @@
+#This is the TAp2.pl script 
+#The goal of this is to count the number of CIS present in a NR set
+# one file is required nr.txt sorted in chromosome order!
+# Aaron Lyman Sarver Oct 21, 2010
+# This script takes over following the bowtie mapping and generates CIS.
+# Update NOV15,2010 to pass library information through 
+#$Fully modified december 1 2010i
+#Fully modifed May 1 2010
+use File::Copy;
+require 'config.pl';
+require 'lib/feature_finder_methods.pl';
+my $path = $0;
+$path =~ s/\/\w*\.pl$//g;
+require "$path/project_man.pl";
+unless(-d "CIS") {
+    mkdir("CIS");
+}
+
+open CIS, "> results/summary_CIS_$proj.txt";
+close CIS;
+
+#generate required tables
+$sth = $dbh->prepare("drop table if exists cis_result_final_$proj");
+$sth->execute;
+$sth = $dbh->prepare("create table cis_result_final_$proj (chromo varchar(20), start int, stop int, name  varchar(20), number varchar(20), size  varchar(20), pvalue  double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type  varchar(20))");
+$sth->execute;
+
+$sth = $dbh->prepare("drop table if exists sort_$proj");
+$sth->execute;
+$sth = $dbh->prepare("create table sort_$proj (chromo varchar(20), start int, stop int, name  varchar(20), number varchar(20), size  varchar(20), pvalue  double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type  varchar(20))");
+$sth->execute;
+$sth = $dbh->prepare("drop table if exists sort2_$proj");
+$sth->execute;
+$sth = $dbh->prepare("create table sort2_$proj (chromo varchar(20), start int, stop int, name  varchar(20), number varchar(20), size  varchar(20), pvalue  double, total varchar(20), pvaluetotal double,region varchar(20), pvalueregion double, library_name varchar(5000), strand int, gene_name varchar(1000), type  varchar(20))");
+$sth->execute;
+
+#load concatamer information from file
+$sth = $dbh->prepare("drop table if exists chromo_$proj");
+$sth->execute;
+$sth = $dbh->prepare("create table chromo_$proj (chromo varchar(20) ) ");
+$sth->execute;
+$sth = $dbh->prepare("load DATA local INFILE 'data/chromo.tab' INTO TABLE chromo_$proj ");
+$sth->execute;
+$sth = $dbh->prepare("select distinct chromo from chromo_$proj");
+$sth->execute;
+print "10.3 - Loaded concatamer chromo\n";
+#generate donor exclusion portion of the SQL query
+@exclude;
+$query_chromo= '';
+while ((@row) = $sth->fetchrow_array) {
+       push (@exclude,$row[0]);
+       print "\n$row[0]";
+}
+print @exclude;
+foreach $desc(@exclude) {
+    $query_chromo = $query_chromo."and chromo != '".$desc."' ";
+}
+print $query_chromo;
+
+#10.4 load library metadata
+$sth = $dbh->prepare("drop table if exists metadata_$proj");
+$sth->execute;
+$sth = $dbh->prepare("create table metadata_$proj (library varchar(20), descriptor varchar(20), type varchar (20) ) ");
+$sth->execute;
+$sth = $dbh->prepare("load DATA local INFILE 'data/metadata.tab' INTO TABLE metadata_$proj ");
+$sth->execute;
+$sth = $dbh->prepare("select distinct descriptor from metadata_$proj where type = 'cis'");
+$sth->execute;
+print "10.4 - Loaded metadata\n";
+
+#10.2 load the feature hash
+print "10.2 - Loading features...\n";
+my ($feature_file, $window, $fchrom, $fstart, $fend, $debug) = ($annotation_file, 20000, 0, 1, 2, 1);
+my $feature_hash_ref = &feature_index(\$feature_file, \$window, \$fchrom, \$fstart, \$fend, \$debug);
+print "10.2 - Features loaded.\n";
+
+#generate metadata based library portion of the SQL query
+@descriptors;
+$query_definition = '';
+while ((@row) = $sth->fetchrow_array) {
+       push (@descriptors,$row[0]);
+       print "\n$row[0]";
+}
+print @descriptors;
+foreach $desc(@descriptors) {
+    $query_definition = '';
+    $query = "select distinct library from metadata_$proj where descriptor = '$desc'";
+#print "\n$query\n";
+    $sth = $dbh->prepare("$query");
+    $sth->execute;
+    
+    while ((@row) = $sth->fetchrow_array) {
+	$query_definition = $query_definition."or A.library = '$row[0]'  "; 
+#print "$row[0]";
+    }
+    print "\n$desc\n";
+#print "$query_definition\n";
+#setup the looping featured
+    @options = ('');
+    foreach $library(@options) {
+	@percent =("$library_percent");
+	foreach $percent(@percent) {
+
+#11.1 query to select regions of interest
+	    $sth = $dbh->prepare("select chromo, start, A.library,count, count/total,orient  from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > $percent $query_chromo and (1=2 $query_definition) 
+order by chromo,start+0 ;");
+	    $sth->execute;
+	    
+	    unless(-d "results") {
+		mkdir("results");
+	    }
+	    unless(-d "results/$desc") {
+		mkdir("results/$desc");
+	    }
+	    
+	    open OUT, "> CIS/nr.txt";
+	    $name = "$desc-nr-$proj-$percent";
+	    open OUT2, "> results/$desc/$name.txt";
+	    open OUT3, "> results/$desc/$name.BED";
+	    print OUT3 "track type ='BED' name='$name' description='$name Insertions' visibility=2 itemRgb='On'";
+	    while ((@row) = $sth->fetchrow_array) {
+		print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]";
+		$start = $row[1]-50;
+		$stop = $row[1]+50;
+		print OUT2 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[4]\t$row[5]";
+		print OUT3 "\n$row[0]\t$start\t$stop\t$row[2]\t$row[3]\t$row[5]\t$start\t$stop\t159,0,197";
+	    }
+	    close OUT;
+	    
+#11.2 Anotate the inserts from the .txt file
+	    my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("results/$desc/$name.txt", "results/$desc/$name.ann.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0);
+	    &process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance);
+	    unlink("results/$desc/$name.txt");
+            #generate a list of gense with high density inserts
+            #if (($percent == '0.01') and ($library eq 'library')) {
+            #system("perl lib/convert_nr_2_annot.pl results/$desc/$name.txt results/$desc/$name.ingenuity.txt");
+            #}
+
+	    $count = 0;
+	    open SOURCE, "< CIS/nr.txt";
+	    while (defined($line = <SOURCE>)) {
+		$count++;
+	    }
+	    @libSizes =();
+	    open CIS, ">> results/summary_CIS_$proj.txt";
+	    print "\n\n11.3 - $name  Total number of NR regions $count\n";
+	    print CIS "\n\n$name\nExcluded chromosomes $query_chromo\nTotal number of NR regions $count\n";
+	    print CIS "CIS analyses calculate window sizes based on NR count\n";
+
+#11.4 Ideal window size calculation
+	    $n = 10000;
+	    $f = 3;
+	    while ($n < 300000) {
+		$stat =poissonBonf($n,$count,$f);
+		if ($stat > 0.05) {
+		    $nf=$n-1000;
+		    $stat2=poissonBonf($nf,$count,$f);
+		    if ($stat2 < 0.05) {
+			print "$stat2\t$nf\t$f\n";
+			print CIS "$stat2\t$nf\t$f\n";
+			unshift(@libSizes, $nf);
+		    }
+		    $f=$f+1;
+		}
+		$n = $n+1000;
+	    }
+	    $nf='301000';
+	    unshift(@libSizes, $nf);
+	    print @libSizes;
+	    print CIS @libsizes;
+	    close CIS;
+	    if ($count < 2000) {
+		@libSizes =('301000','200000','100000','50000','25000','12500');
+	    }
+	    if ($count > 200000) {
+		@libSizes =('301000','200000','100000','50000','25000','12500');
+	    }
+
+#11.5 
+	    foreach $lib(@libSizes) {
+		
+#11.5 call the cis_finder.pl script to calculate the CIS.
+		system("perl lib/cis_finder.pl $lib");
+		print "\n$lib\n";
+
+		
+#11.6 call the feature_finder.pl script to annotat the CIS regions
+		my ($interval_file, $out_file, $req_res_type, $direction, $interval_cols, $feature_cols, $display_distance) = ("CIS/cis$lib.txt", "CIS/final_cis_named_$lib.txt", "ALL", "BOTH", "1:2:3:4:10", "1:2:3:4:6", 0); 
+		&process_intervals(\$interval_file, \$out_file, $feature_hash_ref, \$req_res_type, \$direction, \$window, \$html, \$cturl, \$db, \$interval_cols, \$feature_cols, \$debug, \$display_distance);
+	    }
+	    
+#load the results into the database.and resolve overlap
+	    $sth = $dbh->prepare("delete from sort_$proj");
+	    $sth->execute;
+	    $sth = $dbh->prepare("delete from sort2_$proj");
+	    $sth->execute;
+	    
+#11.7
+	    foreach $lib(@libSizes) {
+		$sth = $dbh->prepare("load DATA local INFILE 'CIS/final_cis_named_$lib.txt' INTO TABLE sort2_$proj");
+$sth->execute;
+		resolve_cis();
+}
+
+#MOve and rename CIS to cis_result_final table 
+	    $filename = $desc."_".$percent.$library;
+	    $query2 = "delete from cis_result_final_$proj where type= '$filename'";
+	    print "\n$query2\n";
+	    $sth = $dbh->prepare("$query2");
+	    $sth->execute;
+	    $sth = $dbh->prepare("insert into cis_result_final_$proj select * from sort_$proj");
+	    $sth->execute;
+	    $query3 = "update cis_result_final_$proj set type = '$filename' where type is NULL";
+	    print "$query3";
+	    $sth = $dbh->prepare("$query3");
+	    $sth->execute;
+	    
+#Export CIS list to the results subdirectory
+	    $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename'  and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;";
+	    $sth = $dbh->prepare("$query4");
+	    $sth->execute;
+	    open CIS, "> results/$desc/cis_$name.xls";
+	    print CIS "$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\tnumber of inserts drive transcription on positive strand";
+	    while ((@row) = $sth->fetchrow_array) {
+		print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t";
+	    }
+	    close CIS;
+	    
+	    
+	    open SOURCE, "< results/$desc/cis_$name.xls";
+	    open CIS, "> results/$desc/plot_$name.wig";
+	    print CIS "track type=wiggle_0 name=\"$proj.$filename\" description=\"$proj.$filename\" visibility=full autoScale=off";
+	    while (defined($line = <SOURCE>)) {
+		chomp $line;
+		@field= split(/\t/, $line);
+		@chromo=split(/:|-/, $field[1]);
+		
+		if ($field[1] =~ m/chr/) {
+		    if ($field[7] < 1.0e-100) {
+			$field[7] = 1.0e-100;}
+		    
+		    $log= -(log($field[7])/log(10));
+		    print CIS "\n$chromo[0]\t$chromo[1]\t$chromo[2]\t$log";
+		}
+	    }
+	    close CIS;
+
+#11.11 Generate the cis summary file of top 25 library CIS
+	    $query4 = "select number,CONCAT(chromo,':',start,'-',stop)as pos, total, pvaluetotal,number, pvalue,region,pvalueregion,gene_name,library_name,strand from cis_result_final_$proj where type = '$filename'  and pvalueregion < '$CIS_region_pvalue' and pvaluetotal < '$CIS_total_pvalue' and pvalue < '$CIS_library_pvalue' and gene_name not like '%BAD%' order by pvalueregion;"; 
+	    print "$query4";
+	    $sth = $dbh->prepare("$query4");
+	    $sth->execute;
+	    open CIS, ">> results/summary_CIS_$proj.txt";
+	    print CIS "\n\n$filename\nnumber\tpos\t#inserts\tpvalueinsert#\t#library\tpvaluelibrary#\t#regions\tpvalueregion#\tgene_name\tlibrary_name\t+strand\t";
+	    while ((@row) = $sth->fetchrow_array) {        
+		print CIS "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]\t$row[9]\t$row[10]\t$row[11]\t";
+	    }
+	    close CIS;
+
+#generate matrix and carry out association analyses
+	    if (($desc eq 'all') and ($percent == 0.0001) ) {
+		print "\n\n Starting Association analyses for $name";
+		mkdir "FISH";
+		mkdir "results/Assoc";
+		system ('perl lib/visualize.pl');
+		copy("FISH/cis4cluster.txt", "results/Assoc/cis4cluster.txt") or die "Unable to copy file: $!\n";
+		#system ('module load R');
+		system ('perl lib/matrix2fisher.pl FISH/cis4cluster.txt');
+		copy("FISH/R_result.txt", "results/Assoc/R_result.txt") or die "Unable to copy file: $!\n";
+		copy("FISH/Fisher_pre_named.txt", "results/Assoc/Fisher_pre_named.txt") or die "Unable to copy file: $!\n";
+		system ('perl lib/Fisher_clean.pl');
+	    }
+
+	}#close the threshhold loop
+    }#close the libraries loop
+}#close the metadata loop
+
+&set_project_status($proj, 0, 1);
+
+exit(0);
+
+sub resolve_cis {
+    $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue >  B.pvalue");
+    $sth->execute;
+    
+    $sth = $dbh->prepare("delete from A using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue >  B.pvalue");
+    $sth->execute;
+    $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.start and A.stop >= B.start and A.pvalue <  B.pvalue; ");
+    $sth->execute;
+    $sth = $dbh->prepare("delete from B using sort_$proj A, sort2_$proj B where A.chromo = B.chromo and A.start <= B.stop and A.stop >= B.stop and A.pvalue <=
+  B.pvalue; ");
+    $sth->execute;
+    
+    $sth = $dbh->prepare("insert into sort_$proj select * from sort2_$proj");
+    $sth->execute;
+    
+    $sth = $dbh->prepare("delete from sort2_$proj");
+    $sth->execute;
+}
+
+sub poisson {
+    return ((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[0
+								     ])))/(fac($_[2])));
+}
+
+sub poissonBonf {
+    return (((($_[1]/(2393726033/$_[0]))**($_[2]))*(2.71828**-($_[1]/(2393726033/$_[
+									  0])))/(fac($_[2])))*$_[1]);
+}
+
+sub fac {
+    my ($n) = @_;
+    
+    if ($n < 2) {
+	return $n;
+    }
+    else {
+	return $n * fac($n-1);
+    }
+}
+