diff check2.pl @ 2:2cceb9398d33 draft

Uploaded
author mkhan1980
date Mon, 04 Mar 2013 06:38:21 -0500
parents
children f1696b304b8d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/check2.pl	Mon Mar 04 06:38:21 2013 -0500
@@ -0,0 +1,274 @@
+  #!/usr/bin/perl -w
+use strict;
+
+# Define variables
+my @temp=();
+my $result1;
+my $result2;
+my $result3;
+my $result4;
+my $result5;
+my $result6;
+my $resultfinal;
+my $count;
+my $coun;
+my $cou;
+my @digit=();
+my $digit;
+my $marks;
+my $log;
+my $coll;
+my @scorearray=();
+my $scorearray;
+my $percent;
+my $kount;
+my @result=();
+my $result;
+my %final=();
+my $final;
+my @c=();
+my @matrix1;
+my @matrix2;
+my $matrix1;
+my $matrix2;
+$coll=0;
+my $count2;
+my $var;
+my $entry1;
+my $entry2;
+my $reventry1;
+my $reventry2;
+my $revvar;
+my @revmatrix1;
+my $revkount;
+my $revcoun;
+my $revcount2;
+my @revtemp;
+my $revcoll;
+my @revdigit;
+my $revdigit;
+my $revmarks;
+my $revresult1;
+my $revresult2;
+my $revresult3;
+my $revresult4;
+my $revresult5;
+my $revresult6;
+my $revresultfinal;
+my @revscorearray;
+my $revscorearray;
+
+
+
+#define variables from configuration file  
+open (IN, "<$ARGV[0]");
+open (IN2, "<$ARGV[1]");
+open (OUT, ">$ARGV[2]");
+
+#assign arrays to variables from configuration file
+my @array5=<IN>;
+
+my @coordinates=<IN2>;
+
+
+#split the chromosome number and starting position from coordinates file into 2 separate strings
+
+foreach my $coordinates(@coordinates) {
+
+chomp($coordinates);
+
+my @coordinates2=split(/\s+/, $coordinates);
+
+my $coordinates2;
+
+$entry1=$coordinates2[0];
+$entry2=$coordinates2[1];
+
+}
+
+
+print OUT "CTCF Site", "\t", "Chromosome no.", "\t", "Start", "\t", "End", "\t", "Score", "\t", "Strand", "\n";
+
+chomp (@array5);
+
+my $digits=join("", @array5);
+
+my @yeslap = $digits =~ /(?=(\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w))/g;
+
+
+    $var = "@yeslap\n";
+
+
+@matrix1=qw/87.25 291.25 76.25 459.25 167.25 145.25 414.25 187.25 281.25 49.25 449.25 134.25 56.25 800.25 21.25 36.25 8.25 903.25 0.25 2.25 744.25 13.25 65.25 91.25 40.25 528.25 334.25 11.25 107.25 433.25 48.25 324.25 851.25 11.25 32.25 18.25 5.25 0.25 903.25 3.25 333.25 3.25 566.25 9.25 54.25 12.25 504.25 341.25 12.25 0.25 890.25 8.25 56.25 8.25 775.25 71.25 104.25 733.25 5.25 67.25 372.25 13.25 507.25 17.25 82.25 482.25 307.25 37.25 117.25 322.25 73.25 396.25 402.25 181.25 266.25 59.25/;
+
+	$kount=0;
+	  		$coun=0;
+
+# Define the pattern for CTCF. Because of pseudocount, a wildcard is allowed at 
+#each position. 
+  	  		my $pattern = "[ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC]";
+	   
+# Compare the pattern with the 19 bp nucleotide segments.
+
+				while($var =~ m/$pattern/gi)
+				{
+					$coun++;
+					$count2++;
+					my $endpos = pos $var;
+
+# Get the starting and ending positions of the matched pattern.
+				
+					my $startpos=($endpos+1)-19;
+					my $lastpos=$endpos;
+				
+
+					my $consensus = substr($var, ($startpos-1), 19);
+			
+					push(@temp, $consensus, $startpos, $lastpos);
+							
+					$coll=0;
+					$kount++;
+				    
+
+# Split the matched pattern into 19 single bases.
+
+					@digit = split(//, $consensus);
+
+
+# For each base, if the base is A, calculate the weight score of A according to 
+#its frequency in the CTCF Position Frequency Matrix.			
+						foreach $digit (@digit)
+						{
+
+
+	if($digit =~ m/A/)
+							{
+
+							
+							    my $ref = \@matrix1;
+							  $marks = @{$ref}[$coll];
+							   
+							  
+							  $result1 = sqrt(914);
+							 
+							  $result2 = $result1*0.3;
+							  
+							  $result3 = $result2+$marks;
+							  $result4 = sqrt(914);
+							  $result5 = $result4+914;
+							  $result6 = 0.3;
+							
+$resultfinal = log($result3/$result5/$result6)/log(2);
+
+						 
+
+							   
+
+push(@scorearray, $resultfinal);
+		     
+						       
+		
+					}
+
+if($digit =~ m/C/)
+
+{
+
+							 my $ref = \@matrix1;
+							 
+							  $marks = @{$ref}[$coll + 1];
+ 
+							  $result1 = sqrt(914);
+							  $result2 = $result1*0.2;
+							  $result3 = $result2+$marks;
+							  $result4 = sqrt(914);
+							  $result5 = $result4+914;
+							  $result6 = 0.2;
+							
+$resultfinal = log($result3/$result5/$result6)/log(2);
+
+push(@scorearray, $resultfinal);
+
+
+}
+
+
+if($digit =~ m/G/)
+
+{
+
+							  my $ref = \@matrix1;
+							 
+							  $marks = @{$ref}[$coll+2];
+
+							  $result1 = sqrt(914);
+							  $result2 = $result1*0.2;
+							  $result3 = $result2+$marks;
+							  $result4 = sqrt(914);
+							  $result5 = $result4+914;
+							  $result6 = 0.2;
+							
+$resultfinal = log($result3/$result5/$result6)/log(2);
+
+
+push(@scorearray, $resultfinal);
+
+}
+
+if($digit =~ m/T/)
+
+{
+
+							  my $ref = \@matrix1;
+							 
+							  $marks = @{$ref}[$coll+3];
+
+							  $result1 = sqrt(914);
+							  $result2 = $result1*0.3;
+							  $result3 = $result2+$marks;
+							  $result4 = sqrt(914);
+							  $result5 = $result4+914;
+							  $result6 = 0.3;
+							
+$resultfinal = log($result3/$result5/$result6)/log(2);
+
+
+push(@scorearray, $resultfinal);
+
+}
+
+	$coll=$coll + 4;
+
+						}
+
+					@digit=();
+my $tem=0;
+
+
+foreach $scorearray(@scorearray)
+{
+
+    $tem = $tem + $scorearray;
+
+
+}
+
+@scorearray = ();
+
+
+my $fpercent = $tem;
+
+
+if ($fpercent >= 18) {
+
+    print OUT $consensus, "\t", $entry1, "\t", $entry2 - 18 - $count2, "\t", $entry2 - $count2, "\t", "$fpercent", "\t", "-", "\n";
+
+
+}			
+
+				}
+
+
+close ( OUT );
+close ( IN );
+close ( IN2 );