diff carpet-src-1/tools/CARPET/MapToExon_RefSeqMat_new.pl @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children 78770028dcf1
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/carpet-src-1/tools/CARPET/MapToExon_RefSeqMat_new.pl	Tue Jun 07 16:50:41 2011 -0400
@@ -0,0 +1,412 @@
+#!/usr/bin/perl
+
+# Copyright 2009 Matteo Cesaroni, Lucilla Luzi
+#
+# This program is free software; ; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+
+$|=1;
+my $infile = $ARGV[0];
+my $infile2=$ARGV[1];
+my $definition_pro=$ARGV[2];
+my $definition_tre=$ARGV[3];
+my $priority=$ARGV[4];
+my $file_output=$ARGV[5];
+
+open (INFILE, "<$infile");
+open (INFILE2, "<$infile2");
+open (OUTFILE1, ">$file_output") or die "Cannot find file $file_output\n";   
+
+$campi_t=0;
+$definition_out=$definition_pro-2000;
+
+while (defined (my $line_down = <INFILE2>)) {
+		$line_down=~ s/\#//g;
+		chomp($line_down);
+		$campi_t++;
+		my @tmp_down=split(/\s+/, $line_down);
+		if($campi_t==1){
+			$z=0;
+			foreach $campo_t(@tmp_down){
+				if(($campo_t eq "name") || ($campo_t eq "qName") || ($campo_t eq "repClass")){
+					$zRef=$z;
+				}
+				if(($campo_t eq "txStart") || ($campo_t eq "tStart") || ($campo_t eq "chromStart")|| ($campo_t eq "genoStart")){
+					$ztxStart=$z;
+				}
+				if(($campo_t eq "txEnd") || ($campo_t eq "tEnd") || ($campo_t eq "chromEnd")|| ($campo_t eq "genoEnd")){
+					$ztxEnd=$z;
+				}
+				if($campo_t eq "strand"){
+					$zstrand=$z;
+				}
+				if(($campo_t eq "chrom") || ($campo_t eq "tName")|| ($campo_t eq "genoName")){
+					$zchrom=$z;
+				}
+				if(($campo_t eq "exonStarts") || ($campo_t eq "tStarts")){
+					$zexonstart=$z;
+				}
+				if($campo_t eq "exonEnds"){
+					$zexonend=$z;
+				}
+				if($campo_t eq "blockSizes"){
+					$zblocksize=$z;
+				}
+				if(($campo_t eq "name2") || ($campo_t eq "repFamily")){
+					$zname=$z;
+				}
+				if(($campo_t eq "exonCount")||($campo_t eq "blockCount")){
+					$zcount=$z;
+				}
+				$z++;
+			}
+			if(!$zname){
+				$zname=$zRef;
+			}
+			if(!$zexonstart){
+				$zexonstart=$ztxStart;
+			}
+			if(!$zexonend){
+				$zexonend=$ztxEnd;
+			}
+			if(($zRef eq "") || ($ztxStart eq "") || ($zstrand eq "") || ($zchrom eq "")){
+				print "Annotation file is not in the accepted format\n";
+				exit;
+			}else{print "promoter=$definition_pro, priority=$priority";}
+		next;
+		} 
+   		chomp $tmp_down[$zchrom];
+   		$tab_ann{$tmp_down[$zchrom]}.="$line_down\n";
+}
+
+while (defined (my $line_down = <INFILE>)) {
+   		my @tmp_down = split("\t", $line_down);  
+   		chomp $tmp_down[0];
+   		$tab_probe{$tmp_down[0]}.=$line_down; 
+}
+
+@chrom_probes= keys(%tab_probe); 
+
+&chip;
+
+exit 0;
+
+###########
+#subrutine#
+###########
+
+sub chip
+{
+foreach $chromosoma (@chrom_probes){
+
+@file1=split("\n", $tab_probe{$chromosoma});
+
+foreach $line(@file1) {
+	chomp $line;
+	#chop $line;
+	if ($line=~/track/g){next;}
+    if ($line=~/#/g){next;}
+    if ($line=~/^\s+$/g){next;}
+	my @Line=split(/\t/, $line);
+	my $Chrom=$Line[0];
+	my $Start=$Line[3];
+	my $Stop=$Line[4];
+	#my $value=$Line[5];
+	my $ProbeName=$Line[5];
+	my $feature="ciccio";
+	my $check=0;
+	my $relative_dist=10000000;
+	$double_check=0;
+	@file2=split("\n", $tab_ann{$chromosoma});
+	foreach $linea(@file2) {
+		chomp $linea;
+		$linea=~ s/\#//g;
+		my @kEle=split("\t", $linea);
+		$ref=$kEle[$zRef];
+		$chrom=$kEle[$zchrom];
+		$strand=$kEle[$zstrand];
+		$transcriptStart=$kEle[$ztxStart];
+		$transcriptStop=$kEle[$ztxEnd];
+		if($zcount){
+			$exoncount=$kEle[$zcount];
+		}
+		else
+		{
+			$exoncount=1;
+		}
+		$geneName=$kEle[$zname];
+		$exonStartref=$kEle[$zexonstart];
+		my $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand";
+		
+		my @exonStart=split(",", $exonStartref);
+		
+		if (!$zblocksize){
+			$exonEndref=$kEle[$zexonend];
+		}
+		else {
+			@blockStop=split(",", $kEle[$zblocksize]);
+			$exonEndref="";
+			for ($jj=0; $jj<=$#exonStart; $jj++){
+				$end_block=$exonStart[$jj]+$blockStop[$jj];
+				$exonEndref.="$end_block,";
+			
+			}	
+		}
+		
+		my @exonStop=split(",", $exonEndref);		
+		
+		#print "$ref / $chrom / $strand / $transcriptStart/$transcriptStop/$exoncount/$geneName [$#exonStop]\n";
+		#print "pippo $exonStart[0] - $exonStop[0],$exonStart[1] - $exonStop[1],$exonStart[2] - $exonStop[2] \n";
+		
+		
+		if ($Chrom eq $chrom)  {
+			#print "cazzo";
+			if ($strand eq "+"){
+				$promotore=$transcriptStart+$definition_pro;
+				$distanzaTSS=int((($Start+$Stop)/2)-$transcriptStart);
+				$trepr=$transcriptStop+$definition_tre;
+				$rel_start=$promotore;
+				$rel_stop=$trepr;
+			}
+			if ($strand eq "-"){
+				$promotore=$transcriptStop-$definition_pro;
+				$distanzaTSS=int($transcriptStop-(($Start+$Stop)/2));
+				$trepr=$transcriptStart-$definition_tre;
+				$rel_start=$trepr;
+				$rel_stop=$promotore;
+			}
+			#print OUTFILE1 "$ref\t$distanzaTSS\n";
+			
+			#if(($Start<=$transcriptStart && $Stop>$transcriptStart) || ($Start>=$promotore && $Stop<=$transcriptStart) || ($Start>=$transcriptStop && $Stop<=$promotore) || ($Start<=$promotore && $Stop>$promotore) || ($Start<$transcriptStop && $Stop>=$transcriptStop) || ($Start>=$transcriptStart && $Stop<=$transcriptStop) ){
+				#print "sono entrato con start $Start stop $Stop e $transcriptStart e $transcriptStop\n";
+		
+			if($Start<=$rel_stop && $rel_start<=$Stop){
+		
+				for(my $i=0;$i<=$#exonStart;$i++) {
+					
+					if ($strand eq "+"){
+						$exoncount1=$i+1;
+						$exoncount2=$exoncount1;
+						$introncount=$exoncount1;
+						if($i==$#exonStart) {
+							$exoncount2="last";
+							$introncount="last";
+						}
+						if($i==$#exonStart-1) {
+							$introncount="last";
+						}
+					}
+					if ($strand eq "-"){
+						$exoncount1=($#exonStart+1)-$i;
+						$exoncount2=$exoncount1;
+						$introncount=$exoncount1-1;
+						#if(($i==0) && ($#exonStart != 0)) {$exoncount2="last";}
+						if($i==0) {
+							$exoncount2="last";
+							$introncount="last";
+						}
+					}
+					#print "esone start $exonStart[$i] e $exonStop[$i] e start $Start\n";
+					
+					#print OUTFILE1 "$ref\t$exonStart[$i]\t$exonStop[$i]\t$Start\t$Stop\t$i\t$#exonStart\t$priority\n";
+					
+					
+					
+					#if($priority eq "prom" && $check==1){
+					#		last;
+					#}
+					#if($priority eq "gene" && $check==2){
+					#		next;
+					#}
+					
+					
+					if(($Start<=$exonStart[$i]) && ($Stop>$exonStart[$i])){
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintronexon $exoncount2\t$distanzaTSS";
+						if($priority eq "prom"){
+							$check=2;
+						}
+						else{
+							$check=1;
+							#last;
+						}
+					}
+					if(($Start>=$exonStart[$i]) && ($Stop<=$exonStop[$i])){
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\texon $exoncount2\t$distanzaTSS";
+						if($priority eq "prom"){
+							$check=2;
+						}
+						else{
+							$check=1;
+							#last;
+						}
+					}
+					if(($Start<$exonStop[$i]) && ($Stop>$exonStop[$i])){
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\texonintron $introncount\t$distanzaTSS";
+						if($priority eq "prom"){
+							$check=2;
+						}
+						else{
+							$check=1;
+							#last;
+						}
+					}
+					if($priority eq "prom"){
+						if(($Start>=$exonStop[$i]) && ($Stop<=$exonStart[$i+1]) && ($check==0)){
+							$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintron $introncount\t$distanzaTSS";
+							#print "intron\n";
+							$check=2;
+						}
+					}
+					else{
+						if(($Start>=$exonStop[$i]) && ($Stop<=$exonStart[$i+1])){
+							$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintron $introncount\t$distanzaTSS";
+							#print "intron\n";
+							$check=2;
+						}
+					}
+					
+					
+					
+					if (($strand eq "+") && ($Start>=$transcriptStop)) {
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\t3_prime\t$distanzaTSS";
+						$distanzaTSS=int($transcriptStop-(($Start+$Stop)/2));
+						if($priority eq "prom"){
+							$check=2;
+							#last;
+						}
+						else{$check=1;}
+					}
+					if (($strand eq "-") && ($Stop<=$transcriptStart)) {
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\t3_prime\t$distanzaTSS";
+						$distanzaTSS=int((($Start+$Stop)/2)-$transcriptStart);
+						if($priority eq "prom"){
+							$check=2;
+							#last;
+						}
+						else{$check=1;}
+					}
+					
+					
+					
+					
+					if (($strand eq "+") && (($Start<=$promotore && $Stop>$promotore) || ($Start>$promotore && $Stop<=$transcriptStart))) {
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tpromoter\t$distanzaTSS";
+						if($priority eq "prom"){
+							$check=1;
+							#last;
+						}
+						else{$check=2;}
+					}
+					if (($strand eq "-") && (($Start<=$promotore && $Stop>=$promotore) || ($Start>=$transcriptStop && $Stop<$promotore))) {
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tpromoter\t$distanzaTSS";
+						if($priority eq "prom"){
+							$check=1;
+							#last;
+						}
+						else{$check=2;}
+					}
+					
+					if(($Start<=$exonStart[$i])  && ($i==0) && ($strand eq "+") && ($Stop>=$exonStart[$i])){
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tprom_exon $exoncount2\t$distanzaTSS";
+						#print "prom-exon e $exoncount1\n";
+						if($priority eq "prom"){
+							$check=1;
+							#last;
+						}
+						else{$check=2;}
+					}
+					if(($Start<=$exonStop[$i])  && ($i==$#exonStart) && ($strand eq "-") && ($Stop>=$exonStop[$i])){
+						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tprom_exon $exoncount2\t$distanzaTSS";
+						if($priority eq "prom"){
+							$check=1;
+							#last;
+						}
+						else{$check=2;}
+					}
+					
+				}
+			#if ($check==1){
+			#	print "$chrom\t$Start\t$Stop\t$geneName\t$exoncount\t$ref\t$strand\t$feature\n";
+			#}
+			}else{next;}
+			#exit;
+		}else{next;}
+		
+		#print OUTFILE1 "$ref\t$feature\n";
+		#print "$geneName\t$transcriptStart\t$Chrom\t$Start\t$Stop\t$distanzaTSS\t$feature\n";
+		
+		if (($priority eq "gene") && ($relative_dist>abs($distanzaTSS))){
+			$relative_dist=abs($distanzaTSS);
+			$stampa="$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n";
+			$double_check=1;
+		}
+		if (($check==1) && ($priority eq "prom") && ($relative_dist>abs($distanzaTSS))){
+			$relative_dist=abs($distanzaTSS);
+			$double_check=1;
+			$stampa="$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n";
+		}
+	}
+	
+	
+	
+	if($double_check==1){
+		print OUTFILE1 "$stampa";
+		next;
+	}
+	if (($check==2) && ($priority eq "prom")){
+		print OUTFILE1 "$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n";
+		next;
+	}
+	if($check==0){
+		print OUTFILE1 "$Chrom\t$Start\t$Stop\t$ProbeName\tintergenic\tintergenic\t$Start\t$Stop\t+\t0\tOUT\t$definition_out\n";
+		next;	
+	}
+}
+}
+close INFILE;
+close INFILE2;
+}
+
+
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+
+