view carpet-src-1/tools/CARPET/MapToExon_RefSeqMat_new.pl @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children 78770028dcf1
line wrap: on
line source

#!/usr/bin/perl

# Copyright 2009 Matteo Cesaroni, Lucilla Luzi
#
# This program is free software; ; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.


$|=1;
my $infile = $ARGV[0];
my $infile2=$ARGV[1];
my $definition_pro=$ARGV[2];
my $definition_tre=$ARGV[3];
my $priority=$ARGV[4];
my $file_output=$ARGV[5];

open (INFILE, "<$infile");
open (INFILE2, "<$infile2");
open (OUTFILE1, ">$file_output") or die "Cannot find file $file_output\n";   

$campi_t=0;
$definition_out=$definition_pro-2000;

while (defined (my $line_down = <INFILE2>)) {
		$line_down=~ s/\#//g;
		chomp($line_down);
		$campi_t++;
		my @tmp_down=split(/\s+/, $line_down);
		if($campi_t==1){
			$z=0;
			foreach $campo_t(@tmp_down){
				if(($campo_t eq "name") || ($campo_t eq "qName") || ($campo_t eq "repClass")){
					$zRef=$z;
				}
				if(($campo_t eq "txStart") || ($campo_t eq "tStart") || ($campo_t eq "chromStart")|| ($campo_t eq "genoStart")){
					$ztxStart=$z;
				}
				if(($campo_t eq "txEnd") || ($campo_t eq "tEnd") || ($campo_t eq "chromEnd")|| ($campo_t eq "genoEnd")){
					$ztxEnd=$z;
				}
				if($campo_t eq "strand"){
					$zstrand=$z;
				}
				if(($campo_t eq "chrom") || ($campo_t eq "tName")|| ($campo_t eq "genoName")){
					$zchrom=$z;
				}
				if(($campo_t eq "exonStarts") || ($campo_t eq "tStarts")){
					$zexonstart=$z;
				}
				if($campo_t eq "exonEnds"){
					$zexonend=$z;
				}
				if($campo_t eq "blockSizes"){
					$zblocksize=$z;
				}
				if(($campo_t eq "name2") || ($campo_t eq "repFamily")){
					$zname=$z;
				}
				if(($campo_t eq "exonCount")||($campo_t eq "blockCount")){
					$zcount=$z;
				}
				$z++;
			}
			if(!$zname){
				$zname=$zRef;
			}
			if(!$zexonstart){
				$zexonstart=$ztxStart;
			}
			if(!$zexonend){
				$zexonend=$ztxEnd;
			}
			if(($zRef eq "") || ($ztxStart eq "") || ($zstrand eq "") || ($zchrom eq "")){
				print "Annotation file is not in the accepted format\n";
				exit;
			}else{print "promoter=$definition_pro, priority=$priority";}
		next;
		} 
   		chomp $tmp_down[$zchrom];
   		$tab_ann{$tmp_down[$zchrom]}.="$line_down\n";
}

while (defined (my $line_down = <INFILE>)) {
   		my @tmp_down = split("\t", $line_down);  
   		chomp $tmp_down[0];
   		$tab_probe{$tmp_down[0]}.=$line_down; 
}

@chrom_probes= keys(%tab_probe); 

&chip;

exit 0;

###########
#subrutine#
###########

sub chip
{
foreach $chromosoma (@chrom_probes){

@file1=split("\n", $tab_probe{$chromosoma});

foreach $line(@file1) {
	chomp $line;
	#chop $line;
	if ($line=~/track/g){next;}
    if ($line=~/#/g){next;}
    if ($line=~/^\s+$/g){next;}
	my @Line=split(/\t/, $line);
	my $Chrom=$Line[0];
	my $Start=$Line[3];
	my $Stop=$Line[4];
	#my $value=$Line[5];
	my $ProbeName=$Line[5];
	my $feature="ciccio";
	my $check=0;
	my $relative_dist=10000000;
	$double_check=0;
	@file2=split("\n", $tab_ann{$chromosoma});
	foreach $linea(@file2) {
		chomp $linea;
		$linea=~ s/\#//g;
		my @kEle=split("\t", $linea);
		$ref=$kEle[$zRef];
		$chrom=$kEle[$zchrom];
		$strand=$kEle[$zstrand];
		$transcriptStart=$kEle[$ztxStart];
		$transcriptStop=$kEle[$ztxEnd];
		if($zcount){
			$exoncount=$kEle[$zcount];
		}
		else
		{
			$exoncount=1;
		}
		$geneName=$kEle[$zname];
		$exonStartref=$kEle[$zexonstart];
		my $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand";
		
		my @exonStart=split(",", $exonStartref);
		
		if (!$zblocksize){
			$exonEndref=$kEle[$zexonend];
		}
		else {
			@blockStop=split(",", $kEle[$zblocksize]);
			$exonEndref="";
			for ($jj=0; $jj<=$#exonStart; $jj++){
				$end_block=$exonStart[$jj]+$blockStop[$jj];
				$exonEndref.="$end_block,";
			
			}	
		}
		
		my @exonStop=split(",", $exonEndref);		
		
		#print "$ref / $chrom / $strand / $transcriptStart/$transcriptStop/$exoncount/$geneName [$#exonStop]\n";
		#print "pippo $exonStart[0] - $exonStop[0],$exonStart[1] - $exonStop[1],$exonStart[2] - $exonStop[2] \n";
		
		
		if ($Chrom eq $chrom)  {
			#print "cazzo";
			if ($strand eq "+"){
				$promotore=$transcriptStart+$definition_pro;
				$distanzaTSS=int((($Start+$Stop)/2)-$transcriptStart);
				$trepr=$transcriptStop+$definition_tre;
				$rel_start=$promotore;
				$rel_stop=$trepr;
			}
			if ($strand eq "-"){
				$promotore=$transcriptStop-$definition_pro;
				$distanzaTSS=int($transcriptStop-(($Start+$Stop)/2));
				$trepr=$transcriptStart-$definition_tre;
				$rel_start=$trepr;
				$rel_stop=$promotore;
			}
			#print OUTFILE1 "$ref\t$distanzaTSS\n";
			
			#if(($Start<=$transcriptStart && $Stop>$transcriptStart) || ($Start>=$promotore && $Stop<=$transcriptStart) || ($Start>=$transcriptStop && $Stop<=$promotore) || ($Start<=$promotore && $Stop>$promotore) || ($Start<$transcriptStop && $Stop>=$transcriptStop) || ($Start>=$transcriptStart && $Stop<=$transcriptStop) ){
				#print "sono entrato con start $Start stop $Stop e $transcriptStart e $transcriptStop\n";
		
			if($Start<=$rel_stop && $rel_start<=$Stop){
		
				for(my $i=0;$i<=$#exonStart;$i++) {
					
					if ($strand eq "+"){
						$exoncount1=$i+1;
						$exoncount2=$exoncount1;
						$introncount=$exoncount1;
						if($i==$#exonStart) {
							$exoncount2="last";
							$introncount="last";
						}
						if($i==$#exonStart-1) {
							$introncount="last";
						}
					}
					if ($strand eq "-"){
						$exoncount1=($#exonStart+1)-$i;
						$exoncount2=$exoncount1;
						$introncount=$exoncount1-1;
						#if(($i==0) && ($#exonStart != 0)) {$exoncount2="last";}
						if($i==0) {
							$exoncount2="last";
							$introncount="last";
						}
					}
					#print "esone start $exonStart[$i] e $exonStop[$i] e start $Start\n";
					
					#print OUTFILE1 "$ref\t$exonStart[$i]\t$exonStop[$i]\t$Start\t$Stop\t$i\t$#exonStart\t$priority\n";
					
					
					
					#if($priority eq "prom" && $check==1){
					#		last;
					#}
					#if($priority eq "gene" && $check==2){
					#		next;
					#}
					
					
					if(($Start<=$exonStart[$i]) && ($Stop>$exonStart[$i])){
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintronexon $exoncount2\t$distanzaTSS";
						if($priority eq "prom"){
							$check=2;
						}
						else{
							$check=1;
							#last;
						}
					}
					if(($Start>=$exonStart[$i]) && ($Stop<=$exonStop[$i])){
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\texon $exoncount2\t$distanzaTSS";
						if($priority eq "prom"){
							$check=2;
						}
						else{
							$check=1;
							#last;
						}
					}
					if(($Start<$exonStop[$i]) && ($Stop>$exonStop[$i])){
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\texonintron $introncount\t$distanzaTSS";
						if($priority eq "prom"){
							$check=2;
						}
						else{
							$check=1;
							#last;
						}
					}
					if($priority eq "prom"){
						if(($Start>=$exonStop[$i]) && ($Stop<=$exonStart[$i+1]) && ($check==0)){
							$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintron $introncount\t$distanzaTSS";
							#print "intron\n";
							$check=2;
						}
					}
					else{
						if(($Start>=$exonStop[$i]) && ($Stop<=$exonStart[$i+1])){
							$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintron $introncount\t$distanzaTSS";
							#print "intron\n";
							$check=2;
						}
					}
					
					
					
					if (($strand eq "+") && ($Start>=$transcriptStop)) {
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\t3_prime\t$distanzaTSS";
						$distanzaTSS=int($transcriptStop-(($Start+$Stop)/2));
						if($priority eq "prom"){
							$check=2;
							#last;
						}
						else{$check=1;}
					}
					if (($strand eq "-") && ($Stop<=$transcriptStart)) {
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\t3_prime\t$distanzaTSS";
						$distanzaTSS=int((($Start+$Stop)/2)-$transcriptStart);
						if($priority eq "prom"){
							$check=2;
							#last;
						}
						else{$check=1;}
					}
					
					
					
					
					if (($strand eq "+") && (($Start<=$promotore && $Stop>$promotore) || ($Start>$promotore && $Stop<=$transcriptStart))) {
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tpromoter\t$distanzaTSS";
						if($priority eq "prom"){
							$check=1;
							#last;
						}
						else{$check=2;}
					}
					if (($strand eq "-") && (($Start<=$promotore && $Stop>=$promotore) || ($Start>=$transcriptStop && $Stop<$promotore))) {
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tpromoter\t$distanzaTSS";
						if($priority eq "prom"){
							$check=1;
							#last;
						}
						else{$check=2;}
					}
					
					if(($Start<=$exonStart[$i])  && ($i==0) && ($strand eq "+") && ($Stop>=$exonStart[$i])){
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tprom_exon $exoncount2\t$distanzaTSS";
						#print "prom-exon e $exoncount1\n";
						if($priority eq "prom"){
							$check=1;
							#last;
						}
						else{$check=2;}
					}
					if(($Start<=$exonStop[$i])  && ($i==$#exonStart) && ($strand eq "-") && ($Stop>=$exonStop[$i])){
						$feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tprom_exon $exoncount2\t$distanzaTSS";
						if($priority eq "prom"){
							$check=1;
							#last;
						}
						else{$check=2;}
					}
					
				}
			#if ($check==1){
			#	print "$chrom\t$Start\t$Stop\t$geneName\t$exoncount\t$ref\t$strand\t$feature\n";
			#}
			}else{next;}
			#exit;
		}else{next;}
		
		#print OUTFILE1 "$ref\t$feature\n";
		#print "$geneName\t$transcriptStart\t$Chrom\t$Start\t$Stop\t$distanzaTSS\t$feature\n";
		
		if (($priority eq "gene") && ($relative_dist>abs($distanzaTSS))){
			$relative_dist=abs($distanzaTSS);
			$stampa="$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n";
			$double_check=1;
		}
		if (($check==1) && ($priority eq "prom") && ($relative_dist>abs($distanzaTSS))){
			$relative_dist=abs($distanzaTSS);
			$double_check=1;
			$stampa="$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n";
		}
	}
	
	
	
	if($double_check==1){
		print OUTFILE1 "$stampa";
		next;
	}
	if (($check==2) && ($priority eq "prom")){
		print OUTFILE1 "$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n";
		next;
	}
	if($check==0){
		print OUTFILE1 "$Chrom\t$Start\t$Stop\t$ProbeName\tintergenic\tintergenic\t$Start\t$Stop\t+\t0\tOUT\t$definition_out\n";
		next;	
	}
}
}
close INFILE;
close INFILE2;
}