Mercurial > repos > matces > carpet_toolsuite
diff carpet-src-1/tools/CARPET/MapToExon_RefSeqMat_new.pl @ 0:cdd489d98766
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:50:41 -0400 |
parents | |
children | 78770028dcf1 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/carpet-src-1/tools/CARPET/MapToExon_RefSeqMat_new.pl Tue Jun 07 16:50:41 2011 -0400 @@ -0,0 +1,412 @@ +#!/usr/bin/perl + +# Copyright 2009 Matteo Cesaroni, Lucilla Luzi +# +# This program is free software; ; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + + +$|=1; +my $infile = $ARGV[0]; +my $infile2=$ARGV[1]; +my $definition_pro=$ARGV[2]; +my $definition_tre=$ARGV[3]; +my $priority=$ARGV[4]; +my $file_output=$ARGV[5]; + +open (INFILE, "<$infile"); +open (INFILE2, "<$infile2"); +open (OUTFILE1, ">$file_output") or die "Cannot find file $file_output\n"; + +$campi_t=0; +$definition_out=$definition_pro-2000; + +while (defined (my $line_down = <INFILE2>)) { + $line_down=~ s/\#//g; + chomp($line_down); + $campi_t++; + my @tmp_down=split(/\s+/, $line_down); + if($campi_t==1){ + $z=0; + foreach $campo_t(@tmp_down){ + if(($campo_t eq "name") || ($campo_t eq "qName") || ($campo_t eq "repClass")){ + $zRef=$z; + } + if(($campo_t eq "txStart") || ($campo_t eq "tStart") || ($campo_t eq "chromStart")|| ($campo_t eq "genoStart")){ + $ztxStart=$z; + } + if(($campo_t eq "txEnd") || ($campo_t eq "tEnd") || ($campo_t eq "chromEnd")|| ($campo_t eq "genoEnd")){ + $ztxEnd=$z; + } + if($campo_t eq "strand"){ + $zstrand=$z; + } + if(($campo_t eq "chrom") || ($campo_t eq "tName")|| ($campo_t eq "genoName")){ + $zchrom=$z; + } + if(($campo_t eq "exonStarts") || ($campo_t eq "tStarts")){ + $zexonstart=$z; + } + if($campo_t eq "exonEnds"){ + $zexonend=$z; + } + if($campo_t eq "blockSizes"){ + $zblocksize=$z; + } + if(($campo_t eq "name2") || ($campo_t eq "repFamily")){ + $zname=$z; + } + if(($campo_t eq "exonCount")||($campo_t eq "blockCount")){ + $zcount=$z; + } + $z++; + } + if(!$zname){ + $zname=$zRef; + } + if(!$zexonstart){ + $zexonstart=$ztxStart; + } + if(!$zexonend){ + $zexonend=$ztxEnd; + } + if(($zRef eq "") || ($ztxStart eq "") || ($zstrand eq "") || ($zchrom eq "")){ + print "Annotation file is not in the accepted format\n"; + exit; + }else{print "promoter=$definition_pro, priority=$priority";} + next; + } + chomp $tmp_down[$zchrom]; + $tab_ann{$tmp_down[$zchrom]}.="$line_down\n"; +} + +while (defined (my $line_down = <INFILE>)) { + my @tmp_down = split("\t", $line_down); + chomp $tmp_down[0]; + $tab_probe{$tmp_down[0]}.=$line_down; +} + +@chrom_probes= keys(%tab_probe); + +&chip; + +exit 0; + +########### +#subrutine# +########### + +sub chip +{ +foreach $chromosoma (@chrom_probes){ + +@file1=split("\n", $tab_probe{$chromosoma}); + +foreach $line(@file1) { + chomp $line; + #chop $line; + if ($line=~/track/g){next;} + if ($line=~/#/g){next;} + if ($line=~/^\s+$/g){next;} + my @Line=split(/\t/, $line); + my $Chrom=$Line[0]; + my $Start=$Line[3]; + my $Stop=$Line[4]; + #my $value=$Line[5]; + my $ProbeName=$Line[5]; + my $feature="ciccio"; + my $check=0; + my $relative_dist=10000000; + $double_check=0; + @file2=split("\n", $tab_ann{$chromosoma}); + foreach $linea(@file2) { + chomp $linea; + $linea=~ s/\#//g; + my @kEle=split("\t", $linea); + $ref=$kEle[$zRef]; + $chrom=$kEle[$zchrom]; + $strand=$kEle[$zstrand]; + $transcriptStart=$kEle[$ztxStart]; + $transcriptStop=$kEle[$ztxEnd]; + if($zcount){ + $exoncount=$kEle[$zcount]; + } + else + { + $exoncount=1; + } + $geneName=$kEle[$zname]; + $exonStartref=$kEle[$zexonstart]; + my $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand"; + + my @exonStart=split(",", $exonStartref); + + if (!$zblocksize){ + $exonEndref=$kEle[$zexonend]; + } + else { + @blockStop=split(",", $kEle[$zblocksize]); + $exonEndref=""; + for ($jj=0; $jj<=$#exonStart; $jj++){ + $end_block=$exonStart[$jj]+$blockStop[$jj]; + $exonEndref.="$end_block,"; + + } + } + + my @exonStop=split(",", $exonEndref); + + #print "$ref / $chrom / $strand / $transcriptStart/$transcriptStop/$exoncount/$geneName [$#exonStop]\n"; + #print "pippo $exonStart[0] - $exonStop[0],$exonStart[1] - $exonStop[1],$exonStart[2] - $exonStop[2] \n"; + + + if ($Chrom eq $chrom) { + #print "cazzo"; + if ($strand eq "+"){ + $promotore=$transcriptStart+$definition_pro; + $distanzaTSS=int((($Start+$Stop)/2)-$transcriptStart); + $trepr=$transcriptStop+$definition_tre; + $rel_start=$promotore; + $rel_stop=$trepr; + } + if ($strand eq "-"){ + $promotore=$transcriptStop-$definition_pro; + $distanzaTSS=int($transcriptStop-(($Start+$Stop)/2)); + $trepr=$transcriptStart-$definition_tre; + $rel_start=$trepr; + $rel_stop=$promotore; + } + #print OUTFILE1 "$ref\t$distanzaTSS\n"; + + #if(($Start<=$transcriptStart && $Stop>$transcriptStart) || ($Start>=$promotore && $Stop<=$transcriptStart) || ($Start>=$transcriptStop && $Stop<=$promotore) || ($Start<=$promotore && $Stop>$promotore) || ($Start<$transcriptStop && $Stop>=$transcriptStop) || ($Start>=$transcriptStart && $Stop<=$transcriptStop) ){ + #print "sono entrato con start $Start stop $Stop e $transcriptStart e $transcriptStop\n"; + + if($Start<=$rel_stop && $rel_start<=$Stop){ + + for(my $i=0;$i<=$#exonStart;$i++) { + + if ($strand eq "+"){ + $exoncount1=$i+1; + $exoncount2=$exoncount1; + $introncount=$exoncount1; + if($i==$#exonStart) { + $exoncount2="last"; + $introncount="last"; + } + if($i==$#exonStart-1) { + $introncount="last"; + } + } + if ($strand eq "-"){ + $exoncount1=($#exonStart+1)-$i; + $exoncount2=$exoncount1; + $introncount=$exoncount1-1; + #if(($i==0) && ($#exonStart != 0)) {$exoncount2="last";} + if($i==0) { + $exoncount2="last"; + $introncount="last"; + } + } + #print "esone start $exonStart[$i] e $exonStop[$i] e start $Start\n"; + + #print OUTFILE1 "$ref\t$exonStart[$i]\t$exonStop[$i]\t$Start\t$Stop\t$i\t$#exonStart\t$priority\n"; + + + + #if($priority eq "prom" && $check==1){ + # last; + #} + #if($priority eq "gene" && $check==2){ + # next; + #} + + + if(($Start<=$exonStart[$i]) && ($Stop>$exonStart[$i])){ + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintronexon $exoncount2\t$distanzaTSS"; + if($priority eq "prom"){ + $check=2; + } + else{ + $check=1; + #last; + } + } + if(($Start>=$exonStart[$i]) && ($Stop<=$exonStop[$i])){ + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\texon $exoncount2\t$distanzaTSS"; + if($priority eq "prom"){ + $check=2; + } + else{ + $check=1; + #last; + } + } + if(($Start<$exonStop[$i]) && ($Stop>$exonStop[$i])){ + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\texonintron $introncount\t$distanzaTSS"; + if($priority eq "prom"){ + $check=2; + } + else{ + $check=1; + #last; + } + } + if($priority eq "prom"){ + if(($Start>=$exonStop[$i]) && ($Stop<=$exonStart[$i+1]) && ($check==0)){ + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintron $introncount\t$distanzaTSS"; + #print "intron\n"; + $check=2; + } + } + else{ + if(($Start>=$exonStop[$i]) && ($Stop<=$exonStart[$i+1])){ + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tintron $introncount\t$distanzaTSS"; + #print "intron\n"; + $check=2; + } + } + + + + if (($strand eq "+") && ($Start>=$transcriptStop)) { + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\t3_prime\t$distanzaTSS"; + $distanzaTSS=int($transcriptStop-(($Start+$Stop)/2)); + if($priority eq "prom"){ + $check=2; + #last; + } + else{$check=1;} + } + if (($strand eq "-") && ($Stop<=$transcriptStart)) { + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\t3_prime\t$distanzaTSS"; + $distanzaTSS=int((($Start+$Stop)/2)-$transcriptStart); + if($priority eq "prom"){ + $check=2; + #last; + } + else{$check=1;} + } + + + + + if (($strand eq "+") && (($Start<=$promotore && $Stop>$promotore) || ($Start>$promotore && $Stop<=$transcriptStart))) { + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tpromoter\t$distanzaTSS"; + if($priority eq "prom"){ + $check=1; + #last; + } + else{$check=2;} + } + if (($strand eq "-") && (($Start<=$promotore && $Stop>=$promotore) || ($Start>=$transcriptStop && $Stop<$promotore))) { + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tpromoter\t$distanzaTSS"; + if($priority eq "prom"){ + $check=1; + #last; + } + else{$check=2;} + } + + if(($Start<=$exonStart[$i]) && ($i==0) && ($strand eq "+") && ($Stop>=$exonStart[$i])){ + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tprom_exon $exoncount2\t$distanzaTSS"; + #print "prom-exon e $exoncount1\n"; + if($priority eq "prom"){ + $check=1; + #last; + } + else{$check=2;} + } + if(($Start<=$exonStop[$i]) && ($i==$#exonStart) && ($strand eq "-") && ($Stop>=$exonStop[$i])){ + $feature="$ref\t$geneName\t$transcriptStart\t$transcriptStop\t$strand\t$exoncount\tprom_exon $exoncount2\t$distanzaTSS"; + if($priority eq "prom"){ + $check=1; + #last; + } + else{$check=2;} + } + + } + #if ($check==1){ + # print "$chrom\t$Start\t$Stop\t$geneName\t$exoncount\t$ref\t$strand\t$feature\n"; + #} + }else{next;} + #exit; + }else{next;} + + #print OUTFILE1 "$ref\t$feature\n"; + #print "$geneName\t$transcriptStart\t$Chrom\t$Start\t$Stop\t$distanzaTSS\t$feature\n"; + + if (($priority eq "gene") && ($relative_dist>abs($distanzaTSS))){ + $relative_dist=abs($distanzaTSS); + $stampa="$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n"; + $double_check=1; + } + if (($check==1) && ($priority eq "prom") && ($relative_dist>abs($distanzaTSS))){ + $relative_dist=abs($distanzaTSS); + $double_check=1; + $stampa="$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n"; + } + } + + + + if($double_check==1){ + print OUTFILE1 "$stampa"; + next; + } + if (($check==2) && ($priority eq "prom")){ + print OUTFILE1 "$Chrom\t$Start\t$Stop\t$ProbeName\t$feature\n"; + next; + } + if($check==0){ + print OUTFILE1 "$Chrom\t$Start\t$Stop\t$ProbeName\tintergenic\tintergenic\t$Start\t$Stop\t+\t0\tOUT\t$definition_out\n"; + next; + } +} +} +close INFILE; +close INFILE2; +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +