Mercurial > repos > clairetn > detrprok_scripts

diff detrprok_scripts/splitTranscriptGff.pl @ 0:a53eb951b164 draft
Uploaded
author: clairetn
date: Mon, 25 Mar 2013 05:39:29 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/detrprok_scripts/splitTranscriptGff.pl	Mon Mar 25 05:39:29 2013 -0400
@@ -0,0 +1,197 @@
+#!/usr/bin/perl -w
+###
+# But : croiser
+# 
+# Entrees : 2 fichiers gff à croiser
+#
+# Sortie : gff affiche a l'ecran
+#
+###------------------------------------------------------
+use vars qw($USAGE);                      
+use strict;                               
+
+=head1 NAME
+
+splitTranscriptGff.pl - compare 2 input gff files and define intervalls by couple of overlapping elements
+
+=head1 SYNOPSIS
+
+% intervallsExtractorGff.pl -i file1.gff -j file2.gff -s strand [-h] 
+
+=head1 DESCRIPTION
+This script will sort 2 gff files and compute distance between 2 successives lines.
+
+    -i|--input1 fileName   gff input file name: included elements
+    -j|--input2 fileName   gff input file name: extended elements
+   [-s|--strand] [s|d]	   s for single strand (colinear) or d for double strands (antisense) [default d]
+   [-h|--help]             help mode then die                              
+
+=head1 USECASE
+Compare 2 input gff files: an annotations file (included elements) and transcription units file (extended elements).
+For each couple of overlapping elements, split the transcription unit in 5'utr, CDS, 3'utr or "operon" in case of successives genes included in the transcription unit.
+
+=head1 KWON BUG
+Fisrt and last elements on the genome sequence can be misclassified in the result file.
+
+=head1 AUTHOR
+Claire Toffano-Nioche - sep.11
+
+=cut
+#-----------------------
+sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ;
+		#print "feedPositionTab::$#info, ", ($#info+1)/4," \n";
+	for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element 
+			#print "....$info[$i+2]\n";
+		if ($info[$i+3] =~ /\+/) {
+			for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # Forward sequence
+		} else {
+			for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # Backward sequence
+		}
+	}
+		#print "feedPos...:: ", join(".", @$pF[0..100]), "\n";
+		#print "feedPos...:: ", join(".", @$pB[0..100]), "\n";
+}
+#-----------------------
+sub recupInfo {	my ($pInfo, @lines) = @_ ;
+    my $i=0 ;
+    while ($i<=$#lines){
+	chomp($lines[$i]);
+	my @line = split("\t",$lines[$i]);
+	if (defined ($line[0])) {
+		if (!($line[0] =~ m/^\s*$|^#/)) { # skip both null and comment lines
+			push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=name, 3=begin, 4=end, 6=strand
+		} 
+	}
+	$i=$i+1 ;
+    }
+    	#print "recupInfo::end=", $i, "\n" ;
+}
+#-----------------------
+sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ;    
+	my $tagN=$seqN.$strand.$posB."..".$posE;
+		#print "tagName::",join("_",@_)," and tagName:$tagN\n";
+return $tagN;
+}
+#-----------------------
+sub transitionAnalysis {
+my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ;
+    my $enCours = @$ptag[$pos] ; 
+    my $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ; 
+    if ($enCours ne $precedant) {
+    	#print "transi...:: $s, $pos, $precedant, $enCours\n";
+    	#print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n";
+    	SWITCH: for ($precedant.$enCours) {
+               	/01/ && do { $$pdebAmont = $pos ; last SWITCH ;};
+                /02/ && do { $$pdebIn = $pos ; last SWITCH ;};
+                /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ; 
+                		if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) {
+                			printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ; 
+                			#if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"};
+                		} elsif ($$pfinAval!=$$pdebAval) {
+                			printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ; 
+                			#if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"};
+                		}
+                		$$pdebAval = 0 ; $$pfinAval = 0 ;
+                		last SWITCH ;
+                	 };
+                /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ;
+                		my $type="utr5";
+                		if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3
+                			$$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ; 
+                			$type="operon" ;
+                		}
+                		if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) {
+                			printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ; 
+                			# if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"};
+                		} elsif ($$pfinAmont!=$$pdebAmont) {
+	                		printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ; 
+                			#if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ;
+                		}
+                		$$pdebAmont = 0 ; $$pfinAmont = 0 ;
+                		last SWITCH ;
+                	 };
+                /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; 
+                        if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
+                        	printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; 
+                		} elsif ($$pfinIn!=$$pdebIn) {
+                		    printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; 
+                		}
+                		$$pdebIn = 0 ; $$pfinIn = 0 ;
+                		last SWITCH ;
+                	 };
+                /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; 
+                        if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
+                        	printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; 
+                		} elsif ($$pfinIn!=$$pdebIn) {
+                			printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
+                				$seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; 
+                		}
+                		#$$pdebIn = 0 ; $$pfinIn = 0 ;
+                		last SWITCH ;
+                	 };
+          }
+    }
+ }
+#-----------------------	
+my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ;
+# command line check
+foreach my $num (0 .. $#ARGV) {
+        SWITCH: for ($ARGV[$num]) {
+        /--input1|-i/ && do { 
+			$fileNameI=$ARGV[$num+1]; 
+			open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ; 
+			last };
+	/--input2|-j/ && do { 
+			$fileNameE=$ARGV[$num+1]; 
+			open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ; 
+			last };
+        /--strand|-s/ && do { 
+			if ($ARGV[$num+1] eq "s") { $strand=1}; 
+			last };
+        /--help|-h/ && do { exec("pod2text $0\n") ; die };
+        }
+}
+# memory declarations:
+my @infoI ; my @infoE ;
+my $seqName ;
+my @tagF ; my @tagB ; # for Forward and Backward sequences
+# data retrieval:
+my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ;
+close fichierGffI ; close fichierGffE ;
+	#print "gff files read ; number of lines : ",$#linesI+1," + ",$#linesE+1,"\n";
+# positions management:
+&recupInfo(\@infoI, @linesI) ; 
+	#print "number of treated elements:",($#infoI+1)/4,"\n";
+&recupInfo(\@infoE, @linesE) ;
+	#print "number of treated elements:",($#infoE+1)/4,"\n";
+# treatement: 
+# transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements
+if (($#infoI) and ($#infoE)) { 
+	$seqName=$infoI[0] ;
+		#print "end : $infoE[$#infoE-1]\n";
+	for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions
+		#print "seqName : $seqName\n" ;
+	&feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements
+	&feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements
+		#print join("", @tagF), "\n";
+		#print join("", @tagB), "\n";
+	# transition management:
+	my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) 
+		= (0, 0, 0, 0, 0, 0) ;
+	for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) {
+		&transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ;
+	}
+	($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) 
+		= ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ;
+	for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) {
+		&transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ;
+	}
+}
+exit(0) ;
author	clairetn
date	Mon, 25 Mar 2013 05:39:29 -0400
parents
children