Mercurial > repos > clairetn > detrprok_scripts
diff detrprok_scripts/splitTranscriptGff.pl @ 0:a53eb951b164 draft
Uploaded
author | clairetn |
---|---|
date | Mon, 25 Mar 2013 05:39:29 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/detrprok_scripts/splitTranscriptGff.pl Mon Mar 25 05:39:29 2013 -0400 @@ -0,0 +1,197 @@ +#!/usr/bin/perl -w +### +# But : croiser +# +# Entrees : 2 fichiers gff à croiser +# +# Sortie : gff affiche a l'ecran +# +###------------------------------------------------------ +use vars qw($USAGE); +use strict; + +=head1 NAME + +splitTranscriptGff.pl - compare 2 input gff files and define intervalls by couple of overlapping elements + +=head1 SYNOPSIS + +% intervallsExtractorGff.pl -i file1.gff -j file2.gff -s strand [-h] + +=head1 DESCRIPTION +This script will sort 2 gff files and compute distance between 2 successives lines. + + -i|--input1 fileName gff input file name: included elements + -j|--input2 fileName gff input file name: extended elements + [-s|--strand] [s|d] s for single strand (colinear) or d for double strands (antisense) [default d] + [-h|--help] help mode then die + +=head1 USECASE +Compare 2 input gff files: an annotations file (included elements) and transcription units file (extended elements). +For each couple of overlapping elements, split the transcription unit in 5'utr, CDS, 3'utr or "operon" in case of successives genes included in the transcription unit. + +=head1 KWON BUG +Fisrt and last elements on the genome sequence can be misclassified in the result file. + +=head1 AUTHOR +Claire Toffano-Nioche - sep.11 + +=cut +#----------------------- +sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ; + #print "feedPositionTab::$#info, ", ($#info+1)/4," \n"; + for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element + #print "....$info[$i+2]\n"; + if ($info[$i+3] =~ /\+/) { + for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # Forward sequence + } else { + for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # Backward sequence + } + } + #print "feedPos...:: ", join(".", @$pF[0..100]), "\n"; + #print "feedPos...:: ", join(".", @$pB[0..100]), "\n"; +} +#----------------------- +sub recupInfo { my ($pInfo, @lines) = @_ ; + my $i=0 ; + while ($i<=$#lines){ + chomp($lines[$i]); + my @line = split("\t",$lines[$i]); + if (defined ($line[0])) { + if (!($line[0] =~ m/^\s*$|^#/)) { # skip both null and comment lines + push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=name, 3=begin, 4=end, 6=strand + } + } + $i=$i+1 ; + } + #print "recupInfo::end=", $i, "\n" ; +} +#----------------------- +sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ; + my $tagN=$seqN.$strand.$posB."..".$posE; + #print "tagName::",join("_",@_)," and tagName:$tagN\n"; +return $tagN; +} +#----------------------- +sub transitionAnalysis { +my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ; + my $enCours = @$ptag[$pos] ; + my $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ; + if ($enCours ne $precedant) { + #print "transi...:: $s, $pos, $precedant, $enCours\n"; + #print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n"; + SWITCH: for ($precedant.$enCours) { + /01/ && do { $$pdebAmont = $pos ; last SWITCH ;}; + /02/ && do { $$pdebIn = $pos ; last SWITCH ;}; + /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ; + if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) { + printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ; + #if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"}; + } elsif ($$pfinAval!=$$pdebAval) { + printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ; + #if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"}; + } + $$pdebAval = 0 ; $$pfinAval = 0 ; + last SWITCH ; + }; + /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ; + my $type="utr5"; + if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3 + $$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ; + $type="operon" ; + } + if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) { + printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ; + # if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"}; + } elsif ($$pfinAmont!=$$pdebAmont) { + printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ; + #if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ; + } + $$pdebAmont = 0 ; $$pfinAmont = 0 ; + last SWITCH ; + }; + /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; + if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) { + printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; + } elsif ($$pfinIn!=$$pdebIn) { + printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; + } + $$pdebIn = 0 ; $$pfinIn = 0 ; + last SWITCH ; + }; + /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; + if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) { + printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; + } elsif ($$pfinIn!=$$pdebIn) { + printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", + $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; + } + #$$pdebIn = 0 ; $$pfinIn = 0 ; + last SWITCH ; + }; + } + } + } +#----------------------- +my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ; +# command line check +foreach my $num (0 .. $#ARGV) { + SWITCH: for ($ARGV[$num]) { + /--input1|-i/ && do { + $fileNameI=$ARGV[$num+1]; + open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ; + last }; + /--input2|-j/ && do { + $fileNameE=$ARGV[$num+1]; + open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ; + last }; + /--strand|-s/ && do { + if ($ARGV[$num+1] eq "s") { $strand=1}; + last }; + /--help|-h/ && do { exec("pod2text $0\n") ; die }; + } +} +# memory declarations: +my @infoI ; my @infoE ; +my $seqName ; +my @tagF ; my @tagB ; # for Forward and Backward sequences +# data retrieval: +my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ; +close fichierGffI ; close fichierGffE ; + #print "gff files read ; number of lines : ",$#linesI+1," + ",$#linesE+1,"\n"; +# positions management: +&recupInfo(\@infoI, @linesI) ; + #print "number of treated elements:",($#infoI+1)/4,"\n"; +&recupInfo(\@infoE, @linesE) ; + #print "number of treated elements:",($#infoE+1)/4,"\n"; +# treatement: +# transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements +if (($#infoI) and ($#infoE)) { + $seqName=$infoI[0] ; + #print "end : $infoE[$#infoE-1]\n"; + for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions + #print "seqName : $seqName\n" ; + &feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements + &feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements + #print join("", @tagF), "\n"; + #print join("", @tagB), "\n"; + # transition management: + my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) + = (0, 0, 0, 0, 0, 0) ; + for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) { + &transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ; + } + ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) + = ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ; + for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) { + &transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ; + } +} +exit(0) ;