Mercurial > repos > yufei-luo > s_mart
view SMART/bacteriaRegulatoryRegion_Detection/splitTranscriptGff.pl @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#!/usr/bin/perl -w ### # Main : defining utr and intergenic operonic intervalles from a transcripts file following a referencies file # # Input : 2 gff files to intersect, transcript queries vs referencies # # Output : resulting gff file printing to standard output # ###------------------------------------------------------ use vars qw($USAGE); use strict; =head1 NAME splitTranscriptGff.pl - compare 2 input gff files and define utr and intergenic operonic intervalles by couple of overlapping elements =head1 SYNOPSIS % intervallsExtractorGff.pl -i referencies.gff -j transcriptQueries.gff -s strand [-h] =head1 DESCRIPTION This script will intersect 2 gff files and compute distance between 2 successives lines. Take care both of sorting by positions the input files and of that referencies are included in transcriptQueries. -i|--input1 fileName gff input file name: included elements -j|--input2 fileName gff input file name: extended elements [-s|--strand] [s|d] s for single strand (colinear) or d for double strands (antisense) [default d] [-h|--help] help mode then die =head1 USECASE Define many fragments for each extended element (transcript): UTR5, gene, UTR3, "inOperon" for intergenomic region between 2 genes intervallsExtractorGff.pl -i CDSannotations.gff -j RNAseqTranscripts.gff > UTRsGenesOperonsLists.gff; =head1 KWON BUGS No disjonction of overlapping elements of the included elements (-i file). In usecase, overlapping genes are fused in one long gene. =head1 AUTHOR Claire Toffano-Nioche - sep.11 =cut #----------------------- sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ; #print "feedPositionTab::$#info, ", ($#info+1)/4," \n"; for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element #print "....$info[$i+2]\n"; if ($info[$i+3] =~ /\+/) { for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # sequence Forward } else { for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # sequence Backward } } #print "feedPos...:: ", join(".", @$pF[0..100]), "\n"; #print "feedPos...:: ", join(".", @$pB[0..100]), "\n"; } #----------------------- sub recupInfo { my ($pInfo, @lines) = @_ ; for (my $i=0 ; $i <= ($#lines+1)*4-1 ; $i+=4) { my @line = split("\t",$lines[$i/4]); push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=nom, 3=debut, 4=fin, 6=sens } #print "recupInfo::fin=", ($#lines+1)*4, "\n" ; } #----------------------- sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ; my $tagN=$seqN.$strand.$posB."..".$posE; #print "tagName:",join("_",@_)," et tagName:$tagN\n"; return $tagN; } #----------------------- sub transitionAnalysis { my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ; my $enCours = 0 ; my $precedant = 0 ; $enCours = @$ptag[$pos] ; $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ; if ($enCours ne $precedant) { #print "transi...:: $s, $pos, $precedant, $enCours\n"; #print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n"; SWITCH: for ($precedant.$enCours) { /01/ && do { $$pdebAmont = $pos ; last SWITCH ;}; /02/ && do { $$pdebIn = $pos ; last SWITCH ;}; /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ; if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) { printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ; #if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"}; } elsif ($$pfinAval!=$$pdebAval) { printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ; #if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"}; } $$pdebAval = 0 ; $$pfinAval = 0 ; last SWITCH ; }; /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ; my $type="utr5"; if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3 $$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ; $type="inOperon" ; } if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) { printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ; # if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"}; } elsif ($$pfinAmont!=$$pdebAmont) { printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ; #if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ; } $$pdebAmont = 0 ; $$pfinAmont = 0 ; last SWITCH ; }; /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) { printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; } elsif ($$pfinIn!=$$pdebIn) { printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; } $$pdebIn = 0 ; $$pfinIn = 0 ; last SWITCH ; }; /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) { printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; } elsif ($$pfinIn!=$$pdebIn) { printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; } #$$pdebIn = 0 ; $$pfinIn = 0 ; last SWITCH ; }; } } } #----------------------- my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ; # command line check foreach my $num (0 .. $#ARGV) { SWITCH: for ($ARGV[$num]) { /--input1|-i/ && do { $fileNameI=$ARGV[$num+1]; open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ; last }; /--input2|-j/ && do { $fileNameE=$ARGV[$num+1]; open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ; last }; /--strand|-s/ && do { if ($ARGV[$num+1] eq "s") { $strand=1}; last }; /--help|-h/ && do { exec("pod2text $0\n") ; die }; } } # memory declarations: my @infoI ; my @infoE ; my $seqName ; my @tagF ; my @tagB ; # Forward and Backward sequence # data retrieval: my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ; close fichierGffI ; close fichierGffE ; #print "gff files read ; number of lines : $#lines1 + $#lines2\n"; # positions management &recupInfo(\@infoI, @linesI) ; &recupInfo(\@infoE, @linesE) ; # treatement: # transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements if (($#infoI) and ($#infoE)) { $seqName=$infoI[0] ; #print "fin : $infoE[$#infoE-1]\n"; for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions #print "seqName : $seqName\n" ; &feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements &feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements #print join("", @tagF), "\n"; #print join("", @tagB), "\n"; # transition management: my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) = (0, 0, 0, 0, 0, 0) ; for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) { &transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ; } ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) = ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ; for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) { &transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ; } } exit(0) ;