comparison SMART/bacteriaRegulatoryRegion_Detection/splitTranscriptGff.pl @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #!/usr/bin/perl -w
2 ###
3 # Main : defining utr and intergenic operonic intervalles from a transcripts file following a referencies file
4 #
5 # Input : 2 gff files to intersect, transcript queries vs referencies
6 #
7 # Output : resulting gff file printing to standard output
8 #
9 ###------------------------------------------------------
10 use vars qw($USAGE);
11 use strict;
12
13 =head1 NAME
14
15 splitTranscriptGff.pl - compare 2 input gff files and define utr and intergenic operonic intervalles by couple of overlapping elements
16
17 =head1 SYNOPSIS
18
19 % intervallsExtractorGff.pl -i referencies.gff -j transcriptQueries.gff -s strand [-h]
20
21 =head1 DESCRIPTION
22 This script will intersect 2 gff files and compute distance between 2 successives lines. Take care both of sorting by positions the input files and of that referencies are included in transcriptQueries.
23
24 -i|--input1 fileName gff input file name: included elements
25 -j|--input2 fileName gff input file name: extended elements
26 [-s|--strand] [s|d] s for single strand (colinear) or d for double strands (antisense) [default d]
27 [-h|--help] help mode then die
28
29 =head1 USECASE
30 Define many fragments for each extended element (transcript): UTR5, gene, UTR3, "inOperon" for intergenomic region between 2 genes
31 intervallsExtractorGff.pl -i CDSannotations.gff -j RNAseqTranscripts.gff > UTRsGenesOperonsLists.gff;
32
33 =head1 KWON BUGS
34 No disjonction of overlapping elements of the included elements (-i file).
35 In usecase, overlapping genes are fused in one long gene.
36
37 =head1 AUTHOR
38 Claire Toffano-Nioche - sep.11
39
40 =cut
41 #-----------------------
42 sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ;
43 #print "feedPositionTab::$#info, ", ($#info+1)/4," \n";
44 for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element
45 #print "....$info[$i+2]\n";
46 if ($info[$i+3] =~ /\+/) {
47 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # sequence Forward
48 } else {
49 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # sequence Backward
50 }
51 }
52 #print "feedPos...:: ", join(".", @$pF[0..100]), "\n";
53 #print "feedPos...:: ", join(".", @$pB[0..100]), "\n";
54 }
55 #-----------------------
56 sub recupInfo { my ($pInfo, @lines) = @_ ;
57 for (my $i=0 ; $i <= ($#lines+1)*4-1 ; $i+=4) {
58 my @line = split("\t",$lines[$i/4]);
59 push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=nom, 3=debut, 4=fin, 6=sens
60 }
61 #print "recupInfo::fin=", ($#lines+1)*4, "\n" ;
62 }
63 #-----------------------
64 sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ;
65 my $tagN=$seqN.$strand.$posB."..".$posE;
66 #print "tagName:",join("_",@_)," et tagName:$tagN\n";
67 return $tagN;
68 }
69 #-----------------------
70 sub transitionAnalysis {
71 my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ;
72 my $enCours = 0 ; my $precedant = 0 ;
73 $enCours = @$ptag[$pos] ;
74 $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ;
75 if ($enCours ne $precedant) {
76 #print "transi...:: $s, $pos, $precedant, $enCours\n";
77 #print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n";
78 SWITCH: for ($precedant.$enCours) {
79 /01/ && do { $$pdebAmont = $pos ; last SWITCH ;};
80 /02/ && do { $$pdebIn = $pos ; last SWITCH ;};
81 /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ;
82 if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) {
83 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
84 $seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ;
85 #if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"};
86 } elsif ($$pfinAval!=$$pdebAval) {
87 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
88 $seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ;
89 #if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"};
90 }
91 $$pdebAval = 0 ; $$pfinAval = 0 ;
92 last SWITCH ;
93 };
94 /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ;
95 my $type="utr5";
96 if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3
97 $$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ;
98 $type="inOperon" ;
99 }
100 if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) {
101 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
102 $seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ;
103 # if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"};
104 } elsif ($$pfinAmont!=$$pdebAmont) {
105 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
106 $seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ;
107 #if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ;
108 }
109 $$pdebAmont = 0 ; $$pfinAmont = 0 ;
110 last SWITCH ;
111 };
112 /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
113 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
114 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
115 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
116 } elsif ($$pfinIn!=$$pdebIn) {
117 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
118 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
119 }
120 $$pdebIn = 0 ; $$pfinIn = 0 ;
121 last SWITCH ;
122 };
123 /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
124 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
125 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
126 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
127 } elsif ($$pfinIn!=$$pdebIn) {
128 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
129 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
130 }
131 #$$pdebIn = 0 ; $$pfinIn = 0 ;
132 last SWITCH ;
133 };
134 }
135 }
136 }
137 #-----------------------
138 my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ;
139 # command line check
140 foreach my $num (0 .. $#ARGV) {
141 SWITCH: for ($ARGV[$num]) {
142 /--input1|-i/ && do {
143 $fileNameI=$ARGV[$num+1];
144 open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ;
145 last };
146 /--input2|-j/ && do {
147 $fileNameE=$ARGV[$num+1];
148 open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ;
149 last };
150 /--strand|-s/ && do {
151 if ($ARGV[$num+1] eq "s") { $strand=1};
152 last };
153 /--help|-h/ && do { exec("pod2text $0\n") ; die };
154 }
155 }
156 # memory declarations:
157 my @infoI ; my @infoE ;
158 my $seqName ;
159 my @tagF ; my @tagB ; # Forward and Backward sequence
160 # data retrieval:
161 my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ;
162 close fichierGffI ; close fichierGffE ;
163 #print "gff files read ; number of lines : $#lines1 + $#lines2\n";
164 # positions management
165 &recupInfo(\@infoI, @linesI) ;
166 &recupInfo(\@infoE, @linesE) ;
167 # treatement:
168 # transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements
169 if (($#infoI) and ($#infoE)) {
170 $seqName=$infoI[0] ;
171 #print "fin : $infoE[$#infoE-1]\n";
172 for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions
173 #print "seqName : $seqName\n" ;
174 &feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements
175 &feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements
176 #print join("", @tagF), "\n";
177 #print join("", @tagB), "\n";
178 # transition management:
179 my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream)
180 = (0, 0, 0, 0, 0, 0) ;
181 for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) {
182 &transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ;
183 }
184 ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) = ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ;
185 for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) {
186 &transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ;
187 }
188 }
189 exit(0) ;