0
|
1 #!/usr/bin/perl -w
|
|
2 ###
|
|
3 # But : croiser
|
|
4 #
|
|
5 # Entrees : 2 fichiers gff à croiser
|
|
6 #
|
|
7 # Sortie : gff affiche a l'ecran
|
|
8 #
|
|
9 ###------------------------------------------------------
|
|
10 use vars qw($USAGE);
|
|
11 use strict;
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 splitTranscriptGff.pl - compare 2 input gff files and define intervalls by couple of overlapping elements
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 % intervallsExtractorGff.pl -i file1.gff -j file2.gff -s strand [-h]
|
|
20
|
|
21 =head1 DESCRIPTION
|
|
22 This script will sort 2 gff files and compute distance between 2 successives lines.
|
|
23
|
|
24 -i|--input1 fileName gff input file name: included elements
|
|
25 -j|--input2 fileName gff input file name: extended elements
|
|
26 [-s|--strand] [s|d] s for single strand (colinear) or d for double strands (antisense) [default d]
|
|
27 [-h|--help] help mode then die
|
|
28
|
|
29 =head1 USECASE
|
|
30 Compare 2 input gff files: an annotations file (included elements) and transcription units file (extended elements).
|
|
31 For each couple of overlapping elements, split the transcription unit in 5'utr, CDS, 3'utr or "operon" in case of successives genes included in the transcription unit.
|
|
32
|
|
33 =head1 KWON BUG
|
|
34 Fisrt and last elements on the genome sequence can be misclassified in the result file.
|
|
35
|
|
36 =head1 AUTHOR
|
|
37 Claire Toffano-Nioche - sep.11
|
|
38
|
|
39 =cut
|
|
40 #-----------------------
|
|
41 sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ;
|
|
42 #print "feedPositionTab::$#info, ", ($#info+1)/4," \n";
|
|
43 for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element
|
|
44 #print "....$info[$i+2]\n";
|
|
45 if ($info[$i+3] =~ /\+/) {
|
|
46 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # Forward sequence
|
|
47 } else {
|
|
48 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # Backward sequence
|
|
49 }
|
|
50 }
|
|
51 #print "feedPos...:: ", join(".", @$pF[0..100]), "\n";
|
|
52 #print "feedPos...:: ", join(".", @$pB[0..100]), "\n";
|
|
53 }
|
|
54 #-----------------------
|
|
55 sub recupInfo { my ($pInfo, @lines) = @_ ;
|
|
56 my $i=0 ;
|
|
57 while ($i<=$#lines){
|
|
58 chomp($lines[$i]);
|
|
59 my @line = split("\t",$lines[$i]);
|
|
60 if (defined ($line[0])) {
|
|
61 if (!($line[0] =~ m/^\s*$|^#/)) { # skip both null and comment lines
|
|
62 push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=name, 3=begin, 4=end, 6=strand
|
|
63 }
|
|
64 }
|
|
65 $i=$i+1 ;
|
|
66 }
|
|
67 #print "recupInfo::end=", $i, "\n" ;
|
|
68 }
|
|
69 #-----------------------
|
|
70 sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ;
|
|
71 my $tagN=$seqN.$strand.$posB."..".$posE;
|
|
72 #print "tagName::",join("_",@_)," and tagName:$tagN\n";
|
|
73 return $tagN;
|
|
74 }
|
|
75 #-----------------------
|
|
76 sub transitionAnalysis {
|
|
77 my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ;
|
|
78 my $enCours = @$ptag[$pos] ;
|
|
79 my $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ;
|
|
80 if ($enCours ne $precedant) {
|
|
81 #print "transi...:: $s, $pos, $precedant, $enCours\n";
|
|
82 #print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n";
|
|
83 SWITCH: for ($precedant.$enCours) {
|
|
84 /01/ && do { $$pdebAmont = $pos ; last SWITCH ;};
|
|
85 /02/ && do { $$pdebIn = $pos ; last SWITCH ;};
|
|
86 /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ;
|
|
87 if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) {
|
|
88 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
89 $seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ;
|
|
90 #if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"};
|
|
91 } elsif ($$pfinAval!=$$pdebAval) {
|
|
92 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
93 $seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ;
|
|
94 #if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"};
|
|
95 }
|
|
96 $$pdebAval = 0 ; $$pfinAval = 0 ;
|
|
97 last SWITCH ;
|
|
98 };
|
|
99 /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ;
|
|
100 my $type="utr5";
|
|
101 if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3
|
|
102 $$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ;
|
|
103 $type="operon" ;
|
|
104 }
|
|
105 if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) {
|
|
106 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
107 $seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ;
|
|
108 # if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"};
|
|
109 } elsif ($$pfinAmont!=$$pdebAmont) {
|
|
110 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
111 $seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ;
|
|
112 #if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ;
|
|
113 }
|
|
114 $$pdebAmont = 0 ; $$pfinAmont = 0 ;
|
|
115 last SWITCH ;
|
|
116 };
|
|
117 /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
|
|
118 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
|
|
119 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
120 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
|
|
121 } elsif ($$pfinIn!=$$pdebIn) {
|
|
122 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
123 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
|
|
124 }
|
|
125 $$pdebIn = 0 ; $$pfinIn = 0 ;
|
|
126 last SWITCH ;
|
|
127 };
|
|
128 /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
|
|
129 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
|
|
130 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
131 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
|
|
132 } elsif ($$pfinIn!=$$pdebIn) {
|
|
133 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
|
|
134 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
|
|
135 }
|
|
136 #$$pdebIn = 0 ; $$pfinIn = 0 ;
|
|
137 last SWITCH ;
|
|
138 };
|
|
139 }
|
|
140 }
|
|
141 }
|
|
142 #-----------------------
|
|
143 my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ;
|
|
144 # command line check
|
|
145 foreach my $num (0 .. $#ARGV) {
|
|
146 SWITCH: for ($ARGV[$num]) {
|
|
147 /--input1|-i/ && do {
|
|
148 $fileNameI=$ARGV[$num+1];
|
|
149 open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ;
|
|
150 last };
|
|
151 /--input2|-j/ && do {
|
|
152 $fileNameE=$ARGV[$num+1];
|
|
153 open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ;
|
|
154 last };
|
|
155 /--strand|-s/ && do {
|
|
156 if ($ARGV[$num+1] eq "s") { $strand=1};
|
|
157 last };
|
|
158 /--help|-h/ && do { exec("pod2text $0\n") ; die };
|
|
159 }
|
|
160 }
|
|
161 # memory declarations:
|
|
162 my @infoI ; my @infoE ;
|
|
163 my $seqName ;
|
|
164 my @tagF ; my @tagB ; # for Forward and Backward sequences
|
|
165 # data retrieval:
|
|
166 my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ;
|
|
167 close fichierGffI ; close fichierGffE ;
|
|
168 #print "gff files read ; number of lines : ",$#linesI+1," + ",$#linesE+1,"\n";
|
|
169 # positions management:
|
|
170 &recupInfo(\@infoI, @linesI) ;
|
|
171 #print "number of treated elements:",($#infoI+1)/4,"\n";
|
|
172 &recupInfo(\@infoE, @linesE) ;
|
|
173 #print "number of treated elements:",($#infoE+1)/4,"\n";
|
|
174 # treatement:
|
|
175 # transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements
|
|
176 if (($#infoI) and ($#infoE)) {
|
|
177 $seqName=$infoI[0] ;
|
|
178 #print "end : $infoE[$#infoE-1]\n";
|
|
179 for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions
|
|
180 #print "seqName : $seqName\n" ;
|
|
181 &feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements
|
|
182 &feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements
|
|
183 #print join("", @tagF), "\n";
|
|
184 #print join("", @tagB), "\n";
|
|
185 # transition management:
|
|
186 my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream)
|
|
187 = (0, 0, 0, 0, 0, 0) ;
|
|
188 for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) {
|
|
189 &transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ;
|
|
190 }
|
|
191 ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream)
|
|
192 = ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ;
|
|
193 for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) {
|
|
194 &transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ;
|
|
195 }
|
|
196 }
|
|
197 exit(0) ;
|