comparison detrprok_scripts/splitTranscriptGff.pl @ 0:a53eb951b164 draft

Uploaded
author clairetn
date Mon, 25 Mar 2013 05:39:29 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a53eb951b164
1 #!/usr/bin/perl -w
2 ###
3 # But : croiser
4 #
5 # Entrees : 2 fichiers gff à croiser
6 #
7 # Sortie : gff affiche a l'ecran
8 #
9 ###------------------------------------------------------
10 use vars qw($USAGE);
11 use strict;
12
13 =head1 NAME
14
15 splitTranscriptGff.pl - compare 2 input gff files and define intervalls by couple of overlapping elements
16
17 =head1 SYNOPSIS
18
19 % intervallsExtractorGff.pl -i file1.gff -j file2.gff -s strand [-h]
20
21 =head1 DESCRIPTION
22 This script will sort 2 gff files and compute distance between 2 successives lines.
23
24 -i|--input1 fileName gff input file name: included elements
25 -j|--input2 fileName gff input file name: extended elements
26 [-s|--strand] [s|d] s for single strand (colinear) or d for double strands (antisense) [default d]
27 [-h|--help] help mode then die
28
29 =head1 USECASE
30 Compare 2 input gff files: an annotations file (included elements) and transcription units file (extended elements).
31 For each couple of overlapping elements, split the transcription unit in 5'utr, CDS, 3'utr or "operon" in case of successives genes included in the transcription unit.
32
33 =head1 KWON BUG
34 Fisrt and last elements on the genome sequence can be misclassified in the result file.
35
36 =head1 AUTHOR
37 Claire Toffano-Nioche - sep.11
38
39 =cut
40 #-----------------------
41 sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ;
42 #print "feedPositionTab::$#info, ", ($#info+1)/4," \n";
43 for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element
44 #print "....$info[$i+2]\n";
45 if ($info[$i+3] =~ /\+/) {
46 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # Forward sequence
47 } else {
48 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # Backward sequence
49 }
50 }
51 #print "feedPos...:: ", join(".", @$pF[0..100]), "\n";
52 #print "feedPos...:: ", join(".", @$pB[0..100]), "\n";
53 }
54 #-----------------------
55 sub recupInfo { my ($pInfo, @lines) = @_ ;
56 my $i=0 ;
57 while ($i<=$#lines){
58 chomp($lines[$i]);
59 my @line = split("\t",$lines[$i]);
60 if (defined ($line[0])) {
61 if (!($line[0] =~ m/^\s*$|^#/)) { # skip both null and comment lines
62 push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=name, 3=begin, 4=end, 6=strand
63 }
64 }
65 $i=$i+1 ;
66 }
67 #print "recupInfo::end=", $i, "\n" ;
68 }
69 #-----------------------
70 sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ;
71 my $tagN=$seqN.$strand.$posB."..".$posE;
72 #print "tagName::",join("_",@_)," and tagName:$tagN\n";
73 return $tagN;
74 }
75 #-----------------------
76 sub transitionAnalysis {
77 my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ;
78 my $enCours = @$ptag[$pos] ;
79 my $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ;
80 if ($enCours ne $precedant) {
81 #print "transi...:: $s, $pos, $precedant, $enCours\n";
82 #print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n";
83 SWITCH: for ($precedant.$enCours) {
84 /01/ && do { $$pdebAmont = $pos ; last SWITCH ;};
85 /02/ && do { $$pdebIn = $pos ; last SWITCH ;};
86 /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ;
87 if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) {
88 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
89 $seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ;
90 #if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"};
91 } elsif ($$pfinAval!=$$pdebAval) {
92 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
93 $seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ;
94 #if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"};
95 }
96 $$pdebAval = 0 ; $$pfinAval = 0 ;
97 last SWITCH ;
98 };
99 /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ;
100 my $type="utr5";
101 if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3
102 $$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ;
103 $type="operon" ;
104 }
105 if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) {
106 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
107 $seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ;
108 # if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"};
109 } elsif ($$pfinAmont!=$$pdebAmont) {
110 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
111 $seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ;
112 #if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ;
113 }
114 $$pdebAmont = 0 ; $$pfinAmont = 0 ;
115 last SWITCH ;
116 };
117 /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
118 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
119 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
120 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
121 } elsif ($$pfinIn!=$$pdebIn) {
122 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
123 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
124 }
125 $$pdebIn = 0 ; $$pfinIn = 0 ;
126 last SWITCH ;
127 };
128 /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
129 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
130 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
131 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
132 } elsif ($$pfinIn!=$$pdebIn) {
133 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
134 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
135 }
136 #$$pdebIn = 0 ; $$pfinIn = 0 ;
137 last SWITCH ;
138 };
139 }
140 }
141 }
142 #-----------------------
143 my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ;
144 # command line check
145 foreach my $num (0 .. $#ARGV) {
146 SWITCH: for ($ARGV[$num]) {
147 /--input1|-i/ && do {
148 $fileNameI=$ARGV[$num+1];
149 open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ;
150 last };
151 /--input2|-j/ && do {
152 $fileNameE=$ARGV[$num+1];
153 open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ;
154 last };
155 /--strand|-s/ && do {
156 if ($ARGV[$num+1] eq "s") { $strand=1};
157 last };
158 /--help|-h/ && do { exec("pod2text $0\n") ; die };
159 }
160 }
161 # memory declarations:
162 my @infoI ; my @infoE ;
163 my $seqName ;
164 my @tagF ; my @tagB ; # for Forward and Backward sequences
165 # data retrieval:
166 my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ;
167 close fichierGffI ; close fichierGffE ;
168 #print "gff files read ; number of lines : ",$#linesI+1," + ",$#linesE+1,"\n";
169 # positions management:
170 &recupInfo(\@infoI, @linesI) ;
171 #print "number of treated elements:",($#infoI+1)/4,"\n";
172 &recupInfo(\@infoE, @linesE) ;
173 #print "number of treated elements:",($#infoE+1)/4,"\n";
174 # treatement:
175 # transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements
176 if (($#infoI) and ($#infoE)) {
177 $seqName=$infoI[0] ;
178 #print "end : $infoE[$#infoE-1]\n";
179 for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions
180 #print "seqName : $seqName\n" ;
181 &feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements
182 &feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements
183 #print join("", @tagF), "\n";
184 #print join("", @tagB), "\n";
185 # transition management:
186 my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream)
187 = (0, 0, 0, 0, 0, 0) ;
188 for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) {
189 &transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ;
190 }
191 ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream)
192 = ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ;
193 for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) {
194 &transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ;
195 }
196 }
197 exit(0) ;