50
|
1 #!/usr/bin/perl -w
|
|
2 #Filename:
|
|
3 #Author: Chen Tingting
|
|
4 #Email: chentt@big.ac.cn
|
|
5 #Date: 2014/5/13
|
|
6 #Modified:
|
|
7 #Description: cluster annotate
|
|
8 my $version=1.00;
|
|
9
|
|
10 use strict;
|
|
11 use Getopt::Long;
|
|
12
|
|
13 my %opts;
|
|
14 GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h");
|
|
15 if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments
|
|
16 &usage;
|
|
17 }
|
|
18
|
|
19 #my %gene;
|
|
20 my %gene1;
|
|
21 open IN,"<$opts{g}";
|
|
22 open OUT ,">$opts{l}";
|
|
23 print OUT "#ID\tchr\tstart\tend\tstrand\n";
|
|
24 my $n=1;
|
|
25 while (my $aline=<IN>) {
|
|
26 chomp $aline;
|
|
27 next if($aline=~/^\#/);
|
|
28 my @tmp=split/\t/,$aline;
|
|
29 my $ID;
|
|
30 if ($tmp[2] eq "gene") {
|
|
31 $tmp[0]=~s/Chr\./Chr/;
|
|
32 #$tmp[0]=~s/Chr/chr/;
|
|
33 my @infor=split/;/,$tmp[8];
|
|
34 for (my $i=0;$i<@infor ;$i++) {
|
|
35 if ($infor[$i]=~/Alias\=(\S+)$/) {
|
|
36 $ID=$1;
|
|
37 last;
|
|
38 }
|
|
39 else {
|
|
40 $ID="unknown$n";
|
|
41 $n++;
|
|
42 }
|
|
43 }
|
|
44 #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
|
|
45 push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
|
|
46 print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
|
|
47 }
|
|
48 }
|
|
49 #while (my $aline=<IN>) {
|
|
50 # chomp $aline;
|
|
51 # next if($aline=~/^\#/);
|
|
52 # my @tmp=split/\t/,$aline;#ID chr start end strand
|
|
53 # push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
|
|
54 # #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]];
|
|
55 #}
|
|
56 close IN;
|
|
57 close OUT;
|
|
58
|
|
59 my %nat;
|
|
60 open TMP,">$opts{t}";
|
|
61 print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n";
|
|
62
|
|
63 open IN,"<$opts{n}";
|
|
64 my $nat_n=1;
|
|
65 while (my $aline=<IN>) {
|
|
66 next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt -
|
|
67 chomp $aline;
|
|
68 my @arr=split /\t/,$aline;
|
|
69 my ($ns,$ne,$ns2,$ne2)=(0,0,0,0);
|
|
70 if ($arr[11]=~/Nearby/) {
|
|
71 ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]);
|
|
72 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n];
|
|
73 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n";
|
|
74 $nat_n++;
|
|
75 }else{
|
|
76 $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1;
|
|
77 $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1;
|
|
78 $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1;
|
|
79 $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1;
|
|
80 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2
|
|
81 push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"];
|
|
82 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n";
|
|
83 $nat_n++;
|
|
84 }
|
|
85 }
|
|
86 close IN;
|
|
87 close TMP;
|
|
88
|
|
89 my %repeat;
|
|
90 open IN,"<$opts{'r'}";
|
|
91 my $first=<IN>;
|
|
92 $first=<IN>;
|
|
93 $first=<IN>;
|
|
94 while (my $aline=<IN>) {
|
|
95 chomp $aline;
|
|
96 $aline=~s/^\s+//;
|
|
97 my @tmp=split/\s+/,$aline;
|
|
98 $tmp[4]=~s/chr0/Chr/;
|
|
99 $tmp[4]=~s/chr/Chr/;
|
|
100 push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class
|
|
101 #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n";
|
|
102 }
|
|
103 close IN;
|
|
104
|
|
105 my %phase;
|
|
106 open IN,"<$opts{'p'}";
|
|
107 while (my $aline=<IN>) {
|
|
108 chomp $aline;
|
|
109 next if($aline=~/^\#/);
|
|
110 my @tmp=split/\t/,$aline;
|
|
111 if ($tmp[5]>=25) {
|
|
112 $phase{$tmp[0]}=$tmp[5];
|
|
113 }
|
|
114 }
|
|
115 close IN;
|
|
116
|
|
117 my $filein=$opts{'i'};
|
|
118 my $fileout=$opts{'o'};
|
|
119 open IN,"<$filein"; #input file
|
|
120 open OUT,">$fileout"; #output file
|
|
121 while (my $aline=<IN>) {
|
|
122 chomp $aline;
|
|
123 if($aline=~/^\#/){
|
|
124 print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n";
|
|
125 next;
|
|
126 }
|
|
127 my @tmp=split/\t/,$aline;
|
|
128 my @inf=split/\:/,$tmp[0];
|
|
129 my @pos=split/\-/,$inf[1];
|
|
130 my $chr=$inf[0];
|
|
131 $chr=~s/Chr0/Chr/;
|
|
132 my $start=$pos[0];
|
|
133 my $end=$pos[1];
|
|
134 #=========Repeat============
|
|
135 my @repeat;
|
|
136 if (defined(@{$repeat{$chr}})) {
|
|
137 my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}};
|
|
138 for (my $i=0;$i<@r_array ;$i++) {
|
|
139 if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) {
|
|
140 push @repeat,$r_array[$i][2];
|
|
141 }
|
|
142 elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){
|
|
143 push @repeat,$r_array[$i][2];
|
|
144
|
|
145 }
|
|
146 elsif($end<$r_array[$i][0]){
|
|
147 last;
|
|
148 }
|
|
149 else{next;}
|
|
150 }
|
|
151 }
|
|
152 my $repeat;
|
|
153 if (@repeat==0) {
|
|
154 $repeat="\/";
|
|
155 }
|
|
156 else{
|
|
157 $repeat=join ";",@repeat;
|
|
158 }
|
|
159 #=========nat===============
|
|
160 my @nat1;#class 1
|
|
161 my @nat2;#class 2
|
|
162 my @nat;#nat ID
|
|
163 #foreach my $chr (keys %nat) {
|
|
164 my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}};
|
|
165 for (my $i=0;$i<@n_array ;$i++) {
|
|
166 if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) {
|
|
167 push @nat1,$n_array[$i][2];
|
|
168 push @nat2,$n_array[$i][3];
|
|
169 push @nat,$n_array[$i][4];
|
|
170 }
|
|
171 elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){
|
|
172 push @nat1,$n_array[$i][2];
|
|
173 push @nat2,$n_array[$i][3];
|
|
174 push @nat,$n_array[$i][4];
|
|
175 }
|
|
176 elsif($end<$n_array[$i][0]){
|
|
177 last;
|
|
178 }
|
|
179 else{next;}
|
|
180 }
|
|
181 #}
|
|
182
|
|
183 my $nat1;
|
|
184 my $nat2;
|
|
185 my $nat;
|
|
186 if (@nat1==0) {
|
|
187 $nat1="\/";
|
|
188 }
|
|
189 else{
|
|
190 $nat1=join ";",@nat1;
|
|
191 }
|
|
192 if (@nat2==0) {
|
|
193 $nat2="\/";
|
|
194 }
|
|
195 else{
|
|
196 $nat2=join ";",@nat2;
|
|
197 }
|
|
198 if (@nat==0) {
|
|
199 $nat="\/";
|
|
200 }
|
|
201 else{
|
|
202 $nat=join ";",@nat;
|
|
203 }
|
|
204 #========phase==============
|
|
205 my $phase="\/";
|
|
206 if (defined($phase{$tmp[0]})) {
|
|
207 $phase="phase";
|
|
208 }
|
|
209 #=========long===============
|
|
210 my $long="\/";
|
|
211 if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) {
|
|
212 $long="long";
|
|
213 }
|
|
214 print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n";
|
|
215 }
|
|
216
|
|
217 close IN;
|
|
218 close OUT;
|
|
219
|
|
220 sub nearby{
|
|
221 my @p=@_;
|
|
222 my ($s,$e)=(0,0);
|
|
223 if ($p[1]<$p[2]) {
|
|
224 $s=$p[1];
|
|
225 $e=$p[2];
|
|
226 }else{
|
|
227 $s=$p[3];
|
|
228 $e=$p[0];
|
|
229 }
|
|
230 return ($s,$e);
|
|
231 }
|
|
232
|
|
233 sub usage{
|
|
234 print <<"USAGE";
|
|
235 Version $version
|
|
236 Usage:
|
|
237 $0 -i -o -g -n -r -p -t -l
|
|
238 options:
|
|
239 -i input file
|
|
240 -g gff file
|
|
241 -n NATs file
|
|
242 -r repeat file
|
|
243 -p phase file
|
|
244 -o output file
|
|
245 -t nat output file
|
|
246 -l genelist output file
|
|
247 -h help
|
|
248 USAGE
|
|
249 exit(1);
|
|
250 }
|
|
251
|