Mercurial > repos > big-tiandm > sirna_plant
comparison ClassAnnotate.pl @ 0:07745c0958dd draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 18 Sep 2014 21:40:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:07745c0958dd |
---|---|
1 #!/usr/bin/perl -w | |
2 #Filename: | |
3 #Author: Chen Tingting | |
4 #Email: chentt@big.ac.cn | |
5 #Date: 2014/5/13 | |
6 #Modified: | |
7 #Description: cluster annotate | |
8 my $version=1.00; | |
9 | |
10 use strict; | |
11 use Getopt::Long; | |
12 | |
13 my %opts; | |
14 GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h"); | |
15 if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments | |
16 &usage; | |
17 } | |
18 | |
19 #my %gene; | |
20 my %gene1; | |
21 open IN,"<$opts{g}"; | |
22 open OUT ,">$opts{l}"; | |
23 print OUT "#ID\tchr\tstart\tend\tstrand\n"; | |
24 my $n=1; | |
25 while (my $aline=<IN>) { | |
26 chomp $aline; | |
27 next if($aline=~/^\#/); | |
28 my @tmp=split/\t/,$aline; | |
29 my $ID; | |
30 if ($tmp[2] eq "gene") { | |
31 $tmp[0]=~s/Chr\./Chr/; | |
32 #$tmp[0]=~s/Chr/chr/; | |
33 my @infor=split/;/,$tmp[8]; | |
34 for (my $i=0;$i<@infor ;$i++) { | |
35 if ($infor[$i]=~/Alias\=(\S+)$/) { | |
36 $ID=$1; | |
37 last; | |
38 } | |
39 else { | |
40 $ID="unknown$n"; | |
41 $n++; | |
42 } | |
43 } | |
44 #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] | |
45 push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; | |
46 print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; | |
47 } | |
48 } | |
49 #while (my $aline=<IN>) { | |
50 # chomp $aline; | |
51 # next if($aline=~/^\#/); | |
52 # my @tmp=split/\t/,$aline;#ID chr start end strand | |
53 # push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; | |
54 # #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]]; | |
55 #} | |
56 close IN; | |
57 close OUT; | |
58 | |
59 my %nat; | |
60 open TMP,">$opts{t}"; | |
61 print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n"; | |
62 | |
63 open IN,"<$opts{n}"; | |
64 my $nat_n=1; | |
65 while (my $aline=<IN>) { | |
66 next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt - | |
67 chomp $aline; | |
68 my @arr=split /\t/,$aline; | |
69 my ($ns,$ne,$ns2,$ne2)=(0,0,0,0); | |
70 if ($arr[11]=~/Nearby/) { | |
71 ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]); | |
72 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n]; | |
73 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n"; | |
74 $nat_n++; | |
75 }else{ | |
76 $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1; | |
77 $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1; | |
78 $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1; | |
79 $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1; | |
80 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2 | |
81 push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"]; | |
82 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n"; | |
83 $nat_n++; | |
84 } | |
85 } | |
86 close IN; | |
87 close TMP; | |
88 | |
89 my %repeat; | |
90 open IN,"<$opts{'r'}"; | |
91 my $first=<IN>; | |
92 $first=<IN>; | |
93 $first=<IN>; | |
94 while (my $aline=<IN>) { | |
95 chomp $aline; | |
96 $aline=~s/^\s+//; | |
97 my @tmp=split/\s+/,$aline; | |
98 $tmp[4]=~s/chr0/Chr/; | |
99 $tmp[4]=~s/chr/Chr/; | |
100 push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class | |
101 #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n"; | |
102 } | |
103 close IN; | |
104 | |
105 my %phase; | |
106 open IN,"<$opts{'p'}"; | |
107 while (my $aline=<IN>) { | |
108 chomp $aline; | |
109 next if($aline=~/^\#/); | |
110 my @tmp=split/\t/,$aline; | |
111 if ($tmp[5]>=25) { | |
112 $phase{$tmp[0]}=$tmp[5]; | |
113 } | |
114 } | |
115 close IN; | |
116 | |
117 my $filein=$opts{'i'}; | |
118 my $fileout=$opts{'o'}; | |
119 open IN,"<$filein"; #input file | |
120 open OUT,">$fileout"; #output file | |
121 while (my $aline=<IN>) { | |
122 chomp $aline; | |
123 if($aline=~/^\#/){ | |
124 print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n"; | |
125 next; | |
126 } | |
127 my @tmp=split/\t/,$aline; | |
128 my @inf=split/\:/,$tmp[0]; | |
129 my @pos=split/\-/,$inf[1]; | |
130 my $chr=$inf[0]; | |
131 $chr=~s/Chr0/Chr/; | |
132 my $start=$pos[0]; | |
133 my $end=$pos[1]; | |
134 #=========Repeat============ | |
135 my @repeat; | |
136 if (defined(@{$repeat{$chr}})) { | |
137 my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}}; | |
138 for (my $i=0;$i<@r_array ;$i++) { | |
139 if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) { | |
140 push @repeat,$r_array[$i][2]; | |
141 } | |
142 elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){ | |
143 push @repeat,$r_array[$i][2]; | |
144 | |
145 } | |
146 elsif($end<$r_array[$i][0]){ | |
147 last; | |
148 } | |
149 else{next;} | |
150 } | |
151 } | |
152 my $repeat; | |
153 if (@repeat==0) { | |
154 $repeat="\/"; | |
155 } | |
156 else{ | |
157 $repeat=join ";",@repeat; | |
158 } | |
159 #=========nat=============== | |
160 my @nat1;#class 1 | |
161 my @nat2;#class 2 | |
162 my @nat;#nat ID | |
163 #foreach my $chr (keys %nat) { | |
164 my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}}; | |
165 for (my $i=0;$i<@n_array ;$i++) { | |
166 if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) { | |
167 push @nat1,$n_array[$i][2]; | |
168 push @nat2,$n_array[$i][3]; | |
169 push @nat,$n_array[$i][4]; | |
170 } | |
171 elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){ | |
172 push @nat1,$n_array[$i][2]; | |
173 push @nat2,$n_array[$i][3]; | |
174 push @nat,$n_array[$i][4]; | |
175 } | |
176 elsif($end<$n_array[$i][0]){ | |
177 last; | |
178 } | |
179 else{next;} | |
180 } | |
181 #} | |
182 | |
183 my $nat1; | |
184 my $nat2; | |
185 my $nat; | |
186 if (@nat1==0) { | |
187 $nat1="\/"; | |
188 } | |
189 else{ | |
190 $nat1=join ";",@nat1; | |
191 } | |
192 if (@nat2==0) { | |
193 $nat2="\/"; | |
194 } | |
195 else{ | |
196 $nat2=join ";",@nat2; | |
197 } | |
198 if (@nat==0) { | |
199 $nat="\/"; | |
200 } | |
201 else{ | |
202 $nat=join ";",@nat; | |
203 } | |
204 #========phase============== | |
205 my $phase="\/"; | |
206 if (defined($phase{$tmp[0]})) { | |
207 $phase="phase"; | |
208 } | |
209 #=========long=============== | |
210 my $long="\/"; | |
211 if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) { | |
212 $long="long"; | |
213 } | |
214 print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n"; | |
215 } | |
216 | |
217 close IN; | |
218 close OUT; | |
219 | |
220 sub nearby{ | |
221 my @p=@_; | |
222 my ($s,$e)=(0,0); | |
223 if ($p[1]<$p[2]) { | |
224 $s=$p[1]; | |
225 $e=$p[2]; | |
226 }else{ | |
227 $s=$p[3]; | |
228 $e=$p[0]; | |
229 } | |
230 return ($s,$e); | |
231 } | |
232 | |
233 sub usage{ | |
234 print <<"USAGE"; | |
235 Version $version | |
236 Usage: | |
237 $0 -i -o -g -n -r -p -t -l | |
238 options: | |
239 -i input file | |
240 -g gff file | |
241 -n NATs file | |
242 -r repeat file | |
243 -p phase file | |
244 -o output file | |
245 -t nat output file | |
246 -l genelist output file | |
247 -h help | |
248 USAGE | |
249 exit(1); | |
250 } | |
251 |