Mercurial > repos > big-tiandm > sirna_plant
diff ClassAnnotate.pl @ 0:07745c0958dd draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 18 Sep 2014 21:40:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ClassAnnotate.pl Thu Sep 18 21:40:25 2014 -0400 @@ -0,0 +1,251 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chen Tingting +#Email: chentt@big.ac.cn +#Date: 2014/5/13 +#Modified: +#Description: cluster annotate +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h"); +if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments +&usage; +} + +#my %gene; +my %gene1; +open IN,"<$opts{g}"; +open OUT ,">$opts{l}"; +print OUT "#ID\tchr\tstart\tend\tstrand\n"; +my $n=1; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + my $ID; + if ($tmp[2] eq "gene") { + $tmp[0]=~s/Chr\./Chr/; + #$tmp[0]=~s/Chr/chr/; + my @infor=split/;/,$tmp[8]; + for (my $i=0;$i<@infor ;$i++) { + if ($infor[$i]=~/Alias\=(\S+)$/) { + $ID=$1; + last; + } + else { + $ID="unknown$n"; + $n++; + } + } + #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] + push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; + print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; + } +} +#while (my $aline=<IN>) { +# chomp $aline; +# next if($aline=~/^\#/); +# my @tmp=split/\t/,$aline;#ID chr start end strand +# push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; +# #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]]; +#} +close IN; +close OUT; + +my %nat; +open TMP,">$opts{t}"; +print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n"; + +open IN,"<$opts{n}"; +my $nat_n=1; +while (my $aline=<IN>) { + next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt - + chomp $aline; + my @arr=split /\t/,$aline; + my ($ns,$ne,$ns2,$ne2)=(0,0,0,0); + if ($arr[11]=~/Nearby/) { + ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]); + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n"; + $nat_n++; + }else{ + $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1; + $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1; + $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1; + $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1; + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2 + push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n"; + $nat_n++; + } +} +close IN; +close TMP; + +my %repeat; +open IN,"<$opts{'r'}"; +my $first=<IN>; +$first=<IN>; +$first=<IN>; +while (my $aline=<IN>) { + chomp $aline; + $aline=~s/^\s+//; + my @tmp=split/\s+/,$aline; + $tmp[4]=~s/chr0/Chr/; + $tmp[4]=~s/chr/Chr/; + push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class + #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n"; +} +close IN; + +my %phase; +open IN,"<$opts{'p'}"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + if ($tmp[5]>=25) { + $phase{$tmp[0]}=$tmp[5]; + } +} +close IN; + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +while (my $aline=<IN>) { + chomp $aline; + if($aline=~/^\#/){ + print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n"; + next; + } + my @tmp=split/\t/,$aline; + my @inf=split/\:/,$tmp[0]; + my @pos=split/\-/,$inf[1]; + my $chr=$inf[0]; + $chr=~s/Chr0/Chr/; + my $start=$pos[0]; + my $end=$pos[1]; + #=========Repeat============ + my @repeat; + if (defined(@{$repeat{$chr}})) { + my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}}; + for (my $i=0;$i<@r_array ;$i++) { + if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) { + push @repeat,$r_array[$i][2]; + } + elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){ + push @repeat,$r_array[$i][2]; + + } + elsif($end<$r_array[$i][0]){ + last; + } + else{next;} + } + } + my $repeat; + if (@repeat==0) { + $repeat="\/"; + } + else{ + $repeat=join ";",@repeat; + } + #=========nat=============== + my @nat1;#class 1 + my @nat2;#class 2 + my @nat;#nat ID + #foreach my $chr (keys %nat) { + my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}}; + for (my $i=0;$i<@n_array ;$i++) { + if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) { + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){ + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($end<$n_array[$i][0]){ + last; + } + else{next;} + } + #} + + my $nat1; + my $nat2; + my $nat; + if (@nat1==0) { + $nat1="\/"; + } + else{ + $nat1=join ";",@nat1; + } + if (@nat2==0) { + $nat2="\/"; + } + else{ + $nat2=join ";",@nat2; + } + if (@nat==0) { + $nat="\/"; + } + else{ + $nat=join ";",@nat; + } + #========phase============== + my $phase="\/"; + if (defined($phase{$tmp[0]})) { + $phase="phase"; + } + #=========long=============== + my $long="\/"; + if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) { + $long="long"; + } + print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n"; +} + +close IN; +close OUT; + +sub nearby{ + my @p=@_; + my ($s,$e)=(0,0); + if ($p[1]<$p[2]) { + $s=$p[1]; + $e=$p[2]; + }else{ + $s=$p[3]; + $e=$p[0]; + } + return ($s,$e); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -g -n -r -p -t -l +options: +-i input file + -g gff file + -n NATs file + -r repeat file + -p phase file +-o output file +-t nat output file +-l genelist output file +-h help +USAGE +exit(1); +} +