Mercurial > repos > big-tiandm > sirna_plant
view ClassAnnotate.pl @ 27:f2e18b4b1f3a draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 06 Nov 2014 02:49:50 -0500 |
parents | 07745c0958dd |
children |
line wrap: on
line source
#!/usr/bin/perl -w #Filename: #Author: Chen Tingting #Email: chentt@big.ac.cn #Date: 2014/5/13 #Modified: #Description: cluster annotate my $version=1.00; use strict; use Getopt::Long; my %opts; GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h"); if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments &usage; } #my %gene; my %gene1; open IN,"<$opts{g}"; open OUT ,">$opts{l}"; print OUT "#ID\tchr\tstart\tend\tstrand\n"; my $n=1; while (my $aline=<IN>) { chomp $aline; next if($aline=~/^\#/); my @tmp=split/\t/,$aline; my $ID; if ($tmp[2] eq "gene") { $tmp[0]=~s/Chr\./Chr/; #$tmp[0]=~s/Chr/chr/; my @infor=split/;/,$tmp[8]; for (my $i=0;$i<@infor ;$i++) { if ($infor[$i]=~/Alias\=(\S+)$/) { $ID=$1; last; } else { $ID="unknown$n"; $n++; } } #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; } } #while (my $aline=<IN>) { # chomp $aline; # next if($aline=~/^\#/); # my @tmp=split/\t/,$aline;#ID chr start end strand # push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; # #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]]; #} close IN; close OUT; my %nat; open TMP,">$opts{t}"; print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n"; open IN,"<$opts{n}"; my $nat_n=1; while (my $aline=<IN>) { next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt - chomp $aline; my @arr=split /\t/,$aline; my ($ns,$ne,$ns2,$ne2)=(0,0,0,0); if ($arr[11]=~/Nearby/) { ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]); push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n]; print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n"; $nat_n++; }else{ $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1; $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1; $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1; $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1; push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2 push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"]; print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n"; $nat_n++; } } close IN; close TMP; my %repeat; open IN,"<$opts{'r'}"; my $first=<IN>; $first=<IN>; $first=<IN>; while (my $aline=<IN>) { chomp $aline; $aline=~s/^\s+//; my @tmp=split/\s+/,$aline; $tmp[4]=~s/chr0/Chr/; $tmp[4]=~s/chr/Chr/; push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n"; } close IN; my %phase; open IN,"<$opts{'p'}"; while (my $aline=<IN>) { chomp $aline; next if($aline=~/^\#/); my @tmp=split/\t/,$aline; if ($tmp[5]>=25) { $phase{$tmp[0]}=$tmp[5]; } } close IN; my $filein=$opts{'i'}; my $fileout=$opts{'o'}; open IN,"<$filein"; #input file open OUT,">$fileout"; #output file while (my $aline=<IN>) { chomp $aline; if($aline=~/^\#/){ print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n"; next; } my @tmp=split/\t/,$aline; my @inf=split/\:/,$tmp[0]; my @pos=split/\-/,$inf[1]; my $chr=$inf[0]; $chr=~s/Chr0/Chr/; my $start=$pos[0]; my $end=$pos[1]; #=========Repeat============ my @repeat; if (defined(@{$repeat{$chr}})) { my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}}; for (my $i=0;$i<@r_array ;$i++) { if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) { push @repeat,$r_array[$i][2]; } elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){ push @repeat,$r_array[$i][2]; } elsif($end<$r_array[$i][0]){ last; } else{next;} } } my $repeat; if (@repeat==0) { $repeat="\/"; } else{ $repeat=join ";",@repeat; } #=========nat=============== my @nat1;#class 1 my @nat2;#class 2 my @nat;#nat ID #foreach my $chr (keys %nat) { my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}}; for (my $i=0;$i<@n_array ;$i++) { if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) { push @nat1,$n_array[$i][2]; push @nat2,$n_array[$i][3]; push @nat,$n_array[$i][4]; } elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){ push @nat1,$n_array[$i][2]; push @nat2,$n_array[$i][3]; push @nat,$n_array[$i][4]; } elsif($end<$n_array[$i][0]){ last; } else{next;} } #} my $nat1; my $nat2; my $nat; if (@nat1==0) { $nat1="\/"; } else{ $nat1=join ";",@nat1; } if (@nat2==0) { $nat2="\/"; } else{ $nat2=join ";",@nat2; } if (@nat==0) { $nat="\/"; } else{ $nat=join ";",@nat; } #========phase============== my $phase="\/"; if (defined($phase{$tmp[0]})) { $phase="phase"; } #=========long=============== my $long="\/"; if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) { $long="long"; } print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n"; } close IN; close OUT; sub nearby{ my @p=@_; my ($s,$e)=(0,0); if ($p[1]<$p[2]) { $s=$p[1]; $e=$p[2]; }else{ $s=$p[3]; $e=$p[0]; } return ($s,$e); } sub usage{ print <<"USAGE"; Version $version Usage: $0 -i -o -g -n -r -p -t -l options: -i input file -g gff file -n NATs file -r repeat file -p phase file -o output file -t nat output file -l genelist output file -h help USAGE exit(1); }