diff ClassAnnotate.pl @ 0:07745c0958dd draft

Uploaded
author big-tiandm
date Thu, 18 Sep 2014 21:40:25 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ClassAnnotate.pl	Thu Sep 18 21:40:25 2014 -0400
@@ -0,0 +1,251 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chen Tingting
+#Email: chentt@big.ac.cn
+#Date: 2014/5/13
+#Modified:
+#Description: cluster annotate
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h");
+if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t}  and defined $opts{l}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+#my %gene;
+my %gene1;
+open IN,"<$opts{g}";
+open OUT ,">$opts{l}";
+print OUT "#ID\tchr\tstart\tend\tstrand\n";
+my $n=1;
+while (my $aline=<IN>) {
+	chomp $aline;
+	next if($aline=~/^\#/);
+	my @tmp=split/\t/,$aline;
+	my $ID;
+	if ($tmp[2] eq "gene") {
+		$tmp[0]=~s/Chr\./Chr/;
+		#$tmp[0]=~s/Chr/chr/;
+		my @infor=split/;/,$tmp[8];
+		for (my $i=0;$i<@infor ;$i++) {
+			if ($infor[$i]=~/Alias\=(\S+)$/) {
+				$ID=$1;
+				last;
+			}
+			else {
+				$ID="unknown$n";
+				$n++;
+			}
+		}
+		#$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+		push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
+		print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+	}
+}
+#while (my $aline=<IN>) {
+#	chomp $aline;
+#	next if($aline=~/^\#/);
+#	my @tmp=split/\t/,$aline;#ID chr start end strand
+#	push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
+#	#$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]];
+#}
+close IN;
+close OUT;
+
+my %nat;
+open TMP,">$opts{t}";
+print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n";
+
+open IN,"<$opts{n}";
+my $nat_n=1;
+while (my $aline=<IN>) {
+	next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt -
+	chomp $aline;
+	my @arr=split /\t/,$aline;
+	my ($ns,$ne,$ns2,$ne2)=(0,0,0,0);
+	if ($arr[11]=~/Nearby/) {
+		($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]);
+		push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n];
+		print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n";
+		$nat_n++;
+	}else{
+		$ns=$gene1{$arr[1]}[0][0]+$arr[6]-1;
+		$ne=$gene1{$arr[1]}[0][0]+$arr[7]-1;
+		$ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1;
+		$ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1;
+		push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2
+		push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"];
+		print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n";
+		$nat_n++;
+	}
+}
+close IN;
+close TMP;
+
+my %repeat;
+open IN,"<$opts{'r'}";
+my $first=<IN>;
+$first=<IN>;
+$first=<IN>;
+while (my $aline=<IN>) {
+	chomp $aline;
+	$aline=~s/^\s+//;
+	my @tmp=split/\s+/,$aline;
+	$tmp[4]=~s/chr0/Chr/;
+	$tmp[4]=~s/chr/Chr/;
+	push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class
+	#print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n";
+}
+close IN;
+
+my %phase;
+open IN,"<$opts{'p'}";
+while (my $aline=<IN>) {
+	chomp $aline;
+	next if($aline=~/^\#/);
+	my @tmp=split/\t/,$aline;
+	if ($tmp[5]>=25) {
+		$phase{$tmp[0]}=$tmp[5];
+	}
+}
+close IN;
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+while (my $aline=<IN>) {
+	chomp $aline;
+	if($aline=~/^\#/){
+		print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n";
+		next;
+	}
+	my @tmp=split/\t/,$aline;
+	my @inf=split/\:/,$tmp[0];
+	my @pos=split/\-/,$inf[1];
+	my $chr=$inf[0];
+	$chr=~s/Chr0/Chr/;
+	my $start=$pos[0];
+	my $end=$pos[1];
+	#=========Repeat============
+	my @repeat;
+	if (defined(@{$repeat{$chr}})) {
+		my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}};
+		for (my $i=0;$i<@r_array ;$i++) {
+			if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) {
+				push @repeat,$r_array[$i][2];
+			}
+			elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){
+				push @repeat,$r_array[$i][2];
+
+			}
+			elsif($end<$r_array[$i][0]){
+				last;
+			}
+			else{next;}
+		}
+	}
+	my $repeat;
+	if (@repeat==0) {
+		$repeat="\/";
+	}
+	else{
+		$repeat=join ";",@repeat;
+	}
+	#=========nat===============
+	my @nat1;#class 1
+	my @nat2;#class 2
+	my @nat;#nat ID
+	#foreach my $chr (keys %nat) {
+		my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}};
+		for (my $i=0;$i<@n_array ;$i++) {
+			if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) {
+				push @nat1,$n_array[$i][2];
+				push @nat2,$n_array[$i][3];
+				push @nat,$n_array[$i][4];
+			}
+			elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){
+				push @nat1,$n_array[$i][2];
+				push @nat2,$n_array[$i][3];
+				push @nat,$n_array[$i][4];
+			}
+			elsif($end<$n_array[$i][0]){
+				last;
+			}
+			else{next;}
+		}
+	#}
+
+	my $nat1;
+	my $nat2;
+	my $nat;
+	if (@nat1==0) {
+		$nat1="\/";
+	}
+	else{
+		$nat1=join ";",@nat1;
+	}
+	if (@nat2==0) {
+		$nat2="\/";
+	}
+	else{
+		$nat2=join ";",@nat2;
+	}
+	if (@nat==0) {
+		$nat="\/";
+	}
+	else{
+		$nat=join ";",@nat;
+	}
+	#========phase==============
+	my $phase="\/";
+	if (defined($phase{$tmp[0]})) {
+		$phase="phase";
+	}
+	#=========long===============
+	my $long="\/";
+	if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) {
+		$long="long";
+	}
+	print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n";
+}
+
+close IN;
+close OUT;
+
+sub nearby{
+	my @p=@_;
+	my ($s,$e)=(0,0);
+	if ($p[1]<$p[2]) {
+		$s=$p[1];
+		$e=$p[2];
+	}else{
+		$s=$p[3];
+		$e=$p[0];
+		}
+	return ($s,$e);
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -g -n -r -p -t -l
+options:
+-i input file
+	-g gff file
+	-n NATs file
+	-r repeat file
+	-p phase file
+-o output file
+-t nat output file
+-l genelist output file
+-h help
+USAGE
+exit(1);
+}
+