diff Annotate.pl @ 0:07745c0958dd draft

Uploaded
author big-tiandm
date Thu, 18 Sep 2014 21:40:25 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Annotate.pl	Thu Sep 18 21:40:25 2014 -0400
@@ -0,0 +1,178 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chentt
+#Email: chentt@big.ac.cn
+#Date: 2014/4/10
+#Modified:
+#Description: cluster annotate by priority
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","d=i","g=s","o=s","t=s","h");
+if (!(defined $opts{i} and defined $opts{g} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+#my $genelistout=$opts{'t'};
+my $dis=defined $opts{'d'}? $opts{'d'}:1000;
+my %gene;
+
+#open OUT,">$genelistout"; #output file  
+#print OUT "#ID\tchr\tstart\tend\tstrand\ns";
+open IN,"<$opts{g}";
+while (my $aline=<IN>) {
+	chomp $aline;
+	next if($aline=~/^\#/);
+	my @tmp=split/\t/,$aline;#ID chr start end strand
+	#push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
+	$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[4]];
+}
+#while (my $aline=<IN>) {
+#	chomp $aline;
+#	next if($aline=~/^\#/);
+#	my @tmp=split/\t/,$aline;
+#	my $ID;
+#	if ($tmp[2] eq "gene") {
+#		$tmp[0]=~s/Chr\./Chr/;
+#		$tmp[0]=~s/Chr/chr/;
+#		my @infor=split/;/,$tmp[8];
+#		for (my $i=0;$i<@infor ;$i++) {
+#			if ($infor[$i]=~/Alias\=(\S+)$/) {
+#				$ID=$1;
+#				last;
+#			}
+#		}
+#		$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+#		print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+#	}
+#}
+close IN;
+#close OUT;
+
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+while (my $aline=<IN>) {
+	chomp $aline;
+	my @tmp=split/\t/,$aline;
+	if($aline=~/^\#/){print OUT "$aline\tP_annotate\n";next}
+	my @result;
+	#shift @tmp;
+	my @id=split/:/,$tmp[0];
+	$id[0]=~s/Chr0/Chr/;
+	my @posi=split/-/,$id[1];
+	foreach my $key (keys %{$gene{$id[0]}}) {
+		if ($posi[0]<$gene{$id[0]}{$key}[1] && $posi[1]>$gene{$id[0]}{$key}[0]) {
+			push @result,"gene-body;$key;$gene{$id[0]}{$key}[2]";#$te{$key}";
+			next;
+		}
+		#if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-1000) {
+		if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-$dis) {
+			push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
+			push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
+			next;
+		}
+		#if ($posi[0]<$gene{$id[0]}{$key}[1]+1000 && $posi[1]>$gene{$id[0]}{$key}[1]) {
+		if ($posi[0]<$gene{$id[0]}{$key}[1]+$dis && $posi[1]>$gene{$id[0]}{$key}[1]) {
+			push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
+			push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
+			next;
+		}
+	}
+	my $result;
+	if (!(@result)) {
+		$result="intergenic";
+	}
+	elsif($#result==0){
+		$result=$result[0];
+	
+	}
+	else{
+		$result=join "\t",@result;
+	}
+#	else{
+#		my $te_num=0;
+#		my @te_overlap;
+#		my @te_up_down;
+#		my @non_overlap;
+#		my @non_up_down;
+#		for (my $k=0;$k<@result ;$k++) {
+#			my @rr=split/\;/,$result[$k];
+#			if ($rr[3] eq "Y") {
+#				$te_num++;
+#				if ($rr[0] eq "overlap") {
+#					push @te_overlap,$result[$k];
+#				}
+#				else{
+#					push @te_up_down,$result[$k];
+#				}
+#			}
+#			else{
+#				if ($rr[0] eq "overlap") {
+#					push @non_overlap,$result[$k];
+#				}
+#				else{
+#					push @non_up_down,$result[$k];
+#				}
+#			}
+#		}
+#		if ($te_num==0) {#non TE 
+#			if (!(@te_overlap)) {#down up
+#				if ($#non_up_down==0) {
+#					$result=$non_up_down[0];
+#				}
+#				else{#overlap
+#					my $all_2=join "\t",@non_up_down;
+#					$result="up&down1-kb\t".$all_2;
+#				}
+#			}
+#			else{
+#				$result=join "\t",@non_overlap;
+#				if ($#non_overlap>=1) {
+#					print "$aline\t$result\n";
+#				}
+#			}
+#		}
+#		else{#TE
+#			if (!(@te_overlap)) {#down up
+#				if ($#te_up_down==0) {
+#					$result=$te_up_down[0];
+#				}
+#				else{#overlap
+#					my $all_2=join "\t",@te_up_down;
+#					$result="up&down1-kb\t".$all_2;
+#				}
+#			}
+#			else{
+#				$result=join "\t",@te_overlap;
+#				if ($#te_overlap>=1) {
+#					print "$aline\t$result\n";
+#				}
+#			}
+#		}
+#	}
+	print OUT "$aline\t$result\n";
+}
+
+close IN;
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -g -d 
+options:
+-i input file
+-g genelist file
+-d int the length of the upstream and downstream,default 1000
+-o output file
+-h help
+USAGE
+exit(1);
+}
+