view Annotate.pl @ 9:e885b3d4444e draft

Uploaded
author big-tiandm
date Wed, 29 Oct 2014 04:18:03 -0400
parents 07745c0958dd
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Chentt
#Email: chentt@big.ac.cn
#Date: 2014/4/10
#Modified:
#Description: cluster annotate by priority
my $version=1.00;

use strict;
use Getopt::Long;

my %opts;
GetOptions(\%opts,"i=s","d=i","g=s","o=s","t=s","h");
if (!(defined $opts{i} and defined $opts{g} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
&usage;
}
#my $genelistout=$opts{'t'};
my $dis=defined $opts{'d'}? $opts{'d'}:1000;
my %gene;

#open OUT,">$genelistout"; #output file  
#print OUT "#ID\tchr\tstart\tend\tstrand\ns";
open IN,"<$opts{g}";
while (my $aline=<IN>) {
	chomp $aline;
	next if($aline=~/^\#/);
	my @tmp=split/\t/,$aline;#ID chr start end strand
	#push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
	$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[4]];
}
#while (my $aline=<IN>) {
#	chomp $aline;
#	next if($aline=~/^\#/);
#	my @tmp=split/\t/,$aline;
#	my $ID;
#	if ($tmp[2] eq "gene") {
#		$tmp[0]=~s/Chr\./Chr/;
#		$tmp[0]=~s/Chr/chr/;
#		my @infor=split/;/,$tmp[8];
#		for (my $i=0;$i<@infor ;$i++) {
#			if ($infor[$i]=~/Alias\=(\S+)$/) {
#				$ID=$1;
#				last;
#			}
#		}
#		$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
#		print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
#	}
#}
close IN;
#close OUT;


my $filein=$opts{'i'};
my $fileout=$opts{'o'};

open IN,"<$filein"; #input file  
open OUT,">$fileout"; #output file  
while (my $aline=<IN>) {
	chomp $aline;
	my @tmp=split/\t/,$aline;
	if($aline=~/^\#/){print OUT "$aline\tP_annotate\n";next}
	my @result;
	#shift @tmp;
	my @id=split/:/,$tmp[0];
	$id[0]=~s/Chr0/Chr/;
	my @posi=split/-/,$id[1];
	foreach my $key (keys %{$gene{$id[0]}}) {
		if ($posi[0]<$gene{$id[0]}{$key}[1] && $posi[1]>$gene{$id[0]}{$key}[0]) {
			push @result,"gene-body;$key;$gene{$id[0]}{$key}[2]";#$te{$key}";
			next;
		}
		#if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-1000) {
		if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-$dis) {
			push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
			push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
			next;
		}
		#if ($posi[0]<$gene{$id[0]}{$key}[1]+1000 && $posi[1]>$gene{$id[0]}{$key}[1]) {
		if ($posi[0]<$gene{$id[0]}{$key}[1]+$dis && $posi[1]>$gene{$id[0]}{$key}[1]) {
			push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
			push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
			next;
		}
	}
	my $result;
	if (!(@result)) {
		$result="intergenic";
	}
	elsif($#result==0){
		$result=$result[0];
	
	}
	else{
		$result=join "\t",@result;
	}
#	else{
#		my $te_num=0;
#		my @te_overlap;
#		my @te_up_down;
#		my @non_overlap;
#		my @non_up_down;
#		for (my $k=0;$k<@result ;$k++) {
#			my @rr=split/\;/,$result[$k];
#			if ($rr[3] eq "Y") {
#				$te_num++;
#				if ($rr[0] eq "overlap") {
#					push @te_overlap,$result[$k];
#				}
#				else{
#					push @te_up_down,$result[$k];
#				}
#			}
#			else{
#				if ($rr[0] eq "overlap") {
#					push @non_overlap,$result[$k];
#				}
#				else{
#					push @non_up_down,$result[$k];
#				}
#			}
#		}
#		if ($te_num==0) {#non TE 
#			if (!(@te_overlap)) {#down up
#				if ($#non_up_down==0) {
#					$result=$non_up_down[0];
#				}
#				else{#overlap
#					my $all_2=join "\t",@non_up_down;
#					$result="up&down1-kb\t".$all_2;
#				}
#			}
#			else{
#				$result=join "\t",@non_overlap;
#				if ($#non_overlap>=1) {
#					print "$aline\t$result\n";
#				}
#			}
#		}
#		else{#TE
#			if (!(@te_overlap)) {#down up
#				if ($#te_up_down==0) {
#					$result=$te_up_down[0];
#				}
#				else{#overlap
#					my $all_2=join "\t",@te_up_down;
#					$result="up&down1-kb\t".$all_2;
#				}
#			}
#			else{
#				$result=join "\t",@te_overlap;
#				if ($#te_overlap>=1) {
#					print "$aline\t$result\n";
#				}
#			}
#		}
#	}
	print OUT "$aline\t$result\n";
}

close IN;
close OUT;
sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -o -g -d 
options:
-i input file
-g genelist file
-d int the length of the upstream and downstream,default 1000
-o output file
-h help
USAGE
exit(1);
}