view genephys/extractgenesfromsegment.pl @ 0:448b10ffb095 draft

Uploaded
author mcharles
date Tue, 12 Aug 2014 05:49:33 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict;

my $input_gene_file = $ARGV[0];
my $input_segment_file = $ARGV[1];
my $output_seq_nuc = $ARGV[2];
my $output_seq_prot = $ARGV[3];

open(IG, $input_gene_file)  or die("Can't open $input_gene_file\n");
open(IS, $input_segment_file)  or die("Can't open $input_segment_file\n");
open (ON,">$output_seq_nuc") or die ("Can't open $output_seq_nuc\n");
open (OP,">$output_seq_prot") or die ("Can't open $output_seq_prot\n");

my $current_annotation="";
my @gene_annotation;
my @list_gene;
my @current_gene;
my %current_gene_annotation;

# while ((my $line=<IG>)&&($#list_gene<5)){
while (my $line=<IG>){
	if ($line =~/\<Gene\>/){
		if (@current_gene){
			my %current_gene_annotation = %{&extract_annotation(\@current_gene)};

			
			push(@list_gene,\%current_gene_annotation);
			undef @current_gene;
		}
		push (@current_gene,$line);
		
	}
	else {
		push (@current_gene,$line);
	}
}
close(IG);

# for (my $i=0;$i<=$#list_gene;$i++){	
	# my %current_gene_annotation = %{$list_gene[$i]};
	# foreach my $key (keys %current_gene_annotation){
		# print "TEST ",$key,"\t",$current_gene_annotation{$key},"\n";
	# }
# }

# my @segment_chr;
# my @segment_start;
# my @segment_stop;

while (my $line=<IS>){
	print "\n$line";
	if ($line =~/(.*?)\:(\d+)\.\.(\d+)/){
		my $chr = $1;
		my $start = $2;
		my $stop = $3;
		
		my @list_gene_selected = @{&extract_gene_from_position($chr,$start,$stop,\@list_gene)};

		if ($#list_gene_selected>=0){
			for (my $i=0;$i<=$#list_gene_selected;$i++){
				my %current = %{$list_gene_selected[$i]},"\n";
				print $current{"00 BN_Id"},"\t",$current{"01 BN_Position"},"\t",$current{"02 ATH_Function"},"\t",$current{"03 ATH_Id"},"\n";
				
				my $seq = $current{"04 Sequence"};
				my $formated_seq;
				my @SEQ = split(//,$seq);
				my $compt_seq=0;
				for (my $i=0;$i<=$#SEQ;$i++){
					if ($SEQ[$i] =~ /[ATGNCXatgcnx]/){
						if ($compt_seq == 60){
							$formated_seq .="\n";
							$compt_seq=0;
						}
						$formated_seq.= $SEQ[$i];
						$compt_seq ++;
					}
				} 
				print ON ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_seq,"\n";
				
				my $prot = $current{"05 Protein"};
				my $formated_prot;
				my @PROT = split(//,$prot);
				my $compt_prot=0;
				for (my $i=0;$i<=$#PROT;$i++){
					if ($PROT[$i] =~ /[A-Za-z\*\+]/){
						if ($compt_prot == 60){
							$formated_prot .="\n";
							$compt_prot=0;
						}
						$formated_prot.= $PROT[$i];
						$compt_prot ++;
					}
				} 
				print OP ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_prot,"\n";
				
				# foreach my $key (sort keys %current){
					# print "   ",$key,"\t",$current{$key},"\n";
				# }
				# print "\n";
			}
		}
		else {
			print "   NO GENE FOUND\n";
		}
	}
	else {
		print "Error Parsing n°2 : $line\n";
	}
}

close (IS);

close (ON);
close (OP);


# my @list_gene_selected = @{&extract_gene_from_position("chrA01",1437,3000,\@list_gene)};


# for (my $i=0;$i<=$#list_gene_selected;$i++){
	# my %current = %{$list_gene_selected[$i]},"\n";
	# foreach my $key (keys %current){
		# print $key,"\t",$current{$key},"\n";
	# }
# }





	

sub extract_annotation{
	my $ref = shift;
	my @gene = @$ref;
	my %gene_annotation;
	for (my $i=0;$i<=$#gene;$i++){
		#print "TEST : $gene[$i]\n";
		if ($gene[$i]=~/\<Id\>(.*?)\<\/Id\>/){
			$gene_annotation{"00 BN_Id"} = $1;
		}
		elsif ($gene[$i]=~/\<Position\>(.*?)\<\/Position\>/){
			$gene_annotation{"01 BN_Position"} = $1;
		}
		elsif ($gene[$i]=~/\<ATH_Function\>(.*?)\<\/ATH_Function\>/){
			$gene_annotation{"02 ATH_Function"} = $1;
		}
		elsif ($gene[$i]=~/\<SId\>(.*?)\<\/SId\>/){
			$gene_annotation{"03 ATH_Id"} = $1;
		}
		elsif ($gene[$i]=~/\<CDS_Sequence\>(.*?)\<\/CDS_Sequence\>/){ #modif 1.11
			$gene_annotation{"04 Sequence"} = $1;
		}
		elsif ($gene[$i]=~/\<Protein\>(.*?)\<\/Protein\>/){
			$gene_annotation{"05 Protein"} = $1;
			# print "TEST : $1\n";
			# exit (0);
		}
	}
	if ((!$gene_annotation{"00 BN_Id"})||(!$gene_annotation{"01 BN_Position"})||(!$gene_annotation{"04 Sequence"})||(!$gene_annotation{"05 Protein"})){
		
		print "Erreur Parsing n°3\n";
		print "Id :",$gene_annotation{"00 BN_Id"},"\n";
		print "Position : ",$gene_annotation{"01 BN_Position"},"\n";
		print "ATH Function : ",$gene_annotation{"02 ATH_Function"},"\n";
		print "ATH Id : ",$gene_annotation{"03 ATH_Id"},"\n";
		print "CDS seq : ",$gene_annotation{"04 Sequence"},"\n";
		print "CDS prot : ",$gene_annotation{"05 Protein"},"\n";
		for (my $i=0;$i<=$#gene;$i++){
			print $gene[$i],"\n";
		}
		
		exit(0);
		
	}
	
	return \%gene_annotation;
}


sub extract_gene_from_position{
	my $chr = shift;
	my $start = shift;
	my $end = shift;
	
	my $ref = shift;
	my @list_gene = @$ref;
	my @list_gene_selected;
	
	for (my $i=0;$i<=$#list_gene;$i++){	
		my %current_gene_annotation = %{$list_gene[$i]};
		my $current_position = $current_gene_annotation{"01 BN_Position"};
		my $current_chr;
		my $current_start;
		my $current_end;
		
		#Extraction de la position
		if ($current_position =~ /^(.*?)\:(\d+)[\.]+(\d+)/){ # modif 1.11
			$current_chr = $1;
			$current_start = $2;
			$current_end = $3;
			if ($current_start > $current_end){
				($current_start,$current_end) = ($current_end,$current_start);
			}
		}
		else {
			print "Erreur Parsing n°1\npos : $current_position\n";
			exit(0);
		}
		#Test de selection
		if ($chr eq $current_chr){
			if (
				($current_end>=$start)&&($current_end<=$end) ||
				($current_start>=$start)&&($current_start<=$end)
			)
			{
				push(@list_gene_selected,$list_gene[$i]);
			}
		}
		
	}
	return \@list_gene_selected;
}