Mercurial > repos > mcharles > genephys

diff genephys/extractgenesfromsegment.pl @ 0:448b10ffb095 draft
Uploaded
author: mcharles
date: Tue, 12 Aug 2014 05:49:33 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genephys/extractgenesfromsegment.pl	Tue Aug 12 05:49:33 2014 -0400
@@ -0,0 +1,223 @@
+#!/usr/bin/perl
+use strict;
+
+my $input_gene_file = $ARGV[0];
+my $input_segment_file = $ARGV[1];
+my $output_seq_nuc = $ARGV[2];
+my $output_seq_prot = $ARGV[3];
+
+open(IG, $input_gene_file)  or die("Can't open $input_gene_file\n");
+open(IS, $input_segment_file)  or die("Can't open $input_segment_file\n");
+open (ON,">$output_seq_nuc") or die ("Can't open $output_seq_nuc\n");
+open (OP,">$output_seq_prot") or die ("Can't open $output_seq_prot\n");
+
+my $current_annotation="";
+my @gene_annotation;
+my @list_gene;
+my @current_gene;
+my %current_gene_annotation;
+
+# while ((my $line=<IG>)&&($#list_gene<5)){
+while (my $line=<IG>){
+	if ($line =~/\<Gene\>/){
+		if (@current_gene){
+			my %current_gene_annotation = %{&extract_annotation(\@current_gene)};
+
+			
+			push(@list_gene,\%current_gene_annotation);
+			undef @current_gene;
+		}
+		push (@current_gene,$line);
+		
+	}
+	else {
+		push (@current_gene,$line);
+	}
+}
+close(IG);
+
+# for (my $i=0;$i<=$#list_gene;$i++){	
+	# my %current_gene_annotation = %{$list_gene[$i]};
+	# foreach my $key (keys %current_gene_annotation){
+		# print "TEST ",$key,"\t",$current_gene_annotation{$key},"\n";
+	# }
+# }
+
+# my @segment_chr;
+# my @segment_start;
+# my @segment_stop;
+
+while (my $line=<IS>){
+	print "\n$line";
+	if ($line =~/(.*?)\:(\d+)\.\.(\d+)/){
+		my $chr = $1;
+		my $start = $2;
+		my $stop = $3;
+		
+		my @list_gene_selected = @{&extract_gene_from_position($chr,$start,$stop,\@list_gene)};
+
+		if ($#list_gene_selected>=0){
+			for (my $i=0;$i<=$#list_gene_selected;$i++){
+				my %current = %{$list_gene_selected[$i]},"\n";
+				print $current{"00 BN_Id"},"\t",$current{"01 BN_Position"},"\t",$current{"02 ATH_Function"},"\t",$current{"03 ATH_Id"},"\n";
+				
+				my $seq = $current{"04 Sequence"};
+				my $formated_seq;
+				my @SEQ = split(//,$seq);
+				my $compt_seq=0;
+				for (my $i=0;$i<=$#SEQ;$i++){
+					if ($SEQ[$i] =~ /[ATGNCXatgcnx]/){
+						if ($compt_seq == 60){
+							$formated_seq .="\n";
+							$compt_seq=0;
+						}
+						$formated_seq.= $SEQ[$i];
+						$compt_seq ++;
+					}
+				} 
+				print ON ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_seq,"\n";
+				
+				my $prot = $current{"05 Protein"};
+				my $formated_prot;
+				my @PROT = split(//,$prot);
+				my $compt_prot=0;
+				for (my $i=0;$i<=$#PROT;$i++){
+					if ($PROT[$i] =~ /[A-Za-z\*\+]/){
+						if ($compt_prot == 60){
+							$formated_prot .="\n";
+							$compt_prot=0;
+						}
+						$formated_prot.= $PROT[$i];
+						$compt_prot ++;
+					}
+				} 
+				print OP ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_prot,"\n";
+				
+				# foreach my $key (sort keys %current){
+					# print "   ",$key,"\t",$current{$key},"\n";
+				# }
+				# print "\n";
+			}
+		}
+		else {
+			print "   NO GENE FOUND\n";
+		}
+	}
+	else {
+		print "Error Parsing n°2 : $line\n";
+	}
+}
+
+close (IS);
+
+close (ON);
+close (OP);
+
+
+# my @list_gene_selected = @{&extract_gene_from_position("chrA01",1437,3000,\@list_gene)};
+
+
+# for (my $i=0;$i<=$#list_gene_selected;$i++){
+	# my %current = %{$list_gene_selected[$i]},"\n";
+	# foreach my $key (keys %current){
+		# print $key,"\t",$current{$key},"\n";
+	# }
+# }
+
+
+
+
+
+	
+
+sub extract_annotation{
+	my $ref = shift;
+	my @gene = @$ref;
+	my %gene_annotation;
+	for (my $i=0;$i<=$#gene;$i++){
+		#print "TEST : $gene[$i]\n";
+		if ($gene[$i]=~/\<Id\>(.*?)\<\/Id\>/){
+			$gene_annotation{"00 BN_Id"} = $1;
+		}
+		elsif ($gene[$i]=~/\<Position\>(.*?)\<\/Position\>/){
+			$gene_annotation{"01 BN_Position"} = $1;
+		}
+		elsif ($gene[$i]=~/\<ATH_Function\>(.*?)\<\/ATH_Function\>/){
+			$gene_annotation{"02 ATH_Function"} = $1;
+		}
+		elsif ($gene[$i]=~/\<SId\>(.*?)\<\/SId\>/){
+			$gene_annotation{"03 ATH_Id"} = $1;
+		}
+		elsif ($gene[$i]=~/\<CDS_Sequence\>(.*?)\<\/CDS_Sequence\>/){ #modif 1.11
+			$gene_annotation{"04 Sequence"} = $1;
+		}
+		elsif ($gene[$i]=~/\<Protein\>(.*?)\<\/Protein\>/){
+			$gene_annotation{"05 Protein"} = $1;
+			# print "TEST : $1\n";
+			# exit (0);
+		}
+	}
+	if ((!$gene_annotation{"00 BN_Id"})||(!$gene_annotation{"01 BN_Position"})||(!$gene_annotation{"04 Sequence"})||(!$gene_annotation{"05 Protein"})){
+		
+		print "Erreur Parsing n°3\n";
+		print "Id :",$gene_annotation{"00 BN_Id"},"\n";
+		print "Position : ",$gene_annotation{"01 BN_Position"},"\n";
+		print "ATH Function : ",$gene_annotation{"02 ATH_Function"},"\n";
+		print "ATH Id : ",$gene_annotation{"03 ATH_Id"},"\n";
+		print "CDS seq : ",$gene_annotation{"04 Sequence"},"\n";
+		print "CDS prot : ",$gene_annotation{"05 Protein"},"\n";
+		for (my $i=0;$i<=$#gene;$i++){
+			print $gene[$i],"\n";
+		}
+		
+		exit(0);
+		
+	}
+	
+	return \%gene_annotation;
+}
+
+
+sub extract_gene_from_position{
+	my $chr = shift;
+	my $start = shift;
+	my $end = shift;
+	
+	my $ref = shift;
+	my @list_gene = @$ref;
+	my @list_gene_selected;
+	
+	for (my $i=0;$i<=$#list_gene;$i++){	
+		my %current_gene_annotation = %{$list_gene[$i]};
+		my $current_position = $current_gene_annotation{"01 BN_Position"};
+		my $current_chr;
+		my $current_start;
+		my $current_end;
+		
+		#Extraction de la position
+		if ($current_position =~ /^(.*?)\:(\d+)[\.]+(\d+)/){ # modif 1.11
+			$current_chr = $1;
+			$current_start = $2;
+			$current_end = $3;
+			if ($current_start > $current_end){
+				($current_start,$current_end) = ($current_end,$current_start);
+			}
+		}
+		else {
+			print "Erreur Parsing n°1\npos : $current_position\n";
+			exit(0);
+		}
+		#Test de selection
+		if ($chr eq $current_chr){
+			if (
+				($current_end>=$start)&&($current_end<=$end) ||
+				($current_start>=$start)&&($current_start<=$end)
+			)
+			{
+				push(@list_gene_selected,$list_gene[$i]);
+			}
+		}
+		
+	}
+	return \@list_gene_selected;
+}
\ No newline at end of file
author	mcharles
date	Tue, 12 Aug 2014 05:49:33 -0400
parents
children