Mercurial > repos > mcharles > genephys
view genephys/extractgenesfromsegment.pl @ 0:448b10ffb095 draft
Uploaded
author | mcharles |
---|---|
date | Tue, 12 Aug 2014 05:49:33 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; my $input_gene_file = $ARGV[0]; my $input_segment_file = $ARGV[1]; my $output_seq_nuc = $ARGV[2]; my $output_seq_prot = $ARGV[3]; open(IG, $input_gene_file) or die("Can't open $input_gene_file\n"); open(IS, $input_segment_file) or die("Can't open $input_segment_file\n"); open (ON,">$output_seq_nuc") or die ("Can't open $output_seq_nuc\n"); open (OP,">$output_seq_prot") or die ("Can't open $output_seq_prot\n"); my $current_annotation=""; my @gene_annotation; my @list_gene; my @current_gene; my %current_gene_annotation; # while ((my $line=<IG>)&&($#list_gene<5)){ while (my $line=<IG>){ if ($line =~/\<Gene\>/){ if (@current_gene){ my %current_gene_annotation = %{&extract_annotation(\@current_gene)}; push(@list_gene,\%current_gene_annotation); undef @current_gene; } push (@current_gene,$line); } else { push (@current_gene,$line); } } close(IG); # for (my $i=0;$i<=$#list_gene;$i++){ # my %current_gene_annotation = %{$list_gene[$i]}; # foreach my $key (keys %current_gene_annotation){ # print "TEST ",$key,"\t",$current_gene_annotation{$key},"\n"; # } # } # my @segment_chr; # my @segment_start; # my @segment_stop; while (my $line=<IS>){ print "\n$line"; if ($line =~/(.*?)\:(\d+)\.\.(\d+)/){ my $chr = $1; my $start = $2; my $stop = $3; my @list_gene_selected = @{&extract_gene_from_position($chr,$start,$stop,\@list_gene)}; if ($#list_gene_selected>=0){ for (my $i=0;$i<=$#list_gene_selected;$i++){ my %current = %{$list_gene_selected[$i]},"\n"; print $current{"00 BN_Id"},"\t",$current{"01 BN_Position"},"\t",$current{"02 ATH_Function"},"\t",$current{"03 ATH_Id"},"\n"; my $seq = $current{"04 Sequence"}; my $formated_seq; my @SEQ = split(//,$seq); my $compt_seq=0; for (my $i=0;$i<=$#SEQ;$i++){ if ($SEQ[$i] =~ /[ATGNCXatgcnx]/){ if ($compt_seq == 60){ $formated_seq .="\n"; $compt_seq=0; } $formated_seq.= $SEQ[$i]; $compt_seq ++; } } print ON ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_seq,"\n"; my $prot = $current{"05 Protein"}; my $formated_prot; my @PROT = split(//,$prot); my $compt_prot=0; for (my $i=0;$i<=$#PROT;$i++){ if ($PROT[$i] =~ /[A-Za-z\*\+]/){ if ($compt_prot == 60){ $formated_prot .="\n"; $compt_prot=0; } $formated_prot.= $PROT[$i]; $compt_prot ++; } } print OP ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_prot,"\n"; # foreach my $key (sort keys %current){ # print " ",$key,"\t",$current{$key},"\n"; # } # print "\n"; } } else { print " NO GENE FOUND\n"; } } else { print "Error Parsing n°2 : $line\n"; } } close (IS); close (ON); close (OP); # my @list_gene_selected = @{&extract_gene_from_position("chrA01",1437,3000,\@list_gene)}; # for (my $i=0;$i<=$#list_gene_selected;$i++){ # my %current = %{$list_gene_selected[$i]},"\n"; # foreach my $key (keys %current){ # print $key,"\t",$current{$key},"\n"; # } # } sub extract_annotation{ my $ref = shift; my @gene = @$ref; my %gene_annotation; for (my $i=0;$i<=$#gene;$i++){ #print "TEST : $gene[$i]\n"; if ($gene[$i]=~/\<Id\>(.*?)\<\/Id\>/){ $gene_annotation{"00 BN_Id"} = $1; } elsif ($gene[$i]=~/\<Position\>(.*?)\<\/Position\>/){ $gene_annotation{"01 BN_Position"} = $1; } elsif ($gene[$i]=~/\<ATH_Function\>(.*?)\<\/ATH_Function\>/){ $gene_annotation{"02 ATH_Function"} = $1; } elsif ($gene[$i]=~/\<SId\>(.*?)\<\/SId\>/){ $gene_annotation{"03 ATH_Id"} = $1; } elsif ($gene[$i]=~/\<CDS_Sequence\>(.*?)\<\/CDS_Sequence\>/){ #modif 1.11 $gene_annotation{"04 Sequence"} = $1; } elsif ($gene[$i]=~/\<Protein\>(.*?)\<\/Protein\>/){ $gene_annotation{"05 Protein"} = $1; # print "TEST : $1\n"; # exit (0); } } if ((!$gene_annotation{"00 BN_Id"})||(!$gene_annotation{"01 BN_Position"})||(!$gene_annotation{"04 Sequence"})||(!$gene_annotation{"05 Protein"})){ print "Erreur Parsing n°3\n"; print "Id :",$gene_annotation{"00 BN_Id"},"\n"; print "Position : ",$gene_annotation{"01 BN_Position"},"\n"; print "ATH Function : ",$gene_annotation{"02 ATH_Function"},"\n"; print "ATH Id : ",$gene_annotation{"03 ATH_Id"},"\n"; print "CDS seq : ",$gene_annotation{"04 Sequence"},"\n"; print "CDS prot : ",$gene_annotation{"05 Protein"},"\n"; for (my $i=0;$i<=$#gene;$i++){ print $gene[$i],"\n"; } exit(0); } return \%gene_annotation; } sub extract_gene_from_position{ my $chr = shift; my $start = shift; my $end = shift; my $ref = shift; my @list_gene = @$ref; my @list_gene_selected; for (my $i=0;$i<=$#list_gene;$i++){ my %current_gene_annotation = %{$list_gene[$i]}; my $current_position = $current_gene_annotation{"01 BN_Position"}; my $current_chr; my $current_start; my $current_end; #Extraction de la position if ($current_position =~ /^(.*?)\:(\d+)[\.]+(\d+)/){ # modif 1.11 $current_chr = $1; $current_start = $2; $current_end = $3; if ($current_start > $current_end){ ($current_start,$current_end) = ($current_end,$current_start); } } else { print "Erreur Parsing n°1\npos : $current_position\n"; exit(0); } #Test de selection if ($chr eq $current_chr){ if ( ($current_end>=$start)&&($current_end<=$end) || ($current_start>=$start)&&($current_start<=$end) ) { push(@list_gene_selected,$list_gene[$i]); } } } return \@list_gene_selected; }