Mercurial > repos > mcharles > genephys
diff genephys/extractgenesfromsegment.pl @ 0:448b10ffb095 draft
Uploaded
author | mcharles |
---|---|
date | Tue, 12 Aug 2014 05:49:33 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/extractgenesfromsegment.pl Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,223 @@ +#!/usr/bin/perl +use strict; + +my $input_gene_file = $ARGV[0]; +my $input_segment_file = $ARGV[1]; +my $output_seq_nuc = $ARGV[2]; +my $output_seq_prot = $ARGV[3]; + +open(IG, $input_gene_file) or die("Can't open $input_gene_file\n"); +open(IS, $input_segment_file) or die("Can't open $input_segment_file\n"); +open (ON,">$output_seq_nuc") or die ("Can't open $output_seq_nuc\n"); +open (OP,">$output_seq_prot") or die ("Can't open $output_seq_prot\n"); + +my $current_annotation=""; +my @gene_annotation; +my @list_gene; +my @current_gene; +my %current_gene_annotation; + +# while ((my $line=<IG>)&&($#list_gene<5)){ +while (my $line=<IG>){ + if ($line =~/\<Gene\>/){ + if (@current_gene){ + my %current_gene_annotation = %{&extract_annotation(\@current_gene)}; + + + push(@list_gene,\%current_gene_annotation); + undef @current_gene; + } + push (@current_gene,$line); + + } + else { + push (@current_gene,$line); + } +} +close(IG); + +# for (my $i=0;$i<=$#list_gene;$i++){ + # my %current_gene_annotation = %{$list_gene[$i]}; + # foreach my $key (keys %current_gene_annotation){ + # print "TEST ",$key,"\t",$current_gene_annotation{$key},"\n"; + # } +# } + +# my @segment_chr; +# my @segment_start; +# my @segment_stop; + +while (my $line=<IS>){ + print "\n$line"; + if ($line =~/(.*?)\:(\d+)\.\.(\d+)/){ + my $chr = $1; + my $start = $2; + my $stop = $3; + + my @list_gene_selected = @{&extract_gene_from_position($chr,$start,$stop,\@list_gene)}; + + if ($#list_gene_selected>=0){ + for (my $i=0;$i<=$#list_gene_selected;$i++){ + my %current = %{$list_gene_selected[$i]},"\n"; + print $current{"00 BN_Id"},"\t",$current{"01 BN_Position"},"\t",$current{"02 ATH_Function"},"\t",$current{"03 ATH_Id"},"\n"; + + my $seq = $current{"04 Sequence"}; + my $formated_seq; + my @SEQ = split(//,$seq); + my $compt_seq=0; + for (my $i=0;$i<=$#SEQ;$i++){ + if ($SEQ[$i] =~ /[ATGNCXatgcnx]/){ + if ($compt_seq == 60){ + $formated_seq .="\n"; + $compt_seq=0; + } + $formated_seq.= $SEQ[$i]; + $compt_seq ++; + } + } + print ON ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_seq,"\n"; + + my $prot = $current{"05 Protein"}; + my $formated_prot; + my @PROT = split(//,$prot); + my $compt_prot=0; + for (my $i=0;$i<=$#PROT;$i++){ + if ($PROT[$i] =~ /[A-Za-z\*\+]/){ + if ($compt_prot == 60){ + $formated_prot .="\n"; + $compt_prot=0; + } + $formated_prot.= $PROT[$i]; + $compt_prot ++; + } + } + print OP ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_prot,"\n"; + + # foreach my $key (sort keys %current){ + # print " ",$key,"\t",$current{$key},"\n"; + # } + # print "\n"; + } + } + else { + print " NO GENE FOUND\n"; + } + } + else { + print "Error Parsing n°2 : $line\n"; + } +} + +close (IS); + +close (ON); +close (OP); + + +# my @list_gene_selected = @{&extract_gene_from_position("chrA01",1437,3000,\@list_gene)}; + + +# for (my $i=0;$i<=$#list_gene_selected;$i++){ + # my %current = %{$list_gene_selected[$i]},"\n"; + # foreach my $key (keys %current){ + # print $key,"\t",$current{$key},"\n"; + # } +# } + + + + + + + +sub extract_annotation{ + my $ref = shift; + my @gene = @$ref; + my %gene_annotation; + for (my $i=0;$i<=$#gene;$i++){ + #print "TEST : $gene[$i]\n"; + if ($gene[$i]=~/\<Id\>(.*?)\<\/Id\>/){ + $gene_annotation{"00 BN_Id"} = $1; + } + elsif ($gene[$i]=~/\<Position\>(.*?)\<\/Position\>/){ + $gene_annotation{"01 BN_Position"} = $1; + } + elsif ($gene[$i]=~/\<ATH_Function\>(.*?)\<\/ATH_Function\>/){ + $gene_annotation{"02 ATH_Function"} = $1; + } + elsif ($gene[$i]=~/\<SId\>(.*?)\<\/SId\>/){ + $gene_annotation{"03 ATH_Id"} = $1; + } + elsif ($gene[$i]=~/\<CDS_Sequence\>(.*?)\<\/CDS_Sequence\>/){ #modif 1.11 + $gene_annotation{"04 Sequence"} = $1; + } + elsif ($gene[$i]=~/\<Protein\>(.*?)\<\/Protein\>/){ + $gene_annotation{"05 Protein"} = $1; + # print "TEST : $1\n"; + # exit (0); + } + } + if ((!$gene_annotation{"00 BN_Id"})||(!$gene_annotation{"01 BN_Position"})||(!$gene_annotation{"04 Sequence"})||(!$gene_annotation{"05 Protein"})){ + + print "Erreur Parsing n°3\n"; + print "Id :",$gene_annotation{"00 BN_Id"},"\n"; + print "Position : ",$gene_annotation{"01 BN_Position"},"\n"; + print "ATH Function : ",$gene_annotation{"02 ATH_Function"},"\n"; + print "ATH Id : ",$gene_annotation{"03 ATH_Id"},"\n"; + print "CDS seq : ",$gene_annotation{"04 Sequence"},"\n"; + print "CDS prot : ",$gene_annotation{"05 Protein"},"\n"; + for (my $i=0;$i<=$#gene;$i++){ + print $gene[$i],"\n"; + } + + exit(0); + + } + + return \%gene_annotation; +} + + +sub extract_gene_from_position{ + my $chr = shift; + my $start = shift; + my $end = shift; + + my $ref = shift; + my @list_gene = @$ref; + my @list_gene_selected; + + for (my $i=0;$i<=$#list_gene;$i++){ + my %current_gene_annotation = %{$list_gene[$i]}; + my $current_position = $current_gene_annotation{"01 BN_Position"}; + my $current_chr; + my $current_start; + my $current_end; + + #Extraction de la position + if ($current_position =~ /^(.*?)\:(\d+)[\.]+(\d+)/){ # modif 1.11 + $current_chr = $1; + $current_start = $2; + $current_end = $3; + if ($current_start > $current_end){ + ($current_start,$current_end) = ($current_end,$current_start); + } + } + else { + print "Erreur Parsing n°1\npos : $current_position\n"; + exit(0); + } + #Test de selection + if ($chr eq $current_chr){ + if ( + ($current_end>=$start)&&($current_end<=$end) || + ($current_start>=$start)&&($current_start<=$end) + ) + { + push(@list_gene_selected,$list_gene[$i]); + } + } + + } + return \@list_gene_selected; +} \ No newline at end of file