Mercurial > repos > fgiacomoni > downloader_bank_hmdb
view lib/hmdb_api.pm @ 1:4373f936111d draft
" master branch Updating with tag :CI_COMMIT_TAG - - Fxx"
author | fgiacomoni |
---|---|
date | Tue, 21 Jan 2020 16:09:45 -0500 |
parents | 7c9269bded0e |
children | be504ccbc41c |
line wrap: on
line source
package hmdb_api ; use strict; use warnings ; use Exporter ; use Carp ; use Data::Dumper ; use XML::Twig ; use csv ; use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS); our $VERSION = "1.0"; our @ISA = qw(Exporter); our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz); our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] ); =head1 NAME My::Module - An example module =head1 SYNOPSIS use My::Module; my $object = My::Module->new(); print $object->as_string; =head1 DESCRIPTION This module does not really exist, it was made for the sole purpose of demonstrating how POD works. =head1 METHODS Methods are : =head2 METHOD new ## Description : new ## Input : $self ## Ouput : bless $self ; ## Usage : new() ; =cut sub new { ## Variables my $self={}; bless($self) ; return $self ; } ### END of SUB =head2 METHOD cowmetdb_handle ## Description : open a flat file and push the contains in memory - compute entries number. ## Input : $flat ## Output : $handler, $entries ## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ; =cut ## START of SUB sub cowmetdb_handle { ## Retrieve Values my $self = shift ; my ( $flat ) = @_ ; my @handle = () ; my $entries = 0 ; my ( $begin, $end ) = ( 0, 0 ) ; if ( -e $flat ) { open(FILE, "<$flat") or die "Cant' read the file $flat\n" ; while (my $line = <FILE>){ chomp $line ; push(@handle, $line) ; if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; } elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; } ## count entries if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; } } close(FILE) ; } else { croak "Can't find the source file $flat\n" ; } return(\@handle, \$entries) ; } ## END of SUB =head2 METHOD cowmetdb_hash ## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry ## Input : $handler ## Output : $entries ## Usage : my ( $entries ) = hmdb_hash( $handler ) ; =cut ## START of SUB sub cowmetdb_hash { ## Retrieve Values my $self = shift ; my ( $handle ) = @_ ; my @entries = () ; my %entry = () ; my $pos = 0 ; if ( ( defined $handle ) ) { foreach my $data ( @$handle ) { if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; } elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; } elsif( $data =~ /^# name:/ ) { $entry{'COMMON_NAME'} = $handle->[$pos+1] ; } elsif( $data =~ /^# iupac:/ ) { $entry{'IUPAC'} = $handle->[$pos+1] ; } elsif( $data =~ /^# kegg_compound_id:/ ) { $entry{'KEGG_ID'} = $handle->[$pos+1] ; } elsif( $data =~ /^# chemical_formula:/ ) { $entry{'FORMULA'} = $handle->[$pos+1] ; } elsif( $data =~ /^# taxonomy_super_class:/ ) { $entry{'TAXONOMY'} = $handle->[$pos+1] ; } elsif( $data =~ /^# cas_number:/ ) { $entry{'CAS'} = $handle->[$pos+1] ; } elsif( $data =~ /^# biofluid_location:/ ) { $entry{'LOCATION'} = $handle->[$pos+1] ; } elsif( $data =~ /^# inchi_identifier:/ ) { $entry{'INCHI'} = $handle->[$pos+1] ; } elsif( $data =~ /^# weight_average:/ ) { $entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; } elsif( $data =~ /^# weight_mono:/ ) { $entry{'MZ_MONO'} = $handle->[$pos+1] ; } elsif( $data =~ /^# biocyc_id:/ ) { $entry{'BIOCYC_ID'} = $handle->[$pos+1] ; } elsif( $data =~ /^# hmdb_id:/ ) { $entry{'HMDB_ID'} = $handle->[$pos+1] ; } $pos++ ; } } else { croak "Handle is not defined : parsing step impossible\n" ; } return(\@entries) ; } ## END of SUB =head2 METHOD getMetaboliteFeatures ## Description : get metabolites features from a xml file ## Input : $xmlFile, ## Output : $metabolites ## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ; =cut sub getMetaboliteFeatures { ## Retrieve Values my $self = shift ; my ( $xmlFile ) = @_ ; my %metabolites = () ; my $twig = undef ; my $id = undef ; if (-e $xmlFile) { $twig = XML::Twig->nparse_ppe( twig_handlers => { 'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } , # metabolite name 'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } , # metabolite chemical_formula 'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } , # metabolite monisotopic_molecular_weight 'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case 'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## # metabolite inchikey 'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } , }, pretty_print => 'indented', error_context => 1, $xmlFile ); # $twig->print; $twig->purge ; } ## get number of entries: my $X = keys %metabolites ; return (\%metabolites, $X) ; } ### END of SUB =head2 METHOD setMetaboliteAcurrateMzToModesMz ## Description : set M+H and M-H masses from a metabolite (M) accurate mass ## Input : $metabolites, $proton_mass, $electron_mass ## Output : $mzsMetabolites ## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ; =cut ## START of SUB sub setMetaboliteAcurrateMzToModesMz { ## Retrieve Values my $self = shift ; my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_; if ($format eq 'XML') { foreach my $id (sort keys %{$metabolites}) { if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) { my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ; $metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; $metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; } else { warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ; } } } elsif ( ($format eq 'CARD') ) { foreach my $entry (@$metabolites) { if ( $entry->{'MZ_MONO'} ) { my $tmp_mass = $entry->{'MZ_MONO'} ; $entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; $entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; } else { warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ; } } } return ($metabolites) ; } ### END of SUB =head2 METHOD buildMetabolitesArray ## Description : build a metabolite list from xml extraction ## Input : $metabolites, $headers ## Output : $metabolitesSorted ## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ; =cut ## START of SUB sub buildMetabolitesArray { ## Retrieve Values my $self = shift ; my ( $metabolites, $headers ) = @_; my ( @metabolitesSorted ) = ( () ) ; ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey'] if (defined $headers) { push ( @metabolitesSorted, $headers ) ; } else { push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ; } foreach my $id (sort keys %{$metabolites}) { my @tmp = () ; push (@tmp, $id) ; push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ; push (@tmp, $metabolites->{$id}{'[M+H]+'}) ; push (@tmp, $metabolites->{$id}{'[M-H]-'}) ; push (@tmp, $metabolites->{$id}{'metabolite_name'}) ; push (@tmp, $metabolites->{$id}{'chemical_formula'}) ; push (@tmp, $metabolites->{$id}{'inchikey'}) ; # merge push (@metabolitesSorted, \@tmp) ; } return (\@metabolitesSorted) ; } ### END of SUB =head2 METHOD cowmetdb_hash_to_inhouse_format ## Description : adaptator from hash cowmetdb entry to inhouse format ## Input : $entries ## Output : $tsv_handler ## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ; =cut ## START of SUB sub cowmetdb_hash_to_inhouse_format { ## Retrieve Values my $self = shift ; my ( $entries ) = @_ ; my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ; my @tsv_handler = () ; push (@tsv_handler, \@fields_name) ; ## first line foreach my $entry (@$entries) { my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'}, $entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ; push (@tsv_handler, \@tmp) ; ## one entry by one line } return(\@tsv_handler) ; } ## END of SUB 1 ; __END__ =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc XXX.pm =head1 Exports =over 4 =item :ALL is ... =back =head1 AUTHOR Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 VERSION version 1 : xx / xx / 201x version 2 : ?? =cut