Mercurial > repos > fgiacomoni > downloader_bank_hmdb
diff lib/hmdb_api.pm @ 0:7c9269bded0e draft
Init repository for [downloader_bank_hmdb]
author | fgiacomoni |
---|---|
date | Tue, 14 Jan 2020 05:21:23 -0500 |
parents | |
children | be504ccbc41c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/hmdb_api.pm Tue Jan 14 05:21:23 2020 -0500 @@ -0,0 +1,342 @@ +package hmdb_api ; + +use strict; +use warnings ; +use Exporter ; +use Carp ; + +use Data::Dumper ; +use XML::Twig ; + +use csv ; + +use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS); + +our $VERSION = "1.0"; +our @ISA = qw(Exporter); +our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz); +our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] ); + +=head1 NAME + +My::Module - An example module + +=head1 SYNOPSIS + + use My::Module; + my $object = My::Module->new(); + print $object->as_string; + +=head1 DESCRIPTION + +This module does not really exist, it +was made for the sole purpose of +demonstrating how POD works. + +=head1 METHODS + +Methods are : + +=head2 METHOD new + + ## Description : new + ## Input : $self + ## Ouput : bless $self ; + ## Usage : new() ; + +=cut + +sub new { + ## Variables + my $self={}; + bless($self) ; + return $self ; +} +### END of SUB + + +=head2 METHOD cowmetdb_handle + + ## Description : open a flat file and push the contains in memory - compute entries number. + ## Input : $flat + ## Output : $handler, $entries + ## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ; + +=cut +## START of SUB +sub cowmetdb_handle { + ## Retrieve Values + my $self = shift ; + my ( $flat ) = @_ ; + + my @handle = () ; + my $entries = 0 ; + my ( $begin, $end ) = ( 0, 0 ) ; + + if ( -e $flat ) { + open(FILE, "<$flat") or die "Cant' read the file $flat\n" ; + while (my $line = <FILE>){ + chomp $line ; + push(@handle, $line) ; + if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; } + elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; } + ## count entries + if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; } + } + close(FILE) ; + } + else { + croak "Can't find the source file $flat\n" ; + } + + return(\@handle, \$entries) ; +} +## END of SUB + +=head2 METHOD cowmetdb_hash + + ## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry + ## Input : $handler + ## Output : $entries + ## Usage : my ( $entries ) = hmdb_hash( $handler ) ; + +=cut +## START of SUB +sub cowmetdb_hash { + ## Retrieve Values + my $self = shift ; + my ( $handle ) = @_ ; + + my @entries = () ; + my %entry = () ; + my $pos = 0 ; + + if ( ( defined $handle ) ) { + foreach my $data ( @$handle ) { + + if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; } + elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; } + elsif( $data =~ /^# name:/ ) { $entry{'COMMON_NAME'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# iupac:/ ) { $entry{'IUPAC'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# kegg_compound_id:/ ) { $entry{'KEGG_ID'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# chemical_formula:/ ) { $entry{'FORMULA'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# taxonomy_super_class:/ ) { $entry{'TAXONOMY'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# cas_number:/ ) { $entry{'CAS'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# biofluid_location:/ ) { $entry{'LOCATION'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# inchi_identifier:/ ) { $entry{'INCHI'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# weight_average:/ ) { $entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# weight_mono:/ ) { $entry{'MZ_MONO'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# biocyc_id:/ ) { $entry{'BIOCYC_ID'} = $handle->[$pos+1] ; } + elsif( $data =~ /^# hmdb_id:/ ) { $entry{'HMDB_ID'} = $handle->[$pos+1] ; } + + $pos++ ; + } + } + else { + croak "Handle is not defined : parsing step impossible\n" ; + } + + return(\@entries) ; +} +## END of SUB + + + +=head2 METHOD getMetaboliteFeatures + + ## Description : get metabolites features from a xml file + ## Input : $xmlFile, + ## Output : $metabolites + ## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ; + +=cut +sub getMetaboliteFeatures { + ## Retrieve Values + my $self = shift ; + my ( $xmlFile ) = @_ ; + + my %metabolites = () ; + my $twig = undef ; + my $id = undef ; + + if (-e $xmlFile) { + + $twig = XML::Twig->nparse_ppe( + + twig_handlers => { + 'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } , + # metabolite name + 'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } , + # metabolite chemical_formula + 'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } , + # metabolite monisotopic_molecular_weight + 'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case + 'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## + # metabolite inchikey + 'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } , + }, + pretty_print => 'indented', + error_context => 1, $xmlFile + ); + +# $twig->print; + $twig->purge ; + } + + ## get number of entries: + my $X = keys %metabolites ; + + return (\%metabolites, $X) ; + + +} +### END of SUB + +=head2 METHOD setMetaboliteAcurrateMzToModesMz + + ## Description : set M+H and M-H masses from a metabolite (M) accurate mass + ## Input : $metabolites, $proton_mass, $electron_mass + ## Output : $mzsMetabolites + ## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ; + +=cut +## START of SUB +sub setMetaboliteAcurrateMzToModesMz { + ## Retrieve Values + my $self = shift ; + my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_; + + if ($format eq 'XML') { + foreach my $id (sort keys %{$metabolites}) { + if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) { + my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ; + $metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; + $metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; + } + else { + warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ; + } + } + } + elsif ( ($format eq 'CARD') ) { + foreach my $entry (@$metabolites) { + if ( $entry->{'MZ_MONO'} ) { + my $tmp_mass = $entry->{'MZ_MONO'} ; + $entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; + $entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; + } + else { + warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ; + } + } + } + + + return ($metabolites) ; +} +### END of SUB + +=head2 METHOD buildMetabolitesArray + + ## Description : build a metabolite list from xml extraction + ## Input : $metabolites, $headers + ## Output : $metabolitesSorted + ## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ; + +=cut +## START of SUB +sub buildMetabolitesArray { + ## Retrieve Values + my $self = shift ; + my ( $metabolites, $headers ) = @_; + my ( @metabolitesSorted ) = ( () ) ; + + ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey'] + if (defined $headers) { + push ( @metabolitesSorted, $headers ) ; + } + else { + push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ; + } + + foreach my $id (sort keys %{$metabolites}) { + my @tmp = () ; + push (@tmp, $id) ; + push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ; + push (@tmp, $metabolites->{$id}{'[M+H]+'}) ; + push (@tmp, $metabolites->{$id}{'[M-H]-'}) ; + push (@tmp, $metabolites->{$id}{'metabolite_name'}) ; + push (@tmp, $metabolites->{$id}{'chemical_formula'}) ; + push (@tmp, $metabolites->{$id}{'inchikey'}) ; + + # merge + push (@metabolitesSorted, \@tmp) ; + } + + return (\@metabolitesSorted) ; +} +### END of SUB + +=head2 METHOD cowmetdb_hash_to_inhouse_format + + ## Description : adaptator from hash cowmetdb entry to inhouse format + ## Input : $entries + ## Output : $tsv_handler + ## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ; + +=cut +## START of SUB +sub cowmetdb_hash_to_inhouse_format { + ## Retrieve Values + my $self = shift ; + my ( $entries ) = @_ ; + + my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ; + my @tsv_handler = () ; + push (@tsv_handler, \@fields_name) ; ## first line + + foreach my $entry (@$entries) { + my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'}, + $entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ; + push (@tsv_handler, \@tmp) ; ## one entry by one line + } + + return(\@tsv_handler) ; +} +## END of SUB + + +1 ; + + +__END__ + +=head1 SUPPORT + +You can find documentation for this module with the perldoc command. + + perldoc XXX.pm + +=head1 Exports + +=over 4 + +=item :ALL is ... + +=back + +=head1 AUTHOR + +Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> + +=head1 LICENSE + +This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. + +=head1 VERSION + +version 1 : xx / xx / 201x + +version 2 : ?? + +=cut \ No newline at end of file