Mercurial > repos > fgiacomoni > downloader_bank_hmdb

package hmdb_api ;

use strict;
use warnings ;
use Exporter ;
use Carp ;

use Data::Dumper ;
use XML::Twig ;

use csv ;

use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS);

our $VERSION = "1.0";
our @ISA = qw(Exporter);
our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz);
our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] );

=head1 NAME

My::Module - An example module

=head1 SYNOPSIS

    use My::Module;
    my $object = My::Module->new();
    print $object->as_string;

=head1 DESCRIPTION

This module does not really exist, it
was made for the sole purpose of
demonstrating how POD works.

=head1 METHODS

Methods are :

=head2 METHOD new

	## Description : new
	## Input : $self
	## Ouput : bless $self ;
	## Usage : new() ;

=cut

sub new {
    ## Variables
    my $self={};
    bless($self) ;
    return $self ;
}
### END of SUB


=head2 METHOD cowmetdb_handle

	## Description : open a flat file and push the contains in memory - compute entries number.
	## Input : $flat
	## Output : $handler, $entries
	## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ;

=cut
## START of SUB
sub cowmetdb_handle {
	## Retrieve Values
    my $self = shift ;
    my ( $flat ) = @_ ;

    my @handle = () ;
    my $entries = 0 ;
    my ( $begin, $end ) = ( 0, 0 ) ;

    if ( -e $flat ) {
    	open(FILE, "<$flat") or die "Cant' read the file $flat\n" ;
    	while (my $line = <FILE>){
    		chomp $line ;
    		push(@handle, $line) ;
    		if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; }
    		elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; }
    		## count entries
    		if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; }
    	}
    	close(FILE) ;
    }
    else {
    	croak "Can't find the source file $flat\n" ;
    }

    return(\@handle, \$entries) ;
}
## END of SUB

=head2 METHOD cowmetdb_hash

	## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry
	## Input : $handler
	## Output : $entries
	## Usage : my ( $entries ) = hmdb_hash( $handler ) ;

=cut
## START of SUB
sub cowmetdb_hash {
	## Retrieve Values
    my $self = shift ;
    my ( $handle ) = @_ ;

    my @entries = () ;
    my %entry = () ;
    my $pos = 0 ;

    if ( ( defined $handle ) ) {
    	foreach my $data ( @$handle ) {

    		if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; }
    		elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; }
    	    elsif( $data =~ /^# name:/ ) { 						$entry{'COMMON_NAME'} = $handle->[$pos+1] ; }
			elsif( $data =~ /^# iupac:/ ) { 					$entry{'IUPAC'} = $handle->[$pos+1] ; }
			elsif( $data =~ /^# kegg_compound_id:/ ) { 			$entry{'KEGG_ID'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# chemical_formula:/ ) {			$entry{'FORMULA'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# taxonomy_super_class:/ ) {		$entry{'TAXONOMY'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# cas_number:/ ) {				$entry{'CAS'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# biofluid_location:/ ) {			$entry{'LOCATION'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# inchi_identifier:/ ) {			$entry{'INCHI'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# weight_average:/ ) {			$entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# weight_mono:/ ) {				$entry{'MZ_MONO'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# biocyc_id:/ ) {					$entry{'BIOCYC_ID'} = $handle->[$pos+1] ; }
    	    elsif( $data =~ /^# hmdb_id:/ ) {					$entry{'HMDB_ID'} = $handle->[$pos+1] ; }

    	    $pos++ ;
    	}
    }
    else {
    	croak "Handle is not defined : parsing step impossible\n" ;
    }

    return(\@entries) ;
}
## END of SUB


=head2 METHOD getMetaboliteFeatures

	## Description : get metabolites features from a xml file
	## Input : $xmlFile,
	## Output : $metabolites
	## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ;

=cut
sub getMetaboliteFeatures {
	## Retrieve Values
    my $self = shift ;
    my ( $xmlFile ) = @_ ;

    my %metabolites = () ;
    my $twig = undef ;
    my $id = undef ;

    if (-e $xmlFile) {

    	$twig = XML::Twig->nparse_ppe(

			twig_handlers => {
					'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } ,
					# metabolite name
					'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } ,
					# metabolite chemical_formula
					'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } ,
					# metabolite monisotopic_molecular_weight
					'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case
					'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ##
					# metabolite inchikey
					'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } ,
			},
			pretty_print => 'indented',
			error_context => 1, $xmlFile
		);

#		$twig->print;
		$twig->purge ;
    }

    ## get number of entries:
    my $X = keys %metabolites ;

    return (\%metabolites, $X) ;


}
### END of SUB

=head2 METHOD setMetaboliteAcurrateMzToModesMz

	## Description : set M+H and M-H masses from a metabolite (M) accurate mass
	## Input : $metabolites, $proton_mass, $electron_mass
	## Output : $mzsMetabolites
	## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ;

=cut
## START of SUB
sub setMetaboliteAcurrateMzToModesMz {
    ## Retrieve Values
    my $self = shift ;
    my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_;

    if ($format eq 'XML') {
		foreach my $id (sort keys %{$metabolites}) {
			if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) {
				my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ;
				$metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
				$metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
			}
			else {
				warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ;
			}
	    }
	}
	elsif ( ($format eq 'CARD') ) {
		foreach my $entry (@$metabolites) {
				if ( $entry->{'MZ_MONO'} ) {
					my $tmp_mass = $entry->{'MZ_MONO'} ;
					$entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
					$entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
				}
				else {
					warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ;
				}
		}
	}


    return ($metabolites) ;
}
### END of SUB

=head2 METHOD buildMetabolitesArray

	## Description : build a metabolite list from xml extraction
	## Input : $metabolites, $headers
	## Output : $metabolitesSorted
	## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ;

=cut
## START of SUB
sub buildMetabolitesArray {
    ## Retrieve Values
    my $self = shift ;
    my ( $metabolites, $headers ) = @_;
    my ( @metabolitesSorted ) = ( () ) ;

    ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey']
    if (defined $headers) {
    	push ( @metabolitesSorted, $headers ) ;
    }
    else {
    	push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ;
    }

    foreach my $id (sort keys %{$metabolites}) {
    	my @tmp = () ;
    	push (@tmp, $id) ;
    	push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ;
    	push (@tmp, $metabolites->{$id}{'[M+H]+'}) ;
    	push (@tmp, $metabolites->{$id}{'[M-H]-'}) ;
    	push (@tmp, $metabolites->{$id}{'metabolite_name'}) ;
    	push (@tmp, $metabolites->{$id}{'chemical_formula'}) ;
    	push (@tmp, $metabolites->{$id}{'inchikey'}) ;

    	# merge
    	push (@metabolitesSorted, \@tmp) ;
    }

    return (\@metabolitesSorted) ;
}
### END of SUB

=head2 METHOD cowmetdb_hash_to_inhouse_format

	## Description : adaptator from hash cowmetdb entry to inhouse format
	## Input : $entries
	## Output : $tsv_handler
	## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ;

=cut
## START of SUB
sub cowmetdb_hash_to_inhouse_format {
	## Retrieve Values
    my $self = shift ;
    my ( $entries ) = @_ ;

    my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ;
    my @tsv_handler = () ;
    push (@tsv_handler, \@fields_name) ; ## first line

    foreach my $entry (@$entries) {
    	my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'},
    	$entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ;
    	push (@tsv_handler, \@tmp) ; ## one entry by one line
    }

    return(\@tsv_handler) ;
}
## END of SUB


1 ;


__END__

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

 perldoc XXX.pm

=head1 Exports

=over 4

=item :ALL is ...

=back

=head1 AUTHOR

Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt>

=head1 LICENSE

This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=head1 VERSION

version 1 : xx / xx / 201x

version 2 : ??

=cut
author	fgiacomoni
date	Tue, 21 Jan 2020 16:09:45 -0500
parents	7c9269bded0e
children	be504ccbc41c