downloader_bank_hmdb: lib/hmdb

comparison lib/hmdb_api.pm @ 0:7c9269bded0e draft

Init repository for [downloader_bank_hmdb]

author	fgiacomoni
date	Tue, 14 Jan 2020 05:21:23 -0500
parents
children	be504ccbc41c

comparison

equal deleted inserted replaced

--1:000000000000
+:7c9269bded0e
+package hmdb_api ;
+use strict;
+use warnings ;
+use Exporter ;
+use Carp ;
+use Data::Dumper ;
+use XML::Twig ;
+use csv ;
+use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS);
+our $VERSION = "1.0";
+our @ISA = qw(Exporter);
+our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz);
+our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] );
+=head1 NAME
+My::Module - An example module
+=head1 SYNOPSIS
+use My::Module;
+my $object = My::Module->new();
+print $object->as_string;
+=head1 DESCRIPTION
+This module does not really exist, it
+was made for the sole purpose of
+demonstrating how POD works.
+=head1 METHODS
+Methods are :
+=head2 METHOD new
+	## Description : new
+	## Input : $self
+	## Ouput : bless $self ;
+	## Usage : new() ;
+=cut
+sub new {
+## Variables
+my $self={};
+bless($self) ;
+return $self ;
+}
+### END of SUB
+=head2 METHOD cowmetdb_handle
+	## Description : open a flat file and push the contains in memory - compute entries number.
+	## Input : $flat
+	## Output : $handler, $entries
+	## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ;
+=cut
+## START of SUB
+sub cowmetdb_handle {
+	## Retrieve Values
+my $self = shift ;
+my ( $flat ) = @_ ;
+my @handle = () ;
+my $entries = 0 ;
+my ( $begin, $end ) = ( 0, 0 ) ;
+if ( -e $flat ) {
+	open(FILE, "<$flat") or die "Cant' read the file $flat\n" ;
+	while (my $line = <FILE>){
+		chomp $line ;
+		push(@handle, $line) ;
+		if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; }
+		elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; }
+		## count entries
+		if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; }
+	}
+	close(FILE) ;
+}
+else {
+	croak "Can't find the source file $flat\n" ;
+}
+return(\@handle, \$entries) ;
+}
+## END of SUB
+=head2 METHOD cowmetdb_hash
+	## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry
+	## Input : $handler
+	## Output : $entries
+	## Usage : my ( $entries ) = hmdb_hash( $handler ) ;
+=cut
+## START of SUB
+sub cowmetdb_hash {
+	## Retrieve Values
+my $self = shift ;
+my ( $handle ) = @_ ;
+my @entries = () ;
+my %entry = () ;
+my $pos = 0 ;
+if ( ( defined $handle ) ) {
+	foreach my $data ( @$handle ) {
+		if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; }
+		elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; }
+	    elsif( $data =~ /^# name:/ ) { 						$entry{'COMMON_NAME'} = $handle->[$pos+1] ; }
+			elsif( $data =~ /^# iupac:/ ) { 					$entry{'IUPAC'} = $handle->[$pos+1] ; }
+			elsif( $data =~ /^# kegg_compound_id:/ ) { 			$entry{'KEGG_ID'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# chemical_formula:/ ) {			$entry{'FORMULA'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# taxonomy_super_class:/ ) {		$entry{'TAXONOMY'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# cas_number:/ ) {				$entry{'CAS'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# biofluid_location:/ ) {			$entry{'LOCATION'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# inchi_identifier:/ ) {			$entry{'INCHI'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# weight_average:/ ) {			$entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# weight_mono:/ ) {				$entry{'MZ_MONO'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# biocyc_id:/ ) {					$entry{'BIOCYC_ID'} = $handle->[$pos+1] ; }
+	    elsif( $data =~ /^# hmdb_id:/ ) {					$entry{'HMDB_ID'} = $handle->[$pos+1] ; }
+	    $pos++ ;
+	}
+}
+else {
+	croak "Handle is not defined : parsing step impossible\n" ;
+}
+return(\@entries) ;
+}
+## END of SUB
+=head2 METHOD getMetaboliteFeatures
+	## Description : get metabolites features from a xml file
+	## Input : $xmlFile,
+	## Output : $metabolites
+	## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ;
+=cut
+sub getMetaboliteFeatures {
+	## Retrieve Values
+my $self = shift ;
+my ( $xmlFile ) = @_ ;
+my %metabolites = () ;
+my $twig = undef ;
+my $id = undef ;
+if (-e $xmlFile) {
+	$twig = XML::Twig->nparse_ppe(
+			twig_handlers => {
+					'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } ,
+					# metabolite name
+					'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } ,
+					# metabolite chemical_formula
+					'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } ,
+					# metabolite monisotopic_molecular_weight
+					'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case
+					'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ##
+					# metabolite inchikey
+					'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } ,
+			},
+			pretty_print => 'indented',
+			error_context => 1, $xmlFile
+		);
+#		$twig->print;
+		$twig->purge ;
+}
+## get number of entries:
+my $X = keys %metabolites ;
+return (\%metabolites, $X) ;
+}
+### END of SUB
+=head2 METHOD setMetaboliteAcurrateMzToModesMz
+	## Description : set M+H and M-H masses from a metabolite (M) accurate mass
+	## Input : $metabolites, $proton_mass, $electron_mass
+	## Output : $mzsMetabolites
+	## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ;
+=cut
+## START of SUB
+sub setMetaboliteAcurrateMzToModesMz {
+## Retrieve Values
+my $self = shift ;
+my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_;
+if ($format eq 'XML') {
+		foreach my $id (sort keys %{$metabolites}) {
+			if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) {
+				my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ;
+				$metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
+				$metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
+			}
+			else {
+				warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ;
+			}
+	    }
+	}
+	elsif ( ($format eq 'CARD') ) {
+		foreach my $entry (@$metabolites) {
+				if ( $entry->{'MZ_MONO'} ) {
+					my $tmp_mass = $entry->{'MZ_MONO'} ;
+					$entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
+					$entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
+				}
+				else {
+					warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ;
+				}
+		}
+	}
+return ($metabolites) ;
+}
+### END of SUB
+=head2 METHOD buildMetabolitesArray
+	## Description : build a metabolite list from xml extraction
+	## Input : $metabolites, $headers
+	## Output : $metabolitesSorted
+	## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ;
+=cut
+## START of SUB
+sub buildMetabolitesArray {
+## Retrieve Values
+my $self = shift ;
+my ( $metabolites, $headers ) = @_;
+my ( @metabolitesSorted ) = ( () ) ;
+## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey']
+if (defined $headers) {
+	push ( @metabolitesSorted, $headers ) ;
+}
+else {
+	push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ;
+}
+foreach my $id (sort keys %{$metabolites}) {
+	my @tmp = () ;
+	push (@tmp, $id) ;
+	push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ;
+	push (@tmp, $metabolites->{$id}{'[M+H]+'}) ;
+	push (@tmp, $metabolites->{$id}{'[M-H]-'}) ;
+	push (@tmp, $metabolites->{$id}{'metabolite_name'}) ;
+	push (@tmp, $metabolites->{$id}{'chemical_formula'}) ;
+	push (@tmp, $metabolites->{$id}{'inchikey'}) ;
+	# merge
+	push (@metabolitesSorted, \@tmp) ;
+}
+return (\@metabolitesSorted) ;
+}
+### END of SUB
+=head2 METHOD cowmetdb_hash_to_inhouse_format
+	## Description : adaptator from hash cowmetdb entry to inhouse format
+	## Input : $entries
+	## Output : $tsv_handler
+	## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ;
+=cut
+## START of SUB
+sub cowmetdb_hash_to_inhouse_format {
+	## Retrieve Values
+my $self = shift ;
+my ( $entries ) = @_ ;
+my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ;
+my @tsv_handler = () ;
+push (@tsv_handler, \@fields_name) ; ## first line
+foreach my $entry (@$entries) {
+	my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'},
+	$entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ;
+	push (@tsv_handler, \@tmp) ; ## one entry by one line
+}
+return(\@tsv_handler) ;
+}
+## END of SUB
+1 ;
+__END__
+=head1 SUPPORT
+You can find documentation for this module with the perldoc command.
+perldoc XXX.pm
+=head1 Exports
+=over 4
+=item :ALL is ...
+=back
+=head1 AUTHOR
+Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt>
+=head1 LICENSE
+This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
+=head1 VERSION
+version 1 : xx / xx / 201x
+version 2 : ??
+=cut

Mercurial > repos > fgiacomoni > downloader_bank_hmdb

comparison lib/hmdb_api.pm @ 0:7c9269bded0e draft