Mercurial > repos > fgiacomoni > hmdb_ms_search
view lib/hmdb.pm @ 23:2d8a310e86ce draft
Prod branch Updating with v.:CI_COMMIT_TAG- - Fxx
author | fgiacomoni |
---|---|
date | Thu, 19 May 2022 13:43:09 +0000 |
parents | 453fbe98925a |
children | d8e2ede293a6 |
line wrap: on
line source
package lib::hmdb ; use strict; use warnings ; use Exporter ; use Carp ; use LWP::Simple; use LWP::UserAgent; use WWW::Mechanize qw(); use URI::URL; use SOAP::Lite; use Encode qw(encode_utf8); use HTML::Template ; use XML::Twig ; use Text::CSV ; use Data::Dumper ; use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS); our $VERSION = "1.0"; our @ISA = qw(Exporter); our @EXPORT = qw( parseHmdb5CSVResults getMatchesFromHmdb5WithUA map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass ); our %EXPORT_TAGS = ( ALL => [qw( parseHmdb5CSVResults getMatchesFromHmdb5WithUA map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass )] ); =head1 NAME My::Module - An example module =head1 SYNOPSIS use My::Module; my $object = My::Module->new(); print $object->as_string; =head1 DESCRIPTION This module does not really exist, it was made for the sole purpose of demonstrating how POD works. =head1 METHODS Methods are : =head2 METHOD new ## Description : new ## Input : $self ## Ouput : bless $self ; ## Usage : new() ; =cut sub new { ## Variables my $self={}; bless($self) ; return $self ; } ### END of SUB =head2 METHOD extract_sub_mz_lists ## Description : extract a couples of sublist from a long mz list (more than $HMDB_LIMITS) ## Input : $HMDB_LIMITS, $masses ## Output : $sublists ## Usage : my ( $sublists ) = extract_sub_mz_lists( $HMDB_LIMITS, $masses ) ; =cut ## START of SUB sub extract_sub_mz_lists { ## Retrieve Values my $self = shift ; my ( $masses, $HMDB_LIMITS ) = @_ ; my ( @sublists, @sublist ) = ( (), () ) ; my $nb_mz = 0 ; my $nb_total_mzs = scalar(@{$masses}) ; if ($nb_total_mzs == 0) { die "The provided mzs list is empty" ; } for ( my $current_pos = 0 ; $current_pos < $nb_total_mzs ; $current_pos++ ) { if ( $nb_mz < $HMDB_LIMITS ) { if ( $masses->[$current_pos] ) { push (@sublist, $masses->[$current_pos]) ; $nb_mz++ ; } # build sub list } elsif ( $nb_mz == $HMDB_LIMITS ) { my @tmp = @sublist ; push (@sublists, \@tmp) ; @sublist = () ; $nb_mz = 0 ; $current_pos-- ; } if ($current_pos == $nb_total_mzs-1) { my @tmp = @sublist ; push (@sublists, \@tmp) ; } } return(\@sublists) ; } ## END of SUB =head2 METHOD prepare_multi_masses_query ## Description : Generate the adapted format of the mz list for HMDB ## Input : $masses ## Output : $hmdb_masses ## Usage : my ( $hmdb_masses ) = prepare_multi_masses_query( $masses ) ; =cut ## START of SUB sub prepare_multi_masses_query { ## Retrieve Values my $self = shift ; my ( $masses ) = @_ ; my $hmdb_masses = undef ; my $sep = '%0D%0A' ; ## retour chariot encode my ($nb_masses, $i) = (0, 0) ; if ( defined $masses ) { my @masses = @{$masses} ; my $nb_masses = scalar ( @masses ) ; if ( $nb_masses == 0 ) { croak "The input method parameter mass list is empty" ; } elsif ( $nb_masses >= 700 ) { croak "Your mass list is too long : HMDB allows maximum 700 query masses per request \n" ; } ## Del it --- temporary patch foreach my $mass (@masses) { if ($i < $nb_masses) { $hmdb_masses .= $mass.$sep ; } elsif ( $i == $nb_masses ) { $hmdb_masses .= $mass ; } else { last ; } $i ++ ; } } else { croak "No mass list found \n" ; } return($hmdb_masses, $nb_masses) ; } ## END of SUB =head2 METHOD prepareAdductListFormat ## Description : prepare a adduct list well formatted for https queries ## Input : $adductString ## Output : $formattedAdductString ## Usage : my ( $formattedAdductString ) = prepareAdductListFormat ( $adductString ) ; =cut ## START of SUB sub prepareAdductListFormat { ## Retrieve Values my $self = shift ; my ( $adductString ) = @_; my ( $formattedAdductString, $nbAdducts ) = ( undef, 0 ) ; ## Formatting is converting [+] in %2B, [-] as - and [,] in converted space as 'M%2BH%202M%2BH' for 'M+H,2M+2H' # print "\t$adductString ..." ; if (defined $adductString) { ## counting selected adducts $nbAdducts = scalar( my @adducts = ( split (/,/, $adductString) ) ) ; ## Converting string into http post format $adductString =~ s/\+/%2B/g ; $adductString =~ s/,/%20/g ; $formattedAdductString = $adductString ; } else { warn "\t[WARN]the adduct type is not defined...It will set to 'Unknown'\n" ; } # print "->$formattedAdductString\n" ; return ($formattedAdductString, $nbAdducts) ; } ### END of SUB =head2 METHOD test_matches_from_hmdb_ua DEPRECATED ## Description : [DEPRECATED]test a single query with tests parameters on hmdb - get the status of the complete server infra. ## Input : none ## Output : $status_line ## Usage : my ( $status_line ) = test_matches_from_hmdb_ua( ) ; =cut ## START of SUB sub test_matches_from_hmdb_ua { ## Retrieve Values my $self = shift ; my @page = () ; my $ua = new LWP::UserAgent; $ua->agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0"); my $url = 'http://specdb.wishartlab.com/ms/search.csv'; my $header = ['Connection' => 'keep-alive', 'Content-Type' => 'application/x-www-form-urlencoded', 'Referer' => 'http://www.hmdb.ca/spectra/ms/search', 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8']; my $req = HTTP::Request->new('POST', $url, $header); # print Dumper $req ; ## Query format for HMDB 4.0 #"utf8=✓&authenticity_token=K1Ys7oyMKmVNm9n8p0jiTxBlh4G4OO0cqKZYnQKDCw0pM6zmm/CiBxv+/cXhuRsVFV98LLeAMJRN5dCyhIWlAA==&query_masses=175.01 238.19 420.16 780.32 956.25 1100.45&ms_search_ion_mode=positive&adduct_type[]=Unknown&tolerance=0.05&tolerance_units=Da&commit=Search" $req->content('utf8=TRUE&mode=positive&query_masses=420.159317&tolerance=0.000001&database=HMDB&commit=Download Results As CSV'); # print Dumper $req ; my $res = $ua->request($req); print $res->as_string; my $status_line = $res->status_line ; ($status_line) = ($status_line =~ /(\d+)/); return (\$status_line) ; } ## END of SUB =head2 METHOD checkHmdbUrlEntries ## Description : check a list of HMDB_IDs by testing what is the status of their uri (HTTP 200 or not) ## Input : $urlRoot, $entries ## Output : $validEntries ## Usage : my ( $validEntries ) = checkHmdbUrlEntries ( $entries ) ; =cut ## START of SUB sub checkHmdbUrlEntries { ## Retrieve Values my $self = shift ; my ( $urlRoot, $clusters, $maxQuery ) = @_; my ( @ValidResults, @validFeature ) = ( (), () ) ; foreach my $resultsByMass (@{$clusters}) { my $currentQuery = 0 ; foreach my $feature (@{$resultsByMass}) { if ($feature->{ENTRY_ENTRY_ID}) { my $url = $urlRoot.$feature->{ENTRY_ENTRY_ID} ; if ( get($url)) { $currentQuery ++ ; if ($currentQuery > $maxQuery) { last ; } else { push (@validFeature, $feature) ; # print "Url is valid ($url) - - add to final results\n" ; } } else { # print "Url is NOT valid ($url) - - Deleting to final results\n" ; next ; } } } # END FOREACH FEATURE my @tmp = @validFeature ; push (@ValidResults, \@tmp) ; @validFeature = () ; } # END FOREACH RESULTS return (\@ValidResults) ; } ### END of SUB =head2 METHOD testMatchesFromHmdbWithUA ## Description : [DEPRECATED] test a single query with tests parameters on hmdb - get the status of the complete server infra. ## Input : none ## Output : $status_line ## Usage : my ( $status_line ) = testMatchesFromHmdbWithUA( ) ; =cut ## START of SUB sub testMatchesFromHmdbWithUA { ## Retrieve Values my $self = shift ; my @page = () ; #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and my $mech = WWW::Mechanize->new( # agent => 'wonderbot for W4M 1.01', agent => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:93.0) Gecko/20100101 Firefox/93.0' , autocheck => 0, ); my $statusGetLine = 0 ; my $statusPostLine = 0 ; my $csrftoken = undef ; #receiving cookies and authentication token (CFRS) my $reqInit = $mech->get("http://www.hmdb.ca/spectra/ms/search"); $statusGetLine = $mech->status() ; if ($statusGetLine == 200 ) { die 'no CSRF_REQUEST_TOKEN_VALUE in page found' unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ; $csrftoken = $1; print "\nTOKEN: $csrftoken\n" ; $mech->add_header("X-CSRFToken", $csrftoken); $mech->add_header('Host', 'specdb.wishartlab.com'); $mech->add_header('Connection', 'keep-alive'); $mech->add_header('Upgrade-Insecure-Requests', '1'); $mech->add_header('Content-Type', 'application/x-www-form-urlencoded'); $mech->add_header('Accept-Language', 'en-US,en;q=0.5'); $mech->add_header('Accept-Encoding', 'gzip, deflate'); # $mech->add_header('Content-Length', "300"); $mech->add_header('Origin', 'null'); $mech->add_header('DNT', '1'); $mech->add_header('Referer', 'https://hmdb.ca/spectra/ms/search'); # $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'); } ## POST test my $top = 1 ; while ($statusPostLine != 200 ) { # Fix a limit at 3 tries... if ($top < 4) { print "\tTesting HMDB server connexion ($top time(s) )...\n" ; eval { $mech->post( "http://specdb.wishartlab.com/ms/search.csv", Content => 'utf8=TRUE&authenticity_token='.$csrftoken.'&mode=positive&adduct_type=M%2BH%202M%2BH&query_masses=125.0089&tolerance=0.001&database=HMDB&commit=Download Results As CSV' ); } ; # print Dumper $mech ; $statusPostLine = $mech->status() ; print "Status: $statusPostLine" ; } else { last ; } $top++ ; }## End While return (\$statusPostLine) ; } ## END of SUB =head2 METHOD testMatchesFromHmdb5WithUA ## Description : test a single query with tests parameters on hmdb - get the status of the complete server infra (API V5.0 compliant). ## Input : none ## Output : $status_line ## Usage : my ( $status_line ) = testMatchesFromHmdb5WithUA( ) ; =cut ## START of SUB sub testMatchesFromHmdb5WithUA { ## Retrieve Values my $self = shift ; my $mech = WWW::Mechanize->new( agent => 'wonderbot for W4M 3.0', autocheck => 1, timeout => 2400, ); my $statusGetLine = 0 ; my $statusPostLine = 0 ; #receiving cookies and authentication token (CFRS) my $reqInit = $mech->get("https://www.hmdb.ca/spectra/ms/search"); $statusGetLine = $mech->status() ; if ($statusGetLine == 200 ) { die 'no CSRF_REQUEST_TOKEN_VALUE in page found' unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ; my $csrftoken = $1; # print "\nTOKEN: $csrftoken\n" ; $mech->add_header("X-CSRFToken", $csrftoken); $mech->add_header('Connection', 'keep-alive'); $mech->add_header('Content-Type', 'application/x-www-form-urlencoded'); $mech->add_header('Referer', 'https://www.hmdb.ca/spectra/ms/search'); $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); } ## POST test my $top = 1 ; while ($statusPostLine != 200 ) { # Fix a limit at 3 tries... if ($top < 4) { print "\tTesting HMDB server connexion ($top time(s) )...\n" ; eval { my $res = $mech->get( 'https://hmdb.ca/spectra/ms/generate_csv.csv?' .'results%5Baction%5D=search' .'&results%5Badduct_type%5D%5B%5D=M%2BH%202M%2BH' #.'&results%5Bauthenticity_token%5D=' .'&results%5Bccs_predictors%5D=' .'&results%5Bccs_tolerance%5D=' .'&results%5Bcommit%5D=Search' .'&results%5Bcontroller%5D=specdb%2Fms' .'&results%5Bms_search_ion_mode%5D=positive' .'&results%5Bquery_masses%5D=125.0089' .'&results%5Btolerance%5D=0.001' .'&results%5Btolerance_units%5D=Da' .'&results%5Butf8%5D=%E2%9C%93' ); } ; # print Dumper $mech ; $statusPostLine = $mech->status() ; print "Status: $statusPostLine" ; } else { last ; } $top++ ; }## End While return (\$statusPostLine) ; } ## END of SUB =head2 METHOD check_state_from_hmdb_ua ## Description : check the thhp status of hmdb and kill correctly the script if necessary. ## Input : $status ## Output : none ## Usage : check_state_from_hmdb_ua($status) ; =cut ## START of SUB sub check_state_from_hmdb_ua { ## Retrieve Values my $self = shift ; my ($status) = @_ ; if (!defined $$status) { croak "No https status is defined for the distant server" ; } else { unless ( $$status == 200 ) { if ( $$status == 502 ) { croak "Bad Gateway (502): The HMDB server, while acting as a gateway or proxy, received an invalid response from the upstream server. The Hmdb tool is stopped with error." ; } if ( $$status == 504 ) { croak "Gateway Timeout (504): The HMDB server was acting as a gateway or proxy and did not receive a timely response from the upstream server. The Hmdb tool is stopped with error." ; } if ( $$status == 500 ) { croak "Internal Server Error (500): The HMDB server returns an unexpected internal server error. The Hmdb tool is stopped with error." ; } else { ## None supported http code error ## croak "Internal Server Error $$status..." ; } } if ( $$status == 200 ) { print "\tThe HMDB server returns that your request (connexion test) was fulfilled\n" ; print "\tAll searches should be sent successfully to HMDB...(Set verbose to \"High\" for more information!)\n" ; } } return (1) ; } ## END of SUB =head2 METHOD get_matches_from_hmdb_ua DEPRECATED ## Description : [DEPRECATED]HMDB querying via an user agent with parameters : mz, delta and molecular species (neutral, pos, neg) ## Input : $mass, $delta, $mode ## Output : $results ## Usage : my ( $results ) = get_matches_from_hmdb( $mass, $delta, $mode ) ; =cut ## START of SUB sub get_matches_from_hmdb_ua { ## Retrieve Values my $self = shift ; my ( $masses, $delta, $mode ) = @_ ; ## Added May, 2022 warn "[DEPRECATED Methode] method get_matches_from_hmdb_ua is deprecated and not compatible with HMDB 4.0" ; return ([], 500) ; my @page = () ; my $ua = LWP::UserAgent->new( keep_alive => 10 ); $ua->agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0"); $ua->timeout(2400) ; # Cookies # my $cookie = new HTTP::Cookies( ignore_discard => 1 ); # $ua->cookie_jar( $cookie ); # my $req = HTTP::Request->new( # POST => 'http://specdb.wishartlab.com/ms/search.csv'); my $url = 'http://specdb.wishartlab.com/ms/search.csv'; my $header = ['Content-Type' => 'application/x-www-form-urlencoded', 'Referer' => 'http://www.hmdb.ca/spectra/ms/search', 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8']; my $data = {mode => $mode, query_masses => $masses, 'tolerance' => $delta, database => 'HMDB', commit => 'Download Results As CSV'}; my $encoded_data = encode_utf8($data); my $req = HTTP::Request->new('POST', $url, $header, $encoded_data); # $req->content_type('application/x-www-form-urlencoded'); $req->content('utf8=TRUE&mode='.$mode.'&query_masses='.$masses.'&tolerance='.$delta.'&database=HMDB&commit=Download Results As CSV'); # print Dumper $req ; my $res = $ua->request($req); my $status_line = undef ; print $res->as_string; if ($res->is_success) { @page = split ( /\n/, $res->decoded_content ) ; $status_line = 'OK' ; } else { $status_line = $res->status_line ; ($status_line) = ($status_line =~ /(\d+)/); warn "\t[HMDB service issue !! the server returned a $status_line HTTP error]" ; } return (\@page, $status_line) ; } ## END of SUB =head2 METHOD getMatchesFromHmdbWithUA ## Description : HMDB querying via an user agent with parameters : mz, delta and molecular species (neutral, pos, neg) ## Input : $mass, $delta, $mode, adducts ## Output : $results ## Usage : my ( $results ) = getMatchesFromHmdbWithUA( $mass, $delta, $mode ) ; =cut ## START of SUB sub getMatchesFromHmdbWithUA { ## Retrieve Values my $self = shift ; my ( $masses, $delta, $mode, $adducts ) = @_ ; ## Added May, 2022 warn "[DEPRECATED Methode] method getMatchesFromHmdbWithUA is deprecated and not compatible with HMDB 5.0" ; return ([], 500) ; my @page = () ; #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and my $mech = WWW::Mechanize->new( agent => 'wonderbot for W4M 1.01', autocheck => 1, timeout => 2400, ); my $statusGetLine = 0 ; my $statusPostLine = 0 ; #receiving cookies and authentication token (CFRS) my $reqInit = $mech->get("http://www.hmdb.ca/spectra/ms/search"); $statusGetLine = $mech->status() ; if ($statusGetLine == 200 ) { die 'no CSRF_REQUEST_TOKEN_VALUE in page found' unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ; my $csrftoken = $1; # print "\nTOKEN: $csrftoken\n" ; $mech->add_header("X-CSRFToken", $csrftoken); $mech->add_header('Connection', 'keep-alive'); $mech->add_header('Content-Type', 'application/x-www-form-urlencoded'); $mech->add_header('Referer', 'http://www.hmdb.ca/spectra/ms/search'); $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); } ## adduct format is adduct_type=M%2BH%202M%2BH if ( (!defined $adducts) or ( $adducts eq '') ) { $adducts = 'Unknown' ; } my $res = $mech->post( "http://specdb.wishartlab.com/ms/search.csv", Content => 'utf8=TRUE&mode=' .$mode.'&adduct_type=' .$adducts.'&query_masses=' .$masses.'&tolerance=' .$delta.'&database=HMDB&commit=Download Results As CSV' ); if ($mech->success) { @page = split ( /\n/, $res->decoded_content ) ; $statusPostLine = 'OK' ; } else { $statusPostLine = $mech->status() ; warn "\t[HMDB service issue !! the server returned a $statusPostLine HTTP error]" ; } # print Dumper $res->decoded_content ; return (\@page, $statusPostLine) ; } ## END of SUB =head2 METHOD getMatchesFromHmdb5WithUA ## Description : HMDB v5.0 querying via an user agent with parameters : mz, delta and molecular species (neutral, pos, neg) ## Input : $mass, $delta, $mode, adducts ## Output : $results ## Usage : my ( $results ) = getMatchesFromHmdbWithUA( $mass, $delta, $mode ) ; =cut ## START of SUB sub getMatchesFromHmdb5WithUA { ## Retrieve Values my $self = shift ; my ( $masses, $delta, $mode, $adducts ) = @_ ; my @page = () ; #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and my $mech = WWW::Mechanize->new( agent => 'wonderbot for W4M 3.0', autocheck => 1, timeout => 2400, ); my $statusGetLine = 0 ; my $statusPostLine = 0 ; #receiving cookies and authentication token (CFRS) my $reqInit = $mech->get("https://www.hmdb.ca/spectra/ms/search"); $statusGetLine = $mech->status() ; if ($statusGetLine == 200 ) { die 'no CSRF_REQUEST_TOKEN_VALUE in page found' unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ; my $csrftoken = $1; # print "\nTOKEN: $csrftoken\n" ; $mech->add_header("X-CSRFToken", $csrftoken); $mech->add_header('Connection', 'keep-alive'); $mech->add_header('Content-Type', 'application/x-www-form-urlencoded'); $mech->add_header('Referer', 'https://www.hmdb.ca/spectra/ms/search'); $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); } ## adduct format is adduct_type=M%2BH%202M%2BH if ( (!defined $adducts) or ( $adducts eq '') ) { $adducts = 'Unknown' ; } my $res = $mech->get( 'https://hmdb.ca/spectra/ms/generate_csv.csv?' .'results%5Baction%5D=search' .'&results%5Badduct_type%5D%5B%5D='.$adducts #.'&results%5Bauthenticity_token%5D=' .'&results%5Bccs_predictors%5D=' .'&results%5Bccs_tolerance%5D=' .'&results%5Bcommit%5D=Search' .'&results%5Bcontroller%5D=specdb%2Fms' .'&results%5Bms_search_ion_mode%5D='.$mode .'&results%5Bquery_masses%5D='.$masses .'&results%5Btolerance%5D='.$delta .'&results%5Btolerance_units%5D=Da' .'&results%5Butf8%5D=%E2%9C%93' ); $statusGetLine = $mech->status() ; if ($mech->success) { @page = split ( /\n/, $res->decoded_content ) ; $statusPostLine = 'OK' ; } else { $statusPostLine = $mech->status() ; warn "\t[HMDB service issue !! the server returned a $statusPostLine HTTP error]" ; } # print Dumper $res->decoded_content ; return (\@page, $statusPostLine) ; } ## END of SUB =head2 METHOD parseHmdb5CSVResults ## Description : parse the csv results and get data - API 5.0 compliant ## Input : $csv ## Output : $results ## Usage : my ( $results ) = parseHmdb5CSVResults( $csv ) ; =cut ## START of SUB sub parseHmdb5CSVResults { ## Retrieve Values my $self = shift ; my ( $csv, $masses, $max_query ) = @_ ; my $test = 0 ; my ($query_mass,$compound_id,$formula,$compound_mass,$adduct,$adduct_type,$adduct_mass,$delta) = (0, undef, undef, undef, undef, undef, undef, undef) ; my %result_by_entry = () ; my %features = () ; # print Dumper $csv ; # print Dumper $masses ; # print Dumper $max_query ; foreach my $line (@{$csv}) { ## NEW HMDB format V5.0 - May2022 if ($line !~ /query_mass,compound_id,compound_name,formula,monoisotopic_mass,adduct,adduct_type,adduct_m\/z,"delta\(ppm\),",ccs_value/) { #query_mass,compound_id,compound_name,formula,monoisotopic_mass,adduct,adduct_type,adduct_m/z,"delta(ppm),",ccs_value', if ( $line =~ /(\d+\.\d+),(\w+),(.*),(\w+),(\d+\.\d+),([\w|n\/a|\s+]+)\s*,(\+|\-),(\d+\.\d+),(\d+),(\d*)/ ) { print "$line\n" ; #if ( $line =~ /(\d+\.\d+),(\w+),(.*),([\w|n\/a|\s+]+)\s*,(\w+),(\d+\.\d+),(.*),(\+|\-),(\d+\.\d+),(\d+)/ ) { my @entry = ("$1","$2","$3","$4","$5","$6","$7","$8","$9","$10") ; if ( !exists $result_by_entry{$entry[0]} ) { $result_by_entry{$entry[0]} = [] ; } $features{ENTRY_ENTRY_ID} = $entry[1] ; $features{ENTRY_ENTRY_NAME} = $entry[2] ; $features{ENTRY_FORMULA} = $entry[3] ; $features{ENTRY_CPD_MZ} = $entry[4] ; $features{ENTRY_ADDUCT} = $entry[5] ; $features{ENTRY_ADDUCT_TYPE} = $entry[6] ; $features{ENTRY_ADDUCT_MZ} = $entry[7] ; $features{ENTRY_DELTA} = $entry[8] ; my %temp = %features ; push (@{$result_by_entry{$entry[0]} }, \%temp) ; } # elsif ($line =~ /(\d+\.\d+)/) { # # ## 288.082286511284,HMDB0002255,R-Methylmalonyl-CoA, C01213 ,C25H40N7O19P3S,867.131252359,M-3H,-,288.036475,159 # ## 283.108004472276,"Bicyclo_3,1,1heptane-2,3-diol,2,6,6_trimethyl","2,3-Pinanediol",n/a,C10H18O2,170.13067982,M+TFA-H,-,283.116266,29 # ## 174.034120330029,HMDB0011723,2-Methylhippuric acid, C01586,C10H11NO3,193.073893223,M-H20-H,-,174.055503,123 # ## 193.139160745841,HMDB0012109,"7-[(1R,2R,3R,5S)-3,5-Dihydroxy-2-[(1E,3S)-3-hydroxyoct-1-en-1-yl]cyclopentyl]-5,6-dihydroxyheptanoic acid", C06475,C20H36O7,388.246103506,M-2H,-,193.115776,121 # ## 214.018826827064,HMDB0011723,2-Methylhippuric acid, C01586,C10H11NO3,193.073893223,M+Na-2H,-,214.048559,139 # } # else { # # warn "The parsed line ($line) does not match your pattern\n " ; # } } else { print "Header detected...Parsing is starting...\n" ; next ; } } ## end foreach ## manage per query_mzs (keep query masses order by array) my @results = () ; foreach (@{$masses}) { if ($result_by_entry{$_}) { ## cut all entries > $max_query - all entries were already sorted...by hmdb my @temp_entries = @{$result_by_entry{$_}} ; my @temp_cut = () ; my $current_query = 0 ; foreach (@temp_entries) { $current_query ++ ; if ($current_query > $max_query) { last ; } else { push (@temp_cut, $_) ; } } push (@results, \@temp_cut) ; } else { push (@results, [] ) ; } ; } return(\@results) ; } ## END of SUB =head2 METHOD parse_hmdb_csv_results ## Description : [DEPRECATED] parse the csv results and get data ## Input : $csv ## Output : $results ## Usage : my ( $results ) = parse_hmdb_csv_results( $csv ) ; =cut ## START of SUB sub parse_hmdb_csv_results { ## Retrieve Values my $self = shift ; my ( $csv, $masses, $max_query ) = @_ ; my $test = 0 ; my ($query_mass,$compound_id,$formula,$compound_mass,$adduct,$adduct_type,$adduct_mass,$delta) = (0, undef, undef, undef, undef, undef, undef, undef) ; my %result_by_entry = () ; my %features = () ; # print Dumper $csv ; # print Dumper $masses ; # print Dumper $max_query ; foreach my $line (@{$csv}) { ## NEW HMDB format V4.0 - dec2017 if ($line !~ /query_mass,compound_id,compound_name,kegg_id,formula,monoisotopic_mass,adduct,adduct_type,adduct_m\/z,delta\(ppm\)/) { if ( $line =~ /(\d+\.\d+),(\w+),(.*),([\w|n\/a|\s+]+)\s*,(\w+),(\d+\.\d+),(.*),(\+|\-),(\d+\.\d+),(\d+)/ ) { my @entry = ("$1","$2","$3","$4","$5","$6","$7","$8","$9","$10") ; if ( !exists $result_by_entry{$entry[0]} ) { $result_by_entry{$entry[0]} = [] ; } $features{ENTRY_ENTRY_ID} = $entry[1] ; $features{ENTRY_ENTRY_NAME} = $entry[2] ; $features{ENTRY_FORMULA} = $entry[4] ; $features{ENTRY_CPD_MZ} = $entry[5] ; $features{ENTRY_ADDUCT} = $entry[6] ; $features{ENTRY_ADDUCT_TYPE} = $entry[7] ; $features{ENTRY_ADDUCT_MZ} = $entry[8] ; $features{ENTRY_DELTA} = $entry[9] ; my %temp = %features ; push (@{$result_by_entry{$entry[0]} }, \%temp) ; } # elsif ($line =~ /(\d+\.\d+)/) { # # ## 288.082286511284,HMDB0002255,R-Methylmalonyl-CoA, C01213 ,C25H40N7O19P3S,867.131252359,M-3H,-,288.036475,159 # ## 283.108004472276,"Bicyclo_3,1,1heptane-2,3-diol,2,6,6_trimethyl","2,3-Pinanediol",n/a,C10H18O2,170.13067982,M+TFA-H,-,283.116266,29 # ## 174.034120330029,HMDB0011723,2-Methylhippuric acid, C01586,C10H11NO3,193.073893223,M-H20-H,-,174.055503,123 # ## 193.139160745841,HMDB0012109,"7-[(1R,2R,3R,5S)-3,5-Dihydroxy-2-[(1E,3S)-3-hydroxyoct-1-en-1-yl]cyclopentyl]-5,6-dihydroxyheptanoic acid", C06475,C20H36O7,388.246103506,M-2H,-,193.115776,121 # ## 214.018826827064,HMDB0011723,2-Methylhippuric acid, C01586,C10H11NO3,193.073893223,M+Na-2H,-,214.048559,139 # } # else { # # warn "The parsed line ($line) does not match your pattern\n " ; # } } else { next ; } } ## end foreach ## manage per query_mzs (keep query masses order by array) my @results = () ; foreach (@{$masses}) { if ($result_by_entry{$_}) { ## cut all entries > $max_query - all entries were already sorted...by hmdb my @temp_entries = @{$result_by_entry{$_}} ; my @temp_cut = () ; my $current_query = 0 ; foreach (@temp_entries) { $current_query ++ ; if ($current_query > $max_query) { last ; } else { push (@temp_cut, $_) ; } } push (@results, \@temp_cut) ; } else { push (@results, [] ) ; } ; } return(\@results) ; } ## END of SUB =head2 METHOD parse_hmdb_page_results ## Description : [DEPRECATED] old HMDB html page parser ## Input : $page ## Output : $results ## Usage : my ( $results ) = parse_hmdb_page_result( $pages ) ; =cut ## START of SUB sub parse_hmdb_page_results { ## Retrieve Values my $self = shift ; my ( $page ) = @_ ; my @results = () ; my ($catch_table, $catch_name) = (0, 0) ; my ($name, $adduct, $adduct_mw, $cpd_mw, $delta) = (undef, undef, undef, undef, undef) ; if ( defined $page ) { my @page = @{$page} ; my $ID = undef ; my @result_by_mz = () ; my %result_by_entry = () ; foreach my $line (@page) { #Section de la page contenant les resultat if( $line =~/<table>/ ) { $catch_table = 1 ; } ## Si il existe un resultat : if($catch_table == 1) { #Id de la molecule, et creation du lien if( $line =~ /<a href=\"\/metabolites\/(\w+)\" (.*)>/ ) { $ID = $1 ; $catch_name = 0 ; next ; } #Nom de la molecule ONLY!! if ( $catch_name == 0 ) { if( $line =~ /<td>(.+)<\/td>/ ) { if ( !defined $name ) { $name = $1 ; $result_by_entry{'ENTRY_ENTRY_ID'} = $ID ; $result_by_entry{'ENTRY_NAME'} = $name ; next ; } if ( !defined $adduct ) { $adduct = $1 ; $result_by_entry{'ENTRY_ADDUCT'} = $adduct ; next ; } if ( !defined $adduct_mw ) { $adduct_mw = $1 ; $result_by_entry{'ENTRY_ADDUCT_MZ'} = $adduct_mw ; next ; } if ( !defined $cpd_mw ) { $cpd_mw = $1 ; $result_by_entry{'ENTRY_CPD_MZ'} = $cpd_mw ; next ; } if ( !defined $delta ) { $delta = $1 ; $result_by_entry{'ENTRY_DELTA'} = $delta ; $catch_name = 1 ; my %tmp = %result_by_entry ; push (@result_by_mz, \%tmp) ; %result_by_entry = () ; ( $name, $cpd_mw, $delta, $adduct, $adduct_mw ) = ( undef, undef, undef, undef, undef ) ; next ; } } } } #Fin de la section contenant les resultats if( $line =~ /<\/table>/ ) { $catch_table = 0 ; my @Tmp = @result_by_mz ; push(@results, \@Tmp) ; @result_by_mz = () ; } } } return(\@results) ; } ## END of SUB =head2 METHOD get_unik_ids_from_results ## Description : get all unik ids from the hmdb result object ## Input : $results ## Output : $ids, $idsNumber ## Usage : my ( $ids ) = get_unik_ids_from_results ( $results ) ; =cut ## START of SUB sub get_unik_ids_from_results { ## Retrieve Values my $self = shift ; my ( $results ) = @_; my ( %ids ) = ( () ) ; foreach my $result (@{$results}) { foreach my $entries (@{$result}) { if ( ($entries->{'ENTRY_ENTRY_ID'}) and ($entries->{'ENTRY_ENTRY_ID'} ne '' ) ) { $ids{$entries->{'ENTRY_ENTRY_ID'}} = 1 ; } } } my $idsNumber = keys %ids ; return (\%ids, $idsNumber) ; } ### END of SUB =head2 METHOD get_hmdb_metabocard_from_id ## Description : get a metabocard (xml format from an ID on HMDB) ## Input : $ids ## Output : $metabocard_features ## Usage : my ( $metabocard_features ) = get_hmdb_metabocard_from_id ( $ids ) ; =cut ## START of SUB sub get_hmdb_metabocard_from_id { ## Retrieve Values my $self = shift ; my ( $ids, $hmdb_url ) = @_; my ( %metabocard_features ) = ( () ) ; my $query = undef ; ## structure %metabocard_features # metabolite_id = ( # 'metabolite_name' => '__name__', # 'metabolite_inchi' => '__inchi__', # 'metabolite_logp' => '__logp-ALOGPS__', # # ) if( (defined $ids) and ($ids > 0 ) ) { foreach my $id (keys %{$ids}) { print "$id...\n" ; my $twig = undef ; if (defined $hmdb_url) { $query = $hmdb_url.$id.'.xml' ; ## test the header if exists my $response = head($query) ; if (!defined $response) { $metabocard_features{$id}{'STATUS'} = 'NOT_EXISTING' ; $metabocard_features{$id}{'metabolite_name'} = undef ; $metabocard_features{$id}{'metabolite_inchi'} = undef ; $metabocard_features{$id}{'metabolite_logp'} = undef ; ## Need to be improve to manage http 404 or other response diff than 200 } ## IF error elsif ( $response->is_success ) { $twig = XML::Twig->nparse_ppe( twig_handlers => { # metabolite name 'metabolite/name' => sub { $metabocard_features{$id}{'metabolite_name'} = $_ -> text_only ; $metabocard_features{$id}{'STATUS'} = 'EXISTING' ; } , # metabolite inchi 'metabolite/inchi' => sub { $metabocard_features{$id}{'metabolite_inchi'} = $_ -> text_only ; $metabocard_features{$id}{'STATUS'} = 'EXISTING' ;} , ## metabolite logP 'metabolite/predicted_properties/property' => sub { my ($kind, $source, $value ) = ( undef, undef, undef ) ; if (defined $_->children ) { foreach my $field ($_->children) { if ( $field->name eq 'kind') { $kind = $field->text ; } elsif ( $field->name eq 'source') { $source = $field->text ; } elsif ( $field->name eq 'value') { $value = $field->text ; } if (defined $source ) { if ( ( $kind eq 'logp' ) and ( $source eq 'ALOGPS' ) ) { $metabocard_features{$id}{'metabolite_logp'} = $value ; $metabocard_features{$id}{'STATUS'} = 'EXISTING' ; } ($kind, $source, $value ) = ( undef, undef, undef ) ; } } } } }, pretty_print => 'indented', error_context => 1, $query ); # $twig->print; $twig->purge ; # if (!$@) { # # } # else { # warn $@ ; # } } ## ELSIF success } # END if defined URL else { warn "\tThe hmdb metabocard url is not defined\n" ; last; } } } ## End IF defined ids else { warn "The HMDB ids list from HMDB is empty - No metabocard found\n" ; } # print Dumper %metabocard_features ; return (\%metabocard_features) ; } ### END of SUB =head2 METHOD map_suppl_data_on_hmdb_results ## Description : map supplementary data with already collected results with hmdb search - delete the entry if hmdb card doesn't exist... ## Input : $results, $features ## Output : $results ## Usage : my ( $results ) = map_suppl_data_on_hmdb_results ( $results, $features ) ; =cut ## START of SUB sub map_suppl_data_on_hmdb_results { ## Retrieve Values my $self = shift ; my ( $results, $features ) = @_; my ( @moreResults ) = ( () ) ; foreach my $result (@{$results}) { my @newResult = () ; foreach my $entry (@{$result}) { if ( ($entry->{'ENTRY_ENTRY_ID'}) and ($entry->{'ENTRY_ENTRY_ID'} ne '' ) ) { my $current_id = $entry->{'ENTRY_ENTRY_ID'} ; my $newCompletedEntry = $entry ; ## If the id exists in feature hash and its status is not NOT_EXISTING if ( ($features->{"$current_id"} ) and ( $features->{"$current_id"}{STATUS} eq 'EXISTING' ) ) { ## Metabolite NAME if (defined $features->{"$current_id"}{'metabolite_name'} ) { $newCompletedEntry->{'ENTRY_ENTRY_NAME'} = $features->{"$current_id"}{'metabolite_name'} } else { $newCompletedEntry->{'ENTRY_ENTRY_NAME'} = 'UNKNOWN' ; } ## Metabolite INCHI if (defined $features->{"$current_id"}{'metabolite_inchi'} ) { $newCompletedEntry->{'ENTRY_ENTRY_INCHI'} = $features->{"$current_id"}{'metabolite_inchi'} } else { $newCompletedEntry->{'ENTRY_ENTRY_INCHI'} = 'NA' ; } ## Metabolite LOGP if (defined $features->{"$current_id"}{'metabolite_logp'} ) { $newCompletedEntry->{'ENTRY_ENTRY_LOGP'} = $features->{"$current_id"}{'metabolite_logp'} } else { $newCompletedEntry->{'ENTRY_ENTRY_LOGP'} = 'NA' ; } push (@newResult, $newCompletedEntry) ; } elsif ( ($features->{"$current_id"} ) and ( $features->{"$current_id"}{STATUS} eq 'NOT_EXISTING' ) ) { $newCompletedEntry = undef ; next ; } ## In cas no features are given else { $newCompletedEntry->{'ENTRY_ENTRY_INCHI'} = 'NONEDATA' ; $newCompletedEntry->{'ENTRY_ENTRY_LOGP'} = 'NONEDATA' ; push (@newResult, $newCompletedEntry) ; } } } ## END FOREACH ENTRIES push (@moreResults, \@newResult) ; } ## END FOREACH RESULT return (\@moreResults) ; } ### END of SUB =head2 METHOD set_html_tbody_object ## Description : initializes and build the tbody object (perl array) needed to html template ## Input : $nb_pages, $nb_items_per_page ## Output : $tbody_object ## Usage : my ( $tbody_object ) = set_html_tbody_object($nb_pages, $nb_items_per_page) ; =cut ## START of SUB sub set_html_tbody_object { my $self = shift ; my ( $nb_pages, $nb_items_per_page ) = @_ ; my ( @tbody_object ) = ( ) ; for ( my $i = 1 ; $i <= $nb_pages ; $i++ ) { my %pages = ( # tbody feature PAGE_NB => $i, MASSES => [], ## end MASSES ) ; ## end TBODY N push (@tbody_object, \%pages) ; } return(\@tbody_object) ; } ## END of SUB =head2 METHOD add_mz_to_tbody_object ## Description : initializes and build the mz object (perl array) needed to html template ## Input : $tbody_object, $nb_items_per_page, $mz_list ## Output : $tbody_object ## Usage : my ( $tbody_object ) = add_mz_to_tbody_object( $tbody_object, $nb_items_per_page, $mz_list ) ; =cut ## START of SUB sub add_mz_to_tbody_object { my $self = shift ; my ( $tbody_object, $nb_items_per_page, $mz_list, $ids_list ) = @_ ; my ( $current_page, $mz_index ) = ( 0, 0 ) ; foreach my $page ( @{$tbody_object} ) { my @colors = ('white', 'green') ; my ( $current_index, , $icolor ) = ( 0, 0 ) ; for ( my $i = 1 ; $i <= $nb_items_per_page ; $i++ ) { # if ( $current_index > $nb_items_per_page ) { ## manage exact mz per html page $current_index = 0 ; last ; ## } else { $current_index++ ; if ( $icolor > 1 ) { $icolor = 0 ; } if ( exists $mz_list->[$mz_index] ) { my %mz = ( # mass feature MASSES_ID_QUERY => $ids_list->[$mz_index], MASSES_MZ_QUERY => $mz_list->[$mz_index], MZ_COLOR => $colors[$icolor], MASSES_NB => $mz_index+1, ENTRIES => [] , ) ; push ( @{ $tbody_object->[$current_page]{MASSES} }, \%mz ) ; # Html attr for mass $icolor++ ; } } $mz_index++ ; } ## foreach mz $current_page++ ; } return($tbody_object) ; } ## END of SUB =head2 METHOD add_entries_to_tbody_object ## Description : initializes and build the entries object (perl array) needed to html template ## Input : $tbody_object, $nb_items_per_page, $mz_list, $entries ## Output : $tbody_object ## Usage : my ( $tbody_object ) = add_entries_to_tbody_object( $tbody_object, $nb_items_per_page, $mz_list, $entries ) ; =cut ## START of SUB sub add_entries_to_tbody_object { ## Retrieve Values my $self = shift ; my ( $tbody_object, $nb_items_per_page, $mz_list, $entries ) = @_ ; my $index_page = 0 ; my $index_mz_continous = 0 ; foreach my $page (@{$tbody_object}) { my $index_mz = 0 ; foreach my $mz (@{ $tbody_object->[$index_page]{MASSES} }) { my $index_entry = 0 ; my @anti_redondant = ('N/A') ; my $check_rebond = 0 ; my $check_noentry = 0 ; foreach my $entry (@{ $entries->[$index_mz_continous] }) { $check_noentry ++ ; ## dispo anti doublons des entries foreach my $rebond (@anti_redondant) { if ( $rebond eq $entries->[$index_mz_continous][$index_entry]{ENTRY_ENTRY_ID} ) { $check_rebond = 1 ; last ; } } if ( $check_rebond == 0 ) { push ( @anti_redondant, $entries->[$index_mz_continous][$index_entry]{ENTRY_ENTRY_ID} ) ; my %entry = ( ENTRY_COLOR => $tbody_object->[$index_page]{MASSES}[$index_mz]{MZ_COLOR}, ENTRY_ENTRY_NAME => $entries->[$index_mz_continous][$index_entry]{ENTRY_ENTRY_NAME}, ENTRY_ENTRY_ID => $entries->[$index_mz_continous][$index_entry]{ENTRY_ENTRY_ID}, ENTRY_ENTRY_ID2 => $entries->[$index_mz_continous][$index_entry]{ENTRY_ENTRY_ID}, ENTRY_FORMULA => $entries->[$index_mz_continous][$index_entry]{ENTRY_FORMULA}, ENTRY_CPD_MZ => $entries->[$index_mz_continous][$index_entry]{ENTRY_CPD_MZ}, ENTRY_ADDUCT => $entries->[$index_mz_continous][$index_entry]{ENTRY_ADDUCT}, ENTRY_ADDUCT_TYPE => $entries->[$index_mz_continous][$index_entry]{ENTRY_ADDUCT_TYPE}, ENTRY_ADDUCT_MZ => $entries->[$index_mz_continous][$index_entry]{ENTRY_ADDUCT_MZ}, ENTRY_DELTA => $entries->[$index_mz_continous][$index_entry]{ENTRY_DELTA}, ENTRY_ENTRY_INCHI => $entries->[$index_mz_continous][$index_entry]{ENTRY_ENTRY_INCHI}, ENTRY_ENTRY_LOGP => $entries->[$index_mz_continous][$index_entry]{ENTRY_ENTRY_LOGP}, ) ; push ( @{ $tbody_object->[$index_page]{MASSES}[$index_mz]{ENTRIES} }, \%entry) ; } $check_rebond = 0 ; ## reinit double control $index_entry++ ; } ## end foreach if ($check_noentry == 0 ) { my %entry = ( ENTRY_COLOR => $tbody_object->[$index_page]{MASSES}[$index_mz]{MZ_COLOR}, ENTRY_ENTRY_NAME => 'UNKNOWN', ENTRY_ENTRY_ID => 'NONE', ENTRY_ENTRY_ID2 => '', ENTRY_FORMULA => 'n/a', ENTRY_CPD_MZ => 'n/a', ENTRY_ADDUCT => 'n/a', ENTRY_ADDUCT_TYPE => 'n/a', ENTRY_ADDUCT_MZ => 'n/a', ENTRY_DELTA => 0, ENTRY_ENTRY_INCHI => 'n/a', ENTRY_ENTRY_LOGP => 'n/a', ) ; push ( @{ $tbody_object->[$index_page]{MASSES}[$index_mz]{ENTRIES} }, \%entry) ; } $index_mz ++ ; $index_mz_continous ++ ; } $index_page++ ; } return($tbody_object) ; } ## END of SUB =head2 METHOD write_html_skel ## Description : prepare and write the html output file ## Input : $html_file_name, $html_object, $html_template ## Output : $html_file_name ## Usage : my ( $html_file_name ) = write_html_skel( $html_file_name, $html_object ) ; =cut ## START of SUB sub write_html_skel { ## Retrieve Values my $self = shift ; my ( $html_file_name, $html_object, $pages , $search_condition, $html_template, $js_path, $css_path ) = @_ ; my $html_file = $$html_file_name ; if ( defined $html_file ) { open ( HTML, ">$html_file" ) or die "Can't create the output file $html_file " ; if (-e $html_template) { my $ohtml = HTML::Template->new(filename => $html_template); $ohtml->param( JS_GALAXY_PATH => $js_path, CSS_GALAXY_PATH => $css_path ) ; $ohtml->param( CONDITIONS => $search_condition ) ; $ohtml->param( PAGES_NB => $pages ) ; $ohtml->param( PAGES => $html_object ) ; print HTML $ohtml->output ; } else { croak "Can't fill any html output : No template available ($html_template)\n" ; } close (HTML) ; } else { croak "No output file name available to write HTML file\n" ; } return(\$html_file) ; } ## END of SUB =head2 METHOD set_lm_matrix_object ## Description : build the hmdb_row under its ref form ## Input : $header, $init_mzs, $entries ## Output : $hmdb_matrix ## Usage : my ( $hmdb_matrix ) = set_lm_matrix_object( $header, $init_mzs, $entries ) ; =cut ## START of SUB sub set_lm_matrix_object { ## Retrieve Values my $self = shift ; my ( $header, $init_mzs, $entries ) = @_ ; my @hmdb_matrix = () ; if ( defined $header ) { my @headers = () ; push @headers, $header ; push @hmdb_matrix, \@headers ; } my $index_mz = 0 ; foreach my $mz ( @{$init_mzs} ) { my $index_entries = 0 ; my @clusters = () ; my $cluster_col = undef ; my @anti_redondant = ('N/A') ; my $check_rebond = 0 ; my $nb_entries = scalar (@{ $entries->[$index_mz] }) ; foreach my $entry (@{ $entries->[$index_mz] }) { ## dispo anti doublons des entries foreach my $rebond (@anti_redondant) { if ( $rebond eq $entries->[$index_mz][$index_entries]{ENTRY_ENTRY_ID} ) { $check_rebond = 1 ; last ; } } if ( $check_rebond == 0 ) { push ( @anti_redondant, $entries->[$index_mz][$index_entries]{ENTRY_ENTRY_ID} ) ; my $delta = $entries->[$index_mz][$index_entries]{ENTRY_DELTA} ; my $formula = $entries->[$index_mz][$index_entries]{ENTRY_FORMULA} ; my $hmdb_id = $entries->[$index_mz][$index_entries]{ENTRY_ENTRY_ID} ; ## METLIN data display model ## entry1=VAR1::VAR2::VAR3::VAR4|entry2=VAR1::VAR2::VAR3::VAR4|... # manage final pipe if ($index_entries < $nb_entries-1 ) { $cluster_col .= $delta.'::('.$formula.')::'.$hmdb_id.'|' ; } else { $cluster_col .= $delta.'::('.$formula.')::'.$hmdb_id ; } } $check_rebond = 0 ; ## reinit double control $index_entries++ ; } ## end foreach if ( !defined $cluster_col ) { $cluster_col = 'NONE' ; } push (@clusters, $cluster_col) ; push (@hmdb_matrix, \@clusters) ; $index_mz++ ; } return(\@hmdb_matrix) ; } ## END of SUB =head2 METHOD set_hmdb_matrix_object_with_ids ## Description : build the hmdb_row under its ref form (IDS only) ## Input : $header, $init_mzs, $entries ## Output : $hmdb_matrix ## Usage : my ( $hmdb_matrix ) = set_hmdb_matrix_object_with_ids( $header, $init_mzs, $entries ) ; =cut ## START of SUB sub set_hmdb_matrix_object_with_ids { ## Retrieve Values my $self = shift ; my ( $header, $init_mzs, $entries ) = @_ ; # print Dumper $entries ; my @hmdb_matrix = () ; if ( defined $header ) { my @headers = () ; ## redefined the header hmdb(delta::name::mz::formula::adduct::id) $header = 'hmdb(delta::name::mz::formula::adduct::id)' ; push @headers, $header ; push @hmdb_matrix, \@headers ; } my $index_mz = 0 ; foreach my $mz ( @{$init_mzs} ) { my $index_entries = 0 ; my @clusters = () ; my $cluster_col = undef ; my @anti_redondant = ('N/A') ; my $check_rebond = 0 ; my $nb_entries = scalar (@{ $entries->[$index_mz] }) ; foreach my $entry (@{ $entries->[$index_mz] }) { ## dispo anti doublons des entries foreach my $rebond (@anti_redondant) { if ( $rebond eq $entries->[$index_mz][$index_entries]{ENTRY_ENTRY_ID} ) { $check_rebond = 1 ; last ; } } if ( $check_rebond == 0 ) { push ( @anti_redondant, $entries->[$index_mz][$index_entries]{ENTRY_ENTRY_ID} ) ; ## my ($hmdb_name, $hmdb_id, $hmdb_formula, $hmdb_cpd_mz, $hmdb_adduct, $hmdb_delta) = (undef, undef, undef, undef, undef, undef) ; if ($entries->[$index_mz][$index_entries]{ENTRY_ENTRY_NAME} ) { $hmdb_name = $entries->[$index_mz][$index_entries]{ENTRY_ENTRY_NAME} ; } else { $hmdb_name = 'UNKNOWN' ; } if ($entries->[$index_mz][$index_entries]{ENTRY_ENTRY_ID} ) { $hmdb_id = $entries->[$index_mz][$index_entries]{ENTRY_ENTRY_ID} ; } else { $hmdb_id = 0 ; } if ($entries->[$index_mz][$index_entries]{ENTRY_FORMULA} ) { $hmdb_formula = $entries->[$index_mz][$index_entries]{ENTRY_FORMULA} ; } else { $hmdb_formula = 'N/A' ; } if ($entries->[$index_mz][$index_entries]{ENTRY_CPD_MZ} ) { $hmdb_cpd_mz = $entries->[$index_mz][$index_entries]{ENTRY_CPD_MZ} ; } else { $hmdb_cpd_mz = 'N/A' ; } if ($entries->[$index_mz][$index_entries]{ENTRY_ADDUCT} ) { $hmdb_adduct = $entries->[$index_mz][$index_entries]{ENTRY_ADDUCT} ; } else { $hmdb_adduct = 'N/A' ; } if ($entries->[$index_mz][$index_entries]{ENTRY_DELTA} ) { $hmdb_delta = $entries->[$index_mz][$index_entries]{ENTRY_DELTA} ; } else { $hmdb_delta = 0 ; } ## METLIN data display model ## entry1= ENTRY_DELTA::ENTRY_ENTRY_NAME::ENTRY_CPD_MZ::ENTRY_FORMULA::ENTRY_ADDUCT::ENTRY_ENTRY_ID | entry2=VAR1::VAR2::VAR3::VAR4|... my $entry = $hmdb_delta.'::['."$hmdb_name".']::'.$hmdb_cpd_mz.'::'.$hmdb_formula.'::['.$hmdb_adduct.']::'.$hmdb_id ; # manage final pipe if ($index_entries < $nb_entries-1 ) { $cluster_col .= $entry.' | ' ; } else { $cluster_col .= $entry ; } } $check_rebond = 0 ; ## reinit double control $index_entries++ ; } ## end foreach if ( !defined $cluster_col ) { $cluster_col = 'NONE' ; } push (@clusters, $cluster_col) ; push (@hmdb_matrix, \@clusters) ; $index_mz++ ; } return(\@hmdb_matrix) ; } ## END of SUB =head2 METHOD add_lm_matrix_to_input_matrix ## Description : build a full matrix (input + lm column) ## Input : $input_matrix_object, $lm_matrix_object, $nb_header ## Output : $output_matrix_object ## Usage : my ( $output_matrix_object ) = add_lm_matrix_to_input_matrix( $input_matrix_object, $lm_matrix_object, $nb_header ) ; =cut ## START of SUB sub add_lm_matrix_to_input_matrix { ## Retrieve Values my $self = shift ; my ( $input_matrix_object, $lm_matrix_object, $nb_header ) = @_ ; my @output_matrix_object = () ; my $index_row = 0 ; my $line = 0 ; foreach my $row ( @{$input_matrix_object} ) { my @init_row = @{$row} ; $line++; if ( ( defined $nb_header ) and ( $line <= $nb_header) ) { push (@output_matrix_object, \@init_row) ; next ; } if ( $lm_matrix_object->[$index_row] ) { my $dim = scalar(@{$lm_matrix_object->[$index_row]}) ; if ($dim > 1) { warn "the add method can't manage more than one column\n" ;} my $lm_col = $lm_matrix_object->[$index_row][$dim-1] ; push (@init_row, $lm_col) ; $index_row++ ; } push (@output_matrix_object, \@init_row) ; } return(\@output_matrix_object) ; } ## END of SUB =head2 METHOD write_csv_skel ## Description : prepare and write csv output file ## Input : $csv_file, $rows ## Output : $csv_file ## Usage : my ( $csv_file ) = write_csv_skel( $csv_file, $rows ) ; =cut ## START of SUB sub write_csv_skel { ## Retrieve Values my $self = shift ; my ( $csv_file, $rows ) = @_ ; # my $ocsv = lib::csv::new( {is_binary => 1 , quote_binary => 0, quote_char => undef }) ; my $ocsv = lib::csv::new() ; my $csv = $ocsv->get_csv_object("\t") ; $ocsv->write_csv_from_arrays($csv, $$csv_file, $rows) ; return($csv_file) ; } ## END of SUB =head2 METHOD write_csv_one_mass ## Description : print a cvs file ## Input : $masses, $ids, $results, $file ## Output : N/A ## Usage : write_csv_one_mass( $ids, $results, $file ) ; =cut ## START of SUB sub write_csv_one_mass { ## Retrieve Values my $self = shift ; my ( $masses, $ids, $results, $file, ) = @_ ; open(CSV, '>:utf8', "$file") or die "Cant' create the file $file\n" ; print CSV "ID\tQuery(Da)\tDelta(ppm)\tMetabolite_Name\tCpd_MW(Da)\tFormula\tAdduct\tAdduct_MW(Da)\tHMDB_ID\n" ; my $i = 0 ; foreach my $id (@{$ids}) { my $mass = undef ; if ( $masses->[$i] ) { $mass = $masses->[$i] ; } else { last ; } if ( $results->[$i] ) { ## an requested id has a result in the list of hashes $results. my @anti_redondant = ('N/A') ; my $check_rebond = 0 ; my $check_noentry = 0 ; foreach my $entry (@{$results->[$i]}) { $check_noentry ++ ; ## dispo anti doublons des entries foreach my $rebond (@anti_redondant) { if ( $rebond eq $entry->{ENTRY_ENTRY_ID} ) { $check_rebond = 1 ; last ; } } # print "\n-----------------------" ; # print Dumper $entry->{ENTRY_ENTRY_ID} ; # print "-------------------------$check_rebond\n" ; # print Dumper @anti_redondant ; if ( $check_rebond == 0 ) { push ( @anti_redondant, $entry->{ENTRY_ENTRY_ID} ) ; print CSV "$id\t$mass\t" ; ## print delta if ( $entry->{ENTRY_DELTA} ) { print CSV "$entry->{ENTRY_DELTA}\t" ; } else { print CSV "0\t" ; } ## print cpd name if ( $entry->{ENTRY_ENTRY_NAME} ) { print CSV "[$entry->{ENTRY_ENTRY_NAME}]\t" ; } else { print CSV "UNKNOWN\t" ; } ## print cpd mz if ( $entry->{ENTRY_CPD_MZ} ) { print CSV "$entry->{ENTRY_CPD_MZ}\t" ; } else { print CSV "N/A\t" ; } ## print cpd formula if ( $entry->{ENTRY_FORMULA} ) { print CSV "$entry->{ENTRY_FORMULA}\t" ; } else { print CSV "N/A\t" ; } ## print adduct if ( $entry->{ENTRY_ADDUCT} ) { print CSV "[$entry->{ENTRY_ADDUCT}]\t" ; } else { print CSV "N/A\t" ; } ## print adduct mz if ( $entry->{ENTRY_ADDUCT_MZ} ) { print CSV "$entry->{ENTRY_ADDUCT_MZ}\t" ; } else { print CSV "N/A\t" ; } ## print cpd id if ( $entry->{ENTRY_ENTRY_ID} ) { print CSV "$entry->{ENTRY_ENTRY_ID}\n" ; } else { print CSV "N/A\n" ; } } $check_rebond = 0 ; ## reinit double control } ## end foreach if ($check_noentry == 0 ) { print CSV "$id\t$mass\t0\tUNKNOWN\tN/A\tN/A\tN/A\tN/A\tN/A\n" ; } } $i++ ; } close(CSV) ; return() ; } ## END of SUB 1 ; __END__ =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc hmdb.pm =head1 Exports =over 4 =item :ALL is ... =back =head1 AUTHOR Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 VERSION version 1 : 06 / 06 / 2013 version 2 : 27 / 01 / 2014 version 3 : 19 / 11 / 2014 version 4 : 28 / 01 / 2016 version 5 : 02 / 11 /2016 =cut