Mercurial > repos > fgiacomoni > hmdb_ms_search
diff lib/hmdb.pm @ 23:2d8a310e86ce draft
Prod branch Updating with v.:CI_COMMIT_TAG- - Fxx
author | fgiacomoni |
---|---|
date | Thu, 19 May 2022 13:43:09 +0000 |
parents | 453fbe98925a |
children | d8e2ede293a6 |
line wrap: on
line diff
--- a/lib/hmdb.pm Fri Nov 20 17:29:18 2020 +0000 +++ b/lib/hmdb.pm Thu May 19 13:43:09 2022 +0000 @@ -21,8 +21,8 @@ our $VERSION = "1.0"; our @ISA = qw(Exporter); -our @EXPORT = qw( map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass ); -our %EXPORT_TAGS = ( ALL => [qw( map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass )] ); +our @EXPORT = qw( parseHmdb5CSVResults getMatchesFromHmdb5WithUA map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass ); +our %EXPORT_TAGS = ( ALL => [qw( parseHmdb5CSVResults getMatchesFromHmdb5WithUA map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass )] ); =head1 NAME @@ -273,7 +273,7 @@ =head2 METHOD testMatchesFromHmdbWithUA - ## Description : test a single query with tests parameters on hmdb - get the status of the complete server infra. + ## Description : [DEPRECATED] test a single query with tests parameters on hmdb - get the status of the complete server infra. ## Input : none ## Output : $status_line ## Usage : my ( $status_line ) = testMatchesFromHmdbWithUA( ) ; @@ -288,12 +288,14 @@ #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and my $mech = WWW::Mechanize->new( - agent => 'wonderbot for W4M 1.01', - autocheck => 1, +# agent => 'wonderbot for W4M 1.01', + agent => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:93.0) Gecko/20100101 Firefox/93.0' , + autocheck => 0, ); my $statusGetLine = 0 ; my $statusPostLine = 0 ; + my $csrftoken = undef ; #receiving cookies and authentication token (CFRS) my $reqInit = $mech->get("http://www.hmdb.ca/spectra/ms/search"); @@ -302,13 +304,23 @@ if ($statusGetLine == 200 ) { die 'no CSRF_REQUEST_TOKEN_VALUE in page found' unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ; - my $csrftoken = $1; -# print "\nTOKEN: $csrftoken\n" ; + $csrftoken = $1; + print "\nTOKEN: $csrftoken\n" ; $mech->add_header("X-CSRFToken", $csrftoken); + $mech->add_header('Host', 'specdb.wishartlab.com'); $mech->add_header('Connection', 'keep-alive'); + $mech->add_header('Upgrade-Insecure-Requests', '1'); $mech->add_header('Content-Type', 'application/x-www-form-urlencoded'); - $mech->add_header('Referer', 'http://www.hmdb.ca/spectra/ms/search'); - $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); + $mech->add_header('Accept-Language', 'en-US,en;q=0.5'); + $mech->add_header('Accept-Encoding', 'gzip, deflate'); +# $mech->add_header('Content-Length', "300"); + $mech->add_header('Origin', 'null'); + $mech->add_header('DNT', '1'); + $mech->add_header('Referer', 'https://hmdb.ca/spectra/ms/search'); +# $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); + $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'); + + } ## POST test @@ -318,13 +330,15 @@ # Fix a limit at 3 tries... if ($top < 4) { print "\tTesting HMDB server connexion ($top time(s) )...\n" ; - $mech->post( - "http://specdb.wishartlab.com/ms/search.csv", - Content => 'utf8=TRUE&mode=positive&adduct_type=M%2BH%202M%2BH&query_masses=125.0089&tolerance=0.001&database=HMDB&commit=Download Results As CSV' - ); - + eval { + $mech->post( + "http://specdb.wishartlab.com/ms/search.csv", + Content => 'utf8=TRUE&authenticity_token='.$csrftoken.'&mode=positive&adduct_type=M%2BH%202M%2BH&query_masses=125.0089&tolerance=0.001&database=HMDB&commit=Download Results As CSV' + ); + } ; # print Dumper $mech ; $statusPostLine = $mech->status() ; + print "Status: $statusPostLine" ; } else { last ; @@ -336,7 +350,82 @@ ## END of SUB +=head2 METHOD testMatchesFromHmdb5WithUA + ## Description : test a single query with tests parameters on hmdb - get the status of the complete server infra (API V5.0 compliant). + ## Input : none + ## Output : $status_line + ## Usage : my ( $status_line ) = testMatchesFromHmdb5WithUA( ) ; + +=cut +## START of SUB +sub testMatchesFromHmdb5WithUA { + ## Retrieve Values + my $self = shift ; + +my $mech = WWW::Mechanize->new( + agent => 'wonderbot for W4M 3.0', + autocheck => 1, + timeout => 2400, + ); + + my $statusGetLine = 0 ; + my $statusPostLine = 0 ; + + #receiving cookies and authentication token (CFRS) + my $reqInit = $mech->get("https://www.hmdb.ca/spectra/ms/search"); + $statusGetLine = $mech->status() ; + + if ($statusGetLine == 200 ) { + die 'no CSRF_REQUEST_TOKEN_VALUE in page found' + unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ; + my $csrftoken = $1; +# print "\nTOKEN: $csrftoken\n" ; + $mech->add_header("X-CSRFToken", $csrftoken); + $mech->add_header('Connection', 'keep-alive'); + $mech->add_header('Content-Type', 'application/x-www-form-urlencoded'); + $mech->add_header('Referer', 'https://www.hmdb.ca/spectra/ms/search'); + $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); + } + + + ## POST test + my $top = 1 ; + while ($statusPostLine != 200 ) { + + # Fix a limit at 3 tries... + if ($top < 4) { + print "\tTesting HMDB server connexion ($top time(s) )...\n" ; + eval { + my $res = $mech->get( + 'https://hmdb.ca/spectra/ms/generate_csv.csv?' + .'results%5Baction%5D=search' + .'&results%5Badduct_type%5D%5B%5D=M%2BH%202M%2BH' + #.'&results%5Bauthenticity_token%5D=' + .'&results%5Bccs_predictors%5D=' + .'&results%5Bccs_tolerance%5D=' + .'&results%5Bcommit%5D=Search' + .'&results%5Bcontroller%5D=specdb%2Fms' + .'&results%5Bms_search_ion_mode%5D=positive' + .'&results%5Bquery_masses%5D=125.0089' + .'&results%5Btolerance%5D=0.001' + .'&results%5Btolerance_units%5D=Da' + .'&results%5Butf8%5D=%E2%9C%93' + + ); + } ; +# print Dumper $mech ; + $statusPostLine = $mech->status() ; + print "Status: $statusPostLine" ; + } + else { + last ; + } + $top++ ; + }## End While + return (\$statusPostLine) ; +} +## END of SUB =head2 METHOD check_state_from_hmdb_ua @@ -353,7 +442,7 @@ my ($status) = @_ ; if (!defined $$status) { - croak "No http status is defined for the distant server" ; + croak "No https status is defined for the distant server" ; } else { unless ( $$status == 200 ) { @@ -389,6 +478,10 @@ my $self = shift ; my ( $masses, $delta, $mode ) = @_ ; + ## Added May, 2022 + warn "[DEPRECATED Methode] method get_matches_from_hmdb_ua is deprecated and not compatible with HMDB 4.0" ; + return ([], 500) ; + my @page = () ; my $ua = LWP::UserAgent->new( keep_alive => 10 ); @@ -443,6 +536,10 @@ my $self = shift ; my ( $masses, $delta, $mode, $adducts ) = @_ ; + ## Added May, 2022 + warn "[DEPRECATED Methode] method getMatchesFromHmdbWithUA is deprecated and not compatible with HMDB 5.0" ; + return ([], 500) ; + my @page = () ; #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and @@ -501,10 +598,185 @@ } ## END of SUB +=head2 METHOD getMatchesFromHmdb5WithUA + + ## Description : HMDB v5.0 querying via an user agent with parameters : mz, delta and molecular species (neutral, pos, neg) + ## Input : $mass, $delta, $mode, adducts + ## Output : $results + ## Usage : my ( $results ) = getMatchesFromHmdbWithUA( $mass, $delta, $mode ) ; + +=cut +## START of SUB +sub getMatchesFromHmdb5WithUA { + ## Retrieve Values + my $self = shift ; + my ( $masses, $delta, $mode, $adducts ) = @_ ; + + my @page = () ; + + #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and + + my $mech = WWW::Mechanize->new( + agent => 'wonderbot for W4M 3.0', + autocheck => 1, + timeout => 2400, + ); + + my $statusGetLine = 0 ; + my $statusPostLine = 0 ; + + #receiving cookies and authentication token (CFRS) + my $reqInit = $mech->get("https://www.hmdb.ca/spectra/ms/search"); + $statusGetLine = $mech->status() ; + + if ($statusGetLine == 200 ) { + die 'no CSRF_REQUEST_TOKEN_VALUE in page found' + unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ; + my $csrftoken = $1; +# print "\nTOKEN: $csrftoken\n" ; + $mech->add_header("X-CSRFToken", $csrftoken); + $mech->add_header('Connection', 'keep-alive'); + $mech->add_header('Content-Type', 'application/x-www-form-urlencoded'); + $mech->add_header('Referer', 'https://www.hmdb.ca/spectra/ms/search'); + $mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); + } + ## adduct format is adduct_type=M%2BH%202M%2BH + + if ( (!defined $adducts) or ( $adducts eq '') ) { + $adducts = 'Unknown' ; + } + + my $res = $mech->get( + 'https://hmdb.ca/spectra/ms/generate_csv.csv?' + .'results%5Baction%5D=search' + .'&results%5Badduct_type%5D%5B%5D='.$adducts + #.'&results%5Bauthenticity_token%5D=' + .'&results%5Bccs_predictors%5D=' + .'&results%5Bccs_tolerance%5D=' + .'&results%5Bcommit%5D=Search' + .'&results%5Bcontroller%5D=specdb%2Fms' + .'&results%5Bms_search_ion_mode%5D='.$mode + .'&results%5Bquery_masses%5D='.$masses + .'&results%5Btolerance%5D='.$delta + .'&results%5Btolerance_units%5D=Da' + .'&results%5Butf8%5D=%E2%9C%93' + + ); + $statusGetLine = $mech->status() ; + + if ($mech->success) { + @page = split ( /\n/, $res->decoded_content ) ; + $statusPostLine = 'OK' ; + } + else { + $statusPostLine = $mech->status() ; + warn "\t[HMDB service issue !! the server returned a $statusPostLine HTTP error]" ; + } + +# print Dumper $res->decoded_content ; + + return (\@page, $statusPostLine) ; +} +## END of SUB + + +=head2 METHOD parseHmdb5CSVResults + + ## Description : parse the csv results and get data - API 5.0 compliant + ## Input : $csv + ## Output : $results + ## Usage : my ( $results ) = parseHmdb5CSVResults( $csv ) ; + +=cut +## START of SUB +sub parseHmdb5CSVResults { + ## Retrieve Values + my $self = shift ; + my ( $csv, $masses, $max_query ) = @_ ; + + my $test = 0 ; + my ($query_mass,$compound_id,$formula,$compound_mass,$adduct,$adduct_type,$adduct_mass,$delta) = (0, undef, undef, undef, undef, undef, undef, undef) ; + + my %result_by_entry = () ; + my %features = () ; + +# print Dumper $csv ; +# print Dumper $masses ; +# print Dumper $max_query ; + + foreach my $line (@{$csv}) { + ## NEW HMDB format V5.0 - May2022 + if ($line !~ /query_mass,compound_id,compound_name,formula,monoisotopic_mass,adduct,adduct_type,adduct_m\/z,"delta\(ppm\),",ccs_value/) { + #query_mass,compound_id,compound_name,formula,monoisotopic_mass,adduct,adduct_type,adduct_m/z,"delta(ppm),",ccs_value', + + if ( $line =~ /(\d+\.\d+),(\w+),(.*),(\w+),(\d+\.\d+),([\w|n\/a|\s+]+)\s*,(\+|\-),(\d+\.\d+),(\d+),(\d*)/ ) { + print "$line\n" ; + #if ( $line =~ /(\d+\.\d+),(\w+),(.*),([\w|n\/a|\s+]+)\s*,(\w+),(\d+\.\d+),(.*),(\+|\-),(\d+\.\d+),(\d+)/ ) { + my @entry = ("$1","$2","$3","$4","$5","$6","$7","$8","$9","$10") ; + + if ( !exists $result_by_entry{$entry[0]} ) { $result_by_entry{$entry[0]} = [] ; } + + $features{ENTRY_ENTRY_ID} = $entry[1] ; + $features{ENTRY_ENTRY_NAME} = $entry[2] ; + $features{ENTRY_FORMULA} = $entry[3] ; + $features{ENTRY_CPD_MZ} = $entry[4] ; + $features{ENTRY_ADDUCT} = $entry[5] ; + $features{ENTRY_ADDUCT_TYPE} = $entry[6] ; + $features{ENTRY_ADDUCT_MZ} = $entry[7] ; + $features{ENTRY_DELTA} = $entry[8] ; + + my %temp = %features ; + push (@{$result_by_entry{$entry[0]} }, \%temp) ; + } +# elsif ($line =~ /(\d+\.\d+)/) { # +# ## 288.082286511284,HMDB0002255,R-Methylmalonyl-CoA, C01213 ,C25H40N7O19P3S,867.131252359,M-3H,-,288.036475,159 +# ## 283.108004472276,"Bicyclo_3,1,1heptane-2,3-diol,2,6,6_trimethyl","2,3-Pinanediol",n/a,C10H18O2,170.13067982,M+TFA-H,-,283.116266,29 +# ## 174.034120330029,HMDB0011723,2-Methylhippuric acid, C01586,C10H11NO3,193.073893223,M-H20-H,-,174.055503,123 +# ## 193.139160745841,HMDB0012109,"7-[(1R,2R,3R,5S)-3,5-Dihydroxy-2-[(1E,3S)-3-hydroxyoct-1-en-1-yl]cyclopentyl]-5,6-dihydroxyheptanoic acid", C06475,C20H36O7,388.246103506,M-2H,-,193.115776,121 +# ## 214.018826827064,HMDB0011723,2-Methylhippuric acid, C01586,C10H11NO3,193.073893223,M+Na-2H,-,214.048559,139 +# } +# else { +# +# warn "The parsed line ($line) does not match your pattern\n " ; +# } + } + else { + print "Header detected...Parsing is starting...\n" ; + next ; + } + } ## end foreach + + ## manage per query_mzs (keep query masses order by array) + my @results = () ; + foreach (@{$masses}) { + if ($result_by_entry{$_}) { + + ## cut all entries > $max_query - all entries were already sorted...by hmdb + my @temp_entries = @{$result_by_entry{$_}} ; + my @temp_cut = () ; + my $current_query = 0 ; + foreach (@temp_entries) { + $current_query ++ ; + if ($current_query > $max_query) { + last ; + } + else { + push (@temp_cut, $_) ; + } + } + push (@results, \@temp_cut) ; + } + else { push (@results, [] ) ; } ; + + } + return(\@results) ; +} +## END of SUB + =head2 METHOD parse_hmdb_csv_results - ## Description : parse the csv results and get data + ## Description : [DEPRECATED] parse the csv results and get data ## Input : $csv ## Output : $results ## Usage : my ( $results ) = parse_hmdb_csv_results( $csv ) ;