diff lib/hmdb.pm @ 23:2d8a310e86ce draft

Prod branch Updating with v.:CI_COMMIT_TAG- - Fxx
author fgiacomoni
date Thu, 19 May 2022 13:43:09 +0000
parents 453fbe98925a
children d8e2ede293a6
line wrap: on
line diff
--- a/lib/hmdb.pm	Fri Nov 20 17:29:18 2020 +0000
+++ b/lib/hmdb.pm	Thu May 19 13:43:09 2022 +0000
@@ -21,8 +21,8 @@
 
 our $VERSION = "1.0";
 our @ISA = qw(Exporter);
-our @EXPORT = qw( map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass );
-our %EXPORT_TAGS = ( ALL => [qw( map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass )] );
+our @EXPORT = qw( parseHmdb5CSVResults getMatchesFromHmdb5WithUA map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass );
+our %EXPORT_TAGS = ( ALL => [qw( parseHmdb5CSVResults getMatchesFromHmdb5WithUA map_suppl_data_on_hmdb_results get_unik_ids_from_results get_hmdb_metabocard_from_id extract_sub_mz_lists test_matches_from_hmdb_ua prepare_multi_masses_query get_matches_from_hmdb_ua parse_hmdb_csv_results set_html_tbody_object add_mz_to_tbody_object add_entries_to_tbody_object write_html_skel set_lm_matrix_object set_hmdb_matrix_object_with_ids add_lm_matrix_to_input_matrix write_csv_skel write_csv_one_mass )] );
 
 =head1 NAME
 
@@ -273,7 +273,7 @@
 
 =head2 METHOD testMatchesFromHmdbWithUA
 
-	## Description : test a single query with tests parameters on hmdb - get the status of the complete server infra.
+	## Description : [DEPRECATED] test a single query with tests parameters on hmdb - get the status of the complete server infra.
 	## Input : none
 	## Output : $status_line
 	## Usage : my ( $status_line ) = testMatchesFromHmdbWithUA( ) ;
@@ -288,12 +288,14 @@
     #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and
 
 	my $mech = WWW::Mechanize->new(
-		agent => 'wonderbot for W4M 1.01',
-		autocheck => 1,	
+#		agent => 'wonderbot for W4M 1.01',
+		agent => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:93.0) Gecko/20100101 Firefox/93.0' ,
+		autocheck => 0,
 	);
 	
 	my $statusGetLine = 0 ;
 	my $statusPostLine = 0 ;
+	my $csrftoken = undef ;
 	
 	#receiving cookies and authentication token (CFRS)
 	my $reqInit = $mech->get("http://www.hmdb.ca/spectra/ms/search");
@@ -302,13 +304,23 @@
 	if ($statusGetLine == 200 ) {
 		die 'no CSRF_REQUEST_TOKEN_VALUE in page found'
 		unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ;
-		my $csrftoken = $1;
-#		print "\nTOKEN: $csrftoken\n" ;
+		$csrftoken = $1;
+		print "\nTOKEN: $csrftoken\n" ;
 		$mech->add_header("X-CSRFToken", $csrftoken);
+		$mech->add_header('Host', 'specdb.wishartlab.com');
 		$mech->add_header('Connection', 'keep-alive');
+		$mech->add_header('Upgrade-Insecure-Requests', '1');
 		$mech->add_header('Content-Type', 'application/x-www-form-urlencoded');
-		$mech->add_header('Referer', 'http://www.hmdb.ca/spectra/ms/search');
-		$mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');
+		$mech->add_header('Accept-Language', 'en-US,en;q=0.5');
+		$mech->add_header('Accept-Encoding', 'gzip, deflate');
+#		$mech->add_header('Content-Length', "300");
+		$mech->add_header('Origin', 'null');
+		$mech->add_header('DNT', '1');
+		$mech->add_header('Referer', 'https://hmdb.ca/spectra/ms/search');
+#		$mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');
+		$mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8');
+		
+		
 	}
 	
 	## POST test
@@ -318,13 +330,15 @@
 		# Fix a limit at 3 tries...
 		if ($top < 4) {
 			print "\tTesting HMDB server connexion ($top time(s) )...\n" ;
-			$mech->post(
-				"http://specdb.wishartlab.com/ms/search.csv",
-		    	Content => 'utf8=TRUE&mode=positive&adduct_type=M%2BH%202M%2BH&query_masses=125.0089&tolerance=0.001&database=HMDB&commit=Download Results As CSV'
-			);
-			
+			eval {
+				$mech->post(
+					"http://specdb.wishartlab.com/ms/search.csv",
+			    	Content => 'utf8=TRUE&authenticity_token='.$csrftoken.'&mode=positive&adduct_type=M%2BH%202M%2BH&query_masses=125.0089&tolerance=0.001&database=HMDB&commit=Download Results As CSV'
+				);
+			} ;
 #		print Dumper $mech ;
 			$statusPostLine = $mech->status() ;
+			print "Status: $statusPostLine" ;
 		}
 		else {
 			last ;
@@ -336,7 +350,82 @@
 ## END of SUB
 
 
+=head2 METHOD testMatchesFromHmdb5WithUA
 
+	## Description : test a single query with tests parameters on hmdb - get the status of the complete server infra (API V5.0 compliant).
+	## Input : none
+	## Output : $status_line
+	## Usage : my ( $status_line ) = testMatchesFromHmdb5WithUA( ) ;
+	
+=cut
+## START of SUB
+sub testMatchesFromHmdb5WithUA {
+	## Retrieve Values
+    my $self = shift ;
+    
+my $mech = WWW::Mechanize->new(
+		agent => 'wonderbot for W4M 3.0',
+		autocheck => 1,
+		timeout => 2400,
+	);
+	
+	my $statusGetLine = 0 ;
+	my $statusPostLine = 0 ;
+	
+	#receiving cookies and authentication token (CFRS)
+	my $reqInit = $mech->get("https://www.hmdb.ca/spectra/ms/search");
+	$statusGetLine = $mech->status() ;
+	
+	if ($statusGetLine == 200 ) {
+		die 'no CSRF_REQUEST_TOKEN_VALUE in page found'
+		unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ;
+		my $csrftoken = $1;
+#		print "\nTOKEN: $csrftoken\n" ;
+		$mech->add_header("X-CSRFToken", $csrftoken);
+		$mech->add_header('Connection', 'keep-alive');
+		$mech->add_header('Content-Type', 'application/x-www-form-urlencoded');
+		$mech->add_header('Referer', 'https://www.hmdb.ca/spectra/ms/search');
+		$mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');
+	}
+	
+	
+	## POST test
+	my $top = 1 ;
+	while ($statusPostLine != 200 ) {
+		
+		# Fix a limit at 3 tries...
+		if ($top < 4) {
+			print "\tTesting HMDB server connexion ($top time(s) )...\n" ;
+			eval {
+				my $res = $mech->get(
+					'https://hmdb.ca/spectra/ms/generate_csv.csv?'
+					.'results%5Baction%5D=search'
+					.'&results%5Badduct_type%5D%5B%5D=M%2BH%202M%2BH'
+					#.'&results%5Bauthenticity_token%5D='
+					.'&results%5Bccs_predictors%5D='
+					.'&results%5Bccs_tolerance%5D='
+					.'&results%5Bcommit%5D=Search'
+					.'&results%5Bcontroller%5D=specdb%2Fms'
+					.'&results%5Bms_search_ion_mode%5D=positive'
+					.'&results%5Bquery_masses%5D=125.0089'
+					.'&results%5Btolerance%5D=0.001'
+					.'&results%5Btolerance_units%5D=Da'
+					.'&results%5Butf8%5D=%E2%9C%93'
+				
+				);
+			} ;
+#		print Dumper $mech ;
+			$statusPostLine = $mech->status() ;
+			print "Status: $statusPostLine" ;
+		}
+		else {
+			last ;
+		}
+		$top++ ;
+	}## End While
+	return (\$statusPostLine) ;
+}
+## END of SUB
 
 =head2 METHOD check_state_from_hmdb_ua
 
@@ -353,7 +442,7 @@
     my ($status) = @_ ;
     
     if (!defined $$status) {
-    	croak "No http status is defined for the distant server" ;
+    	croak "No https status is defined for the distant server" ;
     }
     else {
     	unless ( $$status == 200 ) {
@@ -389,6 +478,10 @@
     my $self = shift ;
     my ( $masses, $delta, $mode ) = @_ ;
     
+    ## Added May, 2022
+    warn "[DEPRECATED Methode] method get_matches_from_hmdb_ua is deprecated and not compatible with HMDB 4.0" ;
+    return ([], 500) ;
+    
     my @page = () ;
 
 	my $ua = LWP::UserAgent->new( keep_alive => 10 );
@@ -443,6 +536,10 @@
     my $self = shift ;
     my ( $masses, $delta, $mode, $adducts ) = @_ ;
     
+    ## Added May, 2022
+    warn "[DEPRECATED Methode] method getMatchesFromHmdbWithUA is deprecated and not compatible with HMDB 5.0" ;
+    return ([], 500) ;
+    
     my @page = () ;
     
     #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and
@@ -501,10 +598,185 @@
 }
 ## END of SUB
 
+=head2 METHOD getMatchesFromHmdb5WithUA
+
+	## Description : HMDB v5.0 querying via an user agent with parameters : mz, delta and molecular species (neutral, pos, neg)
+	## Input : $mass, $delta, $mode, adducts
+	## Output : $results
+	## Usage : my ( $results ) = getMatchesFromHmdbWithUA( $mass, $delta, $mode ) ;
+	
+=cut
+## START of SUB
+sub getMatchesFromHmdb5WithUA {
+	## Retrieve Values
+    my $self = shift ;
+    my ( $masses, $delta, $mode, $adducts ) = @_ ;
+    
+    my @page = () ;
+    
+    #based on https://stackoverflow.com/questions/17732916/perl-post-automation-and
+
+	my $mech = WWW::Mechanize->new(
+		agent => 'wonderbot for W4M 3.0',
+		autocheck => 1,
+		timeout => 2400,
+	);
+	
+	my $statusGetLine = 0 ;
+	my $statusPostLine = 0 ;
+	
+	#receiving cookies and authentication token (CFRS)
+	my $reqInit = $mech->get("https://www.hmdb.ca/spectra/ms/search");
+	$statusGetLine = $mech->status() ;
+	
+	if ($statusGetLine == 200 ) {
+		die 'no CSRF_REQUEST_TOKEN_VALUE in page found'
+		unless ($reqInit->decoded_content =~ /\"csrf-token\"\s+content=\"(.*)\"/) ;
+		my $csrftoken = $1;
+#		print "\nTOKEN: $csrftoken\n" ;
+		$mech->add_header("X-CSRFToken", $csrftoken);
+		$mech->add_header('Connection', 'keep-alive');
+		$mech->add_header('Content-Type', 'application/x-www-form-urlencoded');
+		$mech->add_header('Referer', 'https://www.hmdb.ca/spectra/ms/search');
+		$mech->add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');
+	}
+	## adduct format is adduct_type=M%2BH%202M%2BH
+	
+	if ( (!defined $adducts) or ( $adducts eq '')  ) {
+		$adducts = 'Unknown' ;
+	}
+		
+	my $res = $mech->get(
+		'https://hmdb.ca/spectra/ms/generate_csv.csv?'
+		.'results%5Baction%5D=search'
+		.'&results%5Badduct_type%5D%5B%5D='.$adducts
+		#.'&results%5Bauthenticity_token%5D='
+		.'&results%5Bccs_predictors%5D='
+		.'&results%5Bccs_tolerance%5D='
+		.'&results%5Bcommit%5D=Search'
+		.'&results%5Bcontroller%5D=specdb%2Fms'
+		.'&results%5Bms_search_ion_mode%5D='.$mode
+		.'&results%5Bquery_masses%5D='.$masses
+		.'&results%5Btolerance%5D='.$delta
+		.'&results%5Btolerance_units%5D=Da'
+		.'&results%5Butf8%5D=%E2%9C%93'
+	
+	);
+	$statusGetLine = $mech->status() ;
+	
+	if ($mech->success) {
+		@page = split ( /\n/, $res->decoded_content ) ;
+		$statusPostLine = 'OK' ;
+	}
+	else {
+		$statusPostLine = $mech->status() ;
+		warn "\t[HMDB service issue !! the server returned a $statusPostLine HTTP error]" ;
+	}
+	
+#	print Dumper $res->decoded_content ;
+
+	return (\@page, $statusPostLine) ;
+}
+## END of SUB
+
+
+=head2 METHOD parseHmdb5CSVResults
+
+	## Description : parse the csv results and get data - API 5.0 compliant
+	## Input : $csv
+	## Output : $results
+	## Usage : my ( $results ) = parseHmdb5CSVResults( $csv ) ;
+	
+=cut
+## START of SUB
+sub parseHmdb5CSVResults {
+	## Retrieve Values
+    my $self = shift ;
+    my ( $csv, $masses, $max_query ) = @_ ;
+    
+    my $test = 0 ;
+    my ($query_mass,$compound_id,$formula,$compound_mass,$adduct,$adduct_type,$adduct_mass,$delta) = (0, undef, undef, undef, undef, undef, undef, undef) ;
+    
+    my %result_by_entry = () ;
+    my %features = () ;
+    
+#    print Dumper $csv ;
+#    print Dumper $masses ;
+#    print Dumper $max_query ;
+    
+    foreach my $line (@{$csv}) {
+    	## NEW HMDB format V5.0 - May2022
+		if ($line !~ /query_mass,compound_id,compound_name,formula,monoisotopic_mass,adduct,adduct_type,adduct_m\/z,"delta\(ppm\),",ccs_value/) {
+			#query_mass,compound_id,compound_name,formula,monoisotopic_mass,adduct,adduct_type,adduct_m/z,"delta(ppm),",ccs_value',
+    		
+    		if ( $line =~ /(\d+\.\d+),(\w+),(.*),(\w+),(\d+\.\d+),([\w|n\/a|\s+]+)\s*,(\+|\-),(\d+\.\d+),(\d+),(\d*)/   ) {
+    			print "$line\n" ;
+    		#if ( $line =~ /(\d+\.\d+),(\w+),(.*),([\w|n\/a|\s+]+)\s*,(\w+),(\d+\.\d+),(.*),(\+|\-),(\d+\.\d+),(\d+)/  ) {
+    			my @entry = ("$1","$2","$3","$4","$5","$6","$7","$8","$9","$10") ;
+
+    			if ( !exists $result_by_entry{$entry[0]} ) { $result_by_entry{$entry[0]} = [] ; }
+    			    		
+	    		$features{ENTRY_ENTRY_ID} = $entry[1] ;
+	    		$features{ENTRY_ENTRY_NAME} = $entry[2] ;
+	    		$features{ENTRY_FORMULA} = $entry[3] ;
+	    		$features{ENTRY_CPD_MZ} = $entry[4] ;
+	    		$features{ENTRY_ADDUCT} = $entry[5] ;
+	    		$features{ENTRY_ADDUCT_TYPE} = $entry[6] ;
+	    		$features{ENTRY_ADDUCT_MZ} = $entry[7] ;
+	    		$features{ENTRY_DELTA} = $entry[8] ;
+	    		
+	    		my %temp = %features ;
+	    		push (@{$result_by_entry{$entry[0]} }, \%temp) ;
+    		}
+#    		elsif ($line =~ /(\d+\.\d+)/) { #
+#    			## 288.082286511284,HMDB0002255,R-Methylmalonyl-CoA, C01213 ,C25H40N7O19P3S,867.131252359,M-3H,-,288.036475,159
+#    			## 283.108004472276,"Bicyclo_3,1,1heptane-2,3-diol,2,6,6_trimethyl","2,3-Pinanediol",n/a,C10H18O2,170.13067982,M+TFA-H,-,283.116266,29
+#    			## 174.034120330029,HMDB0011723,2-Methylhippuric acid,	 C01586,C10H11NO3,193.073893223,M-H20-H,-,174.055503,123
+#    			## 193.139160745841,HMDB0012109,"7-[(1R,2R,3R,5S)-3,5-Dihydroxy-2-[(1E,3S)-3-hydroxyoct-1-en-1-yl]cyclopentyl]-5,6-dihydroxyheptanoic acid", C06475,C20H36O7,388.246103506,M-2H,-,193.115776,121
+#    			## 214.018826827064,HMDB0011723,2-Methylhippuric acid,	 C01586,C10H11NO3,193.073893223,M+Na-2H,-,214.048559,139
+#    		}
+#    		else {
+#    			
+#    			warn "The parsed line ($line) does not match your pattern\n " ;
+#    		}
+    	}
+    	else {
+    		print "Header detected...Parsing is starting...\n" ;
+    		next ;
+    	}
+    } ## end foreach
+    
+    ## manage per query_mzs (keep query masses order by array)
+    my @results = () ;
+    foreach (@{$masses}) {
+    	if ($result_by_entry{$_}) { 
+    	
+    		## cut all entries > $max_query	- all entries were already sorted...by hmdb
+    		my @temp_entries = @{$result_by_entry{$_}} ;
+    		my @temp_cut = () ;
+    		my $current_query = 0 ;
+    		foreach (@temp_entries) {
+    			$current_query ++ ;
+    			if ($current_query > $max_query) {
+    				last ;
+    			}
+    			else {
+    				push (@temp_cut, $_) ;
+    			}	
+    		}
+    		push (@results, \@temp_cut) ; 
+    	}
+    	else { push (@results, [] ) ; } ;
+
+    }
+    return(\@results) ;
+}
+## END of SUB
+
 
 =head2 METHOD parse_hmdb_csv_results
 
-	## Description : parse the csv results and get data
+	## Description : [DEPRECATED] parse the csv results and get data
 	## Input : $csv
 	## Output : $results
 	## Usage : my ( $results ) = parse_hmdb_csv_results( $csv ) ;