diff query_metexp.py @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.py	Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the 
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about 
+related items found in the selected MetExp DB.   
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+    return _process_data(data)
+    
+def _process_data(data):
+    
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = OrderedDict()
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+
+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
+    '''
+    This method will iterate over the record in the input_data and
+    will enrich them with the related information found (if any) in the 
+    MetExp Database.
+    
+    # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+    '''
+    merged = []
+    
+    for i in xrange(len(input_data[input_data.keys()[0]])):
+        # Get the record in same dictionary format as input_data, but containing
+        # a value at each column instead of a list of all values of all records:
+        input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+        
+        # read the molecular mass and formula:
+        cas_id = input_data_record[casid_col]
+        formula = input_data_record[formula_col]
+        molecular_mass = input_data_record[molecular_mass_col]
+        
+        # search for related records in MetExp:
+        data_found = None
+        if cas_id != "undef": 
+            # 1- search for other experiments where this CAS id has been found:
+            query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "CAS"
+        if data_found == None:
+            # 2- search for other experiments where this FORMULA has been found:
+            query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "FORMULA"
+        if data_found == None:
+            # 3- search for other experiments where this MM has been found:
+            query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method 
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "MM"
+                
+        if data_found == None:
+            # If still nothing found, just add empty columns
+            extra_cols = ['', '','','','','','','']
+        else:
+            # Add info found:
+            extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+        
+        # Take all data and merge it into a "flat"/simple array of values:
+        field_values_list = _merge_data(input_data_record, extra_cols)
+    
+        merged.append(field_values_list)
+
+    # return the merged/enriched records:
+    return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+    '''
+    This method will go over the data found and will return a 
+    list with the following items:
+    - Experiment details where hits have been found :
+        'organism', 'tissue','experiment_name','user_name','column_type'
+    - Link that executes same query
+        
+    '''
+    # set() makes a unique list:
+    organism_set = []
+    tissue_set = []
+    experiment_name_set = []
+    user_name_set = []
+    column_type_set = []
+    cas_nr_set = []
+    
+    if 'organism' in data_found:
+        organism_set = set(data_found['organism'])
+    if 'tissue' in data_found:
+        tissue_set = set(data_found['tissue'])
+    if 'experiment_name' in data_found:
+        experiment_name_set = set(data_found['experiment_name'])
+    if 'user_name' in data_found:
+        user_name_set = set(data_found['user_name'])
+    if 'column_type' in data_found:
+        column_type_set = set(data_found['column_type'])
+    if 'CAS' in data_found:
+        cas_nr_set = set(data_found['CAS'])        
+    
+    
+    result = [data_type_found,
+              _to_xsv(organism_set),
+              _to_xsv(tissue_set),
+              _to_xsv(experiment_name_set),
+              _to_xsv(user_name_set),
+              _to_xsv(column_type_set),
+              _to_xsv(cas_nr_set),              
+              #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): 
+              "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+    return result
+
+
+def _to_xsv(data_set):
+    result = ""
+    for item in data_set:
+        result = result + str(item) + "|"    
+    return result
+
+
+def _fire_query_and_return_dict(url):
+    '''
+    This method will fire the query as a web-service call and 
+    return the results as a list of dictionary objects
+    '''
+    
+    try:
+        data = urllib2.urlopen(url).read()
+        
+        # transform to dictionary:
+        result = []
+        data_rows = data.split("\n")
+        
+        # check if there is any data in the response:
+        if len(data_rows) <= 1 or data_rows[1].strip() == '': 
+            # means there is only the header row...so no hits:
+            return None
+        
+        for data_row in data_rows:
+            if not data_row.strip() == '':
+                row_as_list = _str_to_list(data_row, delimiter='\t')
+                result.append(row_as_list)
+        
+        # return result processed into a dict:
+        return _process_data(result)
+        
+    except urllib2.HTTPError, e:
+        raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+    except urllib2.URLError, e:
+        raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):    
+    result = []
+    for column in data_row.split(delimiter):
+        result.append(column)
+    return result
+    
+    
+# alternative: ?    
+#     s = requests.Session()
+#     s.verify = False
+#     #s.auth = (token01, token02)
+#     resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+#     content = resp.content
+#     # transform to dictionary:
+    
+    
+    
+    
+def _merge_data(input_data_record, extra_cols):
+    '''
+    Adds the extra information to the existing data record and returns
+    the combined new record.
+    '''
+    record = []
+    for column in input_data_record:
+        record.append(input_data_record[column])
+    
+    
+    # add extra columns
+    for column in extra_cols:
+        record.append(column)    
+    
+    return record  
+    
+
+def _save_data(data_rows, headers, out_csv):
+    '''
+    Writes tab-separated data to file
+    @param data_rows: dictionary containing merged/enriched dataset
+    @param out_csv: output csv file
+    '''
+
+    # Open output file for writing
+    outfile_single_handle = open(out_csv, 'wb')
+    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+    # Write headers
+    output_single_handle.writerow(headers)
+
+    # Write one line for each row
+    for data_row in data_rows:
+        output_single_handle.writerow(data_row)
+
+def _get_metexp_URL(metexp_dblink_file):
+    '''
+    Read out and return the URL stored in the given file.
+    '''
+    file_input = fileinput.input(metexp_dblink_file)
+    try:
+        for line in file_input:
+            if line[0] != '#':
+                # just return the first line that is not a comment line:
+                return line
+    finally:
+        file_input.close()
+    
+
+def main():
+    '''
+    MetExp Query main function
+    
+    The input file can be any tabular file, as long as it contains a column for the molecular mass
+    and one for the formula of the respective identification. These two columns are then
+    used to query against MetExp Database.   
+    '''
+    seconds_start = int(round(time.time()))
+    
+    input_file = sys.argv[1]
+    casid_col = sys.argv[2]
+    formula_col = sys.argv[3]
+    molecular_mass_col = sys.argv[4]
+    metexp_dblink_file = sys.argv[5]
+    separation_method = sys.argv[6]
+    output_result = sys.argv[7]
+
+    # Parse metexp_dblink_file to find the URL to the MetExp service:
+    metexp_dblink = _get_metexp_URL(metexp_dblink_file)
+    
+    # Parse tabular input file into dictionary/array:
+    input_data = _process_file(input_file)
+    
+    # Query data against MetExp DB :
+    enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
+    headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
+                                   'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
+    
+    _save_data(enriched_data, headers, output_result)
+    
+    seconds_end = int(round(time.time()))
+    print "Took " + str(seconds_end - seconds_start) + " seconds"
+                      
+                      
+
+if __name__ == '__main__':
+    main()