view query_metexp.py @ 61:d685210eef3e

fix in pdftotabular tool
author pieter.lukasse@wur.nl
date Fri, 19 Dec 2014 15:30:13 +0100
parents cd4f13119afa
children
line wrap: on
line source

#!/usr/bin/env python
# encoding: utf-8
'''
Module to query a set of identifications against the METabolomics EXPlorer database.

It will take the input file and for each record it will query the 
molecular mass in the selected MetExp DB. If one or more compounds are found in the
MetExp DB then extra information regarding these compounds is added to the output file.

The output file is thus the input file enriched with information about 
related items found in the selected MetExp DB.   
'''
import csv
import sys
import fileinput
import urllib2
import time
from collections import OrderedDict

__author__ = "Pieter Lukasse"
__contact__ = "pieter.lukasse@wur.nl"
__copyright__ = "Copyright, 2014, Plant Research International, WUR"
__license__ = "Apache v2"

def _process_file(in_xsv, delim='\t'):
    '''
    Generic method to parse a tab-separated file returning a dictionary with named columns
    @param in_csv: input filename to be parsed
    '''
    data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
    return _process_data(data)
    
def _process_data(data):
    
    header = data.pop(0)
    # Create dictionary with column name as key
    output = OrderedDict()
    for index in xrange(len(header)):
        output[header[index]] = [row[index] for row in data]
    return output


def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
    '''
    This method will iterate over the record in the input_data and
    will enrich them with the related information found (if any) in the 
    MetExp Database.
    
    # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
    '''
    merged = []
    
    for i in xrange(len(input_data[input_data.keys()[0]])):
        # Get the record in same dictionary format as input_data, but containing
        # a value at each column instead of a list of all values of all records:
        input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
        
        # read the molecular mass and formula:
        cas_id = input_data_record[casid_col]
        formula = input_data_record[formula_col]
        molecular_mass = input_data_record[molecular_mass_col]
        
        # search for related records in MetExp:
        data_found = None
        if cas_id != "undef": 
            # 1- search for other experiments where this CAS id has been found:
            query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
            data_type_found = "CAS"
        if data_found == None:
            # 2- search for other experiments where this FORMULA has been found:
            query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
            data_type_found = "FORMULA"
        if data_found == None:
            # 3- search for other experiments where this MM has been found:
            query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method 
            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
            data_type_found = "MM"
                
        if data_found == None:
            # If still nothing found, just add empty columns
            extra_cols = ['', '','','','','','','']
        else:
            # Add info found:
            extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
        
        # Take all data and merge it into a "flat"/simple array of values:
        field_values_list = _merge_data(input_data_record, extra_cols)
    
        merged.append(field_values_list)

    # return the merged/enriched records:
    return merged


def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
    '''
    This method will go over the data found and will return a 
    list with the following items:
    - Experiment details where hits have been found :
        'organism', 'tissue','experiment_name','user_name','column_type'
    - Link that executes same query
        
    '''
    # set() makes a unique list:
    organism_set = []
    tissue_set = []
    experiment_name_set = []
    user_name_set = []
    column_type_set = []
    cas_nr_set = []
    
    if 'organism' in data_found:
        organism_set = set(data_found['organism'])
    if 'tissue' in data_found:
        tissue_set = set(data_found['tissue'])
    if 'experiment_name' in data_found:
        experiment_name_set = set(data_found['experiment_name'])
    if 'user_name' in data_found:
        user_name_set = set(data_found['user_name'])
    if 'column_type' in data_found:
        column_type_set = set(data_found['column_type'])
    if 'CAS' in data_found:
        cas_nr_set = set(data_found['CAS'])        
    
    
    result = [data_type_found,
              _to_xsv(organism_set),
              _to_xsv(tissue_set),
              _to_xsv(experiment_name_set),
              _to_xsv(user_name_set),
              _to_xsv(column_type_set),
              _to_xsv(cas_nr_set),              
              #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): 
              "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
    return result


def _to_xsv(data_set):
    result = ""
    for item in data_set:
        result = result + str(item) + "|"    
    return result


def _fire_query_and_return_dict(url):
    '''
    This method will fire the query as a web-service call and 
    return the results as a list of dictionary objects
    '''
    
    try:
        data = urllib2.urlopen(url).read()
        
        # transform to dictionary:
        result = []
        data_rows = data.split("\n")
        
        # check if there is any data in the response:
        if len(data_rows) <= 1 or data_rows[1].strip() == '': 
            # means there is only the header row...so no hits:
            return None
        
        for data_row in data_rows:
            if not data_row.strip() == '':
                row_as_list = _str_to_list(data_row, delimiter='\t')
                result.append(row_as_list)
        
        # return result processed into a dict:
        return _process_data(result)
        
    except urllib2.HTTPError, e:
        raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
    except urllib2.URLError, e:
        raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")

def _str_to_list(data_row, delimiter='\t'):    
    result = []
    for column in data_row.split(delimiter):
        result.append(column)
    return result
    
    
# alternative: ?    
#     s = requests.Session()
#     s.verify = False
#     #s.auth = (token01, token02)
#     resp = s.get(url, params={'name': 'anonymous'}, stream=True)
#     content = resp.content
#     # transform to dictionary:
    
    
    
    
def _merge_data(input_data_record, extra_cols):
    '''
    Adds the extra information to the existing data record and returns
    the combined new record.
    '''
    record = []
    for column in input_data_record:
        record.append(input_data_record[column])
    
    
    # add extra columns
    for column in extra_cols:
        record.append(column)    
    
    return record  
    

def _save_data(data_rows, headers, out_csv):
    '''
    Writes tab-separated data to file
    @param data_rows: dictionary containing merged/enriched dataset
    @param out_csv: output csv file
    '''

    # Open output file for writing
    outfile_single_handle = open(out_csv, 'wb')
    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")

    # Write headers
    output_single_handle.writerow(headers)

    # Write one line for each row
    for data_row in data_rows:
        output_single_handle.writerow(data_row)

def _get_metexp_URL(metexp_dblink_file):
    '''
    Read out and return the URL stored in the given file.
    '''
    file_input = fileinput.input(metexp_dblink_file)
    try:
        for line in file_input:
            if line[0] != '#':
                # just return the first line that is not a comment line:
                return line
    finally:
        file_input.close()
    

def main():
    '''
    MetExp Query main function
    
    The input file can be any tabular file, as long as it contains a column for the molecular mass
    and one for the formula of the respective identification. These two columns are then
    used to query against MetExp Database.   
    '''
    seconds_start = int(round(time.time()))
    
    input_file = sys.argv[1]
    casid_col = sys.argv[2]
    formula_col = sys.argv[3]
    molecular_mass_col = sys.argv[4]
    metexp_dblink_file = sys.argv[5]
    separation_method = sys.argv[6]
    output_result = sys.argv[7]

    # Parse metexp_dblink_file to find the URL to the MetExp service:
    metexp_dblink = _get_metexp_URL(metexp_dblink_file)
    
    # Parse tabular input file into dictionary/array:
    input_data = _process_file(input_file)
    
    # Query data against MetExp DB :
    enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
    headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
                                   'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
    
    _save_data(enriched_data, headers, output_result)
    
    seconds_end = int(round(time.time()))
    print "Took " + str(seconds_end - seconds_start) + " seconds"
                      
                      

if __name__ == '__main__':
    main()