Mercurial > repos > pieterlukasse > prims_metabolomics2
diff query_metexp.py @ 0:dffc38727496
initial commit
author | pieter.lukasse@wur.nl |
---|---|
date | Sat, 07 Feb 2015 22:02:00 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + MetExp Database. + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass and formula: + cas_id = input_data_record[casid_col] + formula = input_data_record[formula_col] + molecular_mass = input_data_record[molecular_mass_col] + + # search for related records in MetExp: + data_found = None + if cas_id != "undef": + # 1- search for other experiments where this CAS id has been found: + query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "CAS" + if data_found == None: + # 2- search for other experiments where this FORMULA has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "FORMULA" + if data_found == None: + # 3- search for other experiments where this MM has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - Experiment details where hits have been found : + 'organism', 'tissue','experiment_name','user_name','column_type' + - Link that executes same query + + ''' + # set() makes a unique list: + organism_set = [] + tissue_set = [] + experiment_name_set = [] + user_name_set = [] + column_type_set = [] + cas_nr_set = [] + + if 'organism' in data_found: + organism_set = set(data_found['organism']) + if 'tissue' in data_found: + tissue_set = set(data_found['tissue']) + if 'experiment_name' in data_found: + experiment_name_set = set(data_found['experiment_name']) + if 'user_name' in data_found: + user_name_set = set(data_found['user_name']) + if 'column_type' in data_found: + column_type_set = set(data_found['column_type']) + if 'CAS' in data_found: + cas_nr_set = set(data_found['CAS']) + + + result = [data_type_found, + _to_xsv(organism_set), + _to_xsv(tissue_set), + _to_xsv(experiment_name_set), + _to_xsv(user_name_set), + _to_xsv(column_type_set), + _to_xsv(cas_nr_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_metexp_URL(metexp_dblink_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(metexp_dblink_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + MetExp Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass + and one for the formula of the respective identification. These two columns are then + used to query against MetExp Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + casid_col = sys.argv[2] + formula_col = sys.argv[3] + molecular_mass_col = sys.argv[4] + metexp_dblink_file = sys.argv[5] + separation_method = sys.argv[6] + output_result = sys.argv[7] + + # Parse metexp_dblink_file to find the URL to the MetExp service: + metexp_dblink = _get_metexp_URL(metexp_dblink_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against MetExp DB : + enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) + headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', + 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main()