Mercurial > repos > pieterlukasse > prims_metabolomics2
view query_metexp.py @ 0:dffc38727496
initial commit
author | pieter.lukasse@wur.nl |
---|---|
date | Sat, 07 Feb 2015 22:02:00 +0100 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # encoding: utf-8 ''' Module to query a set of identifications against the METabolomics EXPlorer database. It will take the input file and for each record it will query the molecular mass in the selected MetExp DB. If one or more compounds are found in the MetExp DB then extra information regarding these compounds is added to the output file. The output file is thus the input file enriched with information about related items found in the selected MetExp DB. ''' import csv import sys import fileinput import urllib2 import time from collections import OrderedDict __author__ = "Pieter Lukasse" __contact__ = "pieter.lukasse@wur.nl" __copyright__ = "Copyright, 2014, Plant Research International, WUR" __license__ = "Apache v2" def _process_file(in_xsv, delim='\t'): ''' Generic method to parse a tab-separated file returning a dictionary with named columns @param in_csv: input filename to be parsed ''' data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) return _process_data(data) def _process_data(data): header = data.pop(0) # Create dictionary with column name as key output = OrderedDict() for index in xrange(len(header)): output[header[index]] = [row[index] for row in data] return output def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): ''' This method will iterate over the record in the input_data and will enrich them with the related information found (if any) in the MetExp Database. # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies ''' merged = [] for i in xrange(len(input_data[input_data.keys()[0]])): # Get the record in same dictionary format as input_data, but containing # a value at each column instead of a list of all values of all records: input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) # read the molecular mass and formula: cas_id = input_data_record[casid_col] formula = input_data_record[formula_col] molecular_mass = input_data_record[molecular_mass_col] # search for related records in MetExp: data_found = None if cas_id != "undef": # 1- search for other experiments where this CAS id has been found: query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") data_type_found = "CAS" if data_found == None: # 2- search for other experiments where this FORMULA has been found: query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") data_type_found = "FORMULA" if data_found == None: # 3- search for other experiments where this MM has been found: query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") data_type_found = "MM" if data_found == None: # If still nothing found, just add empty columns extra_cols = ['', '','','','','','',''] else: # Add info found: extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) # Take all data and merge it into a "flat"/simple array of values: field_values_list = _merge_data(input_data_record, extra_cols) merged.append(field_values_list) # return the merged/enriched records: return merged def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): ''' This method will go over the data found and will return a list with the following items: - Experiment details where hits have been found : 'organism', 'tissue','experiment_name','user_name','column_type' - Link that executes same query ''' # set() makes a unique list: organism_set = [] tissue_set = [] experiment_name_set = [] user_name_set = [] column_type_set = [] cas_nr_set = [] if 'organism' in data_found: organism_set = set(data_found['organism']) if 'tissue' in data_found: tissue_set = set(data_found['tissue']) if 'experiment_name' in data_found: experiment_name_set = set(data_found['experiment_name']) if 'user_name' in data_found: user_name_set = set(data_found['user_name']) if 'column_type' in data_found: column_type_set = set(data_found['column_type']) if 'CAS' in data_found: cas_nr_set = set(data_found['CAS']) result = [data_type_found, _to_xsv(organism_set), _to_xsv(tissue_set), _to_xsv(experiment_name_set), _to_xsv(user_name_set), _to_xsv(column_type_set), _to_xsv(cas_nr_set), #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] return result def _to_xsv(data_set): result = "" for item in data_set: result = result + str(item) + "|" return result def _fire_query_and_return_dict(url): ''' This method will fire the query as a web-service call and return the results as a list of dictionary objects ''' try: data = urllib2.urlopen(url).read() # transform to dictionary: result = [] data_rows = data.split("\n") # check if there is any data in the response: if len(data_rows) <= 1 or data_rows[1].strip() == '': # means there is only the header row...so no hits: return None for data_row in data_rows: if not data_row.strip() == '': row_as_list = _str_to_list(data_row, delimiter='\t') result.append(row_as_list) # return result processed into a dict: return _process_data(result) except urllib2.HTTPError, e: raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) except urllib2.URLError, e: raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") def _str_to_list(data_row, delimiter='\t'): result = [] for column in data_row.split(delimiter): result.append(column) return result # alternative: ? # s = requests.Session() # s.verify = False # #s.auth = (token01, token02) # resp = s.get(url, params={'name': 'anonymous'}, stream=True) # content = resp.content # # transform to dictionary: def _merge_data(input_data_record, extra_cols): ''' Adds the extra information to the existing data record and returns the combined new record. ''' record = [] for column in input_data_record: record.append(input_data_record[column]) # add extra columns for column in extra_cols: record.append(column) return record def _save_data(data_rows, headers, out_csv): ''' Writes tab-separated data to file @param data_rows: dictionary containing merged/enriched dataset @param out_csv: output csv file ''' # Open output file for writing outfile_single_handle = open(out_csv, 'wb') output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") # Write headers output_single_handle.writerow(headers) # Write one line for each row for data_row in data_rows: output_single_handle.writerow(data_row) def _get_metexp_URL(metexp_dblink_file): ''' Read out and return the URL stored in the given file. ''' file_input = fileinput.input(metexp_dblink_file) try: for line in file_input: if line[0] != '#': # just return the first line that is not a comment line: return line finally: file_input.close() def main(): ''' MetExp Query main function The input file can be any tabular file, as long as it contains a column for the molecular mass and one for the formula of the respective identification. These two columns are then used to query against MetExp Database. ''' seconds_start = int(round(time.time())) input_file = sys.argv[1] casid_col = sys.argv[2] formula_col = sys.argv[3] molecular_mass_col = sys.argv[4] metexp_dblink_file = sys.argv[5] separation_method = sys.argv[6] output_result = sys.argv[7] # Parse metexp_dblink_file to find the URL to the MetExp service: metexp_dblink = _get_metexp_URL(metexp_dblink_file) # Parse tabular input file into dictionary/array: input_data = _process_file(input_file) # Query data against MetExp DB : enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] _save_data(enriched_data, headers, output_result) seconds_end = int(round(time.time())) print "Took " + str(seconds_end - seconds_start) + " seconds" if __name__ == '__main__': main()