6
+ − 1 #!/usr/bin/env python
+ − 2 # encoding: utf-8
+ − 3 '''
+ − 4 Module to query a set of identifications against the METabolomics EXPlorer database.
+ − 5
+ − 6 It will take the input file and for each record it will query the
+ − 7 molecular mass in the selected MetExp DB. If one or more compounds are found in the
+ − 8 MetExp DB then extra information regarding these compounds is added to the output file.
+ − 9
+ − 10 The output file is thus the input file enriched with information about
+ − 11 related items found in the selected MetExp DB.
+ − 12 '''
+ − 13 import csv
+ − 14 import sys
+ − 15 import fileinput
+ − 16 import urllib2
+ − 17 import time
+ − 18 from collections import OrderedDict
+ − 19
+ − 20 __author__ = "Pieter Lukasse"
+ − 21 __contact__ = "pieter.lukasse@wur.nl"
+ − 22 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
+ − 23 __license__ = "Apache v2"
+ − 24
+ − 25 def _process_file(in_xsv, delim='\t'):
+ − 26 '''
+ − 27 Generic method to parse a tab-separated file returning a dictionary with named columns
+ − 28 @param in_csv: input filename to be parsed
+ − 29 '''
+ − 30 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+ − 31 return _process_data(data)
+ − 32
+ − 33 def _process_data(data):
+ − 34
+ − 35 header = data.pop(0)
+ − 36 # Create dictionary with column name as key
+ − 37 output = OrderedDict()
+ − 38 for index in xrange(len(header)):
+ − 39 output[header[index]] = [row[index] for row in data]
+ − 40 return output
+ − 41
+ − 42
+ − 43 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
+ − 44 '''
+ − 45 This method will iterate over the record in the input_data and
+ − 46 will enrich them with the related information found (if any) in the
+ − 47 MetExp Database.
+ − 48
+ − 49 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+ − 50 '''
+ − 51 merged = []
+ − 52
+ − 53 for i in xrange(len(input_data[input_data.keys()[0]])):
+ − 54 # Get the record in same dictionary format as input_data, but containing
+ − 55 # a value at each column instead of a list of all values of all records:
+ − 56 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+ − 57
+ − 58 # read the molecular mass and formula:
+ − 59 cas_id = input_data_record[casid_col]
+ − 60 formula = input_data_record[formula_col]
+ − 61 molecular_mass = input_data_record[molecular_mass_col]
+ − 62
+ − 63 # search for related records in MetExp:
+ − 64 data_found = None
+ − 65 if cas_id != "undef":
+ − 66 # 1- search for other experiments where this CAS id has been found:
+ − 67 query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
+ − 68 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ − 69 data_type_found = "CAS"
+ − 70 if data_found == None:
+ − 71 # 2- search for other experiments where this FORMULA has been found:
+ − 72 query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
+ − 73 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ − 74 data_type_found = "FORMULA"
+ − 75 if data_found == None:
+ − 76 # 3- search for other experiments where this MM has been found:
+ − 77 query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
+ − 78 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ − 79 data_type_found = "MM"
+ − 80
+ − 81 if data_found == None:
+ − 82 # If still nothing found, just add empty columns
+ − 83 extra_cols = ['', '','','','','','','']
+ − 84 else:
+ − 85 # Add info found:
+ − 86 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+ − 87
+ − 88 # Take all data and merge it into a "flat"/simple array of values:
+ − 89 field_values_list = _merge_data(input_data_record, extra_cols)
+ − 90
+ − 91 merged.append(field_values_list)
+ − 92
+ − 93 # return the merged/enriched records:
+ − 94 return merged
+ − 95
+ − 96
+ − 97 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+ − 98 '''
+ − 99 This method will go over the data found and will return a
+ − 100 list with the following items:
+ − 101 - Experiment details where hits have been found :
+ − 102 'organism', 'tissue','experiment_name','user_name','column_type'
+ − 103 - Link that executes same query
+ − 104
+ − 105 '''
+ − 106 # set() makes a unique list:
+ − 107 organism_set = []
+ − 108 tissue_set = []
+ − 109 experiment_name_set = []
+ − 110 user_name_set = []
+ − 111 column_type_set = []
+ − 112 cas_nr_set = []
+ − 113
+ − 114 if 'organism' in data_found:
+ − 115 organism_set = set(data_found['organism'])
+ − 116 if 'tissue' in data_found:
+ − 117 tissue_set = set(data_found['tissue'])
+ − 118 if 'experiment_name' in data_found:
+ − 119 experiment_name_set = set(data_found['experiment_name'])
+ − 120 if 'user_name' in data_found:
+ − 121 user_name_set = set(data_found['user_name'])
+ − 122 if 'column_type' in data_found:
+ − 123 column_type_set = set(data_found['column_type'])
+ − 124 if 'CAS' in data_found:
+ − 125 cas_nr_set = set(data_found['CAS'])
+ − 126
+ − 127
+ − 128 result = [data_type_found,
+ − 129 _to_xsv(organism_set),
+ − 130 _to_xsv(tissue_set),
+ − 131 _to_xsv(experiment_name_set),
+ − 132 _to_xsv(user_name_set),
+ − 133 _to_xsv(column_type_set),
+ − 134 _to_xsv(cas_nr_set),
+ − 135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+ − 136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+ − 137 return result
+ − 138
+ − 139
+ − 140 def _to_xsv(data_set):
+ − 141 result = ""
+ − 142 for item in data_set:
+ − 143 result = result + str(item) + "|"
+ − 144 return result
+ − 145
+ − 146
+ − 147 def _fire_query_and_return_dict(url):
+ − 148 '''
+ − 149 This method will fire the query as a web-service call and
+ − 150 return the results as a list of dictionary objects
+ − 151 '''
+ − 152
+ − 153 try:
+ − 154 data = urllib2.urlopen(url).read()
+ − 155
+ − 156 # transform to dictionary:
+ − 157 result = []
+ − 158 data_rows = data.split("\n")
+ − 159
+ − 160 # check if there is any data in the response:
+ − 161 if len(data_rows) <= 1 or data_rows[1].strip() == '':
+ − 162 # means there is only the header row...so no hits:
+ − 163 return None
+ − 164
+ − 165 for data_row in data_rows:
+ − 166 if not data_row.strip() == '':
+ − 167 row_as_list = _str_to_list(data_row, delimiter='\t')
+ − 168 result.append(row_as_list)
+ − 169
+ − 170 # return result processed into a dict:
+ − 171 return _process_data(result)
+ − 172
+ − 173 except urllib2.HTTPError, e:
+ − 174 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+ − 175 except urllib2.URLError, e:
+ − 176 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
+ − 177
+ − 178 def _str_to_list(data_row, delimiter='\t'):
+ − 179 result = []
+ − 180 for column in data_row.split(delimiter):
+ − 181 result.append(column)
+ − 182 return result
+ − 183
+ − 184
+ − 185 # alternative: ?
+ − 186 # s = requests.Session()
+ − 187 # s.verify = False
+ − 188 # #s.auth = (token01, token02)
+ − 189 # resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+ − 190 # content = resp.content
+ − 191 # # transform to dictionary:
+ − 192
+ − 193
+ − 194
+ − 195
+ − 196 def _merge_data(input_data_record, extra_cols):
+ − 197 '''
+ − 198 Adds the extra information to the existing data record and returns
+ − 199 the combined new record.
+ − 200 '''
+ − 201 record = []
+ − 202 for column in input_data_record:
+ − 203 record.append(input_data_record[column])
+ − 204
+ − 205
+ − 206 # add extra columns
+ − 207 for column in extra_cols:
+ − 208 record.append(column)
+ − 209
+ − 210 return record
+ − 211
+ − 212
+ − 213 def _save_data(data_rows, headers, out_csv):
+ − 214 '''
+ − 215 Writes tab-separated data to file
+ − 216 @param data_rows: dictionary containing merged/enriched dataset
+ − 217 @param out_csv: output csv file
+ − 218 '''
+ − 219
+ − 220 # Open output file for writing
+ − 221 outfile_single_handle = open(out_csv, 'wb')
+ − 222 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+ − 223
+ − 224 # Write headers
+ − 225 output_single_handle.writerow(headers)
+ − 226
+ − 227 # Write one line for each row
+ − 228 for data_row in data_rows:
+ − 229 output_single_handle.writerow(data_row)
+ − 230
+ − 231 def _get_metexp_URL(metexp_dblink_file):
+ − 232 '''
+ − 233 Read out and return the URL stored in the given file.
+ − 234 '''
+ − 235 file_input = fileinput.input(metexp_dblink_file)
+ − 236 try:
+ − 237 for line in file_input:
+ − 238 if line[0] != '#':
+ − 239 # just return the first line that is not a comment line:
+ − 240 return line
+ − 241 finally:
+ − 242 file_input.close()
+ − 243
+ − 244
+ − 245 def main():
+ − 246 '''
+ − 247 MetExp Query main function
+ − 248
+ − 249 The input file can be any tabular file, as long as it contains a column for the molecular mass
+ − 250 and one for the formula of the respective identification. These two columns are then
+ − 251 used to query against MetExp Database.
+ − 252 '''
+ − 253 seconds_start = int(round(time.time()))
+ − 254
+ − 255 input_file = sys.argv[1]
+ − 256 casid_col = sys.argv[2]
+ − 257 formula_col = sys.argv[3]
+ − 258 molecular_mass_col = sys.argv[4]
+ − 259 metexp_dblink_file = sys.argv[5]
+ − 260 separation_method = sys.argv[6]
+ − 261 output_result = sys.argv[7]
+ − 262
+ − 263 # Parse metexp_dblink_file to find the URL to the MetExp service:
+ − 264 metexp_dblink = _get_metexp_URL(metexp_dblink_file)
+ − 265
+ − 266 # Parse tabular input file into dictionary/array:
+ − 267 input_data = _process_file(input_file)
+ − 268
+ − 269 # Query data against MetExp DB :
+ − 270 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
+ − 271 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
+ − 272 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
+ − 273
+ − 274 _save_data(enriched_data, headers, output_result)
+ − 275
+ − 276 seconds_end = int(round(time.time()))
+ − 277 print "Took " + str(seconds_end - seconds_start) + " seconds"
+ − 278
+ − 279
+ − 280
+ − 281 if __name__ == '__main__':
+ − 282 main()