6
+ − 1 #!/usr/bin/env python
+ − 2 # encoding: utf-8
+ − 3 '''
+ − 4 Module to query a set of accurate mass values detected by high-resolution mass spectrometers
+ − 5 against various repositories/services such as METabolomics EXPlorer database or the
+ − 6 MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
+ − 7
+ − 8 It will take the input file and for each record it will query the
+ − 9 molecular mass in the selected repository/service. If one or more compounds are found
+ − 10 then extra information regarding these compounds is added to the output file.
+ − 11
+ − 12 The output file is thus the input file enriched with information about
+ − 13 related items found in the selected repository/service.
+ − 14
+ − 15 The service should implement the following interface:
+ − 16
+ − 17 http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)
+ − 18
+ − 19 The output should be tab separated and should contain the following columns (in this order)
+ − 20 db-name molecular-formula dbe formula-weight id description
+ − 21
+ − 22
+ − 23 '''
+ − 24 import csv
+ − 25 import sys
+ − 26 import fileinput
+ − 27 import urllib2
+ − 28 import time
+ − 29 from collections import OrderedDict
+ − 30
+ − 31 __author__ = "Pieter Lukasse"
+ − 32 __contact__ = "pieter.lukasse@wur.nl"
+ − 33 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
+ − 34 __license__ = "Apache v2"
+ − 35
+ − 36 def _process_file(in_xsv, delim='\t'):
+ − 37 '''
+ − 38 Generic method to parse a tab-separated file returning a dictionary with named columns
+ − 39 @param in_csv: input filename to be parsed
+ − 40 '''
+ − 41 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+ − 42 return _process_data(data)
+ − 43
+ − 44 def _process_data(data):
+ − 45
+ − 46 header = data.pop(0)
+ − 47 # Create dictionary with column name as key
+ − 48 output = OrderedDict()
+ − 49 for index in xrange(len(header)):
+ − 50 output[header[index]] = [row[index] for row in data]
+ − 51 return output
+ − 52
+ − 53
+ − 54 def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
+ − 55
+ − 56 '''
+ − 57 This method will iterate over the record in the input_data and
+ − 58 will enrich them with the related information found (if any) in the
+ − 59 chosen repository/service
+ − 60
+ − 61 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+ − 62 '''
+ − 63 merged = []
+ − 64
+ − 65 for i in xrange(len(input_data[input_data.keys()[0]])):
+ − 66 # Get the record in same dictionary format as input_data, but containing
+ − 67 # a value at each column instead of a list of all values of all records:
+ − 68 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+ − 69
+ − 70 # read the molecular mass :
+ − 71 molecular_mass = input_data_record[molecular_mass_col]
+ − 72
+ − 73
+ − 74 # search for related records in repository/service:
+ − 75 data_found = None
+ − 76 if molecular_mass != "":
+ − 77 molecular_mass = float(molecular_mass)
+ − 78
+ − 79 # 1- search for data around this MM:
+ − 80 query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
+ − 81
+ − 82 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ − 83 data_type_found = "MM"
+ − 84
+ − 85
+ − 86 if data_found == None:
+ − 87 # If still nothing found, just add empty columns
+ − 88 extra_cols = ['', '','','','','']
+ − 89 else:
+ − 90 # Add info found:
+ − 91 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+ − 92
+ − 93 # Take all data and merge it into a "flat"/simple array of values:
+ − 94 field_values_list = _merge_data(input_data_record, extra_cols)
+ − 95
+ − 96 merged.append(field_values_list)
+ − 97
+ − 98 # return the merged/enriched records:
+ − 99 return merged
+ − 100
+ − 101
+ − 102 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+ − 103 '''
+ − 104 This method will go over the data found and will return a
+ − 105 list with the following items:
+ − 106 - details of hits found :
+ − 107 db-name molecular-formula dbe formula-weight id description
+ − 108 - Link that executes same query
+ − 109
+ − 110 '''
+ − 111
+ − 112 # set() makes a unique list:
+ − 113 db_name_set = []
+ − 114 molecular_formula_set = []
+ − 115 id_set = []
+ − 116 description_set = []
+ − 117
+ − 118
+ − 119 if 'db-name' in data_found:
+ − 120 db_name_set = set(data_found['db-name'])
+ − 121 elif '# db-name' in data_found:
+ − 122 db_name_set = set(data_found['# db-name'])
+ − 123 if 'molecular-formula' in data_found:
+ − 124 molecular_formula_set = set(data_found['molecular-formula'])
+ − 125 if 'id' in data_found:
+ − 126 id_set = set(data_found['id'])
+ − 127 if 'description' in data_found:
+ − 128 description_set = set(data_found['description'])
+ − 129
+ − 130 result = [data_type_found,
+ − 131 _to_xsv(db_name_set),
+ − 132 _to_xsv(molecular_formula_set),
+ − 133 _to_xsv(id_set),
+ − 134 _to_xsv(description_set),
+ − 135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+ − 136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+ − 137 return result
+ − 138
+ − 139
+ − 140 def _to_xsv(data_set):
+ − 141 result = ""
+ − 142 for item in data_set:
+ − 143 result = result + str(item) + "|"
+ − 144 return result
+ − 145
+ − 146
+ − 147 def _fire_query_and_return_dict(url):
+ − 148 '''
+ − 149 This method will fire the query as a web-service call and
+ − 150 return the results as a list of dictionary objects
+ − 151 '''
+ − 152
+ − 153 try:
+ − 154 data = urllib2.urlopen(url).read()
+ − 155
+ − 156 # transform to dictionary:
+ − 157 result = []
+ − 158 data_rows = data.split("\n")
+ − 159
+ − 160 # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
+ − 161 data_rows_to_remove = []
+ − 162 for data_row in data_rows:
+ − 163 if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
+ − 164 data_rows_to_remove.append(data_row)
+ − 165
+ − 166 for data_row in data_rows_to_remove:
+ − 167 data_rows.remove(data_row)
+ − 168
+ − 169 # check if there is any data in the response:
+ − 170 if len(data_rows) <= 1 or data_rows[1].strip() == '':
+ − 171 # means there is only the header row...so no hits:
+ − 172 return None
+ − 173
+ − 174 for data_row in data_rows:
+ − 175 if not data_row.strip() == '':
+ − 176 row_as_list = _str_to_list(data_row, delimiter='\t')
+ − 177 result.append(row_as_list)
+ − 178
+ − 179 # return result processed into a dict:
+ − 180 return _process_data(result)
+ − 181
+ − 182 except urllib2.HTTPError, e:
+ − 183 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+ − 184 except urllib2.URLError, e:
+ − 185 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
+ − 186
+ − 187 def _str_to_list(data_row, delimiter='\t'):
+ − 188 result = []
+ − 189 for column in data_row.split(delimiter):
+ − 190 result.append(column)
+ − 191 return result
+ − 192
+ − 193
+ − 194 # alternative: ?
+ − 195 # s = requests.Session()
+ − 196 # s.verify = False
+ − 197 # #s.auth = (token01, token02)
+ − 198 # resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+ − 199 # content = resp.content
+ − 200 # # transform to dictionary:
+ − 201
+ − 202
+ − 203
+ − 204
+ − 205 def _merge_data(input_data_record, extra_cols):
+ − 206 '''
+ − 207 Adds the extra information to the existing data record and returns
+ − 208 the combined new record.
+ − 209 '''
+ − 210 record = []
+ − 211 for column in input_data_record:
+ − 212 record.append(input_data_record[column])
+ − 213
+ − 214
+ − 215 # add extra columns
+ − 216 for column in extra_cols:
+ − 217 record.append(column)
+ − 218
+ − 219 return record
+ − 220
+ − 221
+ − 222 def _save_data(data_rows, headers, out_csv):
+ − 223 '''
+ − 224 Writes tab-separated data to file
+ − 225 @param data_rows: dictionary containing merged/enriched dataset
+ − 226 @param out_csv: output csv file
+ − 227 '''
+ − 228
+ − 229 # Open output file for writing
+ − 230 outfile_single_handle = open(out_csv, 'wb')
+ − 231 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+ − 232
+ − 233 # Write headers
+ − 234 output_single_handle.writerow(headers)
+ − 235
+ − 236 # Write one line for each row
+ − 237 for data_row in data_rows:
+ − 238 output_single_handle.writerow(data_row)
+ − 239
+ − 240 def _get_repository_URL(repository_file):
+ − 241 '''
+ − 242 Read out and return the URL stored in the given file.
+ − 243 '''
+ − 244 file_input = fileinput.input(repository_file)
+ − 245 try:
+ − 246 for line in file_input:
+ − 247 if line[0] != '#':
+ − 248 # just return the first line that is not a comment line:
+ − 249 return line
+ − 250 finally:
+ − 251 file_input.close()
+ − 252
+ − 253
+ − 254 def main():
+ − 255 '''
+ − 256 Query main function
+ − 257
+ − 258 The input file can be any tabular file, as long as it contains a column for the molecular mass.
+ − 259 This column is then used to query against the chosen repository/service Database.
+ − 260 '''
+ − 261 seconds_start = int(round(time.time()))
+ − 262
+ − 263 input_file = sys.argv[1]
+ − 264 molecular_mass_col = sys.argv[2]
+ − 265 repository_file = sys.argv[3]
+ − 266 error_margin = float(sys.argv[4])
+ − 267 margin_unit = sys.argv[5]
+ − 268 output_result = sys.argv[6]
+ − 269
+ − 270 # Parse repository_file to find the URL to the service:
+ − 271 repository_dblink = _get_repository_URL(repository_file)
+ − 272
+ − 273 # Parse tabular input file into dictionary/array:
+ − 274 input_data = _process_file(input_file)
+ − 275
+ − 276 # Query data against repository :
+ − 277 enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
+ − 278 headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
+ − 279 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns
+ − 280
+ − 281 _save_data(enriched_data, headers, output_result)
+ − 282
+ − 283 seconds_end = int(round(time.time()))
+ − 284 print "Took " + str(seconds_end - seconds_start) + " seconds"
+ − 285
+ − 286
+ − 287
+ − 288 if __name__ == '__main__':
+ − 289 main()