comparison query_metexp.py @ 22:cd4f13119afa

Small fix in filters part and improvement in query_metexp time logging
author pieter.lukasse@wur.nl
date Thu, 06 Mar 2014 14:29:55 +0100
parents 19d8fd10248e
children
comparison
equal deleted inserted replaced
21:19d8fd10248e 22:cd4f13119afa
12 ''' 12 '''
13 import csv 13 import csv
14 import sys 14 import sys
15 import fileinput 15 import fileinput
16 import urllib2 16 import urllib2
17 import time
17 from collections import OrderedDict 18 from collections import OrderedDict
18 19
19 __author__ = "Pieter Lukasse" 20 __author__ = "Pieter Lukasse"
20 __contact__ = "pieter.lukasse@wur.nl" 21 __contact__ = "pieter.lukasse@wur.nl"
21 __copyright__ = "Copyright, 2014, Plant Research International, WUR" 22 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
42 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): 43 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
43 ''' 44 '''
44 This method will iterate over the record in the input_data and 45 This method will iterate over the record in the input_data and
45 will enrich them with the related information found (if any) in the 46 will enrich them with the related information found (if any) in the
46 MetExp Database. 47 MetExp Database.
48
49 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
47 ''' 50 '''
48 merged = [] 51 merged = []
49 52
50 for i in xrange(len(input_data[input_data.keys()[0]])): 53 for i in xrange(len(input_data[input_data.keys()[0]])):
51 # Get the record in same dictionary format as input_data, but containing 54 # Get the record in same dictionary format as input_data, but containing
245 248
246 The input file can be any tabular file, as long as it contains a column for the molecular mass 249 The input file can be any tabular file, as long as it contains a column for the molecular mass
247 and one for the formula of the respective identification. These two columns are then 250 and one for the formula of the respective identification. These two columns are then
248 used to query against MetExp Database. 251 used to query against MetExp Database.
249 ''' 252 '''
253 seconds_start = int(round(time.time()))
254
250 input_file = sys.argv[1] 255 input_file = sys.argv[1]
251 casid_col = sys.argv[2] 256 casid_col = sys.argv[2]
252 formula_col = sys.argv[3] 257 formula_col = sys.argv[3]
253 molecular_mass_col = sys.argv[4] 258 molecular_mass_col = sys.argv[4]
254 metexp_dblink_file = sys.argv[5] 259 metexp_dblink_file = sys.argv[5]
265 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) 270 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
266 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', 271 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
267 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] 272 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
268 273
269 _save_data(enriched_data, headers, output_result) 274 _save_data(enriched_data, headers, output_result)
270 275
276 seconds_end = int(round(time.time()))
277 print "Took " + str(seconds_end - seconds_start) + " seconds"
278
279
271 280
272 if __name__ == '__main__': 281 if __name__ == '__main__':
273 main() 282 main()