comparison sirius_csifingerid.py @ 2:856b3761277d draft

"planemo upload for repository https://github.com/computational-metabolomics/sirius_csifingerid_galaxy commit 3e3dee9a853b6133cf089b3c063f53c52b39463d"
author computational-metabolomics
date Thu, 02 Jul 2020 11:01:45 -0400
parents 9e6bf7278257
children 4cbfd3d0a4c4
comparison
equal deleted inserted replaced
1:1db83da40c54 2:856b3761277d
23 parser.add_argument('--polarity') 23 parser.add_argument('--polarity')
24 parser.add_argument('--results_name') 24 parser.add_argument('--results_name')
25 parser.add_argument('--out_dir') 25 parser.add_argument('--out_dir')
26 parser.add_argument('--tool_directory') 26 parser.add_argument('--tool_directory')
27 parser.add_argument('--temp_dir') 27 parser.add_argument('--temp_dir')
28
29 parser.add_argument('--meta_select_col', default='all') 28 parser.add_argument('--meta_select_col', default='all')
30 parser.add_argument('--cores_top_level', default=1) 29 parser.add_argument('--cores_top_level', default=1)
31 parser.add_argument('--chunks', default=1) 30 parser.add_argument('--chunks', default=1)
32 parser.add_argument('--minMSMSpeaks', default=1) 31 parser.add_argument('--minMSMSpeaks', default=1)
32 parser.add_argument('--rank_filter', default=0)
33 parser.add_argument('--schema', default='msp') 33 parser.add_argument('--schema', default='msp')
34 parser.add_argument('-a', '--adducts', action='append', nargs=1,
35 required=False, default=[], help='Adducts used')
36
34 args = parser.parse_args() 37 args = parser.parse_args()
35 print(args) 38 print(args)
36 if os.stat(args.input_pth).st_size == 0: 39 if os.stat(args.input_pth).st_size == 0:
37 print('Input file empty') 40 print('Input file empty')
38 exit() 41 exit()
45 os.mkdir(wd) 48 os.mkdir(wd)
46 else: 49 else:
47 td = tempfile.mkdtemp() 50 td = tempfile.mkdtemp()
48 wd = os.path.join(td, str(uuid.uuid4())) 51 wd = os.path.join(td, str(uuid.uuid4()))
49 os.mkdir(wd) 52 os.mkdir(wd)
53
54 print(args.adducts)
55 if args.adducts:
56 adducts_from_cli = [
57 a[0].replace('__ob__', '[').replace('__cb__', ']') for a in
58 args.adducts
59 ]
60 else:
61 adducts_from_cli = []
50 62
51 ###################################################################### 63 ######################################################################
52 # Setup regular expressions for MSP parsing dictionary 64 # Setup regular expressions for MSP parsing dictionary
53 ###################################################################### 65 ######################################################################
54 regex_msp = {} 66 regex_msp = {}
60 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] 72 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
61 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', 73 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
62 r'^adduct(?:=|:)(.*)$', 74 r'^adduct(?:=|:)(.*)$',
63 r'^ADDUCTIONNAME(?:=|:)(.*)$'] 75 r'^ADDUCTIONNAME(?:=|:)(.*)$']
64 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] 76 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
77 regex_msp['retention_time'] = [r'^RETENTION.*TIME(?:=|:)\s*(.*)$',
78 r'^rt(?:=|:)\s*(.*)$',
79 r'^time(?:=|:)\s*(.*)$']
80 # From example winter_pos.mspy from kristian
81 regex_msp['AlignmentID'] = [r'^AlignmentID(?:=|:)\s*(.*)$']
82
65 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format 83 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format
66 84
67 regex_massbank = {} 85 regex_massbank = {}
68 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] 86 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
69 regex_massbank['polarity'] = \ 87 regex_massbank['polarity'] = \
71 regex_massbank['precursor_mz'] = \ 89 regex_massbank['precursor_mz'] = \
72 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] 90 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
73 regex_massbank['precursor_type'] = \ 91 regex_massbank['precursor_type'] = \
74 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] 92 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
75 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] 93 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
94 regex_massbank['retention_time'] = [
95 r'^AC\$CHROMATOGRAPHY:\s+RETENTION_TIME\s*(\d*\.?\d+).*']
76 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] 96 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
77 regex_massbank['massbank'] = [r'^RECORD_TITLE:(.*)$'] # Flag for massbank 97 regex_massbank['massbank'] = [r'^RECORD_TITLE:(.*)$'] # Flag for massbank
98
78 99
79 if args.schema == 'msp': 100 if args.schema == 'msp':
80 meta_regex = regex_msp 101 meta_regex = regex_msp
81 elif args.schema == 'massbank': 102 elif args.schema == 'massbank':
82 meta_regex = regex_massbank 103 meta_regex = regex_massbank
139 # the msPurity pipeline) choose between getting additional details to 160 # the msPurity pipeline) choose between getting additional details to
140 # add as columns as either all meta data from msp, just details from the 161 # add as columns as either all meta data from msp, just details from the
141 # record name (i.e. when using msPurity and we have the columns 162 # record name (i.e. when using msPurity and we have the columns
142 # coded into the name) or just the spectra index (spectrac) 163 # coded into the name) or just the spectra index (spectrac)
143 paramd = init_paramd(args) 164 paramd = init_paramd(args)
165 meta_info = {k: v for k, v in meta_info.items() if k
166 not in ['msp', 'massbank', 'cols']}
144 167
145 if args.meta_select_col == 'name': 168 if args.meta_select_col == 'name':
146 # have additional column of just the name 169 # have additional column of just the name
147 paramd['additional_details'] = {'name': meta_info['name']} 170 paramd['additional_details'] = {'name': meta_info['name']}
148 elif args.meta_select_col == 'name_split': 171 elif args.meta_select_col == 'name_split':
175 198
176 # =============== Update param based on MSP metadata ====================== 199 # =============== Update param based on MSP metadata ======================
177 # Replace param details with details from MSP if required 200 # Replace param details with details from MSP if required
178 if 'precursor_type' in meta_info and meta_info['precursor_type']: 201 if 'precursor_type' in meta_info and meta_info['precursor_type']:
179 paramd["cli"]["--ion"] = meta_info['precursor_type'] 202 paramd["cli"]["--ion"] = meta_info['precursor_type']
203 adduct = meta_info['precursor_type']
180 else: 204 else:
181 if paramd["default_ion"]: 205 if paramd["default_ion"]:
182 paramd["cli"]["--ion"] = paramd["default_ion"] 206 paramd["cli"]["--ion"] = paramd["default_ion"]
207 adduct = paramd["default_ion"]
183 else: 208 else:
184 paramd["cli"]["--auto-charge"] = '' 209 paramd["cli"]["--auto-charge"] = ''
185 210
186 if 'precursor_mz' in meta_info and meta_info['precursor_mz']: 211 if 'precursor_mz' in meta_info and meta_info['precursor_mz']:
187 paramd["cli"]["--precursor"] = meta_info['precursor_mz'] 212 paramd["cli"]["--precursor"] = meta_info['precursor_mz']
213
214 if not ('precursor_type' in paramd['additional_details'] or 'adduct'
215 in paramd['additional_details']):
216 # If possible always good to have the adduct in output as a column
217 paramd['additional_details']['adduct'] = adduct
188 218
189 # ============== Create CLI cmd for metfrag =============================== 219 # ============== Create CLI cmd for metfrag ===============================
190 cmd = "sirius --fingerid" 220 cmd = "sirius --fingerid"
191 for k, v in six.iteritems(paramd["cli"]): 221 for k, v in six.iteritems(paramd["cli"]):
192 cmd += " {} {}".format(str(k), str(v)) 222 cmd += " {} {}".format(str(k), str(v))
246 276
247 peaklist.append(save_line) 277 peaklist.append(save_line)
248 278
249 elif plinesread and plinesread == pnumlines: 279 elif plinesread and plinesread == pnumlines:
250 # ======= Get sample name and additional details for output ======= 280 # ======= Get sample name and additional details for output =======
251 spectrac += 1 281 if adducts_from_cli:
252 paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac) 282 for adduct in adducts_from_cli:
253 283 print(adduct)
254 paramds[paramd["SampleName"]] = paramd 284 spectrac += 1
255 cmds.append(cmd) 285 meta_info['precursor_type'] = adduct
286 paramd, cmd = run_sirius(meta_info, peaklist, args, wd,
287 spectrac)
288
289 paramds[paramd["SampleName"]] = paramd
290 cmds.append(cmd)
291 else:
292 spectrac += 1
293 paramd, cmd = run_sirius(meta_info, peaklist, args, wd,
294 spectrac)
295
296 paramds[paramd["SampleName"]] = paramd
297 cmds.append(cmd)
256 298
257 meta_info = {} 299 meta_info = {}
258 pnumlines = 0 300 pnumlines = 0
259 plinesread = 0 301 plinesread = 0
260 302
261 # end of file. Check if there is a MSP spectra to 303 # end of file. Check if there is a MSP spectra to
262 # run metfrag on still 304 # run metfrag on still
263 305
264 if plinesread and plinesread == pnumlines: 306 if plinesread and plinesread == pnumlines:
265 paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac + 1) 307 if adducts_from_cli:
266 308 for adduct in adducts_from_cli:
267 paramds[paramd["SampleName"]] = paramd 309 print(adduct)
268 cmds.append(cmd) 310 spectrac += 1
311 meta_info['precursor_type'] = adduct
312 paramd, cmd = run_sirius(meta_info, peaklist, args, wd,
313 spectrac)
314
315 paramds[paramd["SampleName"]] = paramd
316 cmds.append(cmd)
317 else:
318 spectrac += 1
319 paramd, cmd = run_sirius(meta_info, peaklist, args, wd,
320 spectrac)
321
322 paramds[paramd["SampleName"]] = paramd
323 cmds.append(cmd)
269 324
270 # Perform multiprocessing on command line call level 325 # Perform multiprocessing on command line call level
271 if int(args.cores_top_level) > 1: 326 if int(args.cores_top_level) > 1:
272 cmds_chunks = [cmds[x:x + int(args.chunks)] 327 cmds_chunks = [cmds[x:x + int(args.chunks)]
273 for x in list(range(0, len(cmds), int(args.chunks)))] 328 for x in list(range(0, len(cmds), int(args.chunks)))]
319 reader = csv.DictReader(infile, delimiter='\t') 374 reader = csv.DictReader(infile, delimiter='\t')
320 375
321 ad = paramds[fn.split(os.sep)[-3]]['additional_details'] 376 ad = paramds[fn.split(os.sep)[-3]]['additional_details']
322 377
323 for line in reader: 378 for line in reader:
379 if 0 < int(args.rank_filter) < int(line['rank']):
380 # filter out those annotations greater than rank filter
381 # If rank_filter is zero then skip
382 continue
324 line.update(ad) 383 line.update(ad)
325 # round score to 5 d.p. 384 # round score to 5 d.p.
326 line['score'] = round(float(line['score']), 5) 385 line['score'] = round(float(line['score']), 5)
327 386
328 dwriter.writerow(line) 387 dwriter.writerow(line)