Mercurial > repos > pieterlukasse > nist_wrapper
diff utils.py @ 0:cce6989ed423
new NIST wrapper demo tools
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 22 Jan 2015 16:14:57 +0100 |
parents | |
children | 8c20185752da |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils.py Thu Jan 22 16:14:57 2015 +0100 @@ -0,0 +1,163 @@ +''' +Created on 31 dec. 2014 + +@author: lukas007 +''' +import shutil +import subprocess +import csv +from collections import OrderedDict + +def copy_dir(src, dst): + shutil.copytree(src, dst) + + +def copy_file(src, dst): + shutil.copy(src, dst) + +def get_process_list(): + p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE) + out, err = p.communicate() + return out.splitlines() + +def get_process_pid(process_name): + pid = -1 + for line in get_process_list(): + if process_name in line: + pid = int(line.split(None, 1)[0]) + return pid + + +def get_as_dict(in_tsv): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_tsv: input filename to be parsed + ''' + data = list(csv.reader(open(in_tsv, 'rU'), delimiter='\t')) + header = data.pop(0) + # Create dictionary with column name as key + output = {} + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + +def save_dict_as_tsv(dict, out_tsv): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_tsv: output tsv file + ''' + + # Open output file for writing + out_file = open(out_tsv, 'wb') + output_writer = csv.writer(out_file, delimiter="\t") + + # Write headers + output_writer.writerow(list(dict.keys())) + + # Write + for record_index in xrange(len(dict[dict.keys()[0]])): + row = [dict[k][record_index] for k in dict] + output_writer.writerow(row) + + + + +def get_nist_out_as_dict(nist_result_file): + ''' + Method to parse NIST specific output into a dictionary. + @param nist_result_file: result file as produced by NIST nistms$.exe + ''' + # Create dictionary with column name as key + output = OrderedDict() + output['id'] = [] + output['compound_name'] = [] + output['formula'] = [] + output['lib_name'] = [] + output['id_in_lib'] = [] + output['mf'] = [] + output['rmf'] = [] + output['prob'] = [] + output['cas'] = [] + output['mw'] = [] + + + for line in open(nist_result_file): + row = line.split('<<') + if row[0].startswith('Unknown'): + title_row = row[0] + continue + elif row[0].startswith('Hit'): + hit = row + + output['id'].append(title_row.split(': ')[1].split(' ')[0]) + output['compound_name'].append((hit[1].split('>>')[0]).decode('utf-8', errors='replace')) # see http://blog.webforefront.com/archives/2011/02/python_ascii_co.html + output['formula'].append(hit[2].split('>>')[0]) + output['lib_name'].append(hit[3].split('>>')[0]) + + other_fields_list = (hit[2].split('>>')[1] + hit[3].split('>>')[1]).split(';') + count = 0 + for field in other_fields_list: + if field.startswith(' MF: '): + count += 1 + output['mf'].append(field.split('MF: ')[1]) + elif field.startswith(' RMF: '): + count += 1 + output['rmf'].append(field.split('RMF: ')[1]) + elif field.startswith(' Prob: '): + count += 1 + output['prob'].append(field.split('Prob: ')[1]) + elif field.startswith(' CAS:'): + count += 1 + output['cas'].append(field.split('CAS:')[1]) + elif field.startswith(' Mw: '): + count += 1 + output['mw'].append(field.split('Mw: ')[1]) + elif field.startswith(' Id: '): + count += 1 + output['id_in_lib'].append(field.split('Id: ')[1][0:-2]) # the [0:-2] is to avoid the last 2 characters, namely a '.' and a \n + elif field != '' and field != ' Lib: ': + raise Exception('Error: unexpected field in NIST output: ' + field) + + if count != 6: + raise Exception('Error: did not find all expected fields in NIST output') + + return output + +def get_spectra_file_as_dict(spectrum_file): + ''' + Method to parse spectra file in NIST MSP input format into a dictionary. + The idea is to parse the following : + + Name: spectrum1 + DB#: 1 + Num Peaks: 87 + 14 8; 15 15; 27 18; 28 15; 29 15; + 30 11; 32 19; 39 32; 40 12; 41 68; + + into: + + dict['spectrum1'] = "14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68;" + + @param spectrum_file: spectra file in MSP format (e.g. also the format returned by MsClust) + ''' + + output = OrderedDict() + name = '' + spectrum = '' + for line in open(spectrum_file): + if line.startswith('Name: '): + if name != '': + # store spectrum: + output[name] = spectrum + name = line.split('Name: ')[1].replace('\n','') + spectrum = '' + elif line[0].isdigit(): + # parse spectra: + spectrum += line.replace('\n','') + + # store also last spectrum: + output[name] = spectrum + + return output + \ No newline at end of file