Mercurial > repos > pieterlukasse > nist_wrapper
view utils.py @ 2:f6da901ffff8
jinja solution
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 22 Jan 2015 16:55:19 +0100 |
parents | cce6989ed423 |
children | 8c20185752da |
line wrap: on
line source
''' Created on 31 dec. 2014 @author: lukas007 ''' import shutil import subprocess import csv from collections import OrderedDict def copy_dir(src, dst): shutil.copytree(src, dst) def copy_file(src, dst): shutil.copy(src, dst) def get_process_list(): p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE) out, err = p.communicate() return out.splitlines() def get_process_pid(process_name): pid = -1 for line in get_process_list(): if process_name in line: pid = int(line.split(None, 1)[0]) return pid def get_as_dict(in_tsv): ''' Generic method to parse a tab-separated file returning a dictionary with named columns @param in_tsv: input filename to be parsed ''' data = list(csv.reader(open(in_tsv, 'rU'), delimiter='\t')) header = data.pop(0) # Create dictionary with column name as key output = {} for index in xrange(len(header)): output[header[index]] = [row[index] for row in data] return output def save_dict_as_tsv(dict, out_tsv): ''' Writes tab-separated data to file @param data: dictionary containing merged dataset @param out_tsv: output tsv file ''' # Open output file for writing out_file = open(out_tsv, 'wb') output_writer = csv.writer(out_file, delimiter="\t") # Write headers output_writer.writerow(list(dict.keys())) # Write for record_index in xrange(len(dict[dict.keys()[0]])): row = [dict[k][record_index] for k in dict] output_writer.writerow(row) def get_nist_out_as_dict(nist_result_file): ''' Method to parse NIST specific output into a dictionary. @param nist_result_file: result file as produced by NIST nistms$.exe ''' # Create dictionary with column name as key output = OrderedDict() output['id'] = [] output['compound_name'] = [] output['formula'] = [] output['lib_name'] = [] output['id_in_lib'] = [] output['mf'] = [] output['rmf'] = [] output['prob'] = [] output['cas'] = [] output['mw'] = [] for line in open(nist_result_file): row = line.split('<<') if row[0].startswith('Unknown'): title_row = row[0] continue elif row[0].startswith('Hit'): hit = row output['id'].append(title_row.split(': ')[1].split(' ')[0]) output['compound_name'].append((hit[1].split('>>')[0]).decode('utf-8', errors='replace')) # see http://blog.webforefront.com/archives/2011/02/python_ascii_co.html output['formula'].append(hit[2].split('>>')[0]) output['lib_name'].append(hit[3].split('>>')[0]) other_fields_list = (hit[2].split('>>')[1] + hit[3].split('>>')[1]).split(';') count = 0 for field in other_fields_list: if field.startswith(' MF: '): count += 1 output['mf'].append(field.split('MF: ')[1]) elif field.startswith(' RMF: '): count += 1 output['rmf'].append(field.split('RMF: ')[1]) elif field.startswith(' Prob: '): count += 1 output['prob'].append(field.split('Prob: ')[1]) elif field.startswith(' CAS:'): count += 1 output['cas'].append(field.split('CAS:')[1]) elif field.startswith(' Mw: '): count += 1 output['mw'].append(field.split('Mw: ')[1]) elif field.startswith(' Id: '): count += 1 output['id_in_lib'].append(field.split('Id: ')[1][0:-2]) # the [0:-2] is to avoid the last 2 characters, namely a '.' and a \n elif field != '' and field != ' Lib: ': raise Exception('Error: unexpected field in NIST output: ' + field) if count != 6: raise Exception('Error: did not find all expected fields in NIST output') return output def get_spectra_file_as_dict(spectrum_file): ''' Method to parse spectra file in NIST MSP input format into a dictionary. The idea is to parse the following : Name: spectrum1 DB#: 1 Num Peaks: 87 14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68; into: dict['spectrum1'] = "14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68;" @param spectrum_file: spectra file in MSP format (e.g. also the format returned by MsClust) ''' output = OrderedDict() name = '' spectrum = '' for line in open(spectrum_file): if line.startswith('Name: '): if name != '': # store spectrum: output[name] = spectrum name = line.split('Name: ')[1].replace('\n','') spectrum = '' elif line[0].isdigit(): # parse spectra: spectrum += line.replace('\n','') # store also last spectrum: output[name] = spectrum return output