Mercurial > repos > pieterlukasse > nist_wrapper
comparison utils.py @ 0:cce6989ed423
new NIST wrapper demo tools
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 22 Jan 2015 16:14:57 +0100 |
parents | |
children | 8c20185752da |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cce6989ed423 |
---|---|
1 ''' | |
2 Created on 31 dec. 2014 | |
3 | |
4 @author: lukas007 | |
5 ''' | |
6 import shutil | |
7 import subprocess | |
8 import csv | |
9 from collections import OrderedDict | |
10 | |
11 def copy_dir(src, dst): | |
12 shutil.copytree(src, dst) | |
13 | |
14 | |
15 def copy_file(src, dst): | |
16 shutil.copy(src, dst) | |
17 | |
18 def get_process_list(): | |
19 p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE) | |
20 out, err = p.communicate() | |
21 return out.splitlines() | |
22 | |
23 def get_process_pid(process_name): | |
24 pid = -1 | |
25 for line in get_process_list(): | |
26 if process_name in line: | |
27 pid = int(line.split(None, 1)[0]) | |
28 return pid | |
29 | |
30 | |
31 def get_as_dict(in_tsv): | |
32 ''' | |
33 Generic method to parse a tab-separated file returning a dictionary with named columns | |
34 @param in_tsv: input filename to be parsed | |
35 ''' | |
36 data = list(csv.reader(open(in_tsv, 'rU'), delimiter='\t')) | |
37 header = data.pop(0) | |
38 # Create dictionary with column name as key | |
39 output = {} | |
40 for index in xrange(len(header)): | |
41 output[header[index]] = [row[index] for row in data] | |
42 return output | |
43 | |
44 def save_dict_as_tsv(dict, out_tsv): | |
45 ''' | |
46 Writes tab-separated data to file | |
47 @param data: dictionary containing merged dataset | |
48 @param out_tsv: output tsv file | |
49 ''' | |
50 | |
51 # Open output file for writing | |
52 out_file = open(out_tsv, 'wb') | |
53 output_writer = csv.writer(out_file, delimiter="\t") | |
54 | |
55 # Write headers | |
56 output_writer.writerow(list(dict.keys())) | |
57 | |
58 # Write | |
59 for record_index in xrange(len(dict[dict.keys()[0]])): | |
60 row = [dict[k][record_index] for k in dict] | |
61 output_writer.writerow(row) | |
62 | |
63 | |
64 | |
65 | |
66 def get_nist_out_as_dict(nist_result_file): | |
67 ''' | |
68 Method to parse NIST specific output into a dictionary. | |
69 @param nist_result_file: result file as produced by NIST nistms$.exe | |
70 ''' | |
71 # Create dictionary with column name as key | |
72 output = OrderedDict() | |
73 output['id'] = [] | |
74 output['compound_name'] = [] | |
75 output['formula'] = [] | |
76 output['lib_name'] = [] | |
77 output['id_in_lib'] = [] | |
78 output['mf'] = [] | |
79 output['rmf'] = [] | |
80 output['prob'] = [] | |
81 output['cas'] = [] | |
82 output['mw'] = [] | |
83 | |
84 | |
85 for line in open(nist_result_file): | |
86 row = line.split('<<') | |
87 if row[0].startswith('Unknown'): | |
88 title_row = row[0] | |
89 continue | |
90 elif row[0].startswith('Hit'): | |
91 hit = row | |
92 | |
93 output['id'].append(title_row.split(': ')[1].split(' ')[0]) | |
94 output['compound_name'].append((hit[1].split('>>')[0]).decode('utf-8', errors='replace')) # see http://blog.webforefront.com/archives/2011/02/python_ascii_co.html | |
95 output['formula'].append(hit[2].split('>>')[0]) | |
96 output['lib_name'].append(hit[3].split('>>')[0]) | |
97 | |
98 other_fields_list = (hit[2].split('>>')[1] + hit[3].split('>>')[1]).split(';') | |
99 count = 0 | |
100 for field in other_fields_list: | |
101 if field.startswith(' MF: '): | |
102 count += 1 | |
103 output['mf'].append(field.split('MF: ')[1]) | |
104 elif field.startswith(' RMF: '): | |
105 count += 1 | |
106 output['rmf'].append(field.split('RMF: ')[1]) | |
107 elif field.startswith(' Prob: '): | |
108 count += 1 | |
109 output['prob'].append(field.split('Prob: ')[1]) | |
110 elif field.startswith(' CAS:'): | |
111 count += 1 | |
112 output['cas'].append(field.split('CAS:')[1]) | |
113 elif field.startswith(' Mw: '): | |
114 count += 1 | |
115 output['mw'].append(field.split('Mw: ')[1]) | |
116 elif field.startswith(' Id: '): | |
117 count += 1 | |
118 output['id_in_lib'].append(field.split('Id: ')[1][0:-2]) # the [0:-2] is to avoid the last 2 characters, namely a '.' and a \n | |
119 elif field != '' and field != ' Lib: ': | |
120 raise Exception('Error: unexpected field in NIST output: ' + field) | |
121 | |
122 if count != 6: | |
123 raise Exception('Error: did not find all expected fields in NIST output') | |
124 | |
125 return output | |
126 | |
127 def get_spectra_file_as_dict(spectrum_file): | |
128 ''' | |
129 Method to parse spectra file in NIST MSP input format into a dictionary. | |
130 The idea is to parse the following : | |
131 | |
132 Name: spectrum1 | |
133 DB#: 1 | |
134 Num Peaks: 87 | |
135 14 8; 15 15; 27 18; 28 15; 29 15; | |
136 30 11; 32 19; 39 32; 40 12; 41 68; | |
137 | |
138 into: | |
139 | |
140 dict['spectrum1'] = "14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68;" | |
141 | |
142 @param spectrum_file: spectra file in MSP format (e.g. also the format returned by MsClust) | |
143 ''' | |
144 | |
145 output = OrderedDict() | |
146 name = '' | |
147 spectrum = '' | |
148 for line in open(spectrum_file): | |
149 if line.startswith('Name: '): | |
150 if name != '': | |
151 # store spectrum: | |
152 output[name] = spectrum | |
153 name = line.split('Name: ')[1].replace('\n','') | |
154 spectrum = '' | |
155 elif line[0].isdigit(): | |
156 # parse spectra: | |
157 spectrum += line.replace('\n','') | |
158 | |
159 # store also last spectrum: | |
160 output[name] = spectrum | |
161 | |
162 return output | |
163 |