annotate utils.py @ 11:a62374fcee9c

change
author pieter.lukasse@wur.nl
date Fri, 23 Jan 2015 09:48:58 +0100
parents 8c20185752da
children c21e96bb68c8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
1 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
2 Created on 31 dec. 2014
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
3
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
4 @author: lukas007
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
5 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
6 import shutil
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
7 import subprocess
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
8 import csv
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
9 from collections import OrderedDict
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
10
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
11 def copy_dir(src, dst):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
12 shutil.copytree(src, dst)
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
13
9
8c20185752da better logging
pieter.lukasse@wur.nl
parents: 0
diff changeset
14 def log_message(log_file, log_message):
8c20185752da better logging
pieter.lukasse@wur.nl
parents: 0
diff changeset
15 with open(log_file, "a") as text_file:
8c20185752da better logging
pieter.lukasse@wur.nl
parents: 0
diff changeset
16 text_file.write(log_message + "\n")
8c20185752da better logging
pieter.lukasse@wur.nl
parents: 0
diff changeset
17
0
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
18 def copy_file(src, dst):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
19 shutil.copy(src, dst)
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
20
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
21 def get_process_list():
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
22 p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE)
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
23 out, err = p.communicate()
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
24 return out.splitlines()
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
25
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
26 def get_process_pid(process_name):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
27 pid = -1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
28 for line in get_process_list():
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
29 if process_name in line:
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
30 pid = int(line.split(None, 1)[0])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
31 return pid
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
32
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
33
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
34 def get_as_dict(in_tsv):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
35 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
36 Generic method to parse a tab-separated file returning a dictionary with named columns
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
37 @param in_tsv: input filename to be parsed
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
38 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
39 data = list(csv.reader(open(in_tsv, 'rU'), delimiter='\t'))
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
40 header = data.pop(0)
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
41 # Create dictionary with column name as key
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
42 output = {}
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
43 for index in xrange(len(header)):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
44 output[header[index]] = [row[index] for row in data]
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
45 return output
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
46
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
47 def save_dict_as_tsv(dict, out_tsv):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
48 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
49 Writes tab-separated data to file
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
50 @param data: dictionary containing merged dataset
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
51 @param out_tsv: output tsv file
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
52 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
53
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
54 # Open output file for writing
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
55 out_file = open(out_tsv, 'wb')
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
56 output_writer = csv.writer(out_file, delimiter="\t")
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
57
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
58 # Write headers
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
59 output_writer.writerow(list(dict.keys()))
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
60
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
61 # Write
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
62 for record_index in xrange(len(dict[dict.keys()[0]])):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
63 row = [dict[k][record_index] for k in dict]
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
64 output_writer.writerow(row)
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
65
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
66
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
67
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
68
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
69 def get_nist_out_as_dict(nist_result_file):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
70 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
71 Method to parse NIST specific output into a dictionary.
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
72 @param nist_result_file: result file as produced by NIST nistms$.exe
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
73 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
74 # Create dictionary with column name as key
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
75 output = OrderedDict()
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
76 output['id'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
77 output['compound_name'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
78 output['formula'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
79 output['lib_name'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
80 output['id_in_lib'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
81 output['mf'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
82 output['rmf'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
83 output['prob'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
84 output['cas'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
85 output['mw'] = []
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
86
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
87
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
88 for line in open(nist_result_file):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
89 row = line.split('<<')
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
90 if row[0].startswith('Unknown'):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
91 title_row = row[0]
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
92 continue
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
93 elif row[0].startswith('Hit'):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
94 hit = row
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
95
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
96 output['id'].append(title_row.split(': ')[1].split(' ')[0])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
97 output['compound_name'].append((hit[1].split('>>')[0]).decode('utf-8', errors='replace')) # see http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
98 output['formula'].append(hit[2].split('>>')[0])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
99 output['lib_name'].append(hit[3].split('>>')[0])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
100
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
101 other_fields_list = (hit[2].split('>>')[1] + hit[3].split('>>')[1]).split(';')
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
102 count = 0
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
103 for field in other_fields_list:
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
104 if field.startswith(' MF: '):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
105 count += 1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
106 output['mf'].append(field.split('MF: ')[1])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
107 elif field.startswith(' RMF: '):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
108 count += 1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
109 output['rmf'].append(field.split('RMF: ')[1])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
110 elif field.startswith(' Prob: '):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
111 count += 1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
112 output['prob'].append(field.split('Prob: ')[1])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
113 elif field.startswith(' CAS:'):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
114 count += 1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
115 output['cas'].append(field.split('CAS:')[1])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
116 elif field.startswith(' Mw: '):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
117 count += 1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
118 output['mw'].append(field.split('Mw: ')[1])
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
119 elif field.startswith(' Id: '):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
120 count += 1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
121 output['id_in_lib'].append(field.split('Id: ')[1][0:-2]) # the [0:-2] is to avoid the last 2 characters, namely a '.' and a \n
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
122 elif field != '' and field != ' Lib: ':
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
123 raise Exception('Error: unexpected field in NIST output: ' + field)
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
124
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
125 if count != 6:
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
126 raise Exception('Error: did not find all expected fields in NIST output')
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
127
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
128 return output
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
129
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
130 def get_spectra_file_as_dict(spectrum_file):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
131 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
132 Method to parse spectra file in NIST MSP input format into a dictionary.
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
133 The idea is to parse the following :
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
134
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
135 Name: spectrum1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
136 DB#: 1
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
137 Num Peaks: 87
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
138 14 8; 15 15; 27 18; 28 15; 29 15;
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
139 30 11; 32 19; 39 32; 40 12; 41 68;
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
140
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
141 into:
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
142
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
143 dict['spectrum1'] = "14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68;"
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
144
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
145 @param spectrum_file: spectra file in MSP format (e.g. also the format returned by MsClust)
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
146 '''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
147
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
148 output = OrderedDict()
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
149 name = ''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
150 spectrum = ''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
151 for line in open(spectrum_file):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
152 if line.startswith('Name: '):
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
153 if name != '':
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
154 # store spectrum:
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
155 output[name] = spectrum
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
156 name = line.split('Name: ')[1].replace('\n','')
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
157 spectrum = ''
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
158 elif line[0].isdigit():
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
159 # parse spectra:
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
160 spectrum += line.replace('\n','')
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
161
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
162 # store also last spectrum:
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
163 output[name] = spectrum
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
164
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
165 return output
cce6989ed423 new NIST wrapper demo tools
pieter.lukasse@wur.nl
parents:
diff changeset
166