1 from __future__ import division
2 import os
3 import sys
4 import pandas as pd
5 import collections
6 import pickle as pk
7 import argparse
8 from sklearn.cluster import KMeans
9 import matplotlib
10 matplotlib.use('GTKAgg')
11 import matplotlib.pyplot as plt
13 ########################## argparse ###########################################
15 def process_args(args):
16 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
17 description = 'process some value\'s' +
18 ' genes to create class.')
19 parser.add_argument('-rs', '--rules_selector',
20 type = str,
21 default = 'HMRcore',
22 choices = ['HMRcore', 'Recon', 'Custom'],
23 help = 'chose which type of dataset you want use')
24 parser.add_argument('-cr', '--custom',
25 type = str,
26 help='your dataset if you want custom rules')
27 parser.add_argument('-ch', '--cond_hier',
28 type = str,
29 default = 'no',
30 choices = ['no', 'yes'],
31 help = 'chose if you wanna hierical dendrogram')
32 parser.add_argument('-lk', '--k_min',
33 type = int,
34 help = 'min number of cluster')
35 parser.add_argument('-uk', '--k_max',
36 type = int,
37 help = 'max number of cluster')
38 parser.add_argument('-li', '--linkage',
39 type = str,
40 choices = ['single', 'complete', 'average'],
41 help='linkage hierarchical cluster')
42 parser.add_argument('-d', '--data',
43 type = str,
44 required = True,
45 help = 'input dataset')
46 parser.add_argument('-n', '--none',
47 type = str,
48 default = 'true',
49 choices = ['true', 'false'],
50 help = 'compute Nan values')
51 parser.add_argument('-td', '--tool_dir',
52 type = str,
53 required = True,
54 help = 'your tool directory')
55 parser.add_argument('-na', '--name',
56 type = str,
57 help = 'name of dataset')
58 parser.add_argument('-de', '--dendro',
59 help = "Dendrogram out")
60 parser.add_argument('-ol', '--out_log',
61 help = "Output log")
62 parser.add_argument('-el', '--elbow',
63 help = "Out elbow")
64 args = parser.parse_args()
65 return args
67 ########################### warning ###########################################
69 def warning(s):
70 args = process_args(sys.argv)
71 with open(args.out_log, 'a') as log:
72 log.write(s)
74 ############################ dataset input ####################################
76 def read_dataset(data, name):
77 try:
78 dataset = pd.read_csv(data, sep = '\t', header = 0)
79 except pd.errors.EmptyDataError:
80 sys.exit('Execution aborted: wrong format of '+name+'\n')
81 if len(dataset.columns) < 2:
82 sys.exit('Execution aborted: wrong format of '+name+'\n')
83 return dataset
85 ############################ dataset name #####################################
87 def name_dataset(name_data, count):
88 if str(name_data) == 'Dataset':
89 return str(name_data) + '_' + str(count)
90 else:
91 return str(name_data)
93 ############################ load id e rules ##################################
95 def load_id_rules(reactions):
96 ids, rules = [], []
97 for key, value in reactions.items():
98 ids.append(key)
99 rules.append(value)
100 return (ids, rules)
102 ############################ check_methods ####################################
104 def gene_type(l, name):
105 if check_hgnc(l):
106 return 'hugo_id'
107 elif check_ensembl(l):
108 return 'ensembl_gene_id'
109 elif check_symbol(l):
110 return 'symbol'
111 elif check_entrez(l):
112 return 'entrez_id'
113 else:
114 sys.exit('Execution aborted:\n' +
115 'gene ID type in ' + name + ' not supported. Supported ID' +
116 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
118 def check_hgnc(l):
119 if len(l) > 5:
120 if (l.upper()).startswith('HGNC:'):
121 return l[5:].isdigit()
122 else:
123 return False
124 else:
125 return False
127 def check_ensembl(l):
128 if len(l) == 15:
129 if (l.upper()).startswith('ENS'):
130 return l[4:].isdigit()
131 else:
132 return False
133 else:
134 return False
136 def check_symbol(l):
137 if len(l) > 0:
138 if l[0].isalpha() and l[1:].isalnum():
139 return True
140 else:
141 return False
142 else:
143 return False
145 def check_entrez(l):
146 if len(l) > 0:
147 return l.isdigit()
148 else:
149 return False
151 def check_bool(b):
152 if b == 'true':
153 return True
154 elif b == 'false':
155 return False
157 ############################ make recon #######################################
159 def check_and_doWord(l):
160 tmp = []
161 tmp_genes = []
162 count = 0
163 while l:
164 if count >= 0:
165 if l[0] == '(':
166 count += 1
167 tmp.append(l[0])
168 l.pop(0)
169 elif l[0] == ')':
170 count -= 1
171 tmp.append(l[0])
172 l.pop(0)
173 elif l[0] == ' ':
174 l.pop(0)
175 else:
176 word = []
177 while l:
178 if l[0] in [' ', '(', ')']:
179 break
180 else:
181 word.append(l[0])
182 l.pop(0)
183 word = ''.join(word)
184 tmp.append(word)
185 if not(word in ['or', 'and']):
186 tmp_genes.append(word)
187 else:
188 return False
189 if count == 0:
190 return (tmp, tmp_genes)
191 else:
192 return False
194 def brackets_to_list(l):
195 tmp = []
196 while l:
197 if l[0] == '(':
198 l.pop(0)
199 tmp.append(resolve_brackets(l))
200 else:
201 tmp.append(l[0])
202 l.pop(0)
203 return tmp
205 def resolve_brackets(l):
206 tmp = []
207 while l[0] != ')':
208 if l[0] == '(':
209 l.pop(0)
210 tmp.append(resolve_brackets(l))
211 else:
212 tmp.append(l[0])
213 l.pop(0)
214 l.pop(0)
215 return tmp
217 def priorityAND(l):
218 tmp = []
219 flag = True
220 while l:
221 if len(l) == 1:
222 if isinstance(l[0], list):
223 tmp.append(priorityAND(l[0]))
224 else:
225 tmp.append(l[0])
226 l = l[1:]
227 elif l[0] == 'or':
228 tmp.append(l[0])
229 flag = False
230 l = l[1:]
231 elif l[1] == 'or':
232 if isinstance(l[0], list):
233 tmp.append(priorityAND(l[0]))
234 else:
235 tmp.append(l[0])
236 tmp.append(l[1])
237 flag = False
238 l = l[2:]
239 elif l[1] == 'and':
240 tmpAnd = []
241 if isinstance(l[0], list):
242 tmpAnd.append(priorityAND(l[0]))
243 else:
244 tmpAnd.append(l[0])
245 tmpAnd.append(l[1])
246 if isinstance(l[2], list):
247 tmpAnd.append(priorityAND(l[2]))
248 else:
249 tmpAnd.append(l[2])
250 l = l[3:]
251 while l:
252 if l[0] == 'and':
253 tmpAnd.append(l[0])
254 if isinstance(l[1], list):
255 tmpAnd.append(priorityAND(l[1]))
256 else:
257 tmpAnd.append(l[1])
258 l = l[2:]
259 elif l[0] == 'or':
260 flag = False
261 break
262 if flag == True: #se ci sono solo AND nella lista
263 tmp.extend(tmpAnd)
264 elif flag == False:
265 tmp.append(tmpAnd)
266 return tmp
268 def checkRule(l):
269 if len(l) == 1:
270 if isinstance(l[0], list):
271 if checkRule(l[0]) is False:
272 return False
273 elif len(l) > 2:
274 if checkRule2(l) is False:
275 return False
276 else:
277 return False
278 return True
280 def checkRule2(l):
281 while l:
282 if len(l) == 1:
283 return False
284 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
285 if checkRule(l[0]) is False:
286 return False
287 if isinstance(l[2], list):
288 if checkRule(l[2]) is False:
289 return False
290 l = l[3:]
291 elif l[1] in ['and', 'or']:
292 if isinstance(l[2], list):
293 if checkRule(l[2]) is False:
294 return False
295 l = l[3:]
296 elif l[0] in ['and', 'or']:
297 if isinstance(l[1], list):
298 if checkRule(l[1]) is False:
299 return False
300 l = l[2:]
301 else:
302 return False
303 return True
305 def do_rules(rules):
306 split_rules = []
307 err_rules = []
308 tmp_gene_in_rule = []
309 for i in range(len(rules)):
310 tmp = list(rules[i])
311 if tmp:
312 tmp, tmp_genes = check_and_doWord(tmp)
313 tmp_gene_in_rule.extend(tmp_genes)
314 if tmp is False:
315 split_rules.append([])
316 err_rules.append(rules[i])
317 else:
318 tmp = brackets_to_list(tmp)
319 if checkRule(tmp):
320 split_rules.append(priorityAND(tmp))
321 else:
322 split_rules.append([])
323 err_rules.append(rules[i])
324 else:
325 split_rules.append([])
326 if err_rules:
327 warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
328 return (split_rules, list(set(tmp_gene_in_rule)))
330 def make_recon(data):
331 try:
332 import cobra as cb
333 import warnings
334 with warnings.catch_warnings():
335 warnings.simplefilter('ignore')
336 recon = cb.io.read_sbml_model(data)
337 react = recon.reactions
338 rules = [react[i].gene_reaction_rule for i in range(len(react))]
339 ids = [react[i].id for i in range(len(react))]
340 except cb.io.sbml3.CobraSBMLError:
341 try:
342 data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('')
343 if len(data.columns) < 2:
344 sys.exit('Execution aborted: wrong format of ' +
345 'custom GPR rules\n')
346 if not len(data.columns) == 2:
347 warning('WARNING: more than 2 columns in custom GPR rules.\n' +
348 'Extra columns have been disregarded\n')
349 ids = list(data.iloc[:, 0])
350 rules = list(data.iloc[:, 1])
351 except pd.errors.EmptyDataError:
352 sys.exit('Execution aborted: wrong format of custom GPR rules\n')
353 except pd.errors.ParserError:
354 sys.exit('Execution aborted: wrong format of custom GPR rules\n')
355 split_rules, tmp_genes = do_rules(rules)
356 gene_in_rule = {}
357 for i in tmp_genes:
358 gene_in_rule[i] = 'ok'
359 return (ids, split_rules, gene_in_rule)
361 ############################ resolve_methods ##################################
363 def replace_gene_value(l, d):
364 tmp = []
365 err = []
366 while l:
367 if isinstance(l[0], list):
368 tmp_rules, tmp_err = replace_gene_value(l[0], d)
369 tmp.append(tmp_rules)
370 err.extend(tmp_err)
371 else:
372 value = replace_gene(l[0],d)
373 tmp.append(value)
374 if value == None:
375 err.append(l[0])
376 l = l[1:]
377 return (tmp, err)
379 def replace_gene(l, d):
380 if l =='and' or l == 'or':
381 return l
382 else:
383 value = d.get(l, None)
384 if not(value == None or isinstance(value, (int, float))):
385 sys.exit('Execution aborted: ' + value + ' value not valid\n')
386 return value
388 def compute(val1, op, val2, cn):
389 if val1 != None and val2 != None:
390 if op == 'and':
391 return min(val1, val2)
392 else:
393 return val1 + val2
394 elif op == 'and':
395 if cn is True:
396 if val1 != None:
397 return val1
398 elif val2 != None:
399 return val2
400 else:
401 return None
402 else:
403 return None
404 else:
405 if val1 != None:
406 return val1
407 elif val2 != None:
408 return val2
409 else:
410 return None
412 def control(ris, l, cn):
413 if len(l) == 1:
414 if isinstance(l[0], (float, int)) or l[0] == None:
415 return l[0]
416 elif isinstance(l[0], list):
417 return control(None, l[0], cn)
418 else:
419 return False
420 elif len(l) > 2:
421 return control_list(ris, l, cn)
422 else:
423 return False
425 def control_list(ris, l, cn):
426 while l:
427 if len(l) == 1:
428 return False
429 elif (isinstance(l[0], (float, int)) or
430 l[0] == None) and l[1] in ['and', 'or']:
431 if isinstance(l[2], (float, int)) or l[2] == None:
432 ris = compute(l[0], l[1], l[2], cn)
433 elif isinstance(l[2], list):
434 tmp = control(None, l[2], cn)
435 if tmp is False:
436 return False
437 else:
438 ris = compute(l[0], l[1], tmp, cn)
439 else:
440 return False
441 l = l[3:]
442 elif l[0] in ['and', 'or']:
443 if isinstance(l[1], (float, int)) or l[1] == None:
444 ris = compute(ris, l[0], l[1], cn)
445 elif isinstance(l[1], list):
446 tmp = control(None,l[1], cn)
447 if tmp is False:
448 return False
449 else:
450 ris = compute(ris, l[0], tmp, cn)
451 else:
452 return False
453 l = l[2:]
454 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
455 if isinstance(l[2], (float, int)) or l[2] == None:
456 tmp = control(None, l[0], cn)
457 if tmp is False:
458 return False
459 else:
460 ris = compute(tmp, l[1], l[2], cn)
461 elif isinstance(l[2], list):
462 tmp = control(None, l[0], cn)
463 tmp2 = control(None, l[2], cn)
464 if tmp is False or tmp2 is False:
465 return False
466 else:
467 ris = compute(tmp, l[1], tmp2, cn)
468 else:
469 return False
470 l = l[3:]
471 else:
472 return False
473 return ris
475 ############################ gene #############################################
477 def data_gene(gene, type_gene, name, gene_custom):
478 args = process_args(sys.argv)
479 for i in range(len(gene)):
480 tmp = gene.iloc[i, 0]
481 if tmp.startswith(' ') or tmp.endswith(' '):
482 gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
483 gene_dup = [item for item, count in
484 collections.Counter(gene[gene.columns[0]]).items() if count > 1]
485 pat_dup = [item for item, count in
486 collections.Counter(list(gene.columns)).items() if count > 1]
487 if gene_dup:
488 if gene_custom == None:
489 if args.rules_selector == 'HMRcore':
490 gene_in_rule = pk.load(open(args.tool_dir +
491 '/local/HMRcore_genes.p', 'rb'))
492 elif args.rules_selector == 'Recon':
493 gene_in_rule = pk.load(open(args.tool_dir +
494 '/local/Recon_genes.p', 'rb'))
495 gene_in_rule = gene_in_rule.get(type_gene)
496 else:
497 gene_in_rule = gene_custom
498 tmp = []
499 for i in gene_dup:
500 if gene_in_rule.get(i) == 'ok':
501 tmp.append(i)
502 if tmp:
503 sys.exit('Execution aborted because gene ID '
504 + str(tmp) + ' in ' + name + ' is duplicated\n')
505 if pat_dup:
506 sys.exit('Execution aborted: duplicated label\n'
507 + str(pat_dup) + 'in ' + name + '\n')
508 return (gene.set_index(gene.columns[0])).to_dict()
510 ############################ resolve ##########################################
512 def resolve(genes, rules, ids, resolve_none, name):
513 resolve_rules = {}
514 not_found = []
515 flag = False
516 for key, value in genes.items():
517 tmp_resolve = []
518 for i in range(len(rules)):
519 tmp = rules[i]
520 if tmp:
521 tmp, err = replace_gene_value(tmp, value)
522 if err:
523 not_found.extend(err)
524 ris = control(None, tmp, resolve_none)
525 if ris is False or ris == None:
526 tmp_resolve.append(None)
527 else:
528 tmp_resolve.append(ris)
529 flag = True
530 else:
531 tmp_resolve.append(None)
532 resolve_rules[key] = tmp_resolve
533 if flag is False:
534 sys.exit('Execution aborted: no computable score' +
535 ' (due to missing gene values) for class '
536 + name + ', the class has been disregarded\n')
537 return (resolve_rules, list(set(not_found)))
539 ################################# clustering ##################################
541 def f_cluster(resolve_rules):
542 os.makedirs('cluster_out')
543 args = process_args(sys.argv)
544 k_min = args.k_min
545 k_max = args.k_max
546 if k_min > k_max:
547 warning('k range boundaries inverted.\n')
548 tmp = k_min
549 k_min = k_max
550 k_max = tmp
551 else:
552 warning('k range correct.\n')
553 cluster_data = pd.DataFrame.from_dict(resolve_rules, orient = 'index')
554 for i in cluster_data.columns:
555 tmp = cluster_data[i][0]
556 if tmp == None:
557 cluster_data = cluster_data.drop(columns=[i])
558 distorsion = []
559 for i in range(k_min, k_max+1):
560 tmp_kmeans = KMeans(n_clusters = i,
561 n_init = 100,
562 max_iter = 300,
563 random_state = 0).fit(cluster_data)
564 distorsion.append(tmp_kmeans.inertia_)
565 predict = tmp_kmeans.predict(cluster_data)
566 predict = [x+1 for x in predict]
567 classe = (pd.DataFrame(list(zip(cluster_data.index, predict)))).astype(str)
568 dest = 'cluster_out/K=' + str(i) + '_' + args.name+'.tsv'
569 classe.to_csv(dest, sep = '\t', index = False,
570 header = ['Patient_ID', 'Class'])
571 plt.figure(0)
572 plt.plot(range(k_min, k_max+1), distorsion, marker = 'o')
573 plt.xlabel('Number of cluster')
574 plt.ylabel('Distorsion')
575 plt.savefig(args.elbow, dpi = 240, format = 'pdf')
576 if args.cond_hier == 'yes':
577 import scipy.cluster.hierarchy as hier
578 lin = hier.linkage(cluster_data, args.linkage)
579 plt.figure(1)
580 plt.figure(figsize=(10, 5))
581 hier.dendrogram(lin, leaf_font_size = 2, labels = cluster_data.index)
582 plt.savefig(args.dendro, dpi = 480, format = 'pdf')
583 return None
585 ################################# main ########################################
587 def main():
588 args = process_args(sys.argv)
589 if args.rules_selector == 'HMRcore':
590 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
591 elif args.rules_selector == 'Recon':
592 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
593 elif args.rules_selector == 'Custom':
594 ids, rules, gene_in_rule = make_recon(args.custom)
595 resolve_none = check_bool(args.none)
596 dataset = read_dataset(args.data, args.name)
597 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
598 type_gene = gene_type(dataset.iloc[0, 0], args.name)
599 if args.rules_selector != 'Custom':
600 genes = data_gene(dataset, type_gene, args.name, None)
601 ids, rules = load_id_rules(recon.get(type_gene))
602 elif args.rules_selector == 'Custom':
603 genes = data_gene(dataset, type_gene, args.name, gene_in_rule)
604 resolve_rules, err = resolve(genes, rules, ids, resolve_none, args.name)
605 if err:
606 warning('WARNING: gene\n' + str(err) + '\nnot found in class '
607 + args.name + ', the expression level for this gene ' +
608 'will be considered NaN\n')
609 f_cluster(resolve_rules)
610 warning('Execution succeeded')
611 return None
613 ###############################################################################
615 if __name__ == "__main__":
616 main()