0
|
1 from __future__ import division
|
|
2 import sys
|
|
3 import pandas as pd
|
|
4 import itertools as it
|
|
5 import scipy.stats as st
|
|
6 import collections
|
|
7 import lxml.etree as ET
|
|
8 import pickle as pk
|
|
9 import math
|
|
10 import os
|
|
11 import argparse
|
|
12 from svglib.svglib import svg2rlg
|
|
13 from reportlab.graphics import renderPDF
|
|
14
|
16
|
15 ########################## argparse ##########################################
|
0
|
16
|
|
17 def process_args(args):
|
|
18 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
|
|
19 description = 'process some value\'s'+
|
|
20 ' genes to create a comparison\'s map.')
|
|
21 parser.add_argument('-rs', '--rules_selector',
|
|
22 type = str,
|
|
23 default = 'HMRcore',
|
|
24 choices = ['HMRcore', 'Recon', 'Custom'],
|
|
25 help = 'chose which type of dataset you want use')
|
|
26 parser.add_argument('-cr', '--custom',
|
|
27 type = str,
|
|
28 help='your dataset if you want custom rules')
|
|
29 parser.add_argument('-n', '--none',
|
|
30 type = str,
|
|
31 default = 'true',
|
|
32 choices = ['true', 'false'],
|
|
33 help = 'compute Nan values')
|
|
34 parser.add_argument('-pv' ,'--pValue',
|
|
35 type = float,
|
47
|
36 default = 0.1,
|
0
|
37 help = 'P-Value threshold (default: %(default)s)')
|
|
38 parser.add_argument('-fc', '--fChange',
|
|
39 type = float,
|
|
40 default = 1.5,
|
|
41 help = 'Fold-Change threshold (default: %(default)s)')
|
|
42 parser.add_argument('-td', '--tool_dir',
|
|
43 type = str,
|
|
44 required = True,
|
|
45 help = 'your tool directory')
|
|
46 parser.add_argument('-op', '--option',
|
|
47 type = str,
|
47
|
48 choices = ['datasets', 'dataset_class'],
|
0
|
49 help='dataset or dataset and class')
|
|
50 parser.add_argument('-ol', '--out_log',
|
|
51 help = "Output log")
|
|
52 parser.add_argument('-id', '--input_data',
|
|
53 type = str,
|
|
54 help = 'input dataset')
|
|
55 parser.add_argument('-ic', '--input_class',
|
|
56 type = str,
|
|
57 help = 'sample group specification')
|
|
58 parser.add_argument('-cm', '--custom_map',
|
|
59 type = str,
|
|
60 help = 'custom map')
|
|
61 parser.add_argument('-yn', '--yes_no',
|
|
62 type = str,
|
|
63 choices = ['yes', 'no'],
|
|
64 help = 'if make or not custom map')
|
16
|
65 parser.add_argument('-gs', '--generate_svg',
|
|
66 type = str,
|
|
67 default = 'true',
|
|
68 choices = ['true', 'false'],
|
|
69 help = 'generate svg map')
|
|
70 parser.add_argument('-gp', '--generate_pdf',
|
|
71 type = str,
|
|
72 default = 'true',
|
|
73 choices = ['true', 'false'],
|
|
74 help = 'generate pdf map')
|
47
|
75 parser.add_argument('-on', '--control',
|
|
76 type = str)
|
|
77 parser.add_argument('-co', '--comparison',
|
|
78 type = str,
|
|
79 default = '1vs1',
|
|
80 choices = ['manyvsmany', 'onevsrest', 'onevsmany'])
|
|
81 parser.add_argument('-ids', '--input_datas',
|
16
|
82 type = str,
|
47
|
83 nargs = '+',
|
|
84 help = 'input datasets')
|
|
85 parser.add_argument('-na', '--names',
|
|
86 type = str,
|
|
87 nargs = '+',
|
|
88 help = 'input names')
|
|
89
|
0
|
90 args = parser.parse_args()
|
|
91 return args
|
|
92
|
|
93 ########################### warning ###########################################
|
|
94
|
|
95 def warning(s):
|
|
96 args = process_args(sys.argv)
|
|
97 with open(args.out_log, 'a') as log:
|
|
98 log.write(s)
|
|
99
|
|
100 ############################ dataset input ####################################
|
|
101
|
|
102 def read_dataset(data, name):
|
|
103 try:
|
16
|
104 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python')
|
0
|
105 except pd.errors.EmptyDataError:
|
|
106 sys.exit('Execution aborted: wrong format of ' + name + '\n')
|
|
107 if len(dataset.columns) < 2:
|
|
108 sys.exit('Execution aborted: wrong format of ' + name + '\n')
|
|
109 return dataset
|
|
110
|
|
111 ############################ dataset name #####################################
|
|
112
|
|
113 def name_dataset(name_data, count):
|
|
114 if str(name_data) == 'Dataset':
|
|
115 return str(name_data) + '_' + str(count)
|
|
116 else:
|
|
117 return str(name_data)
|
|
118
|
|
119 ############################ load id e rules ##################################
|
|
120
|
|
121 def load_id_rules(reactions):
|
|
122 ids, rules = [], []
|
|
123 for key, value in reactions.items():
|
|
124 ids.append(key)
|
|
125 rules.append(value)
|
|
126 return (ids, rules)
|
|
127
|
|
128 ############################ check_methods ####################################
|
|
129
|
|
130 def gene_type(l, name):
|
|
131 if check_hgnc(l):
|
|
132 return 'hugo_id'
|
|
133 elif check_ensembl(l):
|
|
134 return 'ensembl_gene_id'
|
|
135 elif check_symbol(l):
|
|
136 return 'symbol'
|
|
137 elif check_entrez(l):
|
|
138 return 'entrez_id'
|
|
139 else:
|
|
140 sys.exit('Execution aborted:\n' +
|
|
141 'gene ID type in ' + name + ' not supported. Supported ID'+
|
|
142 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
|
|
143
|
|
144 def check_hgnc(l):
|
|
145 if len(l) > 5:
|
|
146 if (l.upper()).startswith('HGNC:'):
|
|
147 return l[5:].isdigit()
|
|
148 else:
|
|
149 return False
|
|
150 else:
|
|
151 return False
|
|
152
|
|
153 def check_ensembl(l):
|
|
154 if len(l) == 15:
|
|
155 if (l.upper()).startswith('ENS'):
|
|
156 return l[4:].isdigit()
|
|
157 else:
|
|
158 return False
|
|
159 else:
|
|
160 return False
|
|
161
|
|
162 def check_symbol(l):
|
|
163 if len(l) > 0:
|
|
164 if l[0].isalpha() and l[1:].isalnum():
|
|
165 return True
|
|
166 else:
|
|
167 return False
|
|
168 else:
|
|
169 return False
|
|
170
|
|
171 def check_entrez(l):
|
|
172 if len(l) > 0:
|
|
173 return l.isdigit()
|
|
174 else:
|
|
175 return False
|
|
176
|
|
177 def check_bool(b):
|
|
178 if b == 'true':
|
|
179 return True
|
|
180 elif b == 'false':
|
|
181 return False
|
|
182
|
|
183 ############################ resolve_methods ##################################
|
|
184
|
|
185 def replace_gene_value(l, d):
|
|
186 tmp = []
|
|
187 err = []
|
|
188 while l:
|
|
189 if isinstance(l[0], list):
|
|
190 tmp_rules, tmp_err = replace_gene_value(l[0], d)
|
|
191 tmp.append(tmp_rules)
|
|
192 err.extend(tmp_err)
|
|
193 else:
|
|
194 value = replace_gene(l[0], d)
|
|
195 tmp.append(value)
|
|
196 if value == None:
|
|
197 err.append(l[0])
|
|
198 l = l[1:]
|
|
199 return (tmp, err)
|
|
200
|
35
|
201
|
0
|
202 def replace_gene(l, d):
|
|
203 if l =='and' or l == 'or':
|
|
204 return l
|
|
205 else:
|
|
206 value = d.get(l, None)
|
|
207 if not(value == None or isinstance(value, (int, float))):
|
|
208 sys.exit('Execution aborted: ' + value + ' value not valid\n')
|
|
209 return value
|
|
210
|
|
211 def computes(val1, op, val2, cn):
|
|
212 if val1 != None and val2 != None:
|
|
213 if op == 'and':
|
|
214 return min(val1, val2)
|
|
215 else:
|
|
216 return val1 + val2
|
|
217 elif op == 'and':
|
|
218 if cn is True:
|
|
219 if val1 != None:
|
|
220 return val1
|
|
221 elif val2 != None:
|
|
222 return val2
|
|
223 else:
|
|
224 return None
|
|
225 else:
|
|
226 return None
|
|
227 else:
|
|
228 if val1 != None:
|
|
229 return val1
|
|
230 elif val2 != None:
|
|
231 return val2
|
|
232 else:
|
|
233 return None
|
|
234
|
|
235 def control(ris, l, cn):
|
|
236 if len(l) == 1:
|
|
237 if isinstance(l[0], (float, int)) or l[0] == None:
|
|
238 return l[0]
|
|
239 elif isinstance(l[0], list):
|
|
240 return control(None, l[0], cn)
|
|
241 else:
|
|
242 return False
|
|
243 elif len(l) > 2:
|
|
244 return control_list(ris, l, cn)
|
|
245 else:
|
|
246 return False
|
|
247
|
|
248 def control_list(ris, l, cn):
|
|
249 while l:
|
|
250 if len(l) == 1:
|
|
251 return False
|
|
252 elif (isinstance(l[0], (float, int)) or
|
|
253 l[0] == None) and l[1] in ['and', 'or']:
|
|
254 if isinstance(l[2], (float, int)) or l[2] == None:
|
|
255 ris = computes(l[0], l[1], l[2], cn)
|
|
256 elif isinstance(l[2], list):
|
|
257 tmp = control(None, l[2], cn)
|
|
258 if tmp is False:
|
|
259 return False
|
|
260 else:
|
|
261 ris = computes(l[0], l[1], tmp, cn)
|
|
262 else:
|
|
263 return False
|
|
264 l = l[3:]
|
|
265 elif l[0] in ['and', 'or']:
|
|
266 if isinstance(l[1], (float, int)) or l[1] == None:
|
|
267 ris = computes(ris, l[0], l[1], cn)
|
|
268 elif isinstance(l[1], list):
|
|
269 tmp = control(None,l[1], cn)
|
|
270 if tmp is False:
|
|
271 return False
|
|
272 else:
|
|
273 ris = computes(ris, l[0], tmp, cn)
|
|
274 else:
|
|
275 return False
|
|
276 l = l[2:]
|
|
277 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
|
|
278 if isinstance(l[2], (float, int)) or l[2] == None:
|
|
279 tmp = control(None, l[0], cn)
|
|
280 if tmp is False:
|
|
281 return False
|
|
282 else:
|
|
283 ris = computes(tmp, l[1], l[2], cn)
|
|
284 elif isinstance(l[2], list):
|
|
285 tmp = control(None, l[0], cn)
|
|
286 tmp2 = control(None, l[2], cn)
|
|
287 if tmp is False or tmp2 is False:
|
|
288 return False
|
|
289 else:
|
|
290 ris = computes(tmp, l[1], tmp2, cn)
|
|
291 else:
|
|
292 return False
|
|
293 l = l[3:]
|
|
294 else:
|
|
295 return False
|
|
296 return ris
|
|
297
|
|
298 ############################ map_methods ######################################
|
|
299
|
|
300 def fold_change(avg1, avg2):
|
|
301 if avg1 == 0 and avg2 == 0:
|
|
302 return 0
|
|
303 elif avg1 == 0:
|
|
304 return '-INF'
|
|
305 elif avg2 == 0:
|
|
306 return 'INF'
|
|
307 else:
|
|
308 return math.log(avg1 / avg2, 2)
|
|
309
|
|
310 def fix_style(l, col, width, dash):
|
|
311 tmp = l.split(';')
|
|
312 flag_col = False
|
|
313 flag_width = False
|
|
314 flag_dash = False
|
|
315 for i in range(len(tmp)):
|
|
316 if tmp[i].startswith('stroke:'):
|
|
317 tmp[i] = 'stroke:' + col
|
|
318 flag_col = True
|
|
319 if tmp[i].startswith('stroke-width:'):
|
|
320 tmp[i] = 'stroke-width:' + width
|
|
321 flag_width = True
|
|
322 if tmp[i].startswith('stroke-dasharray:'):
|
|
323 tmp[i] = 'stroke-dasharray:' + dash
|
|
324 flag_dash = True
|
|
325 if not flag_col:
|
|
326 tmp.append('stroke:' + col)
|
|
327 if not flag_width:
|
|
328 tmp.append('stroke-width:' + width)
|
|
329 if not flag_dash:
|
|
330 tmp.append('stroke-dasharray:' + dash)
|
|
331 return ';'.join(tmp)
|
|
332
|
|
333 def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C):
|
|
334 maxT = 12
|
|
335 minT = 2
|
|
336 grey = '#BEBEBE'
|
|
337 blue = '#0000FF'
|
|
338 red = '#E41A1C'
|
|
339 for el in core_map.iter():
|
|
340 el_id = str(el.get('id'))
|
|
341 if el_id.startswith('R_'):
|
|
342 tmp = d.get(el_id[2:])
|
|
343 if tmp != None:
|
|
344 p_val = tmp[0]
|
|
345 f_c = tmp[1]
|
|
346 if p_val < threshold_P_V:
|
|
347 if not isinstance(f_c, str):
|
|
348 if abs(f_c) < math.log(threshold_F_C, 2):
|
|
349 col = grey
|
|
350 width = str(minT)
|
|
351 else:
|
|
352 if f_c < 0:
|
|
353 col = blue
|
|
354 elif f_c > 0:
|
|
355 col = red
|
|
356 width = str(max((abs(f_c) * maxT) / max_F_C, minT))
|
|
357 else:
|
|
358 if f_c == '-INF':
|
|
359 col = blue
|
|
360 elif f_c == 'INF':
|
|
361 col = red
|
|
362 width = str(maxT)
|
|
363 dash = 'none'
|
|
364 else:
|
|
365 dash = '5,5'
|
|
366 col = grey
|
|
367 width = str(minT)
|
|
368 el.set('style', fix_style(el.get('style'), col, width, dash))
|
|
369 return core_map
|
|
370
|
|
371 ############################ make recon #######################################
|
|
372
|
|
373 def check_and_doWord(l):
|
|
374 tmp = []
|
|
375 tmp_genes = []
|
|
376 count = 0
|
|
377 while l:
|
|
378 if count >= 0:
|
|
379 if l[0] == '(':
|
|
380 count += 1
|
|
381 tmp.append(l[0])
|
|
382 l.pop(0)
|
|
383 elif l[0] == ')':
|
|
384 count -= 1
|
|
385 tmp.append(l[0])
|
|
386 l.pop(0)
|
|
387 elif l[0] == ' ':
|
|
388 l.pop(0)
|
|
389 else:
|
|
390 word = []
|
|
391 while l:
|
|
392 if l[0] in [' ', '(', ')']:
|
|
393 break
|
|
394 else:
|
|
395 word.append(l[0])
|
|
396 l.pop(0)
|
|
397 word = ''.join(word)
|
|
398 tmp.append(word)
|
|
399 if not(word in ['or', 'and']):
|
|
400 tmp_genes.append(word)
|
|
401 else:
|
|
402 return False
|
|
403 if count == 0:
|
|
404 return (tmp, tmp_genes)
|
|
405 else:
|
|
406 return False
|
|
407
|
|
408 def brackets_to_list(l):
|
|
409 tmp = []
|
|
410 while l:
|
|
411 if l[0] == '(':
|
|
412 l.pop(0)
|
|
413 tmp.append(resolve_brackets(l))
|
|
414 else:
|
|
415 tmp.append(l[0])
|
|
416 l.pop(0)
|
|
417 return tmp
|
|
418
|
|
419 def resolve_brackets(l):
|
|
420 tmp = []
|
|
421 while l[0] != ')':
|
|
422 if l[0] == '(':
|
|
423 l.pop(0)
|
|
424 tmp.append(resolve_brackets(l))
|
|
425 else:
|
|
426 tmp.append(l[0])
|
|
427 l.pop(0)
|
|
428 l.pop(0)
|
|
429 return tmp
|
|
430
|
|
431 def priorityAND(l):
|
|
432 tmp = []
|
|
433 flag = True
|
|
434 while l:
|
|
435 if len(l) == 1:
|
|
436 if isinstance(l[0], list):
|
|
437 tmp.append(priorityAND(l[0]))
|
|
438 else:
|
|
439 tmp.append(l[0])
|
|
440 l = l[1:]
|
|
441 elif l[0] == 'or':
|
|
442 tmp.append(l[0])
|
|
443 flag = False
|
|
444 l = l[1:]
|
|
445 elif l[1] == 'or':
|
|
446 if isinstance(l[0], list):
|
|
447 tmp.append(priorityAND(l[0]))
|
|
448 else:
|
|
449 tmp.append(l[0])
|
|
450 tmp.append(l[1])
|
|
451 flag = False
|
|
452 l = l[2:]
|
|
453 elif l[1] == 'and':
|
|
454 tmpAnd = []
|
|
455 if isinstance(l[0], list):
|
|
456 tmpAnd.append(priorityAND(l[0]))
|
|
457 else:
|
|
458 tmpAnd.append(l[0])
|
|
459 tmpAnd.append(l[1])
|
|
460 if isinstance(l[2], list):
|
|
461 tmpAnd.append(priorityAND(l[2]))
|
|
462 else:
|
|
463 tmpAnd.append(l[2])
|
|
464 l = l[3:]
|
|
465 while l:
|
|
466 if l[0] == 'and':
|
|
467 tmpAnd.append(l[0])
|
|
468 if isinstance(l[1], list):
|
|
469 tmpAnd.append(priorityAND(l[1]))
|
|
470 else:
|
|
471 tmpAnd.append(l[1])
|
|
472 l = l[2:]
|
|
473 elif l[0] == 'or':
|
|
474 flag = False
|
|
475 break
|
13
|
476 if flag == True: #when there are only AND in list
|
0
|
477 tmp.extend(tmpAnd)
|
|
478 elif flag == False:
|
|
479 tmp.append(tmpAnd)
|
|
480 return tmp
|
|
481
|
|
482 def checkRule(l):
|
|
483 if len(l) == 1:
|
|
484 if isinstance(l[0], list):
|
|
485 if checkRule(l[0]) is False:
|
|
486 return False
|
|
487 elif len(l) > 2:
|
|
488 if checkRule2(l) is False:
|
|
489 return False
|
|
490 else:
|
|
491 return False
|
|
492 return True
|
|
493
|
|
494 def checkRule2(l):
|
|
495 while l:
|
|
496 if len(l) == 1:
|
|
497 return False
|
|
498 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
|
|
499 if checkRule(l[0]) is False:
|
|
500 return False
|
|
501 if isinstance(l[2], list):
|
|
502 if checkRule(l[2]) is False:
|
|
503 return False
|
|
504 l = l[3:]
|
|
505 elif l[1] in ['and', 'or']:
|
|
506 if isinstance(l[2], list):
|
|
507 if checkRule(l[2]) is False:
|
|
508 return False
|
|
509 l = l[3:]
|
|
510 elif l[0] in ['and', 'or']:
|
|
511 if isinstance(l[1], list):
|
|
512 if checkRule(l[1]) is False:
|
|
513 return False
|
|
514 l = l[2:]
|
|
515 else:
|
|
516 return False
|
|
517 return True
|
|
518
|
|
519 def do_rules(rules):
|
|
520 split_rules = []
|
|
521 err_rules = []
|
|
522 tmp_gene_in_rule = []
|
|
523 for i in range(len(rules)):
|
|
524 tmp = list(rules[i])
|
|
525 if tmp:
|
|
526 tmp, tmp_genes = check_and_doWord(tmp)
|
|
527 tmp_gene_in_rule.extend(tmp_genes)
|
|
528 if tmp is False:
|
|
529 split_rules.append([])
|
|
530 err_rules.append(rules[i])
|
|
531 else:
|
|
532 tmp = brackets_to_list(tmp)
|
|
533 if checkRule(tmp):
|
|
534 split_rules.append(priorityAND(tmp))
|
|
535 else:
|
|
536 split_rules.append([])
|
|
537 err_rules.append(rules[i])
|
|
538 else:
|
|
539 split_rules.append([])
|
|
540 if err_rules:
|
|
541 warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
|
|
542 return (split_rules, list(set(tmp_gene_in_rule)))
|
|
543
|
|
544 def make_recon(data):
|
|
545 try:
|
|
546 import cobra as cb
|
|
547 import warnings
|
|
548 with warnings.catch_warnings():
|
|
549 warnings.simplefilter('ignore')
|
|
550 recon = cb.io.read_sbml_model(data)
|
|
551 react = recon.reactions
|
|
552 rules = [react[i].gene_reaction_rule for i in range(len(react))]
|
|
553 ids = [react[i].id for i in range(len(react))]
|
|
554 except cb.io.sbml3.CobraSBMLError:
|
|
555 try:
|
16
|
556 data = (pd.read_csv(data, sep = '\t', dtype = str, engine='python')).fillna('')
|
0
|
557 if len(data.columns) < 2:
|
|
558 sys.exit('Execution aborted: wrong format of '+
|
|
559 'custom datarules\n')
|
|
560 if not len(data.columns) == 2:
|
|
561 warning('Warning: more than 2 columns in custom datarules.\n' +
|
|
562 'Extra columns have been disregarded\n')
|
|
563 ids = list(data.iloc[:, 0])
|
|
564 rules = list(data.iloc[:, 1])
|
|
565 except pd.errors.EmptyDataError:
|
|
566 sys.exit('Execution aborted: wrong format of custom datarules\n')
|
|
567 except pd.errors.ParserError:
|
|
568 sys.exit('Execution aborted: wrong format of custom datarules\n')
|
|
569 split_rules, tmp_genes = do_rules(rules)
|
|
570 gene_in_rule = {}
|
|
571 for i in tmp_genes:
|
|
572 gene_in_rule[i] = 'ok'
|
|
573 return (ids, split_rules, gene_in_rule)
|
|
574
|
|
575 ############################ gene #############################################
|
|
576
|
|
577 def data_gene(gene, type_gene, name, gene_custom):
|
35
|
578 args = process_args(sys.argv)
|
0
|
579 for i in range(len(gene)):
|
|
580 tmp = gene.iloc[i, 0]
|
|
581 if tmp.startswith(' ') or tmp.endswith(' '):
|
|
582 gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
|
|
583 gene_dup = [item for item, count in
|
|
584 collections.Counter(gene[gene.columns[0]]).items() if count > 1]
|
|
585 pat_dup = [item for item, count in
|
|
586 collections.Counter(list(gene.columns)).items() if count > 1]
|
35
|
587
|
0
|
588 if gene_dup:
|
|
589 if gene_custom == None:
|
|
590 if args.rules_selector == 'HMRcore':
|
|
591 gene_in_rule = pk.load(open(args.tool_dir +
|
|
592 '/local/HMRcore_genes.p', 'rb'))
|
|
593 elif args.rules_selector == 'Recon':
|
|
594 gene_in_rule = pk.load(open(args.tool_dir +
|
|
595 '/local/Recon_genes.p', 'rb'))
|
|
596 gene_in_rule = gene_in_rule.get(type_gene)
|
|
597 else:
|
|
598 gene_in_rule = gene_custom
|
|
599 tmp = []
|
|
600 for i in gene_dup:
|
|
601 if gene_in_rule.get(i) == 'ok':
|
|
602 tmp.append(i)
|
|
603 if tmp:
|
|
604 sys.exit('Execution aborted because gene ID '
|
|
605 +str(tmp)+' in '+name+' is duplicated\n')
|
|
606 if pat_dup:
|
|
607 warning('Warning: duplicated label\n' + str(pat_dup) + 'in ' + name +
|
|
608 '\n')
|
35
|
609
|
0
|
610 return (gene.set_index(gene.columns[0])).to_dict()
|
|
611
|
|
612 ############################ resolve ##########################################
|
|
613
|
|
614 def resolve(genes, rules, ids, resolve_none, name):
|
|
615 resolve_rules = {}
|
|
616 not_found = []
|
|
617 flag = False
|
|
618 for key, value in genes.items():
|
|
619 tmp_resolve = []
|
|
620 for i in range(len(rules)):
|
|
621 tmp = rules[i]
|
|
622 if tmp:
|
|
623 tmp, err = replace_gene_value(tmp, value)
|
|
624 if err:
|
|
625 not_found.extend(err)
|
|
626 ris = control(None, tmp, resolve_none)
|
|
627 if ris is False or ris == None:
|
|
628 tmp_resolve.append(None)
|
|
629 else:
|
|
630 tmp_resolve.append(ris)
|
|
631 flag = True
|
|
632 else:
|
35
|
633 tmp_resolve.append(None)
|
0
|
634 resolve_rules[key] = tmp_resolve
|
|
635 if flag is False:
|
|
636 warning('Warning: no computable score (due to missing gene values)' +
|
|
637 'for class ' + name + ', the class has been disregarded\n')
|
|
638 return (None, None)
|
|
639 return (resolve_rules, list(set(not_found)))
|
|
640
|
|
641 ############################ split class ######################################
|
|
642
|
|
643 def split_class(classes, resolve_rules):
|
|
644 class_pat = {}
|
|
645 for i in range(len(classes)):
|
|
646 classe = classes.iloc[i, 1]
|
|
647 if not pd.isnull(classe):
|
|
648 l = []
|
|
649 for j in range(i, len(classes)):
|
|
650 if classes.iloc[j, 1] == classe:
|
|
651 pat_id = classes.iloc[j, 0]
|
|
652 tmp = resolve_rules.get(pat_id, None)
|
|
653 if tmp != None:
|
|
654 l.append(tmp)
|
|
655 classes.iloc[j, 1] = None
|
|
656 if l:
|
|
657 class_pat[classe] = list(map(list, zip(*l)))
|
|
658 else:
|
|
659 warning('Warning: no sample found in class ' + classe +
|
|
660 ', the class has been disregarded\n')
|
|
661 return class_pat
|
|
662
|
|
663 ############################ map ##############################################
|
|
664
|
47
|
665 def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf, comparison, control):
|
0
|
666 args = process_args(sys.argv)
|
|
667 if (not class_pat) or (len(class_pat.keys()) < 2):
|
|
668 sys.exit('Execution aborted: classes provided for comparisons are ' +
|
|
669 'less than two\n')
|
47
|
670
|
|
671 if comparison == "manyvsmany":
|
|
672 for i, j in it.combinations(class_pat.keys(), 2):
|
|
673
|
|
674 tmp = {}
|
|
675 count = 0
|
|
676 max_F_C = 0
|
|
677 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
|
|
678 try:
|
|
679 stat_D, p_value = st.ks_2samp(l1, l2)
|
|
680 #sum(l1) da errore secondo me perchè ha null
|
|
681 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
|
|
682 if not isinstance(avg, str):
|
|
683 if max_F_C < abs(avg):
|
|
684 max_F_C = abs(avg)
|
|
685 tmp[ids[count]] = [float(p_value), avg]
|
|
686 count += 1
|
|
687 except (TypeError, ZeroDivisionError):
|
|
688 count += 1
|
|
689 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
|
|
690 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
|
|
691 tmp_csv = tmp_csv.reset_index()
|
|
692 header = ['ids', 'P_Value', 'Log2(fold change)']
|
|
693 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
|
|
694
|
|
695 if create_svg or create_pdf:
|
|
696 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
|
|
697 and args.yes_no == 'yes'):
|
|
698 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
|
|
699 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
|
|
700 with open(file_svg, 'wb') as new_map:
|
|
701 new_map.write(ET.tostring(core_map))
|
|
702
|
|
703
|
|
704 if create_pdf:
|
|
705 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
|
|
706 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
|
|
707
|
|
708 if not create_svg:
|
|
709 #Ho utilizzato il file svg per generare il pdf,
|
|
710 #ma l'utente non ne ha richiesto il ritorno, quindi
|
|
711 #lo elimino
|
|
712
|
|
713 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
|
|
714 elif comparison == "onevsrest":
|
|
715 for single_cluster in class_pat.keys():
|
|
716 t = []
|
|
717 for k in class_pat.keys():
|
|
718 if k != single_cluster:
|
|
719 t.append(class_pat.get(k))
|
|
720 rest = []
|
|
721 for i in t:
|
|
722 rest = rest + i
|
|
723
|
|
724 tmp = {}
|
|
725 count = 0
|
|
726 max_F_C = 0
|
|
727
|
|
728 for l1, l2 in zip(rest, class_pat.get(single_cluster)):
|
|
729 try:
|
|
730 stat_D, p_value = st.ks_2samp(l1, l2)
|
|
731 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
|
|
732 if not isinstance(avg, str):
|
|
733 if max_F_C < abs(avg):
|
|
734 max_F_C = abs(avg)
|
|
735 tmp[ids[count]] = [float(p_value), avg]
|
|
736 count += 1
|
|
737 except (TypeError, ZeroDivisionError):
|
|
738 count += 1
|
|
739 tab = 'result/' + single_cluster + '_vs_rest (Tabular Result).tsv'
|
|
740 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
|
|
741 tmp_csv = tmp_csv.reset_index()
|
|
742 header = ['ids', 'P_Value', 'Log2(fold change)']
|
|
743 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
|
|
744
|
|
745 if create_svg or create_pdf:
|
|
746 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
|
|
747 and args.yes_no == 'yes'):
|
|
748 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
|
|
749 file_svg = 'result/' + single_cluster + '_vs_ rest (SVG Map).svg'
|
|
750 with open(file_svg, 'wb') as new_map:
|
|
751 new_map.write(ET.tostring(core_map))
|
|
752
|
|
753
|
|
754 if create_pdf:
|
|
755 file_pdf = 'result/' + single_cluster + '_vs_ rest (PDF Map).pdf'
|
|
756 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
|
|
757
|
|
758 if not create_svg:
|
|
759 os.remove('result/' + single_cluster + '_vs_ rest (SVG Map).svg')
|
|
760
|
|
761 elif comparison == "onevsmany":
|
|
762 for i, j in it.combinations(class_pat.keys(), 2):
|
|
763
|
|
764 if i != control and j != control:
|
|
765 print(str(control) + " " + str(i) + " " + str(j))
|
|
766 #Se è un confronto fra elementi diversi dal controllo, skippo
|
|
767 continue
|
|
768
|
|
769 print('vado')
|
|
770 tmp = {}
|
|
771 count = 0
|
|
772 max_F_C = 0
|
|
773 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
|
|
774 try:
|
|
775 stat_D, p_value = st.ks_2samp(l1, l2)
|
|
776 #sum(l1) da errore secondo me perchè ha null
|
|
777 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
|
|
778 if not isinstance(avg, str):
|
|
779 if max_F_C < abs(avg):
|
|
780 max_F_C = abs(avg)
|
|
781 tmp[ids[count]] = [float(p_value), avg]
|
|
782 count += 1
|
|
783 except (TypeError, ZeroDivisionError):
|
|
784 count += 1
|
|
785 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
|
|
786 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
|
|
787 tmp_csv = tmp_csv.reset_index()
|
|
788 header = ['ids', 'P_Value', 'Log2(fold change)']
|
|
789 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
|
|
790
|
|
791 if create_svg or create_pdf:
|
|
792 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
|
|
793 and args.yes_no == 'yes'):
|
|
794 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
|
|
795 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
|
|
796 with open(file_svg, 'wb') as new_map:
|
|
797 new_map.write(ET.tostring(core_map))
|
|
798
|
|
799
|
|
800 if create_pdf:
|
|
801 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
|
|
802 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
|
|
803
|
|
804 if not create_svg:
|
|
805 #Ho utilizzato il file svg per generare il pdf,
|
|
806 #ma l'utente non ne ha richiesto il ritorno, quindi
|
|
807 #lo elimino
|
|
808
|
|
809 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
|
16
|
810
|
47
|
811
|
|
812
|
|
813
|
0
|
814 return None
|
|
815
|
|
816 ############################ MAIN #############################################
|
|
817
|
|
818 def main():
|
|
819 args = process_args(sys.argv)
|
16
|
820
|
|
821 create_svg = check_bool(args.generate_svg)
|
|
822 create_pdf = check_bool(args.generate_pdf)
|
35
|
823
|
47
|
824 if os.path.isdir('result') == False:
|
|
825 os.makedirs('result')
|
16
|
826
|
|
827 if args.rules_selector == 'HMRcore':
|
0
|
828 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
|
|
829 elif args.rules_selector == 'Recon':
|
|
830 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
|
|
831 elif args.rules_selector == 'Custom':
|
|
832 ids, rules, gene_in_rule = make_recon(args.custom)
|
47
|
833
|
0
|
834 class_pat = {}
|
16
|
835
|
47
|
836 if args.option == 'datasets':
|
0
|
837 num = 1
|
|
838 for i, j in zip(args.input_datas, args.names):
|
|
839 name = name_dataset(j, num)
|
47
|
840 resolve_rules = read_dataset(i, name)
|
|
841
|
|
842 resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
|
16
|
843
|
47
|
844 ids = pd.Series.tolist(resolve_rules.iloc[:, 0])
|
16
|
845
|
47
|
846 resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
|
|
847 resolve_rules = resolve_rules.replace({'None': None})
|
|
848 resolve_rules = resolve_rules.to_dict('list')
|
16
|
849
|
47
|
850 #Converto i valori da str a float
|
|
851 to_float = lambda x: float(x) if (x != None) else None
|
|
852
|
|
853 resolve_rules_float = {}
|
|
854
|
|
855 for k in resolve_rules:
|
|
856 resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
|
|
857
|
0
|
858 if resolve_rules != None:
|
47
|
859 class_pat[name] = list(map(list, zip(*resolve_rules_float.values())))
|
|
860
|
0
|
861 num += 1
|
47
|
862
|
|
863 if args.option == 'dataset_class':
|
|
864 name = 'RAS'
|
|
865 resolve_rules = read_dataset(args.input_data, name)
|
|
866 resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
|
|
867
|
|
868 ids = pd.Series.tolist(resolve_rules.iloc[:, 0])
|
|
869
|
|
870 resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
|
|
871 resolve_rules = resolve_rules.replace({'None': None})
|
|
872 resolve_rules = resolve_rules.to_dict('list')
|
|
873
|
|
874 #Converto i valori da str a float
|
|
875 to_float = lambda x: float(x) if (x != None) else None
|
|
876
|
|
877 resolve_rules_float = {}
|
|
878
|
|
879 for k in resolve_rules:
|
|
880 resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
|
|
881
|
0
|
882 classes = read_dataset(args.input_class, 'class')
|
|
883 classes = classes.astype(str)
|
47
|
884
|
|
885 if resolve_rules_float != None:
|
|
886 class_pat = split_class(classes, resolve_rules_float)
|
28
|
887
|
0
|
888 if args.rules_selector == 'Custom':
|
|
889 if args.yes_no == 'yes':
|
|
890 try:
|
|
891 core_map = ET.parse(args.custom_map)
|
|
892 except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
|
|
893 sys.exit('Execution aborted: custom map in wrong format')
|
|
894 elif args.yes_no == 'no':
|
|
895 core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
|
|
896 else:
|
|
897 core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
|
16
|
898
|
47
|
899 maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf, args.comparison, args.control)
|
16
|
900
|
|
901 print('Execution succeded')
|
47
|
902
|
|
903 return None
|
16
|
904
|
0
|
905
|
|
906 ###############################################################################
|
|
907
|
|
908 if __name__ == "__main__":
|
|
909 main()
|