comparison Marea/ras_generator.py @ 47:3af9d394367c draft

Uploaded
author bimib
date Wed, 19 Feb 2020 10:44:52 -0500
parents 5d5d01ef1d68
children 3b0e71e28c0b
comparison
equal deleted inserted replaced
46:5d5d01ef1d68 47:3af9d394367c
1 from __future__ import division 1 from __future__ import division
2 import sys 2 import sys
3 import pandas as pd 3 import pandas as pd
4 import itertools as it
5 import scipy.stats as st
6 import collections 4 import collections
7 import lxml.etree as ET
8 import pickle as pk 5 import pickle as pk
9 import math 6 import math
10 import os
11 import argparse 7 import argparse
12 from svglib.svglib import svg2rlg
13 from reportlab.graphics import renderPDF
14 8
15 ########################## argparse ########################################## 9 ########################## argparse ##########################################
16 10
17 def process_args(args): 11 def process_args(args):
18 parser = argparse.ArgumentParser(usage = '%(prog)s [options]', 12 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
24 choices = ['HMRcore', 'Recon', 'Custom'], 18 choices = ['HMRcore', 'Recon', 'Custom'],
25 help = 'chose which type of dataset you want use') 19 help = 'chose which type of dataset you want use')
26 parser.add_argument('-cr', '--custom', 20 parser.add_argument('-cr', '--custom',
27 type = str, 21 type = str,
28 help='your dataset if you want custom rules') 22 help='your dataset if you want custom rules')
29 parser.add_argument('-na', '--names',
30 type = str,
31 nargs = '+',
32 help = 'input names')
33 parser.add_argument('-n', '--none', 23 parser.add_argument('-n', '--none',
34 type = str, 24 type = str,
35 default = 'true', 25 default = 'true',
36 choices = ['true', 'false'], 26 choices = ['true', 'false'],
37 help = 'compute Nan values') 27 help = 'compute Nan values')
38 parser.add_argument('-pv' ,'--pValue',
39 type = float,
40 default = 0.05,
41 help = 'P-Value threshold (default: %(default)s)')
42 parser.add_argument('-fc', '--fChange',
43 type = float,
44 default = 1.5,
45 help = 'Fold-Change threshold (default: %(default)s)')
46 parser.add_argument('-td', '--tool_dir', 28 parser.add_argument('-td', '--tool_dir',
47 type = str, 29 type = str,
48 required = True, 30 required = True,
49 help = 'your tool directory') 31 help = 'your tool directory')
50 parser.add_argument('-op', '--option',
51 type = str,
52 choices = ['datasets', 'dataset_class', 'datasets_rasonly'],
53 help='dataset or dataset and class')
54 parser.add_argument('-ol', '--out_log', 32 parser.add_argument('-ol', '--out_log',
55 help = "Output log") 33 help = "Output log")
56 parser.add_argument('-ids', '--input_datas', 34 parser.add_argument('-id', '--input',
57 type = str,
58 nargs = '+',
59 help = 'input datasets')
60 parser.add_argument('-id', '--input_data',
61 type = str, 35 type = str,
62 help = 'input dataset') 36 help = 'input dataset')
63 parser.add_argument('-ic', '--input_class', 37 parser.add_argument('-ra', '--ras_output',
64 type = str,
65 help = 'sample group specification')
66 parser.add_argument('-cm', '--custom_map',
67 type = str,
68 help = 'custom map')
69 parser.add_argument('-yn', '--yes_no',
70 type = str, 38 type = str,
71 choices = ['yes', 'no'], 39 required = True,
72 help = 'if make or not custom map') 40 help = 'ras output')
73 parser.add_argument('-gs', '--generate_svg', 41
74 type = str,
75 default = 'true',
76 choices = ['true', 'false'],
77 help = 'generate svg map')
78 parser.add_argument('-gp', '--generate_pdf',
79 type = str,
80 default = 'true',
81 choices = ['true', 'false'],
82 help = 'generate pdf map')
83 parser.add_argument('-gr', '--generate_ras',
84 type = str,
85 default = 'true',
86 choices = ['true', 'false'],
87 help = 'generate reaction activity score')
88 parser.add_argument('-sr', '--single_ras_file',
89 type = str,
90 help = 'file that will contain ras')
91
92 args = parser.parse_args() 42 args = parser.parse_args()
93 return args 43 return args
94 44
95 ########################### warning ########################################### 45 ########################### warning ###########################################
96 46
294 return False 244 return False
295 l = l[3:] 245 l = l[3:]
296 else: 246 else:
297 return False 247 return False
298 return ris 248 return ris
299
300 ############################ map_methods ######################################
301
302 def fold_change(avg1, avg2):
303 if avg1 == 0 and avg2 == 0:
304 return 0
305 elif avg1 == 0:
306 return '-INF'
307 elif avg2 == 0:
308 return 'INF'
309 else:
310 return math.log(avg1 / avg2, 2)
311
312 def fix_style(l, col, width, dash):
313 tmp = l.split(';')
314 flag_col = False
315 flag_width = False
316 flag_dash = False
317 for i in range(len(tmp)):
318 if tmp[i].startswith('stroke:'):
319 tmp[i] = 'stroke:' + col
320 flag_col = True
321 if tmp[i].startswith('stroke-width:'):
322 tmp[i] = 'stroke-width:' + width
323 flag_width = True
324 if tmp[i].startswith('stroke-dasharray:'):
325 tmp[i] = 'stroke-dasharray:' + dash
326 flag_dash = True
327 if not flag_col:
328 tmp.append('stroke:' + col)
329 if not flag_width:
330 tmp.append('stroke-width:' + width)
331 if not flag_dash:
332 tmp.append('stroke-dasharray:' + dash)
333 return ';'.join(tmp)
334
335 def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C):
336 maxT = 12
337 minT = 2
338 grey = '#BEBEBE'
339 blue = '#0000FF'
340 red = '#E41A1C'
341 for el in core_map.iter():
342 el_id = str(el.get('id'))
343 if el_id.startswith('R_'):
344 tmp = d.get(el_id[2:])
345 if tmp != None:
346 p_val = tmp[0]
347 f_c = tmp[1]
348 if p_val < threshold_P_V:
349 if not isinstance(f_c, str):
350 if abs(f_c) < math.log(threshold_F_C, 2):
351 col = grey
352 width = str(minT)
353 else:
354 if f_c < 0:
355 col = blue
356 elif f_c > 0:
357 col = red
358 width = str(max((abs(f_c) * maxT) / max_F_C, minT))
359 else:
360 if f_c == '-INF':
361 col = blue
362 elif f_c == 'INF':
363 col = red
364 width = str(maxT)
365 dash = 'none'
366 else:
367 dash = '5,5'
368 col = grey
369 width = str(minT)
370 el.set('style', fix_style(el.get('style'), col, width, dash))
371 return core_map
372 249
373 ############################ make recon ####################################### 250 ############################ make recon #######################################
374 251
375 def check_and_doWord(l): 252 def check_and_doWord(l):
376 tmp = [] 253 tmp = []
613 490
614 ############################ resolve ########################################## 491 ############################ resolve ##########################################
615 492
616 def resolve(genes, rules, ids, resolve_none, name): 493 def resolve(genes, rules, ids, resolve_none, name):
617 resolve_rules = {} 494 resolve_rules = {}
618 names_array = []
619 not_found = [] 495 not_found = []
620 flag = False 496 flag = False
621 for key, value in genes.items(): 497 for key, value in genes.items():
622 tmp_resolve = [] 498 tmp_resolve = []
623 for i in range(len(rules)): 499 for i in range(len(rules)):
650 if not pd.isnull(classe): 526 if not pd.isnull(classe):
651 l = [] 527 l = []
652 for j in range(i, len(classes)): 528 for j in range(i, len(classes)):
653 if classes.iloc[j, 1] == classe: 529 if classes.iloc[j, 1] == classe:
654 pat_id = classes.iloc[j, 0] 530 pat_id = classes.iloc[j, 0]
655 tmp = resolve_rules.get(pat_id, None)
656 if tmp != None: 531 if tmp != None:
657 l.append(tmp) 532 l.append(tmp)
658 classes.iloc[j, 1] = None 533 classes.iloc[j, 1] = None
659 if l: 534 if l:
660 class_pat[classe] = list(map(list, zip(*l))) 535 class_pat[classe] = list(map(list, zip(*l)))
663 ', the class has been disregarded\n') 538 ', the class has been disregarded\n')
664 return class_pat 539 return class_pat
665 540
666 ############################ create_ras ####################################### 541 ############################ create_ras #######################################
667 542
668 def create_ras (resolve_rules, dataset_name, single_ras, rules, ids): 543 def create_ras (resolve_rules, dataset_name, rules, ids, file):
669 544
670 if resolve_rules == None: 545 if resolve_rules == None:
671 warning("Couldn't generate RAS for current dataset: " + dataset_name) 546 warning("Couldn't generate RAS for current dataset: " + dataset_name)
672 547
673 for geni in resolve_rules.values(): 548 for geni in resolve_rules.values():
678 output_ras = pd.DataFrame.from_dict(resolve_rules) 553 output_ras = pd.DataFrame.from_dict(resolve_rules)
679 554
680 output_ras.insert(0, 'Reactions', ids) 555 output_ras.insert(0, 'Reactions', ids)
681 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) 556 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
682 557
683 if (single_ras): 558 text_file = open(file, "w")
684 args = process_args(sys.argv)
685 text_file = open(args.single_ras_file, "w")
686 else:
687 text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
688 559
689 text_file.write(output_to_csv) 560 text_file.write(output_to_csv)
690 text_file.close() 561 text_file.close()
691 562
692 ############################ map ##############################################
693
694 def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf):
695 args = process_args(sys.argv)
696 if (not class_pat) or (len(class_pat.keys()) < 2):
697 sys.exit('Execution aborted: classes provided for comparisons are ' +
698 'less than two\n')
699 for i, j in it.combinations(class_pat.keys(), 2):
700 tmp = {}
701 count = 0
702 max_F_C = 0
703 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
704 try:
705 stat_D, p_value = st.ks_2samp(l1, l2)
706 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
707 if not isinstance(avg, str):
708 if max_F_C < abs(avg):
709 max_F_C = abs(avg)
710 tmp[ids[count]] = [float(p_value), avg]
711 count += 1
712 except (TypeError, ZeroDivisionError):
713 count += 1
714 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
715 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
716 tmp_csv = tmp_csv.reset_index()
717 header = ['ids', 'P_Value', 'Log2(fold change)']
718 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
719
720 if create_svg or create_pdf:
721 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
722 and args.yes_no == 'yes'):
723 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
724 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
725 with open(file_svg, 'wb') as new_map:
726 new_map.write(ET.tostring(core_map))
727
728
729 if create_pdf:
730 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
731 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
732
733 if not create_svg:
734 #Ho utilizzato il file svg per generare il pdf,
735 #ma l'utente non ne ha richiesto il ritorno, quindi
736 #lo elimino
737 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
738
739 return None
740
741 ############################ MAIN ############################################# 563 ############################ MAIN #############################################
742 564
743 def main(): 565 def main():
744 args = process_args(sys.argv) 566 args = process_args(sys.argv)
745 567
746 create_svg = check_bool(args.generate_svg)
747 create_pdf = check_bool(args.generate_pdf)
748 generate_ras = check_bool(args.generate_ras)
749
750 os.makedirs('result')
751
752 if generate_ras:
753 os.makedirs('ras')
754
755 if args.rules_selector == 'HMRcore': 568 if args.rules_selector == 'HMRcore':
756 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) 569 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
757 elif args.rules_selector == 'Recon': 570 elif args.rules_selector == 'Recon':
758 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) 571 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
759 elif args.rules_selector == 'Custom': 572 elif args.rules_selector == 'Custom':
760 ids, rules, gene_in_rule = make_recon(args.custom) 573 ids, rules, gene_in_rule = make_recon(args.custom)
761 574
762 resolve_none = check_bool(args.none) 575 resolve_none = check_bool(args.none)
763 576
764 class_pat = {} 577
765 578 name = "RAS Dataset"
766 if args.option == 'datasets_rasonly': 579 dataset = read_dataset(args.input, "dataset")
767 name = "RAS Dataset" 580
768 dataset = read_dataset(args.input_datas[0],"dataset") 581 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
769 582
770 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) 583 type_gene = gene_type(dataset.iloc[0, 0], name)
771
772 type_gene = gene_type(dataset.iloc[0, 0], name)
773
774 if args.rules_selector != 'Custom':
775 genes = data_gene(dataset, type_gene, name, None)
776 ids, rules = load_id_rules(recon.get(type_gene))
777 elif args.rules_selector == 'Custom':
778 genes = data_gene(dataset, type_gene, name, gene_in_rule)
779 584
780 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) 585 if args.rules_selector != 'Custom':
781 586 genes = data_gene(dataset, type_gene, name, None)
782 create_ras(resolve_rules, name, True, rules, ids) 587 ids, rules = load_id_rules(recon.get(type_gene))
783 588 elif args.rules_selector == 'Custom':
784 if err != None and err: 589 genes = data_gene(dataset, type_gene, name, gene_in_rule)
785 warning('Warning: gene\n' + str(err) + '\nnot found in class ' 590
786 + name + ', the expression level for this gene ' + 591 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
787 'will be considered NaN\n') 592
788 593 create_ras(resolve_rules, name, rules, ids, args.ras_output)
789 print('execution succeded') 594
790 return None 595 if err != None and err:
791 596 warning('Warning: gene\n' + str(err) + '\nnot found in class '
792 597 + name + ', the expression level for this gene ' +
793 elif args.option == 'datasets': 598 'will be considered NaN\n')
794 num = 1 599
795 for i, j in zip(args.input_datas, args.names): 600
796
797 name = name_dataset(j, num)
798 dataset = read_dataset(i, name)
799
800 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
801
802 type_gene = gene_type(dataset.iloc[0, 0], name)
803
804 if args.rules_selector != 'Custom':
805 genes = data_gene(dataset, type_gene, name, None)
806 ids, rules = load_id_rules(recon.get(type_gene))
807 elif args.rules_selector == 'Custom':
808 genes = data_gene(dataset, type_gene, name, gene_in_rule)
809
810
811 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
812
813 if generate_ras:
814 create_ras(resolve_rules, name, False, rules, ids)
815
816 if err != None and err:
817 warning('Warning: gene\n' + str(err) + '\nnot found in class '
818 + name + ', the expression level for this gene ' +
819 'will be considered NaN\n')
820 if resolve_rules != None:
821 class_pat[name] = list(map(list, zip(*resolve_rules.values())))
822 num += 1
823 elif args.option == 'dataset_class':
824 name = 'RNAseq'
825 dataset = read_dataset(args.input_data, name)
826 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
827 type_gene = gene_type(dataset.iloc[0, 0], name)
828 classes = read_dataset(args.input_class, 'class')
829 if not len(classes.columns) == 2:
830 warning('Warning: more than 2 columns in class file. Extra' +
831 'columns have been disregarded\n')
832 classes = classes.astype(str)
833 if args.rules_selector != 'Custom':
834 genes = data_gene(dataset, type_gene, name, None)
835 ids, rules = load_id_rules(recon.get(type_gene))
836 elif args.rules_selector == 'Custom':
837 genes = data_gene(dataset, type_gene, name, gene_in_rule)
838 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
839 if err != None and err:
840 warning('Warning: gene\n'+str(err)+'\nnot found in class '
841 + name + ', the expression level for this gene ' +
842 'will be considered NaN\n')
843 if resolve_rules != None:
844 class_pat = split_class(classes, resolve_rules)
845 if generate_ras:
846 create_ras(resolve_rules, name, False, rules, ids)
847
848
849 if args.rules_selector == 'Custom':
850 if args.yes_no == 'yes':
851 try:
852 core_map = ET.parse(args.custom_map)
853 except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
854 sys.exit('Execution aborted: custom map in wrong format')
855 elif args.yes_no == 'no':
856 core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
857 else:
858 core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
859
860 maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf)
861
862 print('Execution succeded') 601 print('Execution succeded')
863 602
864 return None 603 return None
865 604
866 ############################################################################### 605 ###############################################################################