comparison Marea/marea.py @ 47:3af9d394367c draft

Uploaded
author bimib
date Wed, 19 Feb 2020 10:44:52 -0500
parents 7b1971251c63
children e4235b5231e4
comparison
equal deleted inserted replaced
46:5d5d01ef1d68 47:3af9d394367c
24 choices = ['HMRcore', 'Recon', 'Custom'], 24 choices = ['HMRcore', 'Recon', 'Custom'],
25 help = 'chose which type of dataset you want use') 25 help = 'chose which type of dataset you want use')
26 parser.add_argument('-cr', '--custom', 26 parser.add_argument('-cr', '--custom',
27 type = str, 27 type = str,
28 help='your dataset if you want custom rules') 28 help='your dataset if you want custom rules')
29 parser.add_argument('-na', '--names',
30 type = str,
31 nargs = '+',
32 help = 'input names')
33 parser.add_argument('-n', '--none', 29 parser.add_argument('-n', '--none',
34 type = str, 30 type = str,
35 default = 'true', 31 default = 'true',
36 choices = ['true', 'false'], 32 choices = ['true', 'false'],
37 help = 'compute Nan values') 33 help = 'compute Nan values')
38 parser.add_argument('-pv' ,'--pValue', 34 parser.add_argument('-pv' ,'--pValue',
39 type = float, 35 type = float,
40 default = 0.05, 36 default = 0.1,
41 help = 'P-Value threshold (default: %(default)s)') 37 help = 'P-Value threshold (default: %(default)s)')
42 parser.add_argument('-fc', '--fChange', 38 parser.add_argument('-fc', '--fChange',
43 type = float, 39 type = float,
44 default = 1.5, 40 default = 1.5,
45 help = 'Fold-Change threshold (default: %(default)s)') 41 help = 'Fold-Change threshold (default: %(default)s)')
47 type = str, 43 type = str,
48 required = True, 44 required = True,
49 help = 'your tool directory') 45 help = 'your tool directory')
50 parser.add_argument('-op', '--option', 46 parser.add_argument('-op', '--option',
51 type = str, 47 type = str,
52 choices = ['datasets', 'dataset_class', 'datasets_rasonly'], 48 choices = ['datasets', 'dataset_class'],
53 help='dataset or dataset and class') 49 help='dataset or dataset and class')
54 parser.add_argument('-ol', '--out_log', 50 parser.add_argument('-ol', '--out_log',
55 help = "Output log") 51 help = "Output log")
56 parser.add_argument('-ids', '--input_datas',
57 type = str,
58 nargs = '+',
59 help = 'input datasets')
60 parser.add_argument('-id', '--input_data', 52 parser.add_argument('-id', '--input_data',
61 type = str, 53 type = str,
62 help = 'input dataset') 54 help = 'input dataset')
63 parser.add_argument('-ic', '--input_class', 55 parser.add_argument('-ic', '--input_class',
64 type = str, 56 type = str,
78 parser.add_argument('-gp', '--generate_pdf', 70 parser.add_argument('-gp', '--generate_pdf',
79 type = str, 71 type = str,
80 default = 'true', 72 default = 'true',
81 choices = ['true', 'false'], 73 choices = ['true', 'false'],
82 help = 'generate pdf map') 74 help = 'generate pdf map')
83 parser.add_argument('-gr', '--generate_ras', 75 parser.add_argument('-on', '--control',
76 type = str)
77 parser.add_argument('-co', '--comparison',
78 type = str,
79 default = '1vs1',
80 choices = ['manyvsmany', 'onevsrest', 'onevsmany'])
81 parser.add_argument('-ids', '--input_datas',
84 type = str, 82 type = str,
85 default = 'true', 83 nargs = '+',
86 choices = ['true', 'false'], 84 help = 'input datasets')
87 help = 'generate reaction activity score') 85 parser.add_argument('-na', '--names',
88 parser.add_argument('-sr', '--single_ras_file', 86 type = str,
89 type = str, 87 nargs = '+',
90 help = 'file that will contain ras') 88 help = 'input names')
91 89
92 args = parser.parse_args() 90 args = parser.parse_args()
93 return args 91 return args
94 92
95 ########################### warning ########################################### 93 ########################### warning ###########################################
96 94
613 611
614 ############################ resolve ########################################## 612 ############################ resolve ##########################################
615 613
616 def resolve(genes, rules, ids, resolve_none, name): 614 def resolve(genes, rules, ids, resolve_none, name):
617 resolve_rules = {} 615 resolve_rules = {}
618 names_array = []
619 not_found = [] 616 not_found = []
620 flag = False 617 flag = False
621 for key, value in genes.items(): 618 for key, value in genes.items():
622 tmp_resolve = [] 619 tmp_resolve = []
623 for i in range(len(rules)): 620 for i in range(len(rules)):
661 else: 658 else:
662 warning('Warning: no sample found in class ' + classe + 659 warning('Warning: no sample found in class ' + classe +
663 ', the class has been disregarded\n') 660 ', the class has been disregarded\n')
664 return class_pat 661 return class_pat
665 662
666 ############################ create_ras #######################################
667
668 def create_ras (resolve_rules, dataset_name, single_ras, rules, ids):
669
670 if resolve_rules == None:
671 warning("Couldn't generate RAS for current dataset: " + dataset_name)
672
673 for geni in resolve_rules.values():
674 for i, valori in enumerate(geni):
675 if valori == None:
676 geni[i] = 'None'
677
678 output_ras = pd.DataFrame.from_dict(resolve_rules)
679
680 output_ras.insert(0, 'Reactions', ids)
681 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
682
683 if (single_ras):
684 args = process_args(sys.argv)
685 text_file = open(args.single_ras_file, "w")
686 else:
687 text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
688
689 text_file.write(output_to_csv)
690 text_file.close()
691
692 ############################ map ############################################## 663 ############################ map ##############################################
693 664
694 def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf): 665 def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf, comparison, control):
695 args = process_args(sys.argv) 666 args = process_args(sys.argv)
696 if (not class_pat) or (len(class_pat.keys()) < 2): 667 if (not class_pat) or (len(class_pat.keys()) < 2):
697 sys.exit('Execution aborted: classes provided for comparisons are ' + 668 sys.exit('Execution aborted: classes provided for comparisons are ' +
698 'less than two\n') 669 'less than two\n')
699 for i, j in it.combinations(class_pat.keys(), 2): 670
700 tmp = {} 671 if comparison == "manyvsmany":
701 count = 0 672 for i, j in it.combinations(class_pat.keys(), 2):
702 max_F_C = 0 673
703 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)): 674 tmp = {}
704 try: 675 count = 0
705 stat_D, p_value = st.ks_2samp(l1, l2) 676 max_F_C = 0
706 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2)) 677 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
707 if not isinstance(avg, str): 678 try:
708 if max_F_C < abs(avg): 679 stat_D, p_value = st.ks_2samp(l1, l2)
709 max_F_C = abs(avg) 680 #sum(l1) da errore secondo me perchè ha null
710 tmp[ids[count]] = [float(p_value), avg] 681 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
711 count += 1 682 if not isinstance(avg, str):
712 except (TypeError, ZeroDivisionError): 683 if max_F_C < abs(avg):
713 count += 1 684 max_F_C = abs(avg)
714 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv' 685 tmp[ids[count]] = [float(p_value), avg]
715 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index") 686 count += 1
716 tmp_csv = tmp_csv.reset_index() 687 except (TypeError, ZeroDivisionError):
717 header = ['ids', 'P_Value', 'Log2(fold change)'] 688 count += 1
718 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header) 689 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
690 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
691 tmp_csv = tmp_csv.reset_index()
692 header = ['ids', 'P_Value', 'Log2(fold change)']
693 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
694
695 if create_svg or create_pdf:
696 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
697 and args.yes_no == 'yes'):
698 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
699 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
700 with open(file_svg, 'wb') as new_map:
701 new_map.write(ET.tostring(core_map))
702
703
704 if create_pdf:
705 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
706 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
707
708 if not create_svg:
709 #Ho utilizzato il file svg per generare il pdf,
710 #ma l'utente non ne ha richiesto il ritorno, quindi
711 #lo elimino
712
713 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
714 elif comparison == "onevsrest":
715 for single_cluster in class_pat.keys():
716 t = []
717 for k in class_pat.keys():
718 if k != single_cluster:
719 t.append(class_pat.get(k))
720 rest = []
721 for i in t:
722 rest = rest + i
723
724 tmp = {}
725 count = 0
726 max_F_C = 0
727
728 for l1, l2 in zip(rest, class_pat.get(single_cluster)):
729 try:
730 stat_D, p_value = st.ks_2samp(l1, l2)
731 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
732 if not isinstance(avg, str):
733 if max_F_C < abs(avg):
734 max_F_C = abs(avg)
735 tmp[ids[count]] = [float(p_value), avg]
736 count += 1
737 except (TypeError, ZeroDivisionError):
738 count += 1
739 tab = 'result/' + single_cluster + '_vs_rest (Tabular Result).tsv'
740 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
741 tmp_csv = tmp_csv.reset_index()
742 header = ['ids', 'P_Value', 'Log2(fold change)']
743 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
744
745 if create_svg or create_pdf:
746 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
747 and args.yes_no == 'yes'):
748 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
749 file_svg = 'result/' + single_cluster + '_vs_ rest (SVG Map).svg'
750 with open(file_svg, 'wb') as new_map:
751 new_map.write(ET.tostring(core_map))
752
753
754 if create_pdf:
755 file_pdf = 'result/' + single_cluster + '_vs_ rest (PDF Map).pdf'
756 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
757
758 if not create_svg:
759 os.remove('result/' + single_cluster + '_vs_ rest (SVG Map).svg')
760
761 elif comparison == "onevsmany":
762 for i, j in it.combinations(class_pat.keys(), 2):
763
764 if i != control and j != control:
765 print(str(control) + " " + str(i) + " " + str(j))
766 #Se è un confronto fra elementi diversi dal controllo, skippo
767 continue
768
769 print('vado')
770 tmp = {}
771 count = 0
772 max_F_C = 0
773 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
774 try:
775 stat_D, p_value = st.ks_2samp(l1, l2)
776 #sum(l1) da errore secondo me perchè ha null
777 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
778 if not isinstance(avg, str):
779 if max_F_C < abs(avg):
780 max_F_C = abs(avg)
781 tmp[ids[count]] = [float(p_value), avg]
782 count += 1
783 except (TypeError, ZeroDivisionError):
784 count += 1
785 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
786 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
787 tmp_csv = tmp_csv.reset_index()
788 header = ['ids', 'P_Value', 'Log2(fold change)']
789 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
790
791 if create_svg or create_pdf:
792 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
793 and args.yes_no == 'yes'):
794 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
795 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
796 with open(file_svg, 'wb') as new_map:
797 new_map.write(ET.tostring(core_map))
798
799
800 if create_pdf:
801 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
802 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
803
804 if not create_svg:
805 #Ho utilizzato il file svg per generare il pdf,
806 #ma l'utente non ne ha richiesto il ritorno, quindi
807 #lo elimino
808
809 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
719 810
720 if create_svg or create_pdf: 811
721 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom' 812
722 and args.yes_no == 'yes'): 813
723 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
724 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
725 with open(file_svg, 'wb') as new_map:
726 new_map.write(ET.tostring(core_map))
727
728
729 if create_pdf:
730 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
731 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
732
733 if not create_svg:
734 #Ho utilizzato il file svg per generare il pdf,
735 #ma l'utente non ne ha richiesto il ritorno, quindi
736 #lo elimino
737 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
738
739 return None 814 return None
740 815
741 ############################ MAIN ############################################# 816 ############################ MAIN #############################################
742 817
743 def main(): 818 def main():
744 args = process_args(sys.argv) 819 args = process_args(sys.argv)
745 820
746 create_svg = check_bool(args.generate_svg) 821 create_svg = check_bool(args.generate_svg)
747 create_pdf = check_bool(args.generate_pdf) 822 create_pdf = check_bool(args.generate_pdf)
748 generate_ras = check_bool(args.generate_ras) 823
749 824 if os.path.isdir('result') == False:
750 os.makedirs('result') 825 os.makedirs('result')
751
752 if generate_ras:
753 os.makedirs('ras')
754 826
755 if args.rules_selector == 'HMRcore': 827 if args.rules_selector == 'HMRcore':
756 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) 828 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
757 elif args.rules_selector == 'Recon': 829 elif args.rules_selector == 'Recon':
758 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) 830 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
759 elif args.rules_selector == 'Custom': 831 elif args.rules_selector == 'Custom':
760 ids, rules, gene_in_rule = make_recon(args.custom) 832 ids, rules, gene_in_rule = make_recon(args.custom)
761 833
762 resolve_none = check_bool(args.none)
763
764 class_pat = {} 834 class_pat = {}
765 835
766 if args.option == 'datasets_rasonly': 836 if args.option == 'datasets':
767 name = "RAS Dataset"
768 dataset = read_dataset(args.input_datas[0],"dataset")
769
770 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
771
772 type_gene = gene_type(dataset.iloc[0, 0], name)
773
774 if args.rules_selector != 'Custom':
775 genes = data_gene(dataset, type_gene, name, None)
776 ids, rules = load_id_rules(recon.get(type_gene))
777 elif args.rules_selector == 'Custom':
778 genes = data_gene(dataset, type_gene, name, gene_in_rule)
779
780 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
781
782 create_ras(resolve_rules, name, True, rules, ids)
783
784 if err != None and err:
785 warning('Warning: gene\n' + str(err) + '\nnot found in class '
786 + name + ', the expression level for this gene ' +
787 'will be considered NaN\n')
788
789 print('execution succeded')
790 return None
791
792
793 elif args.option == 'datasets':
794 num = 1 837 num = 1
795 for i, j in zip(args.input_datas, args.names): 838 for i, j in zip(args.input_datas, args.names):
796
797 name = name_dataset(j, num) 839 name = name_dataset(j, num)
798 dataset = read_dataset(i, name) 840 resolve_rules = read_dataset(i, name)
799 841
800 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) 842 resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
801 843
802 type_gene = gene_type(dataset.iloc[0, 0], name) 844 ids = pd.Series.tolist(resolve_rules.iloc[:, 0])
803 845
804 if args.rules_selector != 'Custom': 846 resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
805 genes = data_gene(dataset, type_gene, name, None) 847 resolve_rules = resolve_rules.replace({'None': None})
806 ids, rules = load_id_rules(recon.get(type_gene)) 848 resolve_rules = resolve_rules.to_dict('list')
807 elif args.rules_selector == 'Custom': 849
808 genes = data_gene(dataset, type_gene, name, gene_in_rule) 850 #Converto i valori da str a float
809 851 to_float = lambda x: float(x) if (x != None) else None
810 852
811 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) 853 resolve_rules_float = {}
812 854
813 if generate_ras: 855 for k in resolve_rules:
814 create_ras(resolve_rules, name, False, rules, ids) 856 resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
815 857
816 if err != None and err:
817 warning('Warning: gene\n' + str(err) + '\nnot found in class '
818 + name + ', the expression level for this gene ' +
819 'will be considered NaN\n')
820 if resolve_rules != None: 858 if resolve_rules != None:
821 class_pat[name] = list(map(list, zip(*resolve_rules.values()))) 859 class_pat[name] = list(map(list, zip(*resolve_rules_float.values())))
860
822 num += 1 861 num += 1
823 elif args.option == 'dataset_class': 862
824 name = 'RNAseq' 863 if args.option == 'dataset_class':
825 dataset = read_dataset(args.input_data, name) 864 name = 'RAS'
826 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) 865 resolve_rules = read_dataset(args.input_data, name)
827 type_gene = gene_type(dataset.iloc[0, 0], name) 866 resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
867
868 ids = pd.Series.tolist(resolve_rules.iloc[:, 0])
869
870 resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
871 resolve_rules = resolve_rules.replace({'None': None})
872 resolve_rules = resolve_rules.to_dict('list')
873
874 #Converto i valori da str a float
875 to_float = lambda x: float(x) if (x != None) else None
876
877 resolve_rules_float = {}
878
879 for k in resolve_rules:
880 resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
881
828 classes = read_dataset(args.input_class, 'class') 882 classes = read_dataset(args.input_class, 'class')
829 if not len(classes.columns) == 2:
830 warning('Warning: more than 2 columns in class file. Extra' +
831 'columns have been disregarded\n')
832 classes = classes.astype(str) 883 classes = classes.astype(str)
833 if args.rules_selector != 'Custom': 884
834 genes = data_gene(dataset, type_gene, name, None) 885 if resolve_rules_float != None:
835 ids, rules = load_id_rules(recon.get(type_gene)) 886 class_pat = split_class(classes, resolve_rules_float)
836 elif args.rules_selector == 'Custom':
837 genes = data_gene(dataset, type_gene, name, gene_in_rule)
838 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
839 if err != None and err:
840 warning('Warning: gene\n'+str(err)+'\nnot found in class '
841 + name + ', the expression level for this gene ' +
842 'will be considered NaN\n')
843 if resolve_rules != None:
844 class_pat = split_class(classes, resolve_rules)
845 if generate_ras:
846 create_ras(resolve_rules, name, False, rules, ids)
847
848 887
849 if args.rules_selector == 'Custom': 888 if args.rules_selector == 'Custom':
850 if args.yes_no == 'yes': 889 if args.yes_no == 'yes':
851 try: 890 try:
852 core_map = ET.parse(args.custom_map) 891 core_map = ET.parse(args.custom_map)
855 elif args.yes_no == 'no': 894 elif args.yes_no == 'no':
856 core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg') 895 core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
857 else: 896 else:
858 core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg') 897 core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
859 898
860 maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf) 899 maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf, args.comparison, args.control)
861 900
862 print('Execution succeded') 901 print('Execution succeded')
863 902
864 return None 903 return None
904
865 905
866 ############################################################################### 906 ###############################################################################
867 907
868 if __name__ == "__main__": 908 if __name__ == "__main__":
869 main() 909 main()