Mercurial > repos > bimib > marea
diff Marea/marea.py @ 47:3af9d394367c draft
Uploaded
author | bimib |
---|---|
date | Wed, 19 Feb 2020 10:44:52 -0500 |
parents | 7b1971251c63 |
children | e4235b5231e4 |
line wrap: on
line diff
--- a/Marea/marea.py Wed Jan 22 11:50:54 2020 -0500 +++ b/Marea/marea.py Wed Feb 19 10:44:52 2020 -0500 @@ -26,10 +26,6 @@ parser.add_argument('-cr', '--custom', type = str, help='your dataset if you want custom rules') - parser.add_argument('-na', '--names', - type = str, - nargs = '+', - help = 'input names') parser.add_argument('-n', '--none', type = str, default = 'true', @@ -37,7 +33,7 @@ help = 'compute Nan values') parser.add_argument('-pv' ,'--pValue', type = float, - default = 0.05, + default = 0.1, help = 'P-Value threshold (default: %(default)s)') parser.add_argument('-fc', '--fChange', type = float, @@ -49,14 +45,10 @@ help = 'your tool directory') parser.add_argument('-op', '--option', type = str, - choices = ['datasets', 'dataset_class', 'datasets_rasonly'], + choices = ['datasets', 'dataset_class'], help='dataset or dataset and class') parser.add_argument('-ol', '--out_log', help = "Output log") - parser.add_argument('-ids', '--input_datas', - type = str, - nargs = '+', - help = 'input datasets') parser.add_argument('-id', '--input_data', type = str, help = 'input dataset') @@ -80,15 +72,21 @@ default = 'true', choices = ['true', 'false'], help = 'generate pdf map') - parser.add_argument('-gr', '--generate_ras', + parser.add_argument('-on', '--control', + type = str) + parser.add_argument('-co', '--comparison', + type = str, + default = '1vs1', + choices = ['manyvsmany', 'onevsrest', 'onevsmany']) + parser.add_argument('-ids', '--input_datas', type = str, - default = 'true', - choices = ['true', 'false'], - help = 'generate reaction activity score') - parser.add_argument('-sr', '--single_ras_file', - type = str, - help = 'file that will contain ras') - + nargs = '+', + help = 'input datasets') + parser.add_argument('-na', '--names', + type = str, + nargs = '+', + help = 'input names') + args = parser.parse_args() return args @@ -615,7 +613,6 @@ def resolve(genes, rules, ids, resolve_none, name): resolve_rules = {} - names_array = [] not_found = [] flag = False for key, value in genes.items(): @@ -663,79 +660,157 @@ ', the class has been disregarded\n') return class_pat -############################ create_ras ####################################### - -def create_ras (resolve_rules, dataset_name, single_ras, rules, ids): - - if resolve_rules == None: - warning("Couldn't generate RAS for current dataset: " + dataset_name) - - for geni in resolve_rules.values(): - for i, valori in enumerate(geni): - if valori == None: - geni[i] = 'None' - - output_ras = pd.DataFrame.from_dict(resolve_rules) - - output_ras.insert(0, 'Reactions', ids) - output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) - - if (single_ras): - args = process_args(sys.argv) - text_file = open(args.single_ras_file, "w") - else: - text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w") - - text_file.write(output_to_csv) - text_file.close() - ############################ map ############################################## -def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf): +def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf, comparison, control): args = process_args(sys.argv) if (not class_pat) or (len(class_pat.keys()) < 2): sys.exit('Execution aborted: classes provided for comparisons are ' + 'less than two\n') - for i, j in it.combinations(class_pat.keys(), 2): - tmp = {} - count = 0 - max_F_C = 0 - for l1, l2 in zip(class_pat.get(i), class_pat.get(j)): - try: - stat_D, p_value = st.ks_2samp(l1, l2) - avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2)) - if not isinstance(avg, str): - if max_F_C < abs(avg): - max_F_C = abs(avg) - tmp[ids[count]] = [float(p_value), avg] - count += 1 - except (TypeError, ZeroDivisionError): - count += 1 - tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv' - tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index") - tmp_csv = tmp_csv.reset_index() - header = ['ids', 'P_Value', 'Log2(fold change)'] - tmp_csv.to_csv(tab, sep = '\t', index = False, header = header) + + if comparison == "manyvsmany": + for i, j in it.combinations(class_pat.keys(), 2): + + tmp = {} + count = 0 + max_F_C = 0 + for l1, l2 in zip(class_pat.get(i), class_pat.get(j)): + try: + stat_D, p_value = st.ks_2samp(l1, l2) + #sum(l1) da errore secondo me perchè ha null + avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2)) + if not isinstance(avg, str): + if max_F_C < abs(avg): + max_F_C = abs(avg) + tmp[ids[count]] = [float(p_value), avg] + count += 1 + except (TypeError, ZeroDivisionError): + count += 1 + tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv' + tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index") + tmp_csv = tmp_csv.reset_index() + header = ['ids', 'P_Value', 'Log2(fold change)'] + tmp_csv.to_csv(tab, sep = '\t', index = False, header = header) + + if create_svg or create_pdf: + if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom' + and args.yes_no == 'yes'): + fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C) + file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg' + with open(file_svg, 'wb') as new_map: + new_map.write(ET.tostring(core_map)) + + + if create_pdf: + file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf' + renderPDF.drawToFile(svg2rlg(file_svg), file_pdf) + + if not create_svg: + #Ho utilizzato il file svg per generare il pdf, + #ma l'utente non ne ha richiesto il ritorno, quindi + #lo elimino + + os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg') + elif comparison == "onevsrest": + for single_cluster in class_pat.keys(): + t = [] + for k in class_pat.keys(): + if k != single_cluster: + t.append(class_pat.get(k)) + rest = [] + for i in t: + rest = rest + i + + tmp = {} + count = 0 + max_F_C = 0 + + for l1, l2 in zip(rest, class_pat.get(single_cluster)): + try: + stat_D, p_value = st.ks_2samp(l1, l2) + avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2)) + if not isinstance(avg, str): + if max_F_C < abs(avg): + max_F_C = abs(avg) + tmp[ids[count]] = [float(p_value), avg] + count += 1 + except (TypeError, ZeroDivisionError): + count += 1 + tab = 'result/' + single_cluster + '_vs_rest (Tabular Result).tsv' + tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index") + tmp_csv = tmp_csv.reset_index() + header = ['ids', 'P_Value', 'Log2(fold change)'] + tmp_csv.to_csv(tab, sep = '\t', index = False, header = header) + + if create_svg or create_pdf: + if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom' + and args.yes_no == 'yes'): + fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C) + file_svg = 'result/' + single_cluster + '_vs_ rest (SVG Map).svg' + with open(file_svg, 'wb') as new_map: + new_map.write(ET.tostring(core_map)) + + + if create_pdf: + file_pdf = 'result/' + single_cluster + '_vs_ rest (PDF Map).pdf' + renderPDF.drawToFile(svg2rlg(file_svg), file_pdf) + + if not create_svg: + os.remove('result/' + single_cluster + '_vs_ rest (SVG Map).svg') + + elif comparison == "onevsmany": + for i, j in it.combinations(class_pat.keys(), 2): + + if i != control and j != control: + print(str(control) + " " + str(i) + " " + str(j)) + #Se è un confronto fra elementi diversi dal controllo, skippo + continue + + print('vado') + tmp = {} + count = 0 + max_F_C = 0 + for l1, l2 in zip(class_pat.get(i), class_pat.get(j)): + try: + stat_D, p_value = st.ks_2samp(l1, l2) + #sum(l1) da errore secondo me perchè ha null + avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2)) + if not isinstance(avg, str): + if max_F_C < abs(avg): + max_F_C = abs(avg) + tmp[ids[count]] = [float(p_value), avg] + count += 1 + except (TypeError, ZeroDivisionError): + count += 1 + tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv' + tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index") + tmp_csv = tmp_csv.reset_index() + header = ['ids', 'P_Value', 'Log2(fold change)'] + tmp_csv.to_csv(tab, sep = '\t', index = False, header = header) + + if create_svg or create_pdf: + if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom' + and args.yes_no == 'yes'): + fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C) + file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg' + with open(file_svg, 'wb') as new_map: + new_map.write(ET.tostring(core_map)) + + + if create_pdf: + file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf' + renderPDF.drawToFile(svg2rlg(file_svg), file_pdf) + + if not create_svg: + #Ho utilizzato il file svg per generare il pdf, + #ma l'utente non ne ha richiesto il ritorno, quindi + #lo elimino + + os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg') - if create_svg or create_pdf: - if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom' - and args.yes_no == 'yes'): - fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C) - file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg' - with open(file_svg, 'wb') as new_map: - new_map.write(ET.tostring(core_map)) - - - if create_pdf: - file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf' - renderPDF.drawToFile(svg2rlg(file_svg), file_pdf) - - if not create_svg: - #Ho utilizzato il file svg per generare il pdf, - #ma l'utente non ne ha richiesto il ritorno, quindi - #lo elimino - os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg') - + + + return None ############################ MAIN ############################################# @@ -745,12 +820,9 @@ create_svg = check_bool(args.generate_svg) create_pdf = check_bool(args.generate_pdf) - generate_ras = check_bool(args.generate_ras) - os.makedirs('result') - - if generate_ras: - os.makedirs('ras') + if os.path.isdir('result') == False: + os.makedirs('result') if args.rules_selector == 'HMRcore': recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) @@ -758,93 +830,60 @@ recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) elif args.rules_selector == 'Custom': ids, rules, gene_in_rule = make_recon(args.custom) - - resolve_none = check_bool(args.none) - + class_pat = {} - if args.option == 'datasets_rasonly': - name = "RAS Dataset" - dataset = read_dataset(args.input_datas[0],"dataset") - - dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) - - type_gene = gene_type(dataset.iloc[0, 0], name) - - if args.rules_selector != 'Custom': - genes = data_gene(dataset, type_gene, name, None) - ids, rules = load_id_rules(recon.get(type_gene)) - elif args.rules_selector == 'Custom': - genes = data_gene(dataset, type_gene, name, gene_in_rule) - - resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) - - create_ras(resolve_rules, name, True, rules, ids) - - if err != None and err: - warning('Warning: gene\n' + str(err) + '\nnot found in class ' - + name + ', the expression level for this gene ' + - 'will be considered NaN\n') - - print('execution succeded') - return None - - - elif args.option == 'datasets': + if args.option == 'datasets': num = 1 for i, j in zip(args.input_datas, args.names): - name = name_dataset(j, num) - dataset = read_dataset(i, name) - - dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) - - type_gene = gene_type(dataset.iloc[0, 0], name) + resolve_rules = read_dataset(i, name) + + resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str) - if args.rules_selector != 'Custom': - genes = data_gene(dataset, type_gene, name, None) - ids, rules = load_id_rules(recon.get(type_gene)) - elif args.rules_selector == 'Custom': - genes = data_gene(dataset, type_gene, name, gene_in_rule) - - - resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) + ids = pd.Series.tolist(resolve_rules.iloc[:, 0]) - if generate_ras: - create_ras(resolve_rules, name, False, rules, ids) + resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1) + resolve_rules = resolve_rules.replace({'None': None}) + resolve_rules = resolve_rules.to_dict('list') - if err != None and err: - warning('Warning: gene\n' + str(err) + '\nnot found in class ' - + name + ', the expression level for this gene ' + - 'will be considered NaN\n') + #Converto i valori da str a float + to_float = lambda x: float(x) if (x != None) else None + + resolve_rules_float = {} + + for k in resolve_rules: + resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float + if resolve_rules != None: - class_pat[name] = list(map(list, zip(*resolve_rules.values()))) + class_pat[name] = list(map(list, zip(*resolve_rules_float.values()))) + num += 1 - elif args.option == 'dataset_class': - name = 'RNAseq' - dataset = read_dataset(args.input_data, name) - dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) - type_gene = gene_type(dataset.iloc[0, 0], name) + + if args.option == 'dataset_class': + name = 'RAS' + resolve_rules = read_dataset(args.input_data, name) + resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str) + + ids = pd.Series.tolist(resolve_rules.iloc[:, 0]) + + resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1) + resolve_rules = resolve_rules.replace({'None': None}) + resolve_rules = resolve_rules.to_dict('list') + + #Converto i valori da str a float + to_float = lambda x: float(x) if (x != None) else None + + resolve_rules_float = {} + + for k in resolve_rules: + resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float + classes = read_dataset(args.input_class, 'class') - if not len(classes.columns) == 2: - warning('Warning: more than 2 columns in class file. Extra' + - 'columns have been disregarded\n') classes = classes.astype(str) - if args.rules_selector != 'Custom': - genes = data_gene(dataset, type_gene, name, None) - ids, rules = load_id_rules(recon.get(type_gene)) - elif args.rules_selector == 'Custom': - genes = data_gene(dataset, type_gene, name, gene_in_rule) - resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) - if err != None and err: - warning('Warning: gene\n'+str(err)+'\nnot found in class ' - + name + ', the expression level for this gene ' + - 'will be considered NaN\n') - if resolve_rules != None: - class_pat = split_class(classes, resolve_rules) - if generate_ras: - create_ras(resolve_rules, name, False, rules, ids) - + + if resolve_rules_float != None: + class_pat = split_class(classes, resolve_rules_float) if args.rules_selector == 'Custom': if args.yes_no == 'yes': @@ -857,11 +896,12 @@ else: core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg') - maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf) + maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf, args.comparison, args.control) print('Execution succeded') + + return None - return None ###############################################################################