# HG changeset patch
# User bimib
# Date 1571156476 14400
# Node ID e88efefbd015de3356cd9f0a85f80bd7775021d1
# Parent 9fcb0e8d6d47e63ec4adce79de8c12f767b884cd
fix changes
diff -r 9fcb0e8d6d47 -r e88efefbd015 Desktop/Marea/marea.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Desktop/Marea/marea.py Tue Oct 15 12:21:16 2019 -0400
@@ -0,0 +1,861 @@
+from __future__ import division
+import sys
+import pandas as pd
+import itertools as it
+import scipy.stats as st
+import collections
+import lxml.etree as ET
+import shutil
+import pickle as pk
+import math
+import os
+import argparse
+from svglib.svglib import svg2rlg
+from reportlab.graphics import renderPDF
+
+########################## argparse ##########################################
+
+def process_args(args):
+ parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
+ description = 'process some value\'s'+
+ ' genes to create a comparison\'s map.')
+ parser.add_argument('-rs', '--rules_selector',
+ type = str,
+ default = 'HMRcore',
+ choices = ['HMRcore', 'Recon', 'Custom'],
+ help = 'chose which type of dataset you want use')
+ parser.add_argument('-cr', '--custom',
+ type = str,
+ help='your dataset if you want custom rules')
+ parser.add_argument('-na', '--names',
+ type = str,
+ nargs = '+',
+ help = 'input names')
+ parser.add_argument('-n', '--none',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'compute Nan values')
+ parser.add_argument('-pv' ,'--pValue',
+ type = float,
+ default = 0.05,
+ help = 'P-Value threshold (default: %(default)s)')
+ parser.add_argument('-fc', '--fChange',
+ type = float,
+ default = 1.5,
+ help = 'Fold-Change threshold (default: %(default)s)')
+ parser.add_argument('-td', '--tool_dir',
+ type = str,
+ required = True,
+ help = 'your tool directory')
+ parser.add_argument('-op', '--option',
+ type = str,
+ choices = ['datasets', 'dataset_class', 'datasets_rasonly'],
+ help='dataset or dataset and class')
+ parser.add_argument('-ol', '--out_log',
+ help = "Output log")
+ parser.add_argument('-ids', '--input_datas',
+ type = str,
+ nargs = '+',
+ help = 'input datasets')
+ parser.add_argument('-id', '--input_data',
+ type = str,
+ help = 'input dataset')
+ parser.add_argument('-ic', '--input_class',
+ type = str,
+ help = 'sample group specification')
+ parser.add_argument('-cm', '--custom_map',
+ type = str,
+ help = 'custom map')
+ parser.add_argument('-yn', '--yes_no',
+ type = str,
+ choices = ['yes', 'no'],
+ help = 'if make or not custom map')
+ parser.add_argument('-gs', '--generate_svg',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'generate svg map')
+ parser.add_argument('-gp', '--generate_pdf',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'generate pdf map')
+ parser.add_argument('-gr', '--generate_ras',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'generate reaction activity score')
+ parser.add_argument('-sr', '--single_ras_file',
+ type = str,
+ help = 'file that will contain ras')
+
+ args = parser.parse_args()
+ return args
+
+########################### warning ###########################################
+
+def warning(s):
+ args = process_args(sys.argv)
+ with open(args.out_log, 'a') as log:
+ log.write(s)
+
+############################ dataset input ####################################
+
+def read_dataset(data, name):
+ try:
+ dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python')
+ except pd.errors.EmptyDataError:
+ sys.exit('Execution aborted: wrong format of ' + name + '\n')
+ if len(dataset.columns) < 2:
+ sys.exit('Execution aborted: wrong format of ' + name + '\n')
+ return dataset
+
+############################ dataset name #####################################
+
+def name_dataset(name_data, count):
+ if str(name_data) == 'Dataset':
+ return str(name_data) + '_' + str(count)
+ else:
+ return str(name_data)
+
+############################ load id e rules ##################################
+
+def load_id_rules(reactions):
+ ids, rules = [], []
+ for key, value in reactions.items():
+ ids.append(key)
+ rules.append(value)
+ return (ids, rules)
+
+############################ check_methods ####################################
+
+def gene_type(l, name):
+ if check_hgnc(l):
+ return 'hugo_id'
+ elif check_ensembl(l):
+ return 'ensembl_gene_id'
+ elif check_symbol(l):
+ return 'symbol'
+ elif check_entrez(l):
+ return 'entrez_id'
+ else:
+ sys.exit('Execution aborted:\n' +
+ 'gene ID type in ' + name + ' not supported. Supported ID'+
+ 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
+
+def check_hgnc(l):
+ if len(l) > 5:
+ if (l.upper()).startswith('HGNC:'):
+ return l[5:].isdigit()
+ else:
+ return False
+ else:
+ return False
+
+def check_ensembl(l):
+ if len(l) == 15:
+ if (l.upper()).startswith('ENS'):
+ return l[4:].isdigit()
+ else:
+ return False
+ else:
+ return False
+
+def check_symbol(l):
+ if len(l) > 0:
+ if l[0].isalpha() and l[1:].isalnum():
+ return True
+ else:
+ return False
+ else:
+ return False
+
+def check_entrez(l):
+ if len(l) > 0:
+ return l.isdigit()
+ else:
+ return False
+
+def check_bool(b):
+ if b == 'true':
+ return True
+ elif b == 'false':
+ return False
+
+############################ resolve_methods ##################################
+
+def replace_gene_value(l, d):
+ tmp = []
+ err = []
+ while l:
+ if isinstance(l[0], list):
+ tmp_rules, tmp_err = replace_gene_value(l[0], d)
+ tmp.append(tmp_rules)
+ err.extend(tmp_err)
+ else:
+ value = replace_gene(l[0], d)
+ tmp.append(value)
+ if value == None:
+ err.append(l[0])
+ l = l[1:]
+ return (tmp, err)
+
+def replace_gene(l, d):
+ if l =='and' or l == 'or':
+ return l
+ else:
+ value = d.get(l, None)
+ if not(value == None or isinstance(value, (int, float))):
+ sys.exit('Execution aborted: ' + value + ' value not valid\n')
+ return value
+
+def computes(val1, op, val2, cn):
+ if val1 != None and val2 != None:
+ if op == 'and':
+ return min(val1, val2)
+ else:
+ return val1 + val2
+ elif op == 'and':
+ if cn is True:
+ if val1 != None:
+ return val1
+ elif val2 != None:
+ return val2
+ else:
+ return None
+ else:
+ return None
+ else:
+ if val1 != None:
+ return val1
+ elif val2 != None:
+ return val2
+ else:
+ return None
+
+def control(ris, l, cn):
+ if len(l) == 1:
+ if isinstance(l[0], (float, int)) or l[0] == None:
+ return l[0]
+ elif isinstance(l[0], list):
+ return control(None, l[0], cn)
+ else:
+ return False
+ elif len(l) > 2:
+ return control_list(ris, l, cn)
+ else:
+ return False
+
+def control_list(ris, l, cn):
+ while l:
+ if len(l) == 1:
+ return False
+ elif (isinstance(l[0], (float, int)) or
+ l[0] == None) and l[1] in ['and', 'or']:
+ if isinstance(l[2], (float, int)) or l[2] == None:
+ ris = computes(l[0], l[1], l[2], cn)
+ elif isinstance(l[2], list):
+ tmp = control(None, l[2], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = computes(l[0], l[1], tmp, cn)
+ else:
+ return False
+ l = l[3:]
+ elif l[0] in ['and', 'or']:
+ if isinstance(l[1], (float, int)) or l[1] == None:
+ ris = computes(ris, l[0], l[1], cn)
+ elif isinstance(l[1], list):
+ tmp = control(None,l[1], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = computes(ris, l[0], tmp, cn)
+ else:
+ return False
+ l = l[2:]
+ elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+ if isinstance(l[2], (float, int)) or l[2] == None:
+ tmp = control(None, l[0], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = computes(tmp, l[1], l[2], cn)
+ elif isinstance(l[2], list):
+ tmp = control(None, l[0], cn)
+ tmp2 = control(None, l[2], cn)
+ if tmp is False or tmp2 is False:
+ return False
+ else:
+ ris = computes(tmp, l[1], tmp2, cn)
+ else:
+ return False
+ l = l[3:]
+ else:
+ return False
+ return ris
+
+############################ map_methods ######################################
+
+def fold_change(avg1, avg2):
+ if avg1 == 0 and avg2 == 0:
+ return 0
+ elif avg1 == 0:
+ return '-INF'
+ elif avg2 == 0:
+ return 'INF'
+ else:
+ return math.log(avg1 / avg2, 2)
+
+def fix_style(l, col, width, dash):
+ tmp = l.split(';')
+ flag_col = False
+ flag_width = False
+ flag_dash = False
+ for i in range(len(tmp)):
+ if tmp[i].startswith('stroke:'):
+ tmp[i] = 'stroke:' + col
+ flag_col = True
+ if tmp[i].startswith('stroke-width:'):
+ tmp[i] = 'stroke-width:' + width
+ flag_width = True
+ if tmp[i].startswith('stroke-dasharray:'):
+ tmp[i] = 'stroke-dasharray:' + dash
+ flag_dash = True
+ if not flag_col:
+ tmp.append('stroke:' + col)
+ if not flag_width:
+ tmp.append('stroke-width:' + width)
+ if not flag_dash:
+ tmp.append('stroke-dasharray:' + dash)
+ return ';'.join(tmp)
+
+def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C):
+ maxT = 12
+ minT = 2
+ grey = '#BEBEBE'
+ blue = '#0000FF'
+ red = '#E41A1C'
+ for el in core_map.iter():
+ el_id = str(el.get('id'))
+ if el_id.startswith('R_'):
+ tmp = d.get(el_id[2:])
+ if tmp != None:
+ p_val = tmp[0]
+ f_c = tmp[1]
+ if p_val < threshold_P_V:
+ if not isinstance(f_c, str):
+ if abs(f_c) < math.log(threshold_F_C, 2):
+ col = grey
+ width = str(minT)
+ else:
+ if f_c < 0:
+ col = blue
+ elif f_c > 0:
+ col = red
+ width = str(max((abs(f_c) * maxT) / max_F_C, minT))
+ else:
+ if f_c == '-INF':
+ col = blue
+ elif f_c == 'INF':
+ col = red
+ width = str(maxT)
+ dash = 'none'
+ else:
+ dash = '5,5'
+ col = grey
+ width = str(minT)
+ el.set('style', fix_style(el.get('style'), col, width, dash))
+ return core_map
+
+############################ make recon #######################################
+
+def check_and_doWord(l):
+ tmp = []
+ tmp_genes = []
+ count = 0
+ while l:
+ if count >= 0:
+ if l[0] == '(':
+ count += 1
+ tmp.append(l[0])
+ l.pop(0)
+ elif l[0] == ')':
+ count -= 1
+ tmp.append(l[0])
+ l.pop(0)
+ elif l[0] == ' ':
+ l.pop(0)
+ else:
+ word = []
+ while l:
+ if l[0] in [' ', '(', ')']:
+ break
+ else:
+ word.append(l[0])
+ l.pop(0)
+ word = ''.join(word)
+ tmp.append(word)
+ if not(word in ['or', 'and']):
+ tmp_genes.append(word)
+ else:
+ return False
+ if count == 0:
+ return (tmp, tmp_genes)
+ else:
+ return False
+
+def brackets_to_list(l):
+ tmp = []
+ while l:
+ if l[0] == '(':
+ l.pop(0)
+ tmp.append(resolve_brackets(l))
+ else:
+ tmp.append(l[0])
+ l.pop(0)
+ return tmp
+
+def resolve_brackets(l):
+ tmp = []
+ while l[0] != ')':
+ if l[0] == '(':
+ l.pop(0)
+ tmp.append(resolve_brackets(l))
+ else:
+ tmp.append(l[0])
+ l.pop(0)
+ l.pop(0)
+ return tmp
+
+def priorityAND(l):
+ tmp = []
+ flag = True
+ while l:
+ if len(l) == 1:
+ if isinstance(l[0], list):
+ tmp.append(priorityAND(l[0]))
+ else:
+ tmp.append(l[0])
+ l = l[1:]
+ elif l[0] == 'or':
+ tmp.append(l[0])
+ flag = False
+ l = l[1:]
+ elif l[1] == 'or':
+ if isinstance(l[0], list):
+ tmp.append(priorityAND(l[0]))
+ else:
+ tmp.append(l[0])
+ tmp.append(l[1])
+ flag = False
+ l = l[2:]
+ elif l[1] == 'and':
+ tmpAnd = []
+ if isinstance(l[0], list):
+ tmpAnd.append(priorityAND(l[0]))
+ else:
+ tmpAnd.append(l[0])
+ tmpAnd.append(l[1])
+ if isinstance(l[2], list):
+ tmpAnd.append(priorityAND(l[2]))
+ else:
+ tmpAnd.append(l[2])
+ l = l[3:]
+ while l:
+ if l[0] == 'and':
+ tmpAnd.append(l[0])
+ if isinstance(l[1], list):
+ tmpAnd.append(priorityAND(l[1]))
+ else:
+ tmpAnd.append(l[1])
+ l = l[2:]
+ elif l[0] == 'or':
+ flag = False
+ break
+ if flag == True: #when there are only AND in list
+ tmp.extend(tmpAnd)
+ elif flag == False:
+ tmp.append(tmpAnd)
+ return tmp
+
+def checkRule(l):
+ if len(l) == 1:
+ if isinstance(l[0], list):
+ if checkRule(l[0]) is False:
+ return False
+ elif len(l) > 2:
+ if checkRule2(l) is False:
+ return False
+ else:
+ return False
+ return True
+
+def checkRule2(l):
+ while l:
+ if len(l) == 1:
+ return False
+ elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+ if checkRule(l[0]) is False:
+ return False
+ if isinstance(l[2], list):
+ if checkRule(l[2]) is False:
+ return False
+ l = l[3:]
+ elif l[1] in ['and', 'or']:
+ if isinstance(l[2], list):
+ if checkRule(l[2]) is False:
+ return False
+ l = l[3:]
+ elif l[0] in ['and', 'or']:
+ if isinstance(l[1], list):
+ if checkRule(l[1]) is False:
+ return False
+ l = l[2:]
+ else:
+ return False
+ return True
+
+def do_rules(rules):
+ split_rules = []
+ err_rules = []
+ tmp_gene_in_rule = []
+ for i in range(len(rules)):
+ tmp = list(rules[i])
+ if tmp:
+ tmp, tmp_genes = check_and_doWord(tmp)
+ tmp_gene_in_rule.extend(tmp_genes)
+ if tmp is False:
+ split_rules.append([])
+ err_rules.append(rules[i])
+ else:
+ tmp = brackets_to_list(tmp)
+ if checkRule(tmp):
+ split_rules.append(priorityAND(tmp))
+ else:
+ split_rules.append([])
+ err_rules.append(rules[i])
+ else:
+ split_rules.append([])
+ if err_rules:
+ warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
+ return (split_rules, list(set(tmp_gene_in_rule)))
+
+def make_recon(data):
+ try:
+ import cobra as cb
+ import warnings
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore')
+ recon = cb.io.read_sbml_model(data)
+ react = recon.reactions
+ rules = [react[i].gene_reaction_rule for i in range(len(react))]
+ ids = [react[i].id for i in range(len(react))]
+ except cb.io.sbml3.CobraSBMLError:
+ try:
+ data = (pd.read_csv(data, sep = '\t', dtype = str, engine='python')).fillna('')
+ if len(data.columns) < 2:
+ sys.exit('Execution aborted: wrong format of '+
+ 'custom datarules\n')
+ if not len(data.columns) == 2:
+ warning('Warning: more than 2 columns in custom datarules.\n' +
+ 'Extra columns have been disregarded\n')
+ ids = list(data.iloc[:, 0])
+ rules = list(data.iloc[:, 1])
+ except pd.errors.EmptyDataError:
+ sys.exit('Execution aborted: wrong format of custom datarules\n')
+ except pd.errors.ParserError:
+ sys.exit('Execution aborted: wrong format of custom datarules\n')
+ split_rules, tmp_genes = do_rules(rules)
+ gene_in_rule = {}
+ for i in tmp_genes:
+ gene_in_rule[i] = 'ok'
+ return (ids, split_rules, gene_in_rule)
+
+############################ gene #############################################
+
+def data_gene(gene, type_gene, name, gene_custom):
+ args = process_args(sys.argv)
+ for i in range(len(gene)):
+ tmp = gene.iloc[i, 0]
+ if tmp.startswith(' ') or tmp.endswith(' '):
+ gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
+ gene_dup = [item for item, count in
+ collections.Counter(gene[gene.columns[0]]).items() if count > 1]
+ pat_dup = [item for item, count in
+ collections.Counter(list(gene.columns)).items() if count > 1]
+ if gene_dup:
+ if gene_custom == None:
+ if args.rules_selector == 'HMRcore':
+ gene_in_rule = pk.load(open(args.tool_dir +
+ '/local/HMRcore_genes.p', 'rb'))
+ elif args.rules_selector == 'Recon':
+ gene_in_rule = pk.load(open(args.tool_dir +
+ '/local/Recon_genes.p', 'rb'))
+ gene_in_rule = gene_in_rule.get(type_gene)
+ else:
+ gene_in_rule = gene_custom
+ tmp = []
+ for i in gene_dup:
+ if gene_in_rule.get(i) == 'ok':
+ tmp.append(i)
+ if tmp:
+ sys.exit('Execution aborted because gene ID '
+ +str(tmp)+' in '+name+' is duplicated\n')
+ if pat_dup:
+ warning('Warning: duplicated label\n' + str(pat_dup) + 'in ' + name +
+ '\n')
+ return (gene.set_index(gene.columns[0])).to_dict()
+
+############################ resolve ##########################################
+
+def resolve(genes, rules, ids, resolve_none, name):
+ resolve_rules = {}
+ not_found = []
+ flag = False
+ for key, value in genes.items():
+ tmp_resolve = []
+ for i in range(len(rules)):
+ tmp = rules[i]
+ if tmp:
+ tmp, err = replace_gene_value(tmp, value)
+ if err:
+ not_found.extend(err)
+ ris = control(None, tmp, resolve_none)
+ if ris is False or ris == None:
+ tmp_resolve.append(None)
+ else:
+ tmp_resolve.append(ris)
+ flag = True
+ else:
+ tmp_resolve.append(None)
+ resolve_rules[key] = tmp_resolve
+ if flag is False:
+ warning('Warning: no computable score (due to missing gene values)' +
+ 'for class ' + name + ', the class has been disregarded\n')
+ return (None, None)
+ return (resolve_rules, list(set(not_found)))
+
+############################ split class ######################################
+
+def split_class(classes, resolve_rules):
+ class_pat = {}
+ for i in range(len(classes)):
+ classe = classes.iloc[i, 1]
+ if not pd.isnull(classe):
+ l = []
+ for j in range(i, len(classes)):
+ if classes.iloc[j, 1] == classe:
+ pat_id = classes.iloc[j, 0]
+ tmp = resolve_rules.get(pat_id, None)
+ if tmp != None:
+ l.append(tmp)
+ classes.iloc[j, 1] = None
+ if l:
+ class_pat[classe] = list(map(list, zip(*l)))
+ else:
+ warning('Warning: no sample found in class ' + classe +
+ ', the class has been disregarded\n')
+ return class_pat
+
+############################ create_ras #######################################
+
+def create_ras (resolve_rules, dataset_name, single_ras):
+
+ if resolve_rules == None:
+ warning("Couldn't generate RAS for current dataset: " + dataset_name)
+
+ for geni in resolve_rules.values():
+ for i, valori in enumerate(geni):
+ if valori == None:
+ geni[i] = 'None'
+
+ output_ras = pd.DataFrame.from_dict(resolve_rules)
+ output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
+
+ if (single_ras):
+ args = process_args(sys.argv)
+ text_file = open(args.single_ras_file, "w")
+ else:
+ text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
+
+ text_file.write(output_to_csv)
+ text_file.close()
+
+############################ map ##############################################
+
+def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf):
+ args = process_args(sys.argv)
+ if (not class_pat) or (len(class_pat.keys()) < 2):
+ sys.exit('Execution aborted: classes provided for comparisons are ' +
+ 'less than two\n')
+ for i, j in it.combinations(class_pat.keys(), 2):
+ tmp = {}
+ count = 0
+ max_F_C = 0
+ for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
+ try:
+ stat_D, p_value = st.ks_2samp(l1, l2)
+ avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
+ if not isinstance(avg, str):
+ if max_F_C < abs(avg):
+ max_F_C = abs(avg)
+ tmp[ids[count]] = [float(p_value), avg]
+ count += 1
+ except (TypeError, ZeroDivisionError):
+ count += 1
+ tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
+ tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
+ tmp_csv = tmp_csv.reset_index()
+ header = ['ids', 'P_Value', 'Log2(fold change)']
+ tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
+
+ if create_svg or create_pdf:
+ if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
+ and args.yes_no == 'yes'):
+ fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
+ file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
+ with open(file_svg, 'wb') as new_map:
+ new_map.write(ET.tostring(core_map))
+
+
+ if create_pdf:
+ file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
+ renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
+
+ if not create_svg:
+ #Ho utilizzato il file svg per generare il pdf,
+ #ma l'utente non ne ha richiesto il ritorno, quindi
+ #lo elimino
+ os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
+
+ return None
+
+############################ MAIN #############################################
+
+def main():
+ args = process_args(sys.argv)
+
+ create_svg = check_bool(args.generate_svg)
+ create_pdf = check_bool(args.generate_pdf)
+ generate_ras = check_bool(args.generate_ras)
+
+ os.makedirs('result')
+
+ if generate_ras:
+ os.makedirs('ras')
+
+ if args.rules_selector == 'HMRcore':
+ recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
+ elif args.rules_selector == 'Recon':
+ recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
+ elif args.rules_selector == 'Custom':
+ ids, rules, gene_in_rule = make_recon(args.custom)
+
+ resolve_none = check_bool(args.none)
+
+ class_pat = {}
+
+ if args.option == 'datasets_rasonly':
+ name = "RAS Dataset"
+ dataset = read_dataset(args.input_datas[0],"dataset")
+
+ dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+
+ type_gene = gene_type(dataset.iloc[0, 0], name)
+
+ if args.rules_selector != 'Custom':
+ genes = data_gene(dataset, type_gene, name, None)
+ ids, rules = load_id_rules(recon.get(type_gene))
+ elif args.rules_selector == 'Custom':
+ genes = data_gene(dataset, type_gene, name, gene_in_rule)
+
+ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+
+ create_ras(resolve_rules, name, True)
+
+ if err != None and err:
+ warning('Warning: gene\n' + str(err) + '\nnot found in class '
+ + name + ', the expression level for this gene ' +
+ 'will be considered NaN\n')
+
+ print('execution succeded')
+ return None
+
+
+ elif args.option == 'datasets':
+ num = 1
+ for i, j in zip(args.input_datas, args.names):
+
+ name = name_dataset(j, num)
+ dataset = read_dataset(i, name)
+
+ dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+
+ type_gene = gene_type(dataset.iloc[0, 0], name)
+
+ if args.rules_selector != 'Custom':
+ genes = data_gene(dataset, type_gene, name, None)
+ ids, rules = load_id_rules(recon.get(type_gene))
+ elif args.rules_selector == 'Custom':
+ genes = data_gene(dataset, type_gene, name, gene_in_rule)
+
+ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+
+ if generate_ras:
+ create_ras(resolve_rules, name, False)
+
+ if err != None and err:
+ warning('Warning: gene\n' + str(err) + '\nnot found in class '
+ + name + ', the expression level for this gene ' +
+ 'will be considered NaN\n')
+ if resolve_rules != None:
+ class_pat[name] = list(map(list, zip(*resolve_rules.values())))
+ num += 1
+ elif args.option == 'dataset_class':
+ name = 'RNAseq'
+ dataset = read_dataset(args.input_data, name)
+ dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+ type_gene = gene_type(dataset.iloc[0, 0], name)
+ classes = read_dataset(args.input_class, 'class')
+ if not len(classes.columns) == 2:
+ warning('Warning: more than 2 columns in class file. Extra' +
+ 'columns have been disregarded\n')
+ classes = classes.astype(str)
+ if args.rules_selector != 'Custom':
+ genes = data_gene(dataset, type_gene, name, None)
+ ids, rules = load_id_rules(recon.get(type_gene))
+ elif args.rules_selector == 'Custom':
+ genes = data_gene(dataset, type_gene, name, gene_in_rule)
+ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+ if err != None and err:
+ warning('Warning: gene\n'+str(err)+'\nnot found in class '
+ + name + ', the expression level for this gene ' +
+ 'will be considered NaN\n')
+ if resolve_rules != None:
+ class_pat = split_class(classes, resolve_rules)
+
+
+ if args.rules_selector == 'Custom':
+ if args.yes_no == 'yes':
+ try:
+ core_map = ET.parse(args.custom_map)
+ except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
+ sys.exit('Execution aborted: custom map in wrong format')
+ elif args.yes_no == 'no':
+ core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
+ else:
+ core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
+
+ maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf)
+
+ print('Execution succeded')
+
+ return None
+
+###############################################################################
+
+if __name__ == "__main__":
+ main()
diff -r 9fcb0e8d6d47 -r e88efefbd015 Desktop/Marea/marea.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Desktop/Marea/marea.xml Tue Oct 15 12:21:16 2019 -0400
@@ -0,0 +1,274 @@
+
+
+
+ marea_macros.xml
+
+
+ pandas
+ scipy
+ cobra
+ lxml
+ svglib
+ reportlab
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ cond['type_selector'] == "datasets_rasonly"
+
+
+ cond['type_selector'] == "datasets" or cond['type_selector'] == "dataset_class"
+
+
+
+ cond['type_selector'] != "datasets_rasonly" and cond['advanced']['choice'] and cond['advanced']['generateRas']
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 9fcb0e8d6d47 -r e88efefbd015 Desktop/Marea/marea_cluster.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Desktop/Marea/marea_cluster.py Tue Oct 15 12:21:16 2019 -0400
@@ -0,0 +1,401 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 3 19:51:00 2019
+@author: Narger
+"""
+
+import sys
+import argparse
+import os
+from sklearn.datasets import make_blobs
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster
+import matplotlib
+matplotlib.use('agg')
+import matplotlib.pyplot as plt
+import scipy.cluster.hierarchy as shc
+import matplotlib.cm as cm
+import numpy as np
+import pandas as pd
+
+################################# process args ###############################
+
+def process_args(args):
+ parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
+ description = 'process some value\'s' +
+ ' genes to create class.')
+
+ parser.add_argument('-ol', '--out_log',
+ help = "Output log")
+
+ parser.add_argument('-in', '--input',
+ type = str,
+ help = 'input dataset')
+
+ parser.add_argument('-cy', '--cluster_type',
+ type = str,
+ choices = ['kmeans', 'meanshift', 'dbscan', 'hierarchy'],
+ default = 'kmeans',
+ help = 'choose clustering algorythm')
+
+ parser.add_argument('-k1', '--k_min',
+ type = int,
+ default = 2,
+ help = 'choose minimun cluster number to be generated')
+
+ parser.add_argument('-k2', '--k_max',
+ type = int,
+ default = 7,
+ help = 'choose maximum cluster number to be generated')
+
+ parser.add_argument('-el', '--elbow',
+ type = str,
+ default = 'false',
+ choices = ['true', 'false'],
+ help = 'choose if you want to generate an elbow plot for kmeans')
+
+ parser.add_argument('-si', '--silhouette',
+ type = str,
+ default = 'false',
+ choices = ['true', 'false'],
+ help = 'choose if you want silhouette plots')
+
+ parser.add_argument('-db', '--davies',
+ type = str,
+ default = 'false',
+ choices = ['true', 'false'],
+ help = 'choose if you want davies bouldin scores')
+
+ parser.add_argument('-td', '--tool_dir',
+ type = str,
+ required = True,
+ help = 'your tool directory')
+
+ parser.add_argument('-ms', '--min_samples',
+ type = int,
+ help = 'min samples for dbscan (optional)')
+
+ parser.add_argument('-ep', '--eps',
+ type = int,
+ help = 'eps for dbscan (optional)')
+
+ parser.add_argument('-bc', '--best_cluster',
+ type = str,
+ help = 'output of best cluster tsv')
+
+
+
+ args = parser.parse_args()
+ return args
+
+########################### warning ###########################################
+
+def warning(s):
+ args = process_args(sys.argv)
+ with open(args.out_log, 'a') as log:
+ log.write(s + "\n\n")
+ print(s)
+
+########################## read dataset ######################################
+
+def read_dataset(dataset):
+ try:
+ dataset = pd.read_csv(dataset, sep = '\t', header = 0)
+ except pd.errors.EmptyDataError:
+ sys.exit('Execution aborted: wrong format of dataset\n')
+ if len(dataset.columns) < 2:
+ sys.exit('Execution aborted: wrong format of dataset\n')
+ return dataset
+
+############################ rewrite_input ###################################
+
+def rewrite_input(dataset):
+ #Riscrivo il dataset come dizionario di liste,
+ #non come dizionario di dizionari
+
+ for key, val in dataset.items():
+ l = []
+ for i in val:
+ if i == 'None':
+ l.append(None)
+ else:
+ l.append(float(i))
+
+ dataset[key] = l
+
+ return dataset
+
+############################## write to csv ##################################
+
+def write_to_csv (dataset, labels, name):
+ #labels = predict
+ predict = [x+1 for x in labels]
+
+ classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+
+ dest = name
+ classe.to_csv(dest, sep = '\t', index = False,
+ header = ['Patient_ID', 'Class'])
+
+########################### trova il massimo in lista ########################
+def max_index (lista):
+ best = -1
+ best_index = 0
+ for i in range(len(lista)):
+ if lista[i] > best:
+ best = lista [i]
+ best_index = i
+
+ return best_index
+
+################################ kmeans #####################################
+
+def kmeans (k_min, k_max, dataset, elbow, silhouette, davies, best_cluster):
+ if not os.path.exists('clustering'):
+ os.makedirs('clustering')
+
+
+ if elbow == 'true':
+ elbow = True
+ else:
+ elbow = False
+
+ if silhouette == 'true':
+ silhouette = True
+ else:
+ silhouette = False
+
+ if davies == 'true':
+ davies = True
+ else:
+ davies = False
+
+
+ range_n_clusters = [i for i in range(k_min, k_max+1)]
+ distortions = []
+ scores = []
+ all_labels = []
+
+ clusterer = KMeans(n_clusters=1, random_state=10)
+ distortions.append(clusterer.fit(dataset).inertia_)
+
+
+ for n_clusters in range_n_clusters:
+ clusterer = KMeans(n_clusters=n_clusters, random_state=10)
+ cluster_labels = clusterer.fit_predict(dataset)
+
+ all_labels.append(cluster_labels)
+ if n_clusters == 1:
+ silhouette_avg = 0
+ else:
+ silhouette_avg = silhouette_score(dataset, cluster_labels)
+ scores.append(silhouette_avg)
+ distortions.append(clusterer.fit(dataset).inertia_)
+
+ best = max_index(scores) + k_min
+
+ for i in range(len(all_labels)):
+ prefix = ''
+ if (i + k_min == best):
+ prefix = '_BEST'
+
+ write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
+
+
+ if (prefix == '_BEST'):
+ labels = all_labels[i]
+ predict = [x+1 for x in labels]
+ classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+ classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
+
+
+ if davies:
+ with np.errstate(divide='ignore', invalid='ignore'):
+ davies_bouldin = davies_bouldin_score(dataset, all_labels[i])
+ warning("\nFor n_clusters = " + str(i + k_min) +
+ " The average davies bouldin score is: " + str(davies_bouldin))
+
+
+ if silhouette:
+ silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
+
+
+ if elbow:
+ elbow_plot(distortions, k_min,k_max)
+
+
+
+
+
+############################## elbow_plot ####################################
+
+def elbow_plot (distortions, k_min, k_max):
+ plt.figure(0)
+ x = list(range(k_min, k_max + 1))
+ x.insert(0, 1)
+ plt.plot(x, distortions, marker = 'o')
+ plt.xlabel('Number of clusters (k)')
+ plt.ylabel('Distortion')
+ s = 'clustering/elbow_plot.png'
+ fig = plt.gcf()
+ fig.set_size_inches(18.5, 10.5, forward = True)
+ fig.savefig(s, dpi=100)
+
+
+############################## silhouette plot ###############################
+def silihouette_draw(dataset, labels, n_clusters, path):
+ if n_clusters == 1:
+ return None
+
+ silhouette_avg = silhouette_score(dataset, labels)
+ warning("For n_clusters = " + str(n_clusters) +
+ " The average silhouette_score is: " + str(silhouette_avg))
+
+ plt.close('all')
+ # Create a subplot with 1 row and 2 columns
+ fig, (ax1) = plt.subplots(1, 1)
+
+ fig.set_size_inches(18, 7)
+
+ # The 1st subplot is the silhouette plot
+ # The silhouette coefficient can range from -1, 1 but in this example all
+ # lie within [-0.1, 1]
+ ax1.set_xlim([-1, 1])
+ # The (n_clusters+1)*10 is for inserting blank space between silhouette
+ # plots of individual clusters, to demarcate them clearly.
+ ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
+
+ # Compute the silhouette scores for each sample
+ sample_silhouette_values = silhouette_samples(dataset, labels)
+
+ y_lower = 10
+ for i in range(n_clusters):
+ # Aggregate the silhouette scores for samples belonging to
+ # cluster i, and sort them
+ ith_cluster_silhouette_values = \
+ sample_silhouette_values[labels == i]
+
+ ith_cluster_silhouette_values.sort()
+
+ size_cluster_i = ith_cluster_silhouette_values.shape[0]
+ y_upper = y_lower + size_cluster_i
+
+ color = cm.nipy_spectral(float(i) / n_clusters)
+ ax1.fill_betweenx(np.arange(y_lower, y_upper),
+ 0, ith_cluster_silhouette_values,
+ facecolor=color, edgecolor=color, alpha=0.7)
+
+ # Label the silhouette plots with their cluster numbers at the middle
+ ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
+
+ # Compute the new y_lower for next plot
+ y_lower = y_upper + 10 # 10 for the 0 samples
+
+ ax1.set_title("The silhouette plot for the various clusters.")
+ ax1.set_xlabel("The silhouette coefficient values")
+ ax1.set_ylabel("Cluster label")
+
+ # The vertical line for average silhouette score of all the values
+ ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
+
+ ax1.set_yticks([]) # Clear the yaxis labels / ticks
+ ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
+
+
+ plt.suptitle(("Silhouette analysis for clustering on sample data "
+ "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
+
+
+ plt.savefig(path, bbox_inches='tight')
+
+######################## dbscan ##############################################
+
+def dbscan(dataset, eps, min_samples):
+ if not os.path.exists('clustering'):
+ os.makedirs('clustering')
+
+ if eps is not None:
+ clusterer = DBSCAN(eps = eps, min_samples = min_samples)
+ else:
+ clusterer = DBSCAN()
+
+ clustering = clusterer.fit(dataset)
+
+ core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
+ core_samples_mask[clustering.core_sample_indices_] = True
+ labels = clustering.labels_
+
+ # Number of clusters in labels, ignoring noise if present.
+ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+
+
+ ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
+
+
+ write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv')
+
+########################## hierachical #######################################
+
+def hierachical_agglomerative(dataset, k_min, k_max):
+
+ if not os.path.exists('clustering'):
+ os.makedirs('clustering')
+
+ plt.figure(figsize=(10, 7))
+ plt.title("Customer Dendograms")
+ shc.dendrogram(shc.linkage(dataset, method='ward'))
+ fig = plt.gcf()
+ fig.savefig('clustering/dendogram.png', dpi=200)
+
+ range_n_clusters = [i for i in range(k_min, k_max+1)]
+
+ for n_clusters in range_n_clusters:
+
+ cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
+ cluster.fit_predict(dataset)
+ cluster_labels = cluster.labels_
+
+ silhouette_avg = silhouette_score(dataset, cluster_labels)
+ write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
+ #warning("For n_clusters =", n_clusters,
+ #"The average silhouette_score is :", silhouette_avg)
+
+
+
+
+
+############################# main ###########################################
+
+
+def main():
+ if not os.path.exists('clustering'):
+ os.makedirs('clustering')
+
+ args = process_args(sys.argv)
+
+ #Data read
+
+ X = read_dataset(args.input)
+ X = pd.DataFrame.to_dict(X, orient='list')
+ X = rewrite_input(X)
+ X = pd.DataFrame.from_dict(X, orient = 'index')
+
+ for i in X.columns:
+ tmp = X[i][0]
+ if tmp == None:
+ X = X.drop(columns=[i])
+
+
+ if args.cluster_type == 'kmeans':
+ kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster)
+
+ if args.cluster_type == 'dbscan':
+ dbscan(X, args.eps, args.min_samples)
+
+ if args.cluster_type == 'hierarchy':
+ hierachical_agglomerative(X, args.k_min, args.k_max)
+
+##############################################################################
+
+if __name__ == "__main__":
+ main()
diff -r 9fcb0e8d6d47 -r e88efefbd015 Desktop/Marea/marea_cluster.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Desktop/Marea/marea_cluster.xml Tue Oct 15 12:21:16 2019 -0400
@@ -0,0 +1,95 @@
+
+
+
+ marea_macros.xml
+
+
+ pandas
+ scipy
+ cobra
+ scikit-learn
+ matplotlib
+ numpy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 9fcb0e8d6d47 -r e88efefbd015 Desktop/Marea/marea_macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Desktop/Marea/marea_macros.xml Tue Oct 15 12:21:16 2019 -0400
@@ -0,0 +1,92 @@
+
+
+
+
+
+
+
+
+
+
+
+
++--------------------+-------------------------------+
+| id | rule (with entrez-id) |
++====================+===============================+
+| SHMT1 | 155060 or 10357 |
++--------------------+-------------------------------+
+| NIT2 | 155060 or 100134869 |
++--------------------+-------------------------------+
+| GOT1_GOT2_GOT1L1_2 | 155060 and 100134869 or 10357 |
++--------------------+-------------------------------+
+
+|
+
+
+
+
+
++------------+------------+------------+------------+
+| Hugo_ID | TCGAA62670 | TCGAA62671 | TCGAA62672 |
++============+============+============+============+
+| HGNC:24086 | 0.523167 | 0.371355 | 0.925661 |
++------------+------------+------------+------------+
+| HGNC:24086 | 0.568765 | 0.765567 | 0.456789 |
++------------+------------+------------+------------+
+| HGNC:9876 | 0.876545 | 0.768933 | 0.987654 |
++------------+------------+------------+------------+
+| HGNC:9 | 0.456788 | 0.876543 | 0.876542 |
++------------+------------+------------+------------+
+| HGNC:23 | 0.876543 | 0.786543 | 0.897654 |
++------------+------------+------------+------------+
+
+|
+
+
+
+
+
++-------------+------------+------------+------------+
+| Hugo_Symbol | TCGAA62670 | TCGAA62671 | TCGAA62672 |
++=============+============+============+============+
+| A1BG | 0.523167 | 0.371355 | 0.925661 |
++-------------+------------+------------+------------+
+| A1CF | 0.568765 | 0.765567 | 0.456789 |
++-------------+------------+------------+------------+
+| A2M | 0.876545 | 0.768933 | 0.987654 |
++-------------+------------+------------+------------+
+| A4GALT | 0.456788 | 0.876543 | 0.876542 |
++-------------+------------+------------+------------+
+| M664Y65 | 0.876543 | 0.786543 | 0.897654 |
++-------------+------------+------------+------------+
+
+|
+
+
+
+
+
+This tool is developed by the `BIMIB`_ at the `Department of Informatics, Systems and Communications`_ of `University of Milan - Bicocca`_.
+
+.. _BIMIB: http://sito di bio.org
+.. _Department of Informatics, Systems and Communications: http://www.disco.unimib.it/go/Home/English
+.. _University of Milan - Bicocca: https://www.unimib.it/
+
+
+
+
+
+
+@online{lh32017,
+ author = {Alex Graudenzi, Davide Maspero, Cluadio Isella, Marzia Di Filippo, Giancarlo Mauri, Enzo Medico, Marco Antoniotti, Chiara Damiani},
+ year = {2018},
+ title = {MaREA: Metabolic feature extraction, enrichment and visualization of RNAseq},
+ publisher = {bioRxiv},
+ journal = {bioRxiv},
+ url = {https://www.biorxiv.org/content/early/2018/01/16/248724},
+}
+
+
+
+
+