# HG changeset patch
# User bimib
# Date 1541492181 18000
# Node ID 23ac9cf127885e6919d958c2847c10723f610e23
Uploaded
diff -r 000000000000 -r 23ac9cf12788 Marea/local/HMRcoreMap.svg
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Marea/local/HMRcoreMap.svg Tue Nov 06 03:16:21 2018 -0500
@@ -0,0 +1,7702 @@
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 23ac9cf12788 Marea/local/HMRcore_genes.p
Binary file Marea/local/HMRcore_genes.p has changed
diff -r 000000000000 -r 23ac9cf12788 Marea/local/HMRcore_rules.p
Binary file Marea/local/HMRcore_rules.p has changed
diff -r 000000000000 -r 23ac9cf12788 Marea/local/Recon_genes.p
Binary file Marea/local/Recon_genes.p has changed
diff -r 000000000000 -r 23ac9cf12788 Marea/local/Recon_rules.p
Binary file Marea/local/Recon_rules.p has changed
diff -r 000000000000 -r 23ac9cf12788 Marea/marea.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Marea/marea.py Tue Nov 06 03:16:21 2018 -0500
@@ -0,0 +1,760 @@
+
+from __future__ import division
+import sys
+import pandas as pd
+import itertools as it
+import scipy.stats as st
+import collections
+import lxml.etree as ET
+import pickle as pk
+import math
+import os
+import argparse
+from svglib.svglib import svg2rlg
+from reportlab.graphics import renderPDF
+
+########################## argparse ###########################################
+
+def process_args(args):
+ parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
+ description = 'process some value\'s'+
+ ' genes to create a comparison\'s map.')
+ parser.add_argument('-rs', '--rules_selector',
+ type = str,
+ default = 'HMRcore',
+ choices = ['HMRcore', 'Recon', 'Custom'],
+ help = 'chose which type of dataset you want use')
+ parser.add_argument('-cr', '--custom',
+ type = str,
+ help='your dataset if you want custom rules')
+ parser.add_argument('-na', '--names',
+ type = str,
+ nargs = '+',
+ help = 'input names')
+ parser.add_argument('-n', '--none',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'compute Nan values')
+ parser.add_argument('-pv' ,'--pValue',
+ type = float,
+ default = 0.05,
+ help = 'P-Value threshold (default: %(default)s)')
+ parser.add_argument('-fc', '--fChange',
+ type = float,
+ default = 1.5,
+ help = 'Fold-Change threshold (default: %(default)s)')
+ parser.add_argument('-td', '--tool_dir',
+ type = str,
+ required = True,
+ help = 'your tool directory')
+ parser.add_argument('-op', '--option',
+ type = str,
+ choices = ['datasets', 'dataset_class'],
+ help='dataset or dataset and class')
+ parser.add_argument('-ol', '--out_log',
+ help = "Output log")
+ parser.add_argument('-ids', '--input_datas',
+ type = str,
+ nargs = '+',
+ help = 'input datasets')
+ parser.add_argument('-id', '--input_data',
+ type = str,
+ help = 'input dataset')
+ parser.add_argument('-ic', '--input_class',
+ type = str,
+ help = 'sample group specification')
+ parser.add_argument('-cm', '--custom_map',
+ type = str,
+ help = 'custom map')
+ parser.add_argument('-yn', '--yes_no',
+ type = str,
+ choices = ['yes', 'no'],
+ help = 'if make or not custom map')
+ args = parser.parse_args()
+ return args
+
+########################### warning ###########################################
+
+def warning(s):
+ args = process_args(sys.argv)
+ with open(args.out_log, 'a') as log:
+ log.write(s)
+
+############################ dataset input ####################################
+
+def read_dataset(data, name):
+ try:
+ dataset = pd.read_csv(data, sep = '\t', header = 0)
+ except pd.errors.EmptyDataError:
+ sys.exit('Execution aborted: wrong format of ' + name + '\n')
+ if len(dataset.columns) < 2:
+ sys.exit('Execution aborted: wrong format of ' + name + '\n')
+ return dataset
+
+############################ dataset name #####################################
+
+def name_dataset(name_data, count):
+ if str(name_data) == 'Dataset':
+ return str(name_data) + '_' + str(count)
+ else:
+ return str(name_data)
+
+############################ load id e rules ##################################
+
+def load_id_rules(reactions):
+ ids, rules = [], []
+ for key, value in reactions.items():
+ ids.append(key)
+ rules.append(value)
+ return (ids, rules)
+
+############################ check_methods ####################################
+
+def gene_type(l, name):
+ if check_hgnc(l):
+ return 'hugo_id'
+ elif check_ensembl(l):
+ return 'ensembl_gene_id'
+ elif check_symbol(l):
+ return 'symbol'
+ elif check_entrez(l):
+ return 'entrez_id'
+ else:
+ sys.exit('Execution aborted:\n' +
+ 'gene ID type in ' + name + ' not supported. Supported ID'+
+ 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
+
+def check_hgnc(l):
+ if len(l) > 5:
+ if (l.upper()).startswith('HGNC:'):
+ return l[5:].isdigit()
+ else:
+ return False
+ else:
+ return False
+
+def check_ensembl(l):
+ if len(l) == 15:
+ if (l.upper()).startswith('ENS'):
+ return l[4:].isdigit()
+ else:
+ return False
+ else:
+ return False
+
+def check_symbol(l):
+ if len(l) > 0:
+ if l[0].isalpha() and l[1:].isalnum():
+ return True
+ else:
+ return False
+ else:
+ return False
+
+def check_entrez(l):
+ if len(l) > 0:
+ return l.isdigit()
+ else:
+ return False
+
+def check_bool(b):
+ if b == 'true':
+ return True
+ elif b == 'false':
+ return False
+
+############################ resolve_methods ##################################
+
+def replace_gene_value(l, d):
+ tmp = []
+ err = []
+ while l:
+ if isinstance(l[0], list):
+ tmp_rules, tmp_err = replace_gene_value(l[0], d)
+ tmp.append(tmp_rules)
+ err.extend(tmp_err)
+ else:
+ value = replace_gene(l[0], d)
+ tmp.append(value)
+ if value == None:
+ err.append(l[0])
+ l = l[1:]
+ return (tmp, err)
+
+def replace_gene(l, d):
+ if l =='and' or l == 'or':
+ return l
+ else:
+ value = d.get(l, None)
+ if not(value == None or isinstance(value, (int, float))):
+ sys.exit('Execution aborted: ' + value + ' value not valid\n')
+ return value
+
+def computes(val1, op, val2, cn):
+ if val1 != None and val2 != None:
+ if op == 'and':
+ return min(val1, val2)
+ else:
+ return val1 + val2
+ elif op == 'and':
+ if cn is True:
+ if val1 != None:
+ return val1
+ elif val2 != None:
+ return val2
+ else:
+ return None
+ else:
+ return None
+ else:
+ if val1 != None:
+ return val1
+ elif val2 != None:
+ return val2
+ else:
+ return None
+
+def control(ris, l, cn):
+ if len(l) == 1:
+ if isinstance(l[0], (float, int)) or l[0] == None:
+ return l[0]
+ elif isinstance(l[0], list):
+ return control(None, l[0], cn)
+ else:
+ return False
+ elif len(l) > 2:
+ return control_list(ris, l, cn)
+ else:
+ return False
+
+def control_list(ris, l, cn):
+ while l:
+ if len(l) == 1:
+ return False
+ elif (isinstance(l[0], (float, int)) or
+ l[0] == None) and l[1] in ['and', 'or']:
+ if isinstance(l[2], (float, int)) or l[2] == None:
+ ris = computes(l[0], l[1], l[2], cn)
+ elif isinstance(l[2], list):
+ tmp = control(None, l[2], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = computes(l[0], l[1], tmp, cn)
+ else:
+ return False
+ l = l[3:]
+ elif l[0] in ['and', 'or']:
+ if isinstance(l[1], (float, int)) or l[1] == None:
+ ris = computes(ris, l[0], l[1], cn)
+ elif isinstance(l[1], list):
+ tmp = control(None,l[1], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = computes(ris, l[0], tmp, cn)
+ else:
+ return False
+ l = l[2:]
+ elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+ if isinstance(l[2], (float, int)) or l[2] == None:
+ tmp = control(None, l[0], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = computes(tmp, l[1], l[2], cn)
+ elif isinstance(l[2], list):
+ tmp = control(None, l[0], cn)
+ tmp2 = control(None, l[2], cn)
+ if tmp is False or tmp2 is False:
+ return False
+ else:
+ ris = computes(tmp, l[1], tmp2, cn)
+ else:
+ return False
+ l = l[3:]
+ else:
+ return False
+ return ris
+
+############################ map_methods ######################################
+
+def fold_change(avg1, avg2):
+ if avg1 == 0 and avg2 == 0:
+ return 0
+ elif avg1 == 0:
+ return '-INF'
+ elif avg2 == 0:
+ return 'INF'
+ else:
+ return math.log(avg1 / avg2, 2)
+
+def fix_style(l, col, width, dash):
+ tmp = l.split(';')
+ flag_col = False
+ flag_width = False
+ flag_dash = False
+ for i in range(len(tmp)):
+ if tmp[i].startswith('stroke:'):
+ tmp[i] = 'stroke:' + col
+ flag_col = True
+ if tmp[i].startswith('stroke-width:'):
+ tmp[i] = 'stroke-width:' + width
+ flag_width = True
+ if tmp[i].startswith('stroke-dasharray:'):
+ tmp[i] = 'stroke-dasharray:' + dash
+ flag_dash = True
+ if not flag_col:
+ tmp.append('stroke:' + col)
+ if not flag_width:
+ tmp.append('stroke-width:' + width)
+ if not flag_dash:
+ tmp.append('stroke-dasharray:' + dash)
+ return ';'.join(tmp)
+
+def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C):
+ maxT = 12
+ minT = 2
+ grey = '#BEBEBE'
+ blue = '#0000FF'
+ red = '#E41A1C'
+ for el in core_map.iter():
+ el_id = str(el.get('id'))
+ if el_id.startswith('R_'):
+ tmp = d.get(el_id[2:])
+ if tmp != None:
+ p_val = tmp[0]
+ f_c = tmp[1]
+ if p_val < threshold_P_V:
+ if not isinstance(f_c, str):
+ if abs(f_c) < math.log(threshold_F_C, 2):
+ col = grey
+ width = str(minT)
+ else:
+ if f_c < 0:
+ col = blue
+ elif f_c > 0:
+ col = red
+ width = str(max((abs(f_c) * maxT) / max_F_C, minT))
+ else:
+ if f_c == '-INF':
+ col = blue
+ elif f_c == 'INF':
+ col = red
+ width = str(maxT)
+ dash = 'none'
+ else:
+ dash = '5,5'
+ col = grey
+ width = str(minT)
+ el.set('style', fix_style(el.get('style'), col, width, dash))
+ return core_map
+
+############################ make recon #######################################
+
+def check_and_doWord(l):
+ tmp = []
+ tmp_genes = []
+ count = 0
+ while l:
+ if count >= 0:
+ if l[0] == '(':
+ count += 1
+ tmp.append(l[0])
+ l.pop(0)
+ elif l[0] == ')':
+ count -= 1
+ tmp.append(l[0])
+ l.pop(0)
+ elif l[0] == ' ':
+ l.pop(0)
+ else:
+ word = []
+ while l:
+ if l[0] in [' ', '(', ')']:
+ break
+ else:
+ word.append(l[0])
+ l.pop(0)
+ word = ''.join(word)
+ tmp.append(word)
+ if not(word in ['or', 'and']):
+ tmp_genes.append(word)
+ else:
+ return False
+ if count == 0:
+ return (tmp, tmp_genes)
+ else:
+ return False
+
+def brackets_to_list(l):
+ tmp = []
+ while l:
+ if l[0] == '(':
+ l.pop(0)
+ tmp.append(resolve_brackets(l))
+ else:
+ tmp.append(l[0])
+ l.pop(0)
+ return tmp
+
+def resolve_brackets(l):
+ tmp = []
+ while l[0] != ')':
+ if l[0] == '(':
+ l.pop(0)
+ tmp.append(resolve_brackets(l))
+ else:
+ tmp.append(l[0])
+ l.pop(0)
+ l.pop(0)
+ return tmp
+
+def priorityAND(l):
+ tmp = []
+ flag = True
+ while l:
+ if len(l) == 1:
+ if isinstance(l[0], list):
+ tmp.append(priorityAND(l[0]))
+ else:
+ tmp.append(l[0])
+ l = l[1:]
+ elif l[0] == 'or':
+ tmp.append(l[0])
+ flag = False
+ l = l[1:]
+ elif l[1] == 'or':
+ if isinstance(l[0], list):
+ tmp.append(priorityAND(l[0]))
+ else:
+ tmp.append(l[0])
+ tmp.append(l[1])
+ flag = False
+ l = l[2:]
+ elif l[1] == 'and':
+ tmpAnd = []
+ if isinstance(l[0], list):
+ tmpAnd.append(priorityAND(l[0]))
+ else:
+ tmpAnd.append(l[0])
+ tmpAnd.append(l[1])
+ if isinstance(l[2], list):
+ tmpAnd.append(priorityAND(l[2]))
+ else:
+ tmpAnd.append(l[2])
+ l = l[3:]
+ while l:
+ if l[0] == 'and':
+ tmpAnd.append(l[0])
+ if isinstance(l[1], list):
+ tmpAnd.append(priorityAND(l[1]))
+ else:
+ tmpAnd.append(l[1])
+ l = l[2:]
+ elif l[0] == 'or':
+ flag = False
+ break
+ if flag == True: #se ci sono solo AND nella lista
+ tmp.extend(tmpAnd)
+ elif flag == False:
+ tmp.append(tmpAnd)
+ return tmp
+
+def checkRule(l):
+ if len(l) == 1:
+ if isinstance(l[0], list):
+ if checkRule(l[0]) is False:
+ return False
+ elif len(l) > 2:
+ if checkRule2(l) is False:
+ return False
+ else:
+ return False
+ return True
+
+def checkRule2(l):
+ while l:
+ if len(l) == 1:
+ return False
+ elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+ if checkRule(l[0]) is False:
+ return False
+ if isinstance(l[2], list):
+ if checkRule(l[2]) is False:
+ return False
+ l = l[3:]
+ elif l[1] in ['and', 'or']:
+ if isinstance(l[2], list):
+ if checkRule(l[2]) is False:
+ return False
+ l = l[3:]
+ elif l[0] in ['and', 'or']:
+ if isinstance(l[1], list):
+ if checkRule(l[1]) is False:
+ return False
+ l = l[2:]
+ else:
+ return False
+ return True
+
+def do_rules(rules):
+ split_rules = []
+ err_rules = []
+ tmp_gene_in_rule = []
+ for i in range(len(rules)):
+ tmp = list(rules[i])
+ if tmp:
+ tmp, tmp_genes = check_and_doWord(tmp)
+ tmp_gene_in_rule.extend(tmp_genes)
+ if tmp is False:
+ split_rules.append([])
+ err_rules.append(rules[i])
+ else:
+ tmp = brackets_to_list(tmp)
+ if checkRule(tmp):
+ split_rules.append(priorityAND(tmp))
+ else:
+ split_rules.append([])
+ err_rules.append(rules[i])
+ else:
+ split_rules.append([])
+ if err_rules:
+ warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
+ return (split_rules, list(set(tmp_gene_in_rule)))
+
+def make_recon(data):
+ try:
+ import cobra as cb
+ import warnings
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore')
+ recon = cb.io.read_sbml_model(data)
+ react = recon.reactions
+ rules = [react[i].gene_reaction_rule for i in range(len(react))]
+ ids = [react[i].id for i in range(len(react))]
+ except cb.io.sbml3.CobraSBMLError:
+ try:
+ data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('')
+ if len(data.columns) < 2:
+ sys.exit('Execution aborted: wrong format of '+
+ 'custom datarules\n')
+ if not len(data.columns) == 2:
+ warning('Warning: more than 2 columns in custom datarules.\n' +
+ 'Extra columns have been disregarded\n')
+ ids = list(data.iloc[:, 0])
+ rules = list(data.iloc[:, 1])
+ except pd.errors.EmptyDataError:
+ sys.exit('Execution aborted: wrong format of custom datarules\n')
+ except pd.errors.ParserError:
+ sys.exit('Execution aborted: wrong format of custom datarules\n')
+ split_rules, tmp_genes = do_rules(rules)
+ gene_in_rule = {}
+ for i in tmp_genes:
+ gene_in_rule[i] = 'ok'
+ return (ids, split_rules, gene_in_rule)
+
+############################ gene #############################################
+
+def data_gene(gene, type_gene, name, gene_custom):
+ args = process_args(sys.argv)
+ for i in range(len(gene)):
+ tmp = gene.iloc[i, 0]
+ if tmp.startswith(' ') or tmp.endswith(' '):
+ gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
+ gene_dup = [item for item, count in
+ collections.Counter(gene[gene.columns[0]]).items() if count > 1]
+ pat_dup = [item for item, count in
+ collections.Counter(list(gene.columns)).items() if count > 1]
+ if gene_dup:
+ if gene_custom == None:
+ if args.rules_selector == 'HMRcore':
+ gene_in_rule = pk.load(open(args.tool_dir +
+ '/local/HMRcore_genes.p', 'rb'))
+ elif args.rules_selector == 'Recon':
+ gene_in_rule = pk.load(open(args.tool_dir +
+ '/local/Recon_genes.p', 'rb'))
+ gene_in_rule = gene_in_rule.get(type_gene)
+ else:
+ gene_in_rule = gene_custom
+ tmp = []
+ for i in gene_dup:
+ if gene_in_rule.get(i) == 'ok':
+ tmp.append(i)
+ if tmp:
+ sys.exit('Execution aborted because gene ID '
+ +str(tmp)+' in '+name+' is duplicated\n')
+ if pat_dup:
+ warning('Warning: duplicated label\n' + str(pat_dup) + 'in ' + name +
+ '\n')
+ return (gene.set_index(gene.columns[0])).to_dict()
+
+############################ resolve ##########################################
+
+def resolve(genes, rules, ids, resolve_none, name):
+ resolve_rules = {}
+ not_found = []
+ flag = False
+ for key, value in genes.items():
+ tmp_resolve = []
+ for i in range(len(rules)):
+ tmp = rules[i]
+ if tmp:
+ tmp, err = replace_gene_value(tmp, value)
+ if err:
+ not_found.extend(err)
+ ris = control(None, tmp, resolve_none)
+ if ris is False or ris == None:
+ tmp_resolve.append(None)
+ else:
+ tmp_resolve.append(ris)
+ flag = True
+ else:
+ tmp_resolve.append(None)
+ resolve_rules[key] = tmp_resolve
+ if flag is False:
+ warning('Warning: no computable score (due to missing gene values)' +
+ 'for class ' + name + ', the class has been disregarded\n')
+ return (None, None)
+ return (resolve_rules, list(set(not_found)))
+
+############################ split class ######################################
+
+def split_class(classes, resolve_rules):
+ class_pat = {}
+ for i in range(len(classes)):
+ classe = classes.iloc[i, 1]
+ if not pd.isnull(classe):
+ l = []
+ for j in range(i, len(classes)):
+ if classes.iloc[j, 1] == classe:
+ pat_id = classes.iloc[j, 0]
+ tmp = resolve_rules.get(pat_id, None)
+ if tmp != None:
+ l.append(tmp)
+ classes.iloc[j, 1] = None
+ if l:
+ class_pat[classe] = list(map(list, zip(*l)))
+ else:
+ warning('Warning: no sample found in class ' + classe +
+ ', the class has been disregarded\n')
+ return class_pat
+
+############################ map ##############################################
+
+def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C):
+ args = process_args(sys.argv)
+ if (not class_pat) or (len(class_pat.keys()) < 2):
+ sys.exit('Execution aborted: classes provided for comparisons are ' +
+ 'less than two\n')
+ for i, j in it.combinations(class_pat.keys(), 2):
+ tmp = {}
+ count = 0
+ max_F_C = 0
+ for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
+ try:
+ stat_D, p_value = st.ks_2samp(l1, l2)
+ avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
+ if not isinstance(avg, str):
+ if max_F_C < abs(avg):
+ max_F_C = abs(avg)
+ tmp[ids[count]] = [float(p_value), avg]
+ count += 1
+ except (TypeError, ZeroDivisionError):
+ count += 1
+ tab = 'table_out/' + i + '_vs_' + j + '.tsv'
+ tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
+ tmp_csv = tmp_csv.reset_index()
+ header = ['ids', 'P_Value', 'Average']
+ tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
+ if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
+ and args.yes_no == 'yes'):
+ fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
+ file_svg = 'map_svg/' + i + '_vs_' + j + '.svg'
+ with open(file_svg, 'wb') as new_map:
+ new_map.write(ET.tostring(core_map, encoding='UTF-8',
+ method='xml'))
+ file_pdf = 'map_pdf/' + i + '_vs_' + j + '.pdf'
+ renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
+ return None
+
+############################ MAIN #############################################
+
+def main():
+ args = process_args(sys.argv)
+ os.makedirs('table_out')
+ if args.rules_selector == 'HMRcore':
+ os.makedirs('map_svg')
+ os.makedirs('map_pdf')
+ recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
+ elif args.rules_selector == 'Recon':
+ recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
+ elif args.rules_selector == 'Custom':
+ ids, rules, gene_in_rule = make_recon(args.custom)
+ resolve_none = check_bool(args.none)
+ class_pat = {}
+ if args.option == 'datasets':
+ num = 1
+ #if len(args.names) != len(set(args.names)):
+ # sys.exit('Execution aborted: datasets name duplicated')
+ for i, j in zip(args.input_datas, args.names):
+ name = name_dataset(j, num)
+ dataset = read_dataset(i, name)
+ dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+ type_gene = gene_type(dataset.iloc[0, 0], name)
+ if args.rules_selector != 'Custom':
+ genes = data_gene(dataset, type_gene, name, None)
+ ids, rules = load_id_rules(recon.get(type_gene))
+ elif args.rules_selector == 'Custom':
+ genes = data_gene(dataset, type_gene, name, gene_in_rule)
+ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+ if err != None and err:
+ warning('Warning: gene\n' + str(err) + '\nnot found in class '
+ + name + ', the expression level for this gene ' +
+ 'will be considered NaN\n')
+ if resolve_rules != None:
+ class_pat[name] = list(map(list, zip(*resolve_rules.values())))
+ num += 1
+ elif args.option == 'dataset_class':
+ name = 'RNAseq'
+ dataset = read_dataset(args.input_data, name)
+ dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+ type_gene = gene_type(dataset.iloc[0, 0], name)
+ classes = read_dataset(args.input_class, 'class')
+ if not len(classes.columns) == 2:
+ warning('Warning: more than 2 columns in class file. Extra' +
+ 'columns have been disregarded\n')
+ classes = classes.astype(str)
+ if args.rules_selector != 'Custom':
+ genes = data_gene(dataset, type_gene, name, None)
+ ids, rules = load_id_rules(recon.get(type_gene))
+ elif args.rules_selector == 'Custom':
+ genes = data_gene(dataset, type_gene, name, gene_in_rule)
+ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+ if err != None and err:
+ warning('Warning: gene\n'+str(err)+'\nnot found in class '
+ + name + ', the expression level for this gene ' +
+ 'will be considered NaN\n')
+ if resolve_rules != None:
+ class_pat = split_class(classes, resolve_rules)
+ if args.rules_selector == 'Custom':
+ if args.yes_no == 'yes':
+ os.makedirs('map_svg')
+ os.makedirs('map_pdf')
+ try:
+ core_map = ET.parse(args.custom_map)
+ except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
+ sys.exit('Execution aborted: custom map in wrong format')
+ elif args.yes_no == 'no':
+ core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
+ else:
+ core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
+ maps(core_map, class_pat, ids, args.pValue, args.fChange)
+ warning('Execution succeeded')
+ return None
+
+###############################################################################
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r 23ac9cf12788 Marea/marea.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Marea/marea.xml Tue Nov 06 03:16:21 2018 -0500
@@ -0,0 +1,240 @@
+
+ for Galaxy
+
+ pandas
+ scipy
+ lxml
+ svglib
+ reportlab
+ cobrapy
+ python-libsbml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cond_rule['rules_selector'] == 'HMRcore') or ((cond_rule['rules_selector'] == 'Custom') and (cond_rule['cond_map']['yes_no'] == 'yes'))
+
+
+
+ (cond_rule['rules_selector'] == 'HMRcore') or ((cond_rule['rules_selector'] == 'Custom') and (cond_rule['cond_map']['yes_no'] == 'yes'))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 23ac9cf12788 Marea/marea_cluster.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Marea/marea_cluster.py Tue Nov 06 03:16:21 2018 -0500
@@ -0,0 +1,608 @@
+
+from __future__ import division
+import os
+import sys
+import pandas as pd
+import collections
+import pickle as pk
+import argparse
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+
+########################## argparse ###########################################
+
+def process_args(args):
+ parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
+ description = 'process some value\'s' +
+ ' genes to create class.')
+ parser.add_argument('-rs', '--rules_selector',
+ type = str,
+ default = 'HMRcore',
+ choices = ['HMRcore', 'Recon', 'Custom'],
+ help = 'chose which type of dataset you want use')
+ parser.add_argument('-cr', '--custom',
+ type = str,
+ help='your dataset if you want custom rules')
+ parser.add_argument('-ch', '--cond_hier',
+ type = str,
+ default = 'no',
+ choices = ['no', 'yes'],
+ help = 'chose if you wanna hierical dendrogram')
+ parser.add_argument('-lk', '--k_min',
+ type = int,
+ help = 'min number of cluster')
+ parser.add_argument('-uk', '--k_max',
+ type = int,
+ help = 'max number of cluster')
+ parser.add_argument('-li', '--linkage',
+ type = str,
+ choices = ['single', 'complete', 'average'],
+ help='linkage hierarchical cluster')
+ parser.add_argument('-d', '--data',
+ type = str,
+ required = True,
+ help = 'input dataset')
+ parser.add_argument('-n', '--none',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'compute Nan values')
+ parser.add_argument('-td', '--tool_dir',
+ type = str,
+ required = True,
+ help = 'your tool directory')
+ parser.add_argument('-na', '--name',
+ type = str,
+ help = 'name of dataset')
+ parser.add_argument('-de', '--dendro',
+ help = "Dendrogram out")
+ parser.add_argument('-ol', '--out_log',
+ help = "Output log")
+ parser.add_argument('-el', '--elbow',
+ help = "Out elbow")
+ args = parser.parse_args()
+ return args
+
+########################### warning ###########################################
+
+def warning(s):
+ args = process_args(sys.argv)
+ with open(args.out_log, 'a') as log:
+ log.write(s)
+
+############################ dataset input ####################################
+
+def read_dataset(data, name):
+ try:
+ dataset = pd.read_csv(data, sep = '\t', header = 0)
+ except pd.errors.EmptyDataError:
+ sys.exit('Execution aborted: wrong format of '+name+'\n')
+ if len(dataset.columns) < 2:
+ sys.exit('Execution aborted: wrong format of '+name+'\n')
+ return dataset
+
+############################ dataset name #####################################
+
+def name_dataset(name_data, count):
+ if str(name_data) == 'Dataset':
+ return str(name_data) + '_' + str(count)
+ else:
+ return str(name_data)
+
+############################ load id e rules ##################################
+
+def load_id_rules(reactions):
+ ids, rules = [], []
+ for key, value in reactions.items():
+ ids.append(key)
+ rules.append(value)
+ return (ids, rules)
+
+############################ check_methods ####################################
+
+def gene_type(l, name):
+ if check_hgnc(l):
+ return 'hugo_id'
+ elif check_ensembl(l):
+ return 'ensembl_gene_id'
+ elif check_symbol(l):
+ return 'symbol'
+ elif check_entrez(l):
+ return 'entrez_id'
+ else:
+ sys.exit('Execution aborted:\n' +
+ 'gene ID type in ' + name + ' not supported. Supported ID' +
+ 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
+
+def check_hgnc(l):
+ if len(l) > 5:
+ if (l.upper()).startswith('HGNC:'):
+ return l[5:].isdigit()
+ else:
+ return False
+ else:
+ return False
+
+def check_ensembl(l):
+ if len(l) == 15:
+ if (l.upper()).startswith('ENS'):
+ return l[4:].isdigit()
+ else:
+ return False
+ else:
+ return False
+
+def check_symbol(l):
+ if len(l) > 0:
+ if l[0].isalpha() and l[1:].isalnum():
+ return True
+ else:
+ return False
+ else:
+ return False
+
+def check_entrez(l):
+ if len(l) > 0:
+ return l.isdigit()
+ else:
+ return False
+
+def check_bool(b):
+ if b == 'true':
+ return True
+ elif b == 'false':
+ return False
+
+############################ make recon #######################################
+
+def check_and_doWord(l):
+ tmp = []
+ tmp_genes = []
+ count = 0
+ while l:
+ if count >= 0:
+ if l[0] == '(':
+ count += 1
+ tmp.append(l[0])
+ l.pop(0)
+ elif l[0] == ')':
+ count -= 1
+ tmp.append(l[0])
+ l.pop(0)
+ elif l[0] == ' ':
+ l.pop(0)
+ else:
+ word = []
+ while l:
+ if l[0] in [' ', '(', ')']:
+ break
+ else:
+ word.append(l[0])
+ l.pop(0)
+ word = ''.join(word)
+ tmp.append(word)
+ if not(word in ['or', 'and']):
+ tmp_genes.append(word)
+ else:
+ return False
+ if count == 0:
+ return (tmp, tmp_genes)
+ else:
+ return False
+
+def brackets_to_list(l):
+ tmp = []
+ while l:
+ if l[0] == '(':
+ l.pop(0)
+ tmp.append(resolve_brackets(l))
+ else:
+ tmp.append(l[0])
+ l.pop(0)
+ return tmp
+
+def resolve_brackets(l):
+ tmp = []
+ while l[0] != ')':
+ if l[0] == '(':
+ l.pop(0)
+ tmp.append(resolve_brackets(l))
+ else:
+ tmp.append(l[0])
+ l.pop(0)
+ l.pop(0)
+ return tmp
+
+def priorityAND(l):
+ tmp = []
+ flag = True
+ while l:
+ if len(l) == 1:
+ if isinstance(l[0], list):
+ tmp.append(priorityAND(l[0]))
+ else:
+ tmp.append(l[0])
+ l = l[1:]
+ elif l[0] == 'or':
+ tmp.append(l[0])
+ flag = False
+ l = l[1:]
+ elif l[1] == 'or':
+ if isinstance(l[0], list):
+ tmp.append(priorityAND(l[0]))
+ else:
+ tmp.append(l[0])
+ tmp.append(l[1])
+ flag = False
+ l = l[2:]
+ elif l[1] == 'and':
+ tmpAnd = []
+ if isinstance(l[0], list):
+ tmpAnd.append(priorityAND(l[0]))
+ else:
+ tmpAnd.append(l[0])
+ tmpAnd.append(l[1])
+ if isinstance(l[2], list):
+ tmpAnd.append(priorityAND(l[2]))
+ else:
+ tmpAnd.append(l[2])
+ l = l[3:]
+ while l:
+ if l[0] == 'and':
+ tmpAnd.append(l[0])
+ if isinstance(l[1], list):
+ tmpAnd.append(priorityAND(l[1]))
+ else:
+ tmpAnd.append(l[1])
+ l = l[2:]
+ elif l[0] == 'or':
+ flag = False
+ break
+ if flag == True: #se ci sono solo AND nella lista
+ tmp.extend(tmpAnd)
+ elif flag == False:
+ tmp.append(tmpAnd)
+ return tmp
+
+def checkRule(l):
+ if len(l) == 1:
+ if isinstance(l[0], list):
+ if checkRule(l[0]) is False:
+ return False
+ elif len(l) > 2:
+ if checkRule2(l) is False:
+ return False
+ else:
+ return False
+ return True
+
+def checkRule2(l):
+ while l:
+ if len(l) == 1:
+ return False
+ elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+ if checkRule(l[0]) is False:
+ return False
+ if isinstance(l[2], list):
+ if checkRule(l[2]) is False:
+ return False
+ l = l[3:]
+ elif l[1] in ['and', 'or']:
+ if isinstance(l[2], list):
+ if checkRule(l[2]) is False:
+ return False
+ l = l[3:]
+ elif l[0] in ['and', 'or']:
+ if isinstance(l[1], list):
+ if checkRule(l[1]) is False:
+ return False
+ l = l[2:]
+ else:
+ return False
+ return True
+
+def do_rules(rules):
+ split_rules = []
+ err_rules = []
+ tmp_gene_in_rule = []
+ for i in range(len(rules)):
+ tmp = list(rules[i])
+ if tmp:
+ tmp, tmp_genes = check_and_doWord(tmp)
+ tmp_gene_in_rule.extend(tmp_genes)
+ if tmp is False:
+ split_rules.append([])
+ err_rules.append(rules[i])
+ else:
+ tmp = brackets_to_list(tmp)
+ if checkRule(tmp):
+ split_rules.append(priorityAND(tmp))
+ else:
+ split_rules.append([])
+ err_rules.append(rules[i])
+ else:
+ split_rules.append([])
+ if err_rules:
+ warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
+ return (split_rules, list(set(tmp_gene_in_rule)))
+
+def make_recon(data):
+ try:
+ import cobra as cb
+ import warnings
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore')
+ recon = cb.io.read_sbml_model(data)
+ react = recon.reactions
+ rules = [react[i].gene_reaction_rule for i in range(len(react))]
+ ids = [react[i].id for i in range(len(react))]
+ except cb.io.sbml3.CobraSBMLError:
+ try:
+ data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('')
+ if len(data.columns) < 2:
+ sys.exit('Execution aborted: wrong format of ' +
+ 'custom GPR rules\n')
+ if not len(data.columns) == 2:
+ warning('WARNING: more than 2 columns in custom GPR rules.\n' +
+ 'Extra columns have been disregarded\n')
+ ids = list(data.iloc[:, 0])
+ rules = list(data.iloc[:, 1])
+ except pd.errors.EmptyDataError:
+ sys.exit('Execution aborted: wrong format of custom GPR rules\n')
+ except pd.errors.ParserError:
+ sys.exit('Execution aborted: wrong format of custom GPR rules\n')
+ split_rules, tmp_genes = do_rules(rules)
+ gene_in_rule = {}
+ for i in tmp_genes:
+ gene_in_rule[i] = 'ok'
+ return (ids, split_rules, gene_in_rule)
+
+############################ resolve_methods ##################################
+
+def replace_gene_value(l, d):
+ tmp = []
+ err = []
+ while l:
+ if isinstance(l[0], list):
+ tmp_rules, tmp_err = replace_gene_value(l[0], d)
+ tmp.append(tmp_rules)
+ err.extend(tmp_err)
+ else:
+ value = replace_gene(l[0],d)
+ tmp.append(value)
+ if value == None:
+ err.append(l[0])
+ l = l[1:]
+ return (tmp, err)
+
+def replace_gene(l, d):
+ if l =='and' or l == 'or':
+ return l
+ else:
+ value = d.get(l, None)
+ if not(value == None or isinstance(value, (int, float))):
+ sys.exit('Execution aborted: ' + value + ' value not valid\n')
+ return value
+
+def compute(val1, op, val2, cn):
+ if val1 != None and val2 != None:
+ if op == 'and':
+ return min(val1, val2)
+ else:
+ return val1 + val2
+ elif op == 'and':
+ if cn is True:
+ if val1 != None:
+ return val1
+ elif val2 != None:
+ return val2
+ else:
+ return None
+ else:
+ return None
+ else:
+ if val1 != None:
+ return val1
+ elif val2 != None:
+ return val2
+ else:
+ return None
+
+def control(ris, l, cn):
+ if len(l) == 1:
+ if isinstance(l[0], (float, int)) or l[0] == None:
+ return l[0]
+ elif isinstance(l[0], list):
+ return control(None, l[0], cn)
+ else:
+ return False
+ elif len(l) > 2:
+ return control_list(ris, l, cn)
+ else:
+ return False
+
+def control_list(ris, l, cn):
+ while l:
+ if len(l) == 1:
+ return False
+ elif (isinstance(l[0], (float, int)) or
+ l[0] == None) and l[1] in ['and', 'or']:
+ if isinstance(l[2], (float, int)) or l[2] == None:
+ ris = compute(l[0], l[1], l[2], cn)
+ elif isinstance(l[2], list):
+ tmp = control(None, l[2], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = compute(l[0], l[1], tmp, cn)
+ else:
+ return False
+ l = l[3:]
+ elif l[0] in ['and', 'or']:
+ if isinstance(l[1], (float, int)) or l[1] == None:
+ ris = compute(ris, l[0], l[1], cn)
+ elif isinstance(l[1], list):
+ tmp = control(None,l[1], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = compute(ris, l[0], tmp, cn)
+ else:
+ return False
+ l = l[2:]
+ elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+ if isinstance(l[2], (float, int)) or l[2] == None:
+ tmp = control(None, l[0], cn)
+ if tmp is False:
+ return False
+ else:
+ ris = compute(tmp, l[1], l[2], cn)
+ elif isinstance(l[2], list):
+ tmp = control(None, l[0], cn)
+ tmp2 = control(None, l[2], cn)
+ if tmp is False or tmp2 is False:
+ return False
+ else:
+ ris = compute(tmp, l[1], tmp2, cn)
+ else:
+ return False
+ l = l[3:]
+ else:
+ return False
+ return ris
+
+############################ gene #############################################
+
+def data_gene(gene, type_gene, name, gene_custom):
+ args = process_args(sys.argv)
+ for i in range(len(gene)):
+ tmp = gene.iloc[i, 0]
+ if tmp.startswith(' ') or tmp.endswith(' '):
+ gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
+ gene_dup = [item for item, count in
+ collections.Counter(gene[gene.columns[0]]).items() if count > 1]
+ pat_dup = [item for item, count in
+ collections.Counter(list(gene.columns)).items() if count > 1]
+ if gene_dup:
+ if gene_custom == None:
+ if args.rules_selector == 'HMRcore':
+ gene_in_rule = pk.load(open(args.tool_dir +
+ '/local/HMRcore_genes.p', 'rb'))
+ elif args.rules_selector == 'Recon':
+ gene_in_rule = pk.load(open(args.tool_dir +
+ '/local/Recon_genes.p', 'rb'))
+ gene_in_rule = gene_in_rule.get(type_gene)
+ else:
+ gene_in_rule = gene_custom
+ tmp = []
+ for i in gene_dup:
+ if gene_in_rule.get(i) == 'ok':
+ tmp.append(i)
+ if tmp:
+ sys.exit('Execution aborted because gene ID '
+ + str(tmp) + ' in ' + name + ' is duplicated\n')
+ if pat_dup:
+ sys.exit('Execution aborted: duplicated label\n'
+ + str(pat_dup) + 'in ' + name + '\n')
+ return (gene.set_index(gene.columns[0])).to_dict()
+
+############################ resolve ##########################################
+
+def resolve(genes, rules, ids, resolve_none, name):
+ resolve_rules = {}
+ not_found = []
+ flag = False
+ for key, value in genes.items():
+ tmp_resolve = []
+ for i in range(len(rules)):
+ tmp = rules[i]
+ if tmp:
+ tmp, err = replace_gene_value(tmp, value)
+ if err:
+ not_found.extend(err)
+ ris = control(None, tmp, resolve_none)
+ if ris is False or ris == None:
+ tmp_resolve.append(None)
+ else:
+ tmp_resolve.append(ris)
+ flag = True
+ else:
+ tmp_resolve.append(None)
+ resolve_rules[key] = tmp_resolve
+ if flag is False:
+ sys.exit('Execution aborted: no computable score' +
+ ' (due to missing gene values) for class '
+ + name + ', the class has been disregarded\n')
+ return (resolve_rules, list(set(not_found)))
+
+################################# clustering ##################################
+
+def f_cluster(resolve_rules):
+ os.makedirs('cluster_out')
+ args = process_args(sys.argv)
+ cluster_data = pd.DataFrame.from_dict(resolve_rules, orient = 'index')
+ for i in cluster_data.columns:
+ tmp = cluster_data[i][0]
+ if tmp == None:
+ cluster_data = cluster_data.drop(columns=[i])
+ distorsion = []
+ for i in range(args.k_min, args.k_max+1):
+ tmp_kmeans = KMeans(n_clusters = i,
+ n_init = 100,
+ max_iter = 300,
+ random_state = 0).fit(cluster_data)
+ distorsion.append(tmp_kmeans.inertia_)
+ predict = tmp_kmeans.predict(cluster_data)
+ predict = [x+1 for x in predict]
+ classe = (pd.DataFrame(zip(cluster_data.index, predict))).astype(str)
+ dest = 'cluster_out/K=' + str(i) + '_' + args.name+'.tsv'
+ classe.to_csv(dest, sep = '\t', index = False,
+ header = ['Patient_ID', 'Class'])
+ plt.figure(0)
+ plt.plot(range(args.k_min, args.k_max+1), distorsion, marker = 'o')
+ plt.xlabel('Number of cluster')
+ plt.ylabel('Distorsion')
+ plt.savefig(args.elbow, dpi = 240, format = 'pdf')
+ if args.cond_hier == 'yes':
+ import scipy.cluster.hierarchy as hier
+ lin = hier.linkage(cluster_data, args.linkage)
+ plt.figure(1)
+ plt.figure(figsize=(10, 5))
+ hier.dendrogram(lin, leaf_font_size = 2, labels = cluster_data.index)
+ plt.savefig(args.dendro, dpi = 480, format = 'pdf')
+ return None
+
+################################# main ########################################
+
+def main():
+ args = process_args(sys.argv)
+ if args.k_min > args.k_max:
+ sys.exit('Execution aborted: max cluster > min cluster')
+ if args.rules_selector == 'HMRcore':
+ recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
+ elif args.rules_selector == 'Recon':
+ recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
+ elif args.rules_selector == 'Custom':
+ ids, rules, gene_in_rule = make_recon(args.custom)
+ resolve_none = check_bool(args.none)
+ dataset = read_dataset(args.data, args.name)
+ dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+ type_gene = gene_type(dataset.iloc[0, 0], args.name)
+ if args.rules_selector != 'Custom':
+ genes = data_gene(dataset, type_gene, args.name, None)
+ ids, rules = load_id_rules(recon.get(type_gene))
+ elif args.rules_selector == 'Custom':
+ genes = data_gene(dataset, type_gene, args.name, gene_in_rule)
+ resolve_rules, err = resolve(genes, rules, ids, resolve_none, args.name)
+ if err:
+ warning('WARNING: gene\n' + str(err) + '\nnot found in class '
+ + args.name + ', the expression level for this gene ' +
+ 'will be considered NaN\n')
+ f_cluster(resolve_rules)
+ warning('Execution succeeded')
+ return None
+
+###############################################################################
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff -r 000000000000 -r 23ac9cf12788 Marea/marea_cluster.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Marea/marea_cluster.xml Tue Nov 06 03:16:21 2018 -0500
@@ -0,0 +1,97 @@
+
+ of Reaction Activity Scores
+
+ pandas
+ scikit-learn
+ scipy
+ matplotlib
+ cobrapy
+ python-libsbml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ cond_hier['hier'] == 'yes'
+
+
+
+
+
+
+
+
+.. class:: warningmark
+
+This tool expects input datasets consisting of tab-delimited columns.
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation > Convert delimiters to TAB*
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+