diff Marea/marea_cluster.py @ 0:23ac9cf12788 draft

Uploaded
author bimib
date Tue, 06 Nov 2018 03:16:21 -0500
parents
children 5721182715a7
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Marea/marea_cluster.py	Tue Nov 06 03:16:21 2018 -0500
@@ -0,0 +1,608 @@
+
+from __future__ import division
+import os
+import sys
+import pandas as pd
+import collections
+import pickle as pk
+import argparse
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+
+########################## argparse ###########################################
+
+def process_args(args):
+    parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
+                                     description = 'process some value\'s' +
+                                     ' genes to create class.')
+    parser.add_argument('-rs', '--rules_selector', 
+                        type = str,
+                        default = 'HMRcore',
+                        choices = ['HMRcore', 'Recon', 'Custom'], 
+                        help = 'chose which type of dataset you want use')
+    parser.add_argument('-cr', '--custom',
+                        type = str,
+                        help='your dataset if you want custom rules')
+    parser.add_argument('-ch', '--cond_hier', 
+                        type = str,
+                        default = 'no',
+                        choices = ['no', 'yes'], 
+                        help = 'chose if you wanna hierical dendrogram')
+    parser.add_argument('-lk', '--k_min', 
+                        type = int,
+                        help = 'min number of cluster')
+    parser.add_argument('-uk', '--k_max', 
+                        type = int,
+                        help = 'max number of cluster')
+    parser.add_argument('-li', '--linkage', 
+                        type = str, 
+                        choices = ['single', 'complete', 'average'], 
+                        help='linkage hierarchical cluster')
+    parser.add_argument('-d', '--data',
+                        type = str,
+                        required = True,
+                        help = 'input dataset')
+    parser.add_argument('-n', '--none',
+                        type = str,
+                        default = 'true',
+                        choices = ['true', 'false'], 
+                        help = 'compute Nan values')
+    parser.add_argument('-td', '--tool_dir',
+                        type = str,
+                        required = True,
+                        help = 'your tool directory')
+    parser.add_argument('-na', '--name',
+                        type = str,
+                        help = 'name of dataset')
+    parser.add_argument('-de', '--dendro', 
+                        help = "Dendrogram out")
+    parser.add_argument('-ol', '--out_log', 
+                        help = "Output log")
+    parser.add_argument('-el', '--elbow', 
+                        help = "Out elbow")
+    args = parser.parse_args()
+    return args
+
+########################### warning ###########################################
+
+def warning(s):
+    args = process_args(sys.argv)
+    with open(args.out_log, 'a') as log:
+            log.write(s)
+            
+############################ dataset input ####################################
+
+def read_dataset(data, name):
+    try:
+        dataset = pd.read_csv(data, sep = '\t', header = 0)
+    except pd.errors.EmptyDataError:
+        sys.exit('Execution aborted: wrong format of '+name+'\n')
+    if len(dataset.columns) < 2:
+        sys.exit('Execution aborted: wrong format of '+name+'\n')
+    return dataset
+
+############################ dataset name #####################################
+
+def name_dataset(name_data, count):
+    if str(name_data) == 'Dataset':
+        return str(name_data) + '_' + str(count)
+    else:
+        return str(name_data)
+    
+############################ load id e rules ##################################
+
+def load_id_rules(reactions):
+    ids, rules = [], []
+    for key, value in reactions.items():
+            ids.append(key)
+            rules.append(value)
+    return (ids, rules)
+
+############################ check_methods ####################################
+
+def gene_type(l, name):
+    if check_hgnc(l):
+        return 'hugo_id'
+    elif check_ensembl(l):
+        return 'ensembl_gene_id'
+    elif check_symbol(l):
+        return 'symbol'
+    elif check_entrez(l):
+        return 'entrez_id'
+    else:
+        sys.exit('Execution aborted:\n' +
+                 'gene ID type in ' + name + ' not supported. Supported ID' +
+                 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
+
+def check_hgnc(l):
+    if len(l) > 5:
+        if (l.upper()).startswith('HGNC:'):
+            return l[5:].isdigit()
+        else:
+            return False
+    else:
+        return False
+
+def check_ensembl(l): 
+    if len(l) == 15:
+        if (l.upper()).startswith('ENS'):
+            return l[4:].isdigit()
+        else:  
+            return False 
+    else: 
+        return False 
+
+def check_symbol(l):
+    if len(l) > 0:
+        if l[0].isalpha() and l[1:].isalnum():
+            return True
+        else:
+            return False
+    else:
+        return False
+
+def check_entrez(l): 
+    if len(l) > 0:
+        return l.isdigit()
+    else: 
+        return False
+
+def check_bool(b):
+    if b == 'true':
+        return True
+    elif b == 'false':
+        return False
+    
+############################ make recon #######################################
+
+def check_and_doWord(l):
+    tmp = []
+    tmp_genes = []
+    count = 0
+    while l:
+        if count >= 0:
+            if l[0] == '(':
+                count += 1
+                tmp.append(l[0])
+                l.pop(0)
+            elif l[0] == ')':
+                count -= 1
+                tmp.append(l[0])
+                l.pop(0)
+            elif l[0] == ' ':
+                l.pop(0)
+            else:
+                word = []
+                while l:
+                    if l[0] in [' ', '(', ')']:
+                        break
+                    else:
+                        word.append(l[0])
+                        l.pop(0)
+                word = ''.join(word)
+                tmp.append(word)
+                if not(word in ['or', 'and']):
+                    tmp_genes.append(word)
+        else:
+            return False
+    if count == 0:
+        return (tmp, tmp_genes)
+    else:
+        return False
+
+def brackets_to_list(l):
+    tmp = []
+    while l:
+        if l[0] == '(':
+            l.pop(0)
+            tmp.append(resolve_brackets(l))
+        else:
+            tmp.append(l[0])
+            l.pop(0)
+    return tmp
+
+def resolve_brackets(l):
+    tmp = []
+    while l[0] != ')':
+        if l[0] == '(':
+            l.pop(0)
+            tmp.append(resolve_brackets(l))
+        else:
+            tmp.append(l[0])
+            l.pop(0)
+    l.pop(0)
+    return tmp
+
+def priorityAND(l):
+    tmp = []
+    flag = True
+    while l:
+        if len(l) == 1:
+            if isinstance(l[0], list):
+                tmp.append(priorityAND(l[0]))
+            else:
+                tmp.append(l[0])
+            l = l[1:]
+        elif l[0] == 'or':
+            tmp.append(l[0])
+            flag = False
+            l = l[1:]
+        elif l[1] == 'or':
+            if isinstance(l[0], list): 
+                tmp.append(priorityAND(l[0]))
+            else:
+                tmp.append(l[0])
+            tmp.append(l[1])
+            flag = False
+            l = l[2:]
+        elif l[1] == 'and':
+            tmpAnd = []
+            if isinstance(l[0], list): 
+                tmpAnd.append(priorityAND(l[0]))
+            else:
+                tmpAnd.append(l[0])
+            tmpAnd.append(l[1])
+            if isinstance(l[2], list): 
+                tmpAnd.append(priorityAND(l[2]))
+            else:
+                tmpAnd.append(l[2])
+            l = l[3:]
+            while l:
+                if l[0] == 'and':
+                    tmpAnd.append(l[0])
+                    if isinstance(l[1], list): 
+                        tmpAnd.append(priorityAND(l[1]))
+                    else:
+                        tmpAnd.append(l[1])
+                    l = l[2:]
+                elif l[0] == 'or':
+                    flag = False
+                    break
+            if flag == True: #se ci sono solo AND nella lista
+                tmp.extend(tmpAnd)
+            elif flag == False:
+                tmp.append(tmpAnd)
+    return tmp
+
+def checkRule(l):
+    if len(l) == 1:
+        if isinstance(l[0], list):
+            if checkRule(l[0]) is False:
+                return False
+    elif len(l) > 2:
+        if checkRule2(l) is False:
+            return False
+    else:
+        return False
+    return True
+
+def checkRule2(l):
+    while l:
+        if len(l) == 1:
+            return False
+        elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+            if checkRule(l[0]) is False:
+                return False
+            if isinstance(l[2], list):
+                if checkRule(l[2]) is False:
+                    return False
+            l = l[3:]
+        elif l[1] in ['and', 'or']:
+            if isinstance(l[2], list):
+                if checkRule(l[2]) is False:
+                    return False
+            l = l[3:]
+        elif l[0] in ['and', 'or']:
+            if isinstance(l[1], list):
+                if checkRule(l[1]) is False:
+                    return False
+            l = l[2:]
+        else:
+            return False
+    return True
+
+def do_rules(rules):
+    split_rules = []
+    err_rules = []
+    tmp_gene_in_rule = []
+    for i in range(len(rules)):
+        tmp = list(rules[i])
+        if tmp:
+            tmp, tmp_genes = check_and_doWord(tmp)
+            tmp_gene_in_rule.extend(tmp_genes)
+            if tmp is False:
+                split_rules.append([])
+                err_rules.append(rules[i])
+            else:
+                tmp = brackets_to_list(tmp)
+                if checkRule(tmp):
+                    split_rules.append(priorityAND(tmp))
+                else:
+                    split_rules.append([])
+                    err_rules.append(rules[i])
+        else:
+            split_rules.append([])
+    if err_rules:
+        warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
+    return (split_rules, list(set(tmp_gene_in_rule)))
+
+def make_recon(data):
+    try:
+        import cobra as cb
+        import warnings
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')
+            recon = cb.io.read_sbml_model(data)
+        react = recon.reactions
+        rules = [react[i].gene_reaction_rule for i in range(len(react))]
+        ids = [react[i].id for i in range(len(react))]
+    except cb.io.sbml3.CobraSBMLError:
+        try:
+            data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('')
+            if len(data.columns) < 2:
+                sys.exit('Execution aborted: wrong format of ' +
+                         'custom GPR rules\n')
+            if not len(data.columns) == 2:
+                warning('WARNING: more than 2 columns in custom GPR rules.\n' +
+                        'Extra columns have been disregarded\n')
+            ids = list(data.iloc[:, 0])
+            rules = list(data.iloc[:, 1])
+        except pd.errors.EmptyDataError:
+            sys.exit('Execution aborted: wrong format of custom GPR rules\n')
+        except pd.errors.ParserError:
+            sys.exit('Execution aborted: wrong format of custom GPR rules\n')  
+    split_rules, tmp_genes = do_rules(rules)
+    gene_in_rule = {}
+    for i in tmp_genes:
+        gene_in_rule[i] = 'ok'
+    return (ids, split_rules, gene_in_rule)
+
+############################ resolve_methods ##################################
+
+def replace_gene_value(l, d):
+    tmp = []
+    err = []
+    while l:
+        if isinstance(l[0], list):
+            tmp_rules, tmp_err = replace_gene_value(l[0], d)
+            tmp.append(tmp_rules)
+            err.extend(tmp_err)
+        else:
+            value = replace_gene(l[0],d)
+            tmp.append(value)
+            if value == None:
+                err.append(l[0])
+        l = l[1:]
+    return (tmp, err)
+
+def replace_gene(l, d):
+    if l =='and' or l == 'or':
+        return l
+    else:
+        value = d.get(l, None)
+        if not(value == None or isinstance(value, (int, float))):
+            sys.exit('Execution aborted: ' + value + ' value not valid\n')
+        return value
+
+def compute(val1, op, val2, cn):
+    if val1 != None and val2 != None:
+        if op == 'and':
+            return min(val1, val2)
+        else:
+            return val1 + val2
+    elif op == 'and':
+        if cn is True:
+            if val1 != None:
+                return val1
+            elif val2 != None:
+                return val2
+            else:
+                return None
+        else:
+            return None
+    else:
+        if val1 != None:
+            return val1
+        elif val2 != None:
+            return val2
+        else:
+            return None
+
+def control(ris, l, cn):
+    if len(l) == 1:
+        if isinstance(l[0], (float, int)) or l[0] == None:
+            return l[0]
+        elif isinstance(l[0], list):
+            return control(None, l[0], cn)
+        else:
+            return False
+    elif len(l) > 2:
+        return control_list(ris, l, cn)
+    else:
+        return False
+
+def control_list(ris, l, cn):
+    while l:
+        if len(l) == 1:
+            return False
+        elif (isinstance(l[0], (float, int)) or
+              l[0] == None) and l[1] in ['and', 'or']:
+            if isinstance(l[2], (float, int)) or l[2] == None:
+                ris = compute(l[0], l[1], l[2], cn)            
+            elif isinstance(l[2], list):
+                tmp = control(None, l[2], cn)
+                if tmp is False:
+                    return False
+                else:
+                    ris = compute(l[0], l[1], tmp, cn)
+            else:
+                return False
+            l = l[3:]
+        elif l[0] in ['and', 'or']:
+            if isinstance(l[1], (float, int)) or l[1] == None:
+                ris = compute(ris, l[0], l[1], cn)
+            elif isinstance(l[1], list):
+                tmp = control(None,l[1], cn)
+                if tmp is False:
+                    return False
+                else:
+                    ris = compute(ris, l[0], tmp, cn)
+            else:
+                return False
+            l = l[2:]
+        elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+            if isinstance(l[2], (float, int)) or l[2] == None:
+                tmp = control(None, l[0], cn)
+                if tmp is False:
+                    return False
+                else:
+                    ris = compute(tmp, l[1], l[2], cn)
+            elif isinstance(l[2], list):
+                tmp = control(None, l[0], cn)
+                tmp2 = control(None, l[2], cn)
+                if tmp is False or tmp2 is False:
+                    return False
+                else:
+                    ris = compute(tmp, l[1], tmp2, cn)
+            else:
+                return False
+            l = l[3:]
+        else:
+            return False
+    return ris
+
+############################ gene #############################################
+
+def data_gene(gene, type_gene, name, gene_custom):
+    args = process_args(sys.argv)
+    for i in range(len(gene)):
+        tmp = gene.iloc[i, 0]
+        if tmp.startswith(' ') or tmp.endswith(' '):
+            gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
+    gene_dup = [item for item, count in 
+               collections.Counter(gene[gene.columns[0]]).items() if count > 1]
+    pat_dup = [item for item, count in 
+               collections.Counter(list(gene.columns)).items() if count > 1]
+    if gene_dup:
+        if gene_custom == None:
+            if args.rules_selector == 'HMRcore':
+                gene_in_rule = pk.load(open(args.tool_dir +
+                                            '/local/HMRcore_genes.p', 'rb'))
+            elif args.rules_selector == 'Recon':
+                gene_in_rule = pk.load(open(args.tool_dir +
+                                            '/local/Recon_genes.p', 'rb'))
+            gene_in_rule = gene_in_rule.get(type_gene)
+        else:
+            gene_in_rule = gene_custom
+        tmp = []
+        for i in gene_dup:
+            if gene_in_rule.get(i) == 'ok':
+                tmp.append(i)
+        if tmp:
+            sys.exit('Execution aborted because gene ID '
+                     + str(tmp) + ' in ' + name + ' is duplicated\n')
+    if pat_dup:
+        sys.exit('Execution aborted: duplicated label\n'
+                 + str(pat_dup) + 'in ' + name + '\n')
+    return (gene.set_index(gene.columns[0])).to_dict()
+
+############################ resolve ##########################################
+
+def resolve(genes, rules, ids, resolve_none, name):
+    resolve_rules = {}
+    not_found = []
+    flag = False
+    for key, value in genes.items():
+        tmp_resolve = []
+        for i in range(len(rules)):
+            tmp = rules[i]
+            if tmp:
+                tmp, err = replace_gene_value(tmp, value)
+                if err:
+                    not_found.extend(err)
+                ris = control(None, tmp, resolve_none)
+                if ris is False or ris == None:
+                    tmp_resolve.append(None)
+                else:
+                    tmp_resolve.append(ris)
+                    flag = True
+            else:
+                tmp_resolve.append(None)        
+        resolve_rules[key] = tmp_resolve
+    if flag is False:
+        sys.exit('Execution aborted: no computable score' +
+                 ' (due to missing gene values) for class '
+                 + name + ', the class has been disregarded\n')
+    return (resolve_rules, list(set(not_found)))
+
+################################# clustering ##################################
+
+def f_cluster(resolve_rules):
+    os.makedirs('cluster_out')
+    args = process_args(sys.argv)
+    cluster_data = pd.DataFrame.from_dict(resolve_rules, orient = 'index')
+    for i in cluster_data.columns:
+        tmp = cluster_data[i][0]
+        if tmp == None:
+            cluster_data = cluster_data.drop(columns=[i])
+    distorsion = []
+    for i in range(args.k_min, args.k_max+1):
+        tmp_kmeans = KMeans(n_clusters = i,
+                            n_init = 100, 
+                            max_iter = 300,
+                            random_state = 0).fit(cluster_data)
+        distorsion.append(tmp_kmeans.inertia_)
+        predict = tmp_kmeans.predict(cluster_data)
+        predict = [x+1 for x in predict]
+        classe = (pd.DataFrame(zip(cluster_data.index, predict))).astype(str)
+        dest = 'cluster_out/K=' + str(i) + '_' + args.name+'.tsv'
+        classe.to_csv(dest, sep = '\t', index = False,
+                      header = ['Patient_ID', 'Class'])
+    plt.figure(0)
+    plt.plot(range(args.k_min, args.k_max+1), distorsion, marker = 'o')
+    plt.xlabel('Number of cluster')
+    plt.ylabel('Distorsion')
+    plt.savefig(args.elbow, dpi = 240, format = 'pdf')
+    if args.cond_hier == 'yes':
+        import scipy.cluster.hierarchy as hier
+        lin = hier.linkage(cluster_data, args.linkage)
+        plt.figure(1)
+        plt.figure(figsize=(10, 5))
+        hier.dendrogram(lin, leaf_font_size = 2, labels = cluster_data.index)
+        plt.savefig(args.dendro, dpi = 480, format = 'pdf')
+    return None
+
+################################# main ########################################
+
+def main():
+    args = process_args(sys.argv)
+    if args.k_min > args.k_max:
+        sys.exit('Execution aborted: max cluster > min cluster')
+    if args.rules_selector == 'HMRcore':
+        recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
+    elif args.rules_selector == 'Recon':
+        recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
+    elif args.rules_selector == 'Custom':
+        ids, rules, gene_in_rule = make_recon(args.custom)
+    resolve_none = check_bool(args.none)
+    dataset = read_dataset(args.data, args.name)
+    dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+    type_gene = gene_type(dataset.iloc[0, 0], args.name)
+    if args.rules_selector != 'Custom':
+        genes = data_gene(dataset, type_gene, args.name, None)
+        ids, rules = load_id_rules(recon.get(type_gene))
+    elif args.rules_selector == 'Custom':
+        genes = data_gene(dataset, type_gene, args.name, gene_in_rule)
+    resolve_rules, err = resolve(genes, rules, ids, resolve_none, args.name)
+    if err:
+        warning('WARNING: gene\n' + str(err) + '\nnot found in class '  
+                + args.name + ', the expression level for this gene ' +
+                'will be considered NaN\n')
+    f_cluster(resolve_rules)
+    warning('Execution succeeded')
+    return None
+
+###############################################################################
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file