# HG changeset patch
# User bimib
# Date 1569924313 14400
# Node ID c71ac0bb12de72616aa3013b6ad6e47dc35261fd
# Parent d0e7f14b773f1e836b50110c300e57452329f93d
Uploaded
diff -r d0e7f14b773f -r c71ac0bb12de Marea/local/desktop.ini
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Marea/local/desktop.ini Tue Oct 01 06:05:13 2019 -0400
@@ -0,0 +1,6 @@
+[.ShellClassInfo]
+IconResource=C:\WINDOWS\System32\SHELL32.dll,4
+[ViewState]
+Mode=
+Vid=
+FolderType=Generic
diff -r d0e7f14b773f -r c71ac0bb12de Marea/marea.py
--- a/Marea/marea.py Tue Oct 01 06:03:12 2019 -0400
+++ b/Marea/marea.py Tue Oct 01 06:05:13 2019 -0400
@@ -1,4 +1,3 @@
-
from __future__ import division
import sys
import pandas as pd
@@ -6,6 +5,7 @@
import scipy.stats as st
import collections
import lxml.etree as ET
+import shutil
import pickle as pk
import math
import os
@@ -13,7 +13,7 @@
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPDF
-########################## argparse ###########################################
+########################## argparse ##########################################
def process_args(args):
parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
@@ -71,6 +71,21 @@
type = str,
choices = ['yes', 'no'],
help = 'if make or not custom map')
+ parser.add_argument('-gs', '--generate_svg',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'generate svg map')
+ parser.add_argument('-gp', '--generate_pdf',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'generate pdf map')
+ parser.add_argument('-gr', '--generate_ras',
+ type = str,
+ default = 'true',
+ choices = ['true', 'false'],
+ help = 'generate reaction activity score')
args = parser.parse_args()
return args
@@ -85,7 +100,7 @@
def read_dataset(data, name):
try:
- dataset = pd.read_csv(data, sep = '\t', header = 0)
+ dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python')
except pd.errors.EmptyDataError:
sys.exit('Execution aborted: wrong format of ' + name + '\n')
if len(dataset.columns) < 2:
@@ -536,7 +551,7 @@
ids = [react[i].id for i in range(len(react))]
except cb.io.sbml3.CobraSBMLError:
try:
- data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('')
+ data = (pd.read_csv(data, sep = '\t', dtype = str, engine='python')).fillna('')
if len(data.columns) < 2:
sys.exit('Execution aborted: wrong format of '+
'custom datarules\n')
@@ -641,9 +656,28 @@
', the class has been disregarded\n')
return class_pat
+############################ create_ras #######################################
+
+def create_ras (resolve_rules, dataset_name):
+
+ if resolve_rules == None:
+ warning("Couldn't generate RAS for current dataset: " + dataset_name)
+
+ for geni in resolve_rules.values():
+ for i, valori in enumerate(geni):
+ if valori == None:
+ geni[i] = 'None'
+
+ output_ras = pd.DataFrame.from_dict(resolve_rules)
+ output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
+
+ text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
+ text_file.write(output_to_csv)
+ text_file.close()
+
############################ map ##############################################
-def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C):
+def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf):
args = process_args(sys.argv)
if (not class_pat) or (len(class_pat.keys()) < 2):
sys.exit('Execution aborted: classes provided for comparisons are ' +
@@ -663,52 +697,81 @@
count += 1
except (TypeError, ZeroDivisionError):
count += 1
- tab = 'table_out/' + i + '_vs_' + j + '.tsv'
+ tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
tmp_csv = tmp_csv.reset_index()
header = ['ids', 'P_Value', 'Average']
tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
- if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
- and args.yes_no == 'yes'):
- fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
- file_svg = 'map_svg/' + i + '_vs_' + j + '.svg'
- with open(file_svg, 'wb') as new_map:
- new_map.write(ET.tostring(core_map, encoding='UTF-8',
- method='xml'))
- file_pdf = 'map_pdf/' + i + '_vs_' + j + '.pdf'
- renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
+
+ if create_svg or create_pdf:
+ if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
+ and args.yes_no == 'yes'):
+ fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
+ file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
+ with open(file_svg, 'wb') as new_map:
+ new_map.write(ET.tostring(core_map))
+
+
+ if create_pdf:
+ file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
+ renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
+
+ if not create_svg:
+ #Ho utilizzato il file svg per generare il pdf,
+ #ma l'utente non ne ha richiesto il ritorno, quindi
+ #lo elimino
+ os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
+
return None
############################ MAIN #############################################
def main():
args = process_args(sys.argv)
- os.makedirs('table_out')
- if args.rules_selector == 'HMRcore':
- os.makedirs('map_svg')
- os.makedirs('map_pdf')
+
+ create_svg = check_bool(args.generate_svg)
+ create_pdf = check_bool(args.generate_pdf)
+ generate_ras = check_bool(args.generate_ras)
+
+ os.makedirs('result')
+
+ if generate_ras:
+ os.makedirs('ras')
+
+ if args.rules_selector == 'HMRcore':
recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
elif args.rules_selector == 'Recon':
recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
elif args.rules_selector == 'Custom':
ids, rules, gene_in_rule = make_recon(args.custom)
+
resolve_none = check_bool(args.none)
+
class_pat = {}
+
if args.option == 'datasets':
num = 1
- #if len(args.names) != len(set(args.names)):
- # sys.exit('Execution aborted: datasets name duplicated')
for i, j in zip(args.input_datas, args.names):
+
name = name_dataset(j, num)
dataset = read_dataset(i, name)
+
dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
- type_gene = gene_type(dataset.iloc[0, 0], name)
+
+ type_gene = gene_type(dataset.iloc[0, 0], name)
+
if args.rules_selector != 'Custom':
genes = data_gene(dataset, type_gene, name, None)
ids, rules = load_id_rules(recon.get(type_gene))
elif args.rules_selector == 'Custom':
genes = data_gene(dataset, type_gene, name, gene_in_rule)
+
resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+
+ if generate_ras:
+ create_ras(resolve_rules, name)
+
+
if err != None and err:
warning('Warning: gene\n' + str(err) + '\nnot found in class '
+ name + ', the expression level for this gene ' +
@@ -738,10 +801,9 @@
'will be considered NaN\n')
if resolve_rules != None:
class_pat = split_class(classes, resolve_rules)
+
if args.rules_selector == 'Custom':
if args.yes_no == 'yes':
- os.makedirs('map_svg')
- os.makedirs('map_pdf')
try:
core_map = ET.parse(args.custom_map)
except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
@@ -750,8 +812,11 @@
core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
else:
core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
- maps(core_map, class_pat, ids, args.pValue, args.fChange)
- warning('Execution succeeded')
+
+ maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf)
+
+ print('Execution succeded')
+
return None
###############################################################################
diff -r d0e7f14b773f -r c71ac0bb12de Marea/marea.xml
--- a/Marea/marea.xml Tue Oct 01 06:03:12 2019 -0400
+++ b/Marea/marea.xml Tue Oct 01 06:05:13 2019 -0400
@@ -1,200 +1,223 @@
-
- for Galaxy
-
- marea_macros.xml
-
-
- pandas
- scipy
- cobra
- lxml
- svglib
- reportlab
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- (cond_rule['rules_selector'] == 'HMRcore') or ((cond_rule['rules_selector'] == 'Custom') and (cond_rule['cond_map']['yes_no'] == 'yes'))
-
-
-
- (cond_rule['rules_selector'] == 'HMRcore') or ((cond_rule['rules_selector'] == 'Custom') and (cond_rule['cond_map']['yes_no'] == 'yes'))
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+ for Galaxy - 1.0.1
+
+ marea_macros.xml
+
+
+ pandas
+ scipy
+ cobra
+ lxml
+ svglib
+ reportlab
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ advanced['choice'] and advanced['generateRas']
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r d0e7f14b773f -r c71ac0bb12de Marea/marea_cluster.py
--- a/Marea/marea_cluster.py Tue Oct 01 06:03:12 2019 -0400
+++ b/Marea/marea_cluster.py Tue Oct 01 06:05:13 2019 -0400
@@ -1,67 +1,84 @@
-from __future__ import division
-import os
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 3 19:51:00 2019
+
+@author: Narger
+"""
+
import sys
-import pandas as pd
-import collections
-import pickle as pk
import argparse
-from sklearn.cluster import KMeans
-import matplotlib
-# Force matplotlib to not use any Xwindows backend.
-matplotlib.use('Agg')
+import os
+from sklearn.datasets import make_blobs
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster
import matplotlib.pyplot as plt
+import scipy.cluster.hierarchy as shc
+import matplotlib.cm as cm
+import numpy as np
+import pandas as pd
-########################## argparse ###########################################
+################################# process args ###############################
def process_args(args):
parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
description = 'process some value\'s' +
' genes to create class.')
- parser.add_argument('-rs', '--rules_selector',
+
+ parser.add_argument('-ol', '--out_log',
+ help = "Output log")
+
+ parser.add_argument('-in', '--input',
type = str,
- default = 'HMRcore',
- choices = ['HMRcore', 'Recon', 'Custom'],
- help = 'chose which type of dataset you want use')
- parser.add_argument('-cr', '--custom',
+ help = 'input dataset')
+
+ parser.add_argument('-cy', '--cluster_type',
type = str,
- help='your dataset if you want custom rules')
- parser.add_argument('-ch', '--cond_hier',
- type = str,
- default = 'no',
- choices = ['no', 'yes'],
- help = 'chose if you wanna hierical dendrogram')
- parser.add_argument('-lk', '--k_min',
+ choices = ['kmeans', 'meanshift', 'dbscan', 'hierarchy'],
+ default = 'kmeans',
+ help = 'choose clustering algorythm')
+
+ parser.add_argument('-k1', '--k_min',
+ type = int,
+ default = 2,
+ help = 'choose minimun cluster number to be generated')
+
+ parser.add_argument('-k2', '--k_max',
type = int,
- help = 'min number of cluster')
- parser.add_argument('-uk', '--k_max',
- type = int,
- help = 'max number of cluster')
- parser.add_argument('-li', '--linkage',
- type = str,
- choices = ['single', 'complete', 'average'],
- help='linkage hierarchical cluster')
- parser.add_argument('-d', '--data',
+ default = 7,
+ help = 'choose maximum cluster number to be generated')
+
+ parser.add_argument('-el', '--elbow',
+ type = str,
+ default = 'false',
+ choices = ['true', 'false'],
+ help = 'choose if you want to generate an elbow plot for kmeans')
+
+ parser.add_argument('-si', '--silhouette',
type = str,
- required = True,
- help = 'input dataset')
- parser.add_argument('-n', '--none',
+ default = 'false',
+ choices = ['true', 'false'],
+ help = 'choose if you want silhouette plots')
+
+ parser.add_argument('-db', '--davies',
type = str,
- default = 'true',
- choices = ['true', 'false'],
- help = 'compute Nan values')
+ default = 'false',
+ choices = ['true', 'false'],
+ help = 'choose if you want davies bouldin scores')
+
parser.add_argument('-td', '--tool_dir',
type = str,
required = True,
help = 'your tool directory')
- parser.add_argument('-na', '--name',
- type = str,
- help = 'name of dataset')
- parser.add_argument('-de', '--dendro',
- help = "Dendrogram out")
- parser.add_argument('-ol', '--out_log',
- help = "Output log")
- parser.add_argument('-el', '--elbow',
- help = "Out elbow")
+
+ parser.add_argument('-ms', '--min_samples',
+ type = int,
+ help = 'min samples for dbscan (optional)')
+
+ parser.add_argument('-ep', '--eps',
+ type = int,
+ help = 'eps for dbscan (optional)')
+
+
args = parser.parse_args()
return args
@@ -70,548 +87,331 @@
def warning(s):
args = process_args(sys.argv)
with open(args.out_log, 'a') as log:
- log.write(s)
-
-############################ dataset input ####################################
+ log.write(s + "\n\n")
+ print(s)
-def read_dataset(data, name):
+########################## read dataset ######################################
+
+def read_dataset(dataset):
try:
- dataset = pd.read_csv(data, sep = '\t', header = 0)
+ dataset = pd.read_csv(dataset, sep = '\t', header = 0)
except pd.errors.EmptyDataError:
- sys.exit('Execution aborted: wrong format of '+name+'\n')
+ sys.exit('Execution aborted: wrong format of dataset\n')
if len(dataset.columns) < 2:
- sys.exit('Execution aborted: wrong format of '+name+'\n')
+ sys.exit('Execution aborted: wrong format of dataset\n')
+ return dataset
+
+############################ rewrite_input ###################################
+
+def rewrite_input(dataset):
+ #Riscrivo il dataset come dizionario di liste,
+ #non come dizionario di dizionari
+
+ for key, val in dataset.items():
+ l = []
+ for i in val:
+ if i == 'None':
+ l.append(None)
+ else:
+ l.append(float(i))
+
+ dataset[key] = l
+
return dataset
-############################ dataset name #####################################
-
-def name_dataset(name_data, count):
- if str(name_data) == 'Dataset':
- return str(name_data) + '_' + str(count)
- else:
- return str(name_data)
+############################## write to csv ##################################
-############################ load id e rules ##################################
-
-def load_id_rules(reactions):
- ids, rules = [], []
- for key, value in reactions.items():
- ids.append(key)
- rules.append(value)
- return (ids, rules)
-
-############################ check_methods ####################################
-
-def gene_type(l, name):
- if check_hgnc(l):
- return 'hugo_id'
- elif check_ensembl(l):
- return 'ensembl_gene_id'
- elif check_symbol(l):
- return 'symbol'
- elif check_entrez(l):
- return 'entrez_id'
- else:
- sys.exit('Execution aborted:\n' +
- 'gene ID type in ' + name + ' not supported. Supported ID' +
- 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
-
-def check_hgnc(l):
- if len(l) > 5:
- if (l.upper()).startswith('HGNC:'):
- return l[5:].isdigit()
- else:
- return False
- else:
- return False
-
-def check_ensembl(l):
- if len(l) == 15:
- if (l.upper()).startswith('ENS'):
- return l[4:].isdigit()
- else:
- return False
- else:
- return False
-
-def check_symbol(l):
- if len(l) > 0:
- if l[0].isalpha() and l[1:].isalnum():
- return True
- else:
- return False
- else:
- return False
+def write_to_csv (dataset, labels, name):
+ list_labels = labels
+ list_values = dataset
-def check_entrez(l):
- if len(l) > 0:
- return l.isdigit()
- else:
- return False
-
-def check_bool(b):
- if b == 'true':
- return True
- elif b == 'false':
- return False
+ list_values = list_values.tolist()
+ d = {'Label' : list_labels, 'Value' : list_values}
-############################ make recon #######################################
-
-def check_and_doWord(l):
- tmp = []
- tmp_genes = []
- count = 0
- while l:
- if count >= 0:
- if l[0] == '(':
- count += 1
- tmp.append(l[0])
- l.pop(0)
- elif l[0] == ')':
- count -= 1
- tmp.append(l[0])
- l.pop(0)
- elif l[0] == ' ':
- l.pop(0)
- else:
- word = []
- while l:
- if l[0] in [' ', '(', ')']:
- break
- else:
- word.append(l[0])
- l.pop(0)
- word = ''.join(word)
- tmp.append(word)
- if not(word in ['or', 'and']):
- tmp_genes.append(word)
- else:
- return False
- if count == 0:
- return (tmp, tmp_genes)
- else:
- return False
-
-def brackets_to_list(l):
- tmp = []
- while l:
- if l[0] == '(':
- l.pop(0)
- tmp.append(resolve_brackets(l))
- else:
- tmp.append(l[0])
- l.pop(0)
- return tmp
+ df = pd.DataFrame(d, columns=['Value','Label'])
-def resolve_brackets(l):
- tmp = []
- while l[0] != ')':
- if l[0] == '(':
- l.pop(0)
- tmp.append(resolve_brackets(l))
- else:
- tmp.append(l[0])
- l.pop(0)
- l.pop(0)
- return tmp
+ dest = name + '.tsv'
+ df.to_csv(dest, sep = '\t', index = False,
+ header = ['Value', 'Label'])
+
+########################### trova il massimo in lista ########################
+def max_index (lista):
+ best = -1
+ best_index = 0
+ for i in range(len(lista)):
+ if lista[i] > best:
+ best = lista [i]
+ best_index = i
+
+ return best_index
+
+################################ kmeans #####################################
+
+def kmeans (k_min, k_max, dataset, elbow, silhouette, davies):
+ if not os.path.exists('clustering/kmeans_output'):
+ os.makedirs('clustering/kmeans_output')
+
+
+ if elbow == 'true':
+ elbow = True
+ else:
+ elbow = False
+
+ if silhouette == 'true':
+ silhouette = True
+ else:
+ silhouette = False
+
+ if davies == 'true':
+ davies = True
+ else:
+ davies = False
+
-def priorityAND(l):
- tmp = []
- flag = True
- while l:
- if len(l) == 1:
- if isinstance(l[0], list):
- tmp.append(priorityAND(l[0]))
- else:
- tmp.append(l[0])
- l = l[1:]
- elif l[0] == 'or':
- tmp.append(l[0])
- flag = False
- l = l[1:]
- elif l[1] == 'or':
- if isinstance(l[0], list):
- tmp.append(priorityAND(l[0]))
- else:
- tmp.append(l[0])
- tmp.append(l[1])
- flag = False
- l = l[2:]
- elif l[1] == 'and':
- tmpAnd = []
- if isinstance(l[0], list):
- tmpAnd.append(priorityAND(l[0]))
- else:
- tmpAnd.append(l[0])
- tmpAnd.append(l[1])
- if isinstance(l[2], list):
- tmpAnd.append(priorityAND(l[2]))
- else:
- tmpAnd.append(l[2])
- l = l[3:]
- while l:
- if l[0] == 'and':
- tmpAnd.append(l[0])
- if isinstance(l[1], list):
- tmpAnd.append(priorityAND(l[1]))
- else:
- tmpAnd.append(l[1])
- l = l[2:]
- elif l[0] == 'or':
- flag = False
- break
- if flag == True: #se ci sono solo AND nella lista
- tmp.extend(tmpAnd)
- elif flag == False:
- tmp.append(tmpAnd)
- return tmp
+ range_n_clusters = [i for i in range(k_min, k_max+1)]
+ distortions = []
+ scores = []
+ all_labels = []
+
+ for n_clusters in range_n_clusters:
+ clusterer = KMeans(n_clusters=n_clusters, random_state=10)
+ cluster_labels = clusterer.fit_predict(dataset)
+
+ all_labels.append(cluster_labels)
+ silhouette_avg = silhouette_score(dataset, cluster_labels)
+ scores.append(silhouette_avg)
+ distortions.append(clusterer.fit(dataset).inertia_)
+
+ best = max_index(scores) + k_min
+
+ for i in range(len(all_labels)):
+ prefix = ''
+ if (i + k_min == best):
+ prefix = '_BEST'
+
+ write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
+
+ if davies:
+ with np.errstate(divide='ignore', invalid='ignore'):
+ davies_bouldin = davies_bouldin_score(dataset, all_labels[i])
+ warning("\nFor n_clusters = " + str(i + k_min) +
+ " The average davies bouldin score is: " + str(davies_bouldin))
+
+
+ if silhouette:
+ silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
+
+
+ if elbow:
+ elbow_plot(distortions, k_min,k_max)
-def checkRule(l):
- if len(l) == 1:
- if isinstance(l[0], list):
- if checkRule(l[0]) is False:
- return False
- elif len(l) > 2:
- if checkRule2(l) is False:
- return False
- else:
- return False
- return True
-
-def checkRule2(l):
- while l:
- if len(l) == 1:
- return False
- elif isinstance(l[0], list) and l[1] in ['and', 'or']:
- if checkRule(l[0]) is False:
- return False
- if isinstance(l[2], list):
- if checkRule(l[2]) is False:
- return False
- l = l[3:]
- elif l[1] in ['and', 'or']:
- if isinstance(l[2], list):
- if checkRule(l[2]) is False:
- return False
- l = l[3:]
- elif l[0] in ['and', 'or']:
- if isinstance(l[1], list):
- if checkRule(l[1]) is False:
- return False
- l = l[2:]
- else:
- return False
- return True
-
-def do_rules(rules):
- split_rules = []
- err_rules = []
- tmp_gene_in_rule = []
- for i in range(len(rules)):
- tmp = list(rules[i])
- if tmp:
- tmp, tmp_genes = check_and_doWord(tmp)
- tmp_gene_in_rule.extend(tmp_genes)
- if tmp is False:
- split_rules.append([])
- err_rules.append(rules[i])
- else:
- tmp = brackets_to_list(tmp)
- if checkRule(tmp):
- split_rules.append(priorityAND(tmp))
- else:
- split_rules.append([])
- err_rules.append(rules[i])
- else:
- split_rules.append([])
- if err_rules:
- warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
- return (split_rules, list(set(tmp_gene_in_rule)))
+
+
+
-def make_recon(data):
- try:
- import cobra as cb
- import warnings
- with warnings.catch_warnings():
- warnings.simplefilter('ignore')
- recon = cb.io.read_sbml_model(data)
- react = recon.reactions
- rules = [react[i].gene_reaction_rule for i in range(len(react))]
- ids = [react[i].id for i in range(len(react))]
- except cb.io.sbml3.CobraSBMLError:
- try:
- data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('')
- if len(data.columns) < 2:
- sys.exit('Execution aborted: wrong format of ' +
- 'custom GPR rules\n')
- if not len(data.columns) == 2:
- warning('WARNING: more than 2 columns in custom GPR rules.\n' +
- 'Extra columns have been disregarded\n')
- ids = list(data.iloc[:, 0])
- rules = list(data.iloc[:, 1])
- except pd.errors.EmptyDataError:
- sys.exit('Execution aborted: wrong format of custom GPR rules\n')
- except pd.errors.ParserError:
- sys.exit('Execution aborted: wrong format of custom GPR rules\n')
- split_rules, tmp_genes = do_rules(rules)
- gene_in_rule = {}
- for i in tmp_genes:
- gene_in_rule[i] = 'ok'
- return (ids, split_rules, gene_in_rule)
-
-############################ resolve_methods ##################################
-
-def replace_gene_value(l, d):
- tmp = []
- err = []
- while l:
- if isinstance(l[0], list):
- tmp_rules, tmp_err = replace_gene_value(l[0], d)
- tmp.append(tmp_rules)
- err.extend(tmp_err)
- else:
- value = replace_gene(l[0],d)
- tmp.append(value)
- if value == None:
- err.append(l[0])
- l = l[1:]
- return (tmp, err)
-
-def replace_gene(l, d):
- if l =='and' or l == 'or':
- return l
+############################## elbow_plot ####################################
+
+def elbow_plot (distortions, k_min, k_max):
+ plt.figure(0)
+ plt.plot(range(k_min, k_max+1), distortions, marker = 'o')
+ plt.xlabel('Number of cluster')
+ plt.ylabel('Distortion')
+ s = 'clustering/kmeans_output/elbow_plot.png'
+ fig = plt.gcf()
+ fig.set_size_inches(18.5, 10.5, forward = True)
+ fig.savefig(s, dpi=100)
+
+
+############################## silhouette plot ###############################
+def silihouette_draw(dataset, labels, n_clusters, path):
+ silhouette_avg = silhouette_score(dataset, labels)
+ warning("For n_clusters = " + str(n_clusters) +
+ " The average silhouette_score is: " + str(silhouette_avg))
+
+ plt.close('all')
+ # Create a subplot with 1 row and 2 columns
+ fig, (ax1) = plt.subplots(1, 1)
+
+ fig.set_size_inches(18, 7)
+
+ # The 1st subplot is the silhouette plot
+ # The silhouette coefficient can range from -1, 1 but in this example all
+ # lie within [-0.1, 1]
+ ax1.set_xlim([-1, 1])
+ # The (n_clusters+1)*10 is for inserting blank space between silhouette
+ # plots of individual clusters, to demarcate them clearly.
+ ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
+
+ # Compute the silhouette scores for each sample
+ sample_silhouette_values = silhouette_samples(dataset, labels)
+
+ y_lower = 10
+ for i in range(n_clusters):
+ # Aggregate the silhouette scores for samples belonging to
+ # cluster i, and sort them
+ ith_cluster_silhouette_values = \
+ sample_silhouette_values[labels == i]
+
+ ith_cluster_silhouette_values.sort()
+
+ size_cluster_i = ith_cluster_silhouette_values.shape[0]
+ y_upper = y_lower + size_cluster_i
+
+ color = cm.nipy_spectral(float(i) / n_clusters)
+ ax1.fill_betweenx(np.arange(y_lower, y_upper),
+ 0, ith_cluster_silhouette_values,
+ facecolor=color, edgecolor=color, alpha=0.7)
+
+ # Label the silhouette plots with their cluster numbers at the middle
+ ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
+
+ # Compute the new y_lower for next plot
+ y_lower = y_upper + 10 # 10 for the 0 samples
+
+ ax1.set_title("The silhouette plot for the various clusters.")
+ ax1.set_xlabel("The silhouette coefficient values")
+ ax1.set_ylabel("Cluster label")
+
+ # The vertical line for average silhouette score of all the values
+ ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
+
+ ax1.set_yticks([]) # Clear the yaxis labels / ticks
+ ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
+
+
+ plt.suptitle(("Silhouette analysis for clustering on sample data "
+ "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
+
+
+ plt.savefig(path, bbox_inches='tight')
+
+######################## dbscan ##############################################
+
+def dbscan(dataset, eps, min_samples):
+ if not os.path.exists('clustering/dbscan_output'):
+ os.makedirs('clustering/dbscan_output')
+
+ if eps is not None:
+ clusterer = DBSCAN(eps = eps, min_samples = min_samples)
else:
- value = d.get(l, None)
- if not(value == None or isinstance(value, (int, float))):
- sys.exit('Execution aborted: ' + value + ' value not valid\n')
- return value
-
-def compute(val1, op, val2, cn):
- if val1 != None and val2 != None:
- if op == 'and':
- return min(val1, val2)
- else:
- return val1 + val2
- elif op == 'and':
- if cn is True:
- if val1 != None:
- return val1
- elif val2 != None:
- return val2
- else:
- return None
- else:
- return None
- else:
- if val1 != None:
- return val1
- elif val2 != None:
- return val2
- else:
- return None
-
-def control(ris, l, cn):
- if len(l) == 1:
- if isinstance(l[0], (float, int)) or l[0] == None:
- return l[0]
- elif isinstance(l[0], list):
- return control(None, l[0], cn)
- else:
- return False
- elif len(l) > 2:
- return control_list(ris, l, cn)
- else:
- return False
+ clusterer = DBSCAN()
+
+ clustering = clusterer.fit(dataset)
+
+ core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
+ core_samples_mask[clustering.core_sample_indices_] = True
+ labels = clustering.labels_
-def control_list(ris, l, cn):
- while l:
- if len(l) == 1:
- return False
- elif (isinstance(l[0], (float, int)) or
- l[0] == None) and l[1] in ['and', 'or']:
- if isinstance(l[2], (float, int)) or l[2] == None:
- ris = compute(l[0], l[1], l[2], cn)
- elif isinstance(l[2], list):
- tmp = control(None, l[2], cn)
- if tmp is False:
- return False
- else:
- ris = compute(l[0], l[1], tmp, cn)
- else:
- return False
- l = l[3:]
- elif l[0] in ['and', 'or']:
- if isinstance(l[1], (float, int)) or l[1] == None:
- ris = compute(ris, l[0], l[1], cn)
- elif isinstance(l[1], list):
- tmp = control(None,l[1], cn)
- if tmp is False:
- return False
- else:
- ris = compute(ris, l[0], tmp, cn)
- else:
- return False
- l = l[2:]
- elif isinstance(l[0], list) and l[1] in ['and', 'or']:
- if isinstance(l[2], (float, int)) or l[2] == None:
- tmp = control(None, l[0], cn)
- if tmp is False:
- return False
- else:
- ris = compute(tmp, l[1], l[2], cn)
- elif isinstance(l[2], list):
- tmp = control(None, l[0], cn)
- tmp2 = control(None, l[2], cn)
- if tmp is False or tmp2 is False:
- return False
- else:
- ris = compute(tmp, l[1], tmp2, cn)
- else:
- return False
- l = l[3:]
- else:
- return False
- return ris
+ # Number of clusters in labels, ignoring noise if present.
+ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+
+ silhouette_avg = silhouette_score(dataset, labels)
+ warning("For n_clusters =" + str(n_clusters_) +
+ "The average silhouette_score is :" + str(silhouette_avg))
+
+ ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
-############################ gene #############################################
+ # Black removed and is used for noise instead.
+ unique_labels = set(labels)
+ colors = [plt.cm.Spectral(each)
+ for each in np.linspace(0, 1, len(unique_labels))]
+ for k, col in zip(unique_labels, colors):
+ if k == -1:
+ # Black used for noise.
+ col = [0, 0, 0, 1]
+
+ class_member_mask = (labels == k)
+
+ xy = dataset[class_member_mask & core_samples_mask]
+ plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+ markeredgecolor='k', markersize=14)
+
+ xy = dataset[class_member_mask & ~core_samples_mask]
+ plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+ markeredgecolor='k', markersize=6)
-def data_gene(gene, type_gene, name, gene_custom):
- args = process_args(sys.argv)
- for i in range(len(gene)):
- tmp = gene.iloc[i, 0]
- if tmp.startswith(' ') or tmp.endswith(' '):
- gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
- gene_dup = [item for item, count in
- collections.Counter(gene[gene.columns[0]]).items() if count > 1]
- pat_dup = [item for item, count in
- collections.Counter(list(gene.columns)).items() if count > 1]
- if gene_dup:
- if gene_custom == None:
- if args.rules_selector == 'HMRcore':
- gene_in_rule = pk.load(open(args.tool_dir +
- '/local/HMRcore_genes.p', 'rb'))
- elif args.rules_selector == 'Recon':
- gene_in_rule = pk.load(open(args.tool_dir +
- '/local/Recon_genes.p', 'rb'))
- gene_in_rule = gene_in_rule.get(type_gene)
- else:
- gene_in_rule = gene_custom
- tmp = []
- for i in gene_dup:
- if gene_in_rule.get(i) == 'ok':
- tmp.append(i)
- if tmp:
- sys.exit('Execution aborted because gene ID '
- + str(tmp) + ' in ' + name + ' is duplicated\n')
- if pat_dup:
- sys.exit('Execution aborted: duplicated label\n'
- + str(pat_dup) + 'in ' + name + '\n')
- return (gene.set_index(gene.columns[0])).to_dict()
+ plt.title('Estimated number of clusters: %d' % n_clusters_)
+ s = 'clustering/dbscan_output/dbscan_plot.png'
+ fig = plt.gcf()
+ fig.set_size_inches(18.5, 10.5, forward = True)
+ fig.savefig(s, dpi=100)
+
+
+ write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv')
+
+########################## hierachical #######################################
+
+def hierachical_agglomerative(dataset, k_min, k_max):
-############################ resolve ##########################################
+ if not os.path.exists('clustering/agglomerative_output'):
+ os.makedirs('clustering/agglomerative_output')
+
+ plt.figure(figsize=(10, 7))
+ plt.title("Customer Dendograms")
+ shc.dendrogram(shc.linkage(dataset, method='ward'))
+ fig = plt.gcf()
+ fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200)
+
+ range_n_clusters = [i for i in range(k_min, k_max+1)]
-def resolve(genes, rules, ids, resolve_none, name):
- resolve_rules = {}
- not_found = []
- flag = False
- for key, value in genes.items():
- tmp_resolve = []
- for i in range(len(rules)):
- tmp = rules[i]
- if tmp:
- tmp, err = replace_gene_value(tmp, value)
- if err:
- not_found.extend(err)
- ris = control(None, tmp, resolve_none)
- if ris is False or ris == None:
- tmp_resolve.append(None)
- else:
- tmp_resolve.append(ris)
- flag = True
- else:
- tmp_resolve.append(None)
- resolve_rules[key] = tmp_resolve
- if flag is False:
- sys.exit('Execution aborted: no computable score' +
- ' (due to missing gene values) for class '
- + name + ', the class has been disregarded\n')
- return (resolve_rules, list(set(not_found)))
-
-################################# clustering ##################################
+ for n_clusters in range_n_clusters:
+
+ cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
+ cluster.fit_predict(dataset)
+ cluster_labels = cluster.labels_
+
+ silhouette_avg = silhouette_score(dataset, cluster_labels)
+ warning("For n_clusters =", n_clusters,
+ "The average silhouette_score is :", silhouette_avg)
+
+ plt.clf()
+ plt.figure(figsize=(10, 7))
+ plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score")
+ plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow')
+ s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png'
+ fig = plt.gcf()
+ fig.set_size_inches(10, 7, forward = True)
+ fig.savefig(s, dpi=200)
+
+ write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
+
+
-def f_cluster(resolve_rules):
- os.makedirs('cluster_out')
- args = process_args(sys.argv)
- k_min = args.k_min
- k_max = args.k_max
- if k_min > k_max:
- warning('k range boundaries inverted.\n')
- tmp = k_min
- k_min = k_max
- k_max = tmp
- else:
- warning('k range correct.\n')
- cluster_data = pd.DataFrame.from_dict(resolve_rules, orient = 'index')
- for i in cluster_data.columns:
- tmp = cluster_data[i][0]
- if tmp == None:
- cluster_data = cluster_data.drop(columns=[i])
- distorsion = []
- for i in range(k_min, k_max+1):
- tmp_kmeans = KMeans(n_clusters = i,
- n_init = 100,
- max_iter = 300,
- random_state = 0).fit(cluster_data)
- distorsion.append(tmp_kmeans.inertia_)
- predict = tmp_kmeans.predict(cluster_data)
- predict = [x+1 for x in predict]
- classe = (pd.DataFrame(list(zip(cluster_data.index, predict)))).astype(str)
- dest = 'cluster_out/K=' + str(i) + '_' + args.name+'.tsv'
- classe.to_csv(dest, sep = '\t', index = False,
- header = ['Patient_ID', 'Class'])
- plt.figure(0)
- plt.plot(range(k_min, k_max+1), distorsion, marker = 'o')
- plt.xlabel('Number of cluster')
- plt.ylabel('Distorsion')
- plt.savefig(args.elbow, dpi = 240, format = 'pdf')
- if args.cond_hier == 'yes':
- import scipy.cluster.hierarchy as hier
- lin = hier.linkage(cluster_data, args.linkage)
- plt.figure(1)
- plt.figure(figsize=(10, 5))
- hier.dendrogram(lin, leaf_font_size = 2, labels = cluster_data.index)
- plt.savefig(args.dendro, dpi = 480, format = 'pdf')
- return None
+
+############################# main ###########################################
-################################# main ########################################
def main():
+ if not os.path.exists('clustering'):
+ os.makedirs('clustering')
+
args = process_args(sys.argv)
- if args.rules_selector == 'HMRcore':
- recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
- elif args.rules_selector == 'Recon':
- recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
- elif args.rules_selector == 'Custom':
- ids, rules, gene_in_rule = make_recon(args.custom)
- resolve_none = check_bool(args.none)
- dataset = read_dataset(args.data, args.name)
- dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
- type_gene = gene_type(dataset.iloc[0, 0], args.name)
- if args.rules_selector != 'Custom':
- genes = data_gene(dataset, type_gene, args.name, None)
- ids, rules = load_id_rules(recon.get(type_gene))
- elif args.rules_selector == 'Custom':
- genes = data_gene(dataset, type_gene, args.name, gene_in_rule)
- resolve_rules, err = resolve(genes, rules, ids, resolve_none, args.name)
- if err:
- warning('WARNING: gene\n' + str(err) + '\nnot found in class '
- + args.name + ', the expression level for this gene ' +
- 'will be considered NaN\n')
- f_cluster(resolve_rules)
- warning('Execution succeeded')
- return None
-
-###############################################################################
+
+ #Data read
+
+ X = read_dataset(args.input)
+ X = pd.DataFrame.to_dict(X, orient='list')
+ X = rewrite_input(X)
+ X = pd.DataFrame.from_dict(X, orient = 'index')
+
+ for i in X.columns:
+ tmp = X[i][0]
+ if tmp == None:
+ X = X.drop(columns=[i])
+
+ X = pd.DataFrame.to_numpy(X)
+
+
+ if args.cluster_type == 'kmeans':
+ kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies)
+
+ if args.cluster_type == 'dbscan':
+ dbscan(X, args.eps, args.min_samples)
+
+ if args.cluster_type == 'hierarchy':
+ hierachical_agglomerative(X, args.k_min, args.k_max)
+
+##############################################################################
if __name__ == "__main__":
main()
diff -r d0e7f14b773f -r c71ac0bb12de Marea/marea_cluster.xml
--- a/Marea/marea_cluster.xml Tue Oct 01 06:03:12 2019 -0400
+++ b/Marea/marea_cluster.xml Tue Oct 01 06:05:13 2019 -0400
@@ -1,148 +1,92 @@
-
- of Reaction Activity Scores
-
- marea_macros.xml
-
-
- pandas
- scipy
- cobra
- scikit-learn
- matplotlib
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- cond_hier['hier'] == 'yes'
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+ of Reaction Activity Scores - 1.0.1
+
+ marea_macros.xml
+
+
+ pandas
+ scipy
+ cobra
+ scikit-learn
+ matplotlib
+ numpy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/local/HMRcoreMap.svg
--- a/marea-1.0.1/local/HMRcoreMap.svg Tue Oct 01 06:03:12 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,7702 +0,0 @@
-
-
-
-
\ No newline at end of file
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/local/HMRcore_genes.p
Binary file marea-1.0.1/local/HMRcore_genes.p has changed
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/local/HMRcore_rules.p
Binary file marea-1.0.1/local/HMRcore_rules.p has changed
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/local/Recon_genes.p
Binary file marea-1.0.1/local/Recon_genes.p has changed
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/local/Recon_rules.p
Binary file marea-1.0.1/local/Recon_rules.p has changed
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/local/desktop.ini
--- a/marea-1.0.1/local/desktop.ini Tue Oct 01 06:03:12 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-[.ShellClassInfo]
-IconResource=C:\WINDOWS\System32\SHELL32.dll,4
-[ViewState]
-Mode=
-Vid=
-FolderType=Generic
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/marea.py
--- a/marea-1.0.1/marea.py Tue Oct 01 06:03:12 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,825 +0,0 @@
-from __future__ import division
-import sys
-import pandas as pd
-import itertools as it
-import scipy.stats as st
-import collections
-import lxml.etree as ET
-import shutil
-import pickle as pk
-import math
-import os
-import argparse
-from svglib.svglib import svg2rlg
-from reportlab.graphics import renderPDF
-
-########################## argparse ##########################################
-
-def process_args(args):
- parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
- description = 'process some value\'s'+
- ' genes to create a comparison\'s map.')
- parser.add_argument('-rs', '--rules_selector',
- type = str,
- default = 'HMRcore',
- choices = ['HMRcore', 'Recon', 'Custom'],
- help = 'chose which type of dataset you want use')
- parser.add_argument('-cr', '--custom',
- type = str,
- help='your dataset if you want custom rules')
- parser.add_argument('-na', '--names',
- type = str,
- nargs = '+',
- help = 'input names')
- parser.add_argument('-n', '--none',
- type = str,
- default = 'true',
- choices = ['true', 'false'],
- help = 'compute Nan values')
- parser.add_argument('-pv' ,'--pValue',
- type = float,
- default = 0.05,
- help = 'P-Value threshold (default: %(default)s)')
- parser.add_argument('-fc', '--fChange',
- type = float,
- default = 1.5,
- help = 'Fold-Change threshold (default: %(default)s)')
- parser.add_argument('-td', '--tool_dir',
- type = str,
- required = True,
- help = 'your tool directory')
- parser.add_argument('-op', '--option',
- type = str,
- choices = ['datasets', 'dataset_class'],
- help='dataset or dataset and class')
- parser.add_argument('-ol', '--out_log',
- help = "Output log")
- parser.add_argument('-ids', '--input_datas',
- type = str,
- nargs = '+',
- help = 'input datasets')
- parser.add_argument('-id', '--input_data',
- type = str,
- help = 'input dataset')
- parser.add_argument('-ic', '--input_class',
- type = str,
- help = 'sample group specification')
- parser.add_argument('-cm', '--custom_map',
- type = str,
- help = 'custom map')
- parser.add_argument('-yn', '--yes_no',
- type = str,
- choices = ['yes', 'no'],
- help = 'if make or not custom map')
- parser.add_argument('-gs', '--generate_svg',
- type = str,
- default = 'true',
- choices = ['true', 'false'],
- help = 'generate svg map')
- parser.add_argument('-gp', '--generate_pdf',
- type = str,
- default = 'true',
- choices = ['true', 'false'],
- help = 'generate pdf map')
- parser.add_argument('-gr', '--generate_ras',
- type = str,
- default = 'true',
- choices = ['true', 'false'],
- help = 'generate reaction activity score')
- args = parser.parse_args()
- return args
-
-########################### warning ###########################################
-
-def warning(s):
- args = process_args(sys.argv)
- with open(args.out_log, 'a') as log:
- log.write(s)
-
-############################ dataset input ####################################
-
-def read_dataset(data, name):
- try:
- dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python')
- except pd.errors.EmptyDataError:
- sys.exit('Execution aborted: wrong format of ' + name + '\n')
- if len(dataset.columns) < 2:
- sys.exit('Execution aborted: wrong format of ' + name + '\n')
- return dataset
-
-############################ dataset name #####################################
-
-def name_dataset(name_data, count):
- if str(name_data) == 'Dataset':
- return str(name_data) + '_' + str(count)
- else:
- return str(name_data)
-
-############################ load id e rules ##################################
-
-def load_id_rules(reactions):
- ids, rules = [], []
- for key, value in reactions.items():
- ids.append(key)
- rules.append(value)
- return (ids, rules)
-
-############################ check_methods ####################################
-
-def gene_type(l, name):
- if check_hgnc(l):
- return 'hugo_id'
- elif check_ensembl(l):
- return 'ensembl_gene_id'
- elif check_symbol(l):
- return 'symbol'
- elif check_entrez(l):
- return 'entrez_id'
- else:
- sys.exit('Execution aborted:\n' +
- 'gene ID type in ' + name + ' not supported. Supported ID'+
- 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
-
-def check_hgnc(l):
- if len(l) > 5:
- if (l.upper()).startswith('HGNC:'):
- return l[5:].isdigit()
- else:
- return False
- else:
- return False
-
-def check_ensembl(l):
- if len(l) == 15:
- if (l.upper()).startswith('ENS'):
- return l[4:].isdigit()
- else:
- return False
- else:
- return False
-
-def check_symbol(l):
- if len(l) > 0:
- if l[0].isalpha() and l[1:].isalnum():
- return True
- else:
- return False
- else:
- return False
-
-def check_entrez(l):
- if len(l) > 0:
- return l.isdigit()
- else:
- return False
-
-def check_bool(b):
- if b == 'true':
- return True
- elif b == 'false':
- return False
-
-############################ resolve_methods ##################################
-
-def replace_gene_value(l, d):
- tmp = []
- err = []
- while l:
- if isinstance(l[0], list):
- tmp_rules, tmp_err = replace_gene_value(l[0], d)
- tmp.append(tmp_rules)
- err.extend(tmp_err)
- else:
- value = replace_gene(l[0], d)
- tmp.append(value)
- if value == None:
- err.append(l[0])
- l = l[1:]
- return (tmp, err)
-
-def replace_gene(l, d):
- if l =='and' or l == 'or':
- return l
- else:
- value = d.get(l, None)
- if not(value == None or isinstance(value, (int, float))):
- sys.exit('Execution aborted: ' + value + ' value not valid\n')
- return value
-
-def computes(val1, op, val2, cn):
- if val1 != None and val2 != None:
- if op == 'and':
- return min(val1, val2)
- else:
- return val1 + val2
- elif op == 'and':
- if cn is True:
- if val1 != None:
- return val1
- elif val2 != None:
- return val2
- else:
- return None
- else:
- return None
- else:
- if val1 != None:
- return val1
- elif val2 != None:
- return val2
- else:
- return None
-
-def control(ris, l, cn):
- if len(l) == 1:
- if isinstance(l[0], (float, int)) or l[0] == None:
- return l[0]
- elif isinstance(l[0], list):
- return control(None, l[0], cn)
- else:
- return False
- elif len(l) > 2:
- return control_list(ris, l, cn)
- else:
- return False
-
-def control_list(ris, l, cn):
- while l:
- if len(l) == 1:
- return False
- elif (isinstance(l[0], (float, int)) or
- l[0] == None) and l[1] in ['and', 'or']:
- if isinstance(l[2], (float, int)) or l[2] == None:
- ris = computes(l[0], l[1], l[2], cn)
- elif isinstance(l[2], list):
- tmp = control(None, l[2], cn)
- if tmp is False:
- return False
- else:
- ris = computes(l[0], l[1], tmp, cn)
- else:
- return False
- l = l[3:]
- elif l[0] in ['and', 'or']:
- if isinstance(l[1], (float, int)) or l[1] == None:
- ris = computes(ris, l[0], l[1], cn)
- elif isinstance(l[1], list):
- tmp = control(None,l[1], cn)
- if tmp is False:
- return False
- else:
- ris = computes(ris, l[0], tmp, cn)
- else:
- return False
- l = l[2:]
- elif isinstance(l[0], list) and l[1] in ['and', 'or']:
- if isinstance(l[2], (float, int)) or l[2] == None:
- tmp = control(None, l[0], cn)
- if tmp is False:
- return False
- else:
- ris = computes(tmp, l[1], l[2], cn)
- elif isinstance(l[2], list):
- tmp = control(None, l[0], cn)
- tmp2 = control(None, l[2], cn)
- if tmp is False or tmp2 is False:
- return False
- else:
- ris = computes(tmp, l[1], tmp2, cn)
- else:
- return False
- l = l[3:]
- else:
- return False
- return ris
-
-############################ map_methods ######################################
-
-def fold_change(avg1, avg2):
- if avg1 == 0 and avg2 == 0:
- return 0
- elif avg1 == 0:
- return '-INF'
- elif avg2 == 0:
- return 'INF'
- else:
- return math.log(avg1 / avg2, 2)
-
-def fix_style(l, col, width, dash):
- tmp = l.split(';')
- flag_col = False
- flag_width = False
- flag_dash = False
- for i in range(len(tmp)):
- if tmp[i].startswith('stroke:'):
- tmp[i] = 'stroke:' + col
- flag_col = True
- if tmp[i].startswith('stroke-width:'):
- tmp[i] = 'stroke-width:' + width
- flag_width = True
- if tmp[i].startswith('stroke-dasharray:'):
- tmp[i] = 'stroke-dasharray:' + dash
- flag_dash = True
- if not flag_col:
- tmp.append('stroke:' + col)
- if not flag_width:
- tmp.append('stroke-width:' + width)
- if not flag_dash:
- tmp.append('stroke-dasharray:' + dash)
- return ';'.join(tmp)
-
-def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C):
- maxT = 12
- minT = 2
- grey = '#BEBEBE'
- blue = '#0000FF'
- red = '#E41A1C'
- for el in core_map.iter():
- el_id = str(el.get('id'))
- if el_id.startswith('R_'):
- tmp = d.get(el_id[2:])
- if tmp != None:
- p_val = tmp[0]
- f_c = tmp[1]
- if p_val < threshold_P_V:
- if not isinstance(f_c, str):
- if abs(f_c) < math.log(threshold_F_C, 2):
- col = grey
- width = str(minT)
- else:
- if f_c < 0:
- col = blue
- elif f_c > 0:
- col = red
- width = str(max((abs(f_c) * maxT) / max_F_C, minT))
- else:
- if f_c == '-INF':
- col = blue
- elif f_c == 'INF':
- col = red
- width = str(maxT)
- dash = 'none'
- else:
- dash = '5,5'
- col = grey
- width = str(minT)
- el.set('style', fix_style(el.get('style'), col, width, dash))
- return core_map
-
-############################ make recon #######################################
-
-def check_and_doWord(l):
- tmp = []
- tmp_genes = []
- count = 0
- while l:
- if count >= 0:
- if l[0] == '(':
- count += 1
- tmp.append(l[0])
- l.pop(0)
- elif l[0] == ')':
- count -= 1
- tmp.append(l[0])
- l.pop(0)
- elif l[0] == ' ':
- l.pop(0)
- else:
- word = []
- while l:
- if l[0] in [' ', '(', ')']:
- break
- else:
- word.append(l[0])
- l.pop(0)
- word = ''.join(word)
- tmp.append(word)
- if not(word in ['or', 'and']):
- tmp_genes.append(word)
- else:
- return False
- if count == 0:
- return (tmp, tmp_genes)
- else:
- return False
-
-def brackets_to_list(l):
- tmp = []
- while l:
- if l[0] == '(':
- l.pop(0)
- tmp.append(resolve_brackets(l))
- else:
- tmp.append(l[0])
- l.pop(0)
- return tmp
-
-def resolve_brackets(l):
- tmp = []
- while l[0] != ')':
- if l[0] == '(':
- l.pop(0)
- tmp.append(resolve_brackets(l))
- else:
- tmp.append(l[0])
- l.pop(0)
- l.pop(0)
- return tmp
-
-def priorityAND(l):
- tmp = []
- flag = True
- while l:
- if len(l) == 1:
- if isinstance(l[0], list):
- tmp.append(priorityAND(l[0]))
- else:
- tmp.append(l[0])
- l = l[1:]
- elif l[0] == 'or':
- tmp.append(l[0])
- flag = False
- l = l[1:]
- elif l[1] == 'or':
- if isinstance(l[0], list):
- tmp.append(priorityAND(l[0]))
- else:
- tmp.append(l[0])
- tmp.append(l[1])
- flag = False
- l = l[2:]
- elif l[1] == 'and':
- tmpAnd = []
- if isinstance(l[0], list):
- tmpAnd.append(priorityAND(l[0]))
- else:
- tmpAnd.append(l[0])
- tmpAnd.append(l[1])
- if isinstance(l[2], list):
- tmpAnd.append(priorityAND(l[2]))
- else:
- tmpAnd.append(l[2])
- l = l[3:]
- while l:
- if l[0] == 'and':
- tmpAnd.append(l[0])
- if isinstance(l[1], list):
- tmpAnd.append(priorityAND(l[1]))
- else:
- tmpAnd.append(l[1])
- l = l[2:]
- elif l[0] == 'or':
- flag = False
- break
- if flag == True: #when there are only AND in list
- tmp.extend(tmpAnd)
- elif flag == False:
- tmp.append(tmpAnd)
- return tmp
-
-def checkRule(l):
- if len(l) == 1:
- if isinstance(l[0], list):
- if checkRule(l[0]) is False:
- return False
- elif len(l) > 2:
- if checkRule2(l) is False:
- return False
- else:
- return False
- return True
-
-def checkRule2(l):
- while l:
- if len(l) == 1:
- return False
- elif isinstance(l[0], list) and l[1] in ['and', 'or']:
- if checkRule(l[0]) is False:
- return False
- if isinstance(l[2], list):
- if checkRule(l[2]) is False:
- return False
- l = l[3:]
- elif l[1] in ['and', 'or']:
- if isinstance(l[2], list):
- if checkRule(l[2]) is False:
- return False
- l = l[3:]
- elif l[0] in ['and', 'or']:
- if isinstance(l[1], list):
- if checkRule(l[1]) is False:
- return False
- l = l[2:]
- else:
- return False
- return True
-
-def do_rules(rules):
- split_rules = []
- err_rules = []
- tmp_gene_in_rule = []
- for i in range(len(rules)):
- tmp = list(rules[i])
- if tmp:
- tmp, tmp_genes = check_and_doWord(tmp)
- tmp_gene_in_rule.extend(tmp_genes)
- if tmp is False:
- split_rules.append([])
- err_rules.append(rules[i])
- else:
- tmp = brackets_to_list(tmp)
- if checkRule(tmp):
- split_rules.append(priorityAND(tmp))
- else:
- split_rules.append([])
- err_rules.append(rules[i])
- else:
- split_rules.append([])
- if err_rules:
- warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
- return (split_rules, list(set(tmp_gene_in_rule)))
-
-def make_recon(data):
- try:
- import cobra as cb
- import warnings
- with warnings.catch_warnings():
- warnings.simplefilter('ignore')
- recon = cb.io.read_sbml_model(data)
- react = recon.reactions
- rules = [react[i].gene_reaction_rule for i in range(len(react))]
- ids = [react[i].id for i in range(len(react))]
- except cb.io.sbml3.CobraSBMLError:
- try:
- data = (pd.read_csv(data, sep = '\t', dtype = str, engine='python')).fillna('')
- if len(data.columns) < 2:
- sys.exit('Execution aborted: wrong format of '+
- 'custom datarules\n')
- if not len(data.columns) == 2:
- warning('Warning: more than 2 columns in custom datarules.\n' +
- 'Extra columns have been disregarded\n')
- ids = list(data.iloc[:, 0])
- rules = list(data.iloc[:, 1])
- except pd.errors.EmptyDataError:
- sys.exit('Execution aborted: wrong format of custom datarules\n')
- except pd.errors.ParserError:
- sys.exit('Execution aborted: wrong format of custom datarules\n')
- split_rules, tmp_genes = do_rules(rules)
- gene_in_rule = {}
- for i in tmp_genes:
- gene_in_rule[i] = 'ok'
- return (ids, split_rules, gene_in_rule)
-
-############################ gene #############################################
-
-def data_gene(gene, type_gene, name, gene_custom):
- args = process_args(sys.argv)
- for i in range(len(gene)):
- tmp = gene.iloc[i, 0]
- if tmp.startswith(' ') or tmp.endswith(' '):
- gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
- gene_dup = [item for item, count in
- collections.Counter(gene[gene.columns[0]]).items() if count > 1]
- pat_dup = [item for item, count in
- collections.Counter(list(gene.columns)).items() if count > 1]
- if gene_dup:
- if gene_custom == None:
- if args.rules_selector == 'HMRcore':
- gene_in_rule = pk.load(open(args.tool_dir +
- '/local/HMRcore_genes.p', 'rb'))
- elif args.rules_selector == 'Recon':
- gene_in_rule = pk.load(open(args.tool_dir +
- '/local/Recon_genes.p', 'rb'))
- gene_in_rule = gene_in_rule.get(type_gene)
- else:
- gene_in_rule = gene_custom
- tmp = []
- for i in gene_dup:
- if gene_in_rule.get(i) == 'ok':
- tmp.append(i)
- if tmp:
- sys.exit('Execution aborted because gene ID '
- +str(tmp)+' in '+name+' is duplicated\n')
- if pat_dup:
- warning('Warning: duplicated label\n' + str(pat_dup) + 'in ' + name +
- '\n')
- return (gene.set_index(gene.columns[0])).to_dict()
-
-############################ resolve ##########################################
-
-def resolve(genes, rules, ids, resolve_none, name):
- resolve_rules = {}
- not_found = []
- flag = False
- for key, value in genes.items():
- tmp_resolve = []
- for i in range(len(rules)):
- tmp = rules[i]
- if tmp:
- tmp, err = replace_gene_value(tmp, value)
- if err:
- not_found.extend(err)
- ris = control(None, tmp, resolve_none)
- if ris is False or ris == None:
- tmp_resolve.append(None)
- else:
- tmp_resolve.append(ris)
- flag = True
- else:
- tmp_resolve.append(None)
- resolve_rules[key] = tmp_resolve
- if flag is False:
- warning('Warning: no computable score (due to missing gene values)' +
- 'for class ' + name + ', the class has been disregarded\n')
- return (None, None)
- return (resolve_rules, list(set(not_found)))
-
-############################ split class ######################################
-
-def split_class(classes, resolve_rules):
- class_pat = {}
- for i in range(len(classes)):
- classe = classes.iloc[i, 1]
- if not pd.isnull(classe):
- l = []
- for j in range(i, len(classes)):
- if classes.iloc[j, 1] == classe:
- pat_id = classes.iloc[j, 0]
- tmp = resolve_rules.get(pat_id, None)
- if tmp != None:
- l.append(tmp)
- classes.iloc[j, 1] = None
- if l:
- class_pat[classe] = list(map(list, zip(*l)))
- else:
- warning('Warning: no sample found in class ' + classe +
- ', the class has been disregarded\n')
- return class_pat
-
-############################ create_ras #######################################
-
-def create_ras (resolve_rules, dataset_name):
-
- if resolve_rules == None:
- warning("Couldn't generate RAS for current dataset: " + dataset_name)
-
- for geni in resolve_rules.values():
- for i, valori in enumerate(geni):
- if valori == None:
- geni[i] = 'None'
-
- output_ras = pd.DataFrame.from_dict(resolve_rules)
- output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
-
- text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
- text_file.write(output_to_csv)
- text_file.close()
-
-############################ map ##############################################
-
-def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf):
- args = process_args(sys.argv)
- if (not class_pat) or (len(class_pat.keys()) < 2):
- sys.exit('Execution aborted: classes provided for comparisons are ' +
- 'less than two\n')
- for i, j in it.combinations(class_pat.keys(), 2):
- tmp = {}
- count = 0
- max_F_C = 0
- for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
- try:
- stat_D, p_value = st.ks_2samp(l1, l2)
- avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
- if not isinstance(avg, str):
- if max_F_C < abs(avg):
- max_F_C = abs(avg)
- tmp[ids[count]] = [float(p_value), avg]
- count += 1
- except (TypeError, ZeroDivisionError):
- count += 1
- tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
- tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
- tmp_csv = tmp_csv.reset_index()
- header = ['ids', 'P_Value', 'Average']
- tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
-
- if create_svg or create_pdf:
- if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
- and args.yes_no == 'yes'):
- fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
- file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
- with open(file_svg, 'wb') as new_map:
- new_map.write(ET.tostring(core_map))
-
-
- if create_pdf:
- file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
- renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
-
- if not create_svg:
- #Ho utilizzato il file svg per generare il pdf,
- #ma l'utente non ne ha richiesto il ritorno, quindi
- #lo elimino
- os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
-
- return None
-
-############################ MAIN #############################################
-
-def main():
- args = process_args(sys.argv)
-
- create_svg = check_bool(args.generate_svg)
- create_pdf = check_bool(args.generate_pdf)
- generate_ras = check_bool(args.generate_ras)
-
- os.makedirs('result')
-
- if generate_ras:
- os.makedirs('ras')
-
- if args.rules_selector == 'HMRcore':
- recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
- elif args.rules_selector == 'Recon':
- recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
- elif args.rules_selector == 'Custom':
- ids, rules, gene_in_rule = make_recon(args.custom)
-
- resolve_none = check_bool(args.none)
-
- class_pat = {}
-
- if args.option == 'datasets':
- num = 1
- for i, j in zip(args.input_datas, args.names):
-
- name = name_dataset(j, num)
- dataset = read_dataset(i, name)
-
- dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
-
- type_gene = gene_type(dataset.iloc[0, 0], name)
-
- if args.rules_selector != 'Custom':
- genes = data_gene(dataset, type_gene, name, None)
- ids, rules = load_id_rules(recon.get(type_gene))
- elif args.rules_selector == 'Custom':
- genes = data_gene(dataset, type_gene, name, gene_in_rule)
-
- resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
-
- if generate_ras:
- create_ras(resolve_rules, name)
-
-
- if err != None and err:
- warning('Warning: gene\n' + str(err) + '\nnot found in class '
- + name + ', the expression level for this gene ' +
- 'will be considered NaN\n')
- if resolve_rules != None:
- class_pat[name] = list(map(list, zip(*resolve_rules.values())))
- num += 1
- elif args.option == 'dataset_class':
- name = 'RNAseq'
- dataset = read_dataset(args.input_data, name)
- dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
- type_gene = gene_type(dataset.iloc[0, 0], name)
- classes = read_dataset(args.input_class, 'class')
- if not len(classes.columns) == 2:
- warning('Warning: more than 2 columns in class file. Extra' +
- 'columns have been disregarded\n')
- classes = classes.astype(str)
- if args.rules_selector != 'Custom':
- genes = data_gene(dataset, type_gene, name, None)
- ids, rules = load_id_rules(recon.get(type_gene))
- elif args.rules_selector == 'Custom':
- genes = data_gene(dataset, type_gene, name, gene_in_rule)
- resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
- if err != None and err:
- warning('Warning: gene\n'+str(err)+'\nnot found in class '
- + name + ', the expression level for this gene ' +
- 'will be considered NaN\n')
- if resolve_rules != None:
- class_pat = split_class(classes, resolve_rules)
-
- if args.rules_selector == 'Custom':
- if args.yes_no == 'yes':
- try:
- core_map = ET.parse(args.custom_map)
- except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
- sys.exit('Execution aborted: custom map in wrong format')
- elif args.yes_no == 'no':
- core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
- else:
- core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
-
- maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf)
-
- print('Execution succeded')
-
- return None
-
-###############################################################################
-
-if __name__ == "__main__":
- main()
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/marea.xml
--- a/marea-1.0.1/marea.xml Tue Oct 01 06:03:12 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,223 +0,0 @@
-
- for Galaxy - 1.0.1
-
- marea_macros.xml
-
-
- pandas
- scipy
- cobra
- lxml
- svglib
- reportlab
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- advanced['choice'] and advanced['generateRas']
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/marea_cluster.py
--- a/marea-1.0.1/marea_cluster.py Tue Oct 01 06:03:12 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,417 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 3 19:51:00 2019
-
-@author: Narger
-"""
-
-import sys
-import argparse
-import os
-from sklearn.datasets import make_blobs
-from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
-from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster
-import matplotlib.pyplot as plt
-import scipy.cluster.hierarchy as shc
-import matplotlib.cm as cm
-import numpy as np
-import pandas as pd
-
-################################# process args ###############################
-
-def process_args(args):
- parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
- description = 'process some value\'s' +
- ' genes to create class.')
-
- parser.add_argument('-ol', '--out_log',
- help = "Output log")
-
- parser.add_argument('-in', '--input',
- type = str,
- help = 'input dataset')
-
- parser.add_argument('-cy', '--cluster_type',
- type = str,
- choices = ['kmeans', 'meanshift', 'dbscan', 'hierarchy'],
- default = 'kmeans',
- help = 'choose clustering algorythm')
-
- parser.add_argument('-k1', '--k_min',
- type = int,
- default = 2,
- help = 'choose minimun cluster number to be generated')
-
- parser.add_argument('-k2', '--k_max',
- type = int,
- default = 7,
- help = 'choose maximum cluster number to be generated')
-
- parser.add_argument('-el', '--elbow',
- type = str,
- default = 'false',
- choices = ['true', 'false'],
- help = 'choose if you want to generate an elbow plot for kmeans')
-
- parser.add_argument('-si', '--silhouette',
- type = str,
- default = 'false',
- choices = ['true', 'false'],
- help = 'choose if you want silhouette plots')
-
- parser.add_argument('-db', '--davies',
- type = str,
- default = 'false',
- choices = ['true', 'false'],
- help = 'choose if you want davies bouldin scores')
-
- parser.add_argument('-td', '--tool_dir',
- type = str,
- required = True,
- help = 'your tool directory')
-
- parser.add_argument('-ms', '--min_samples',
- type = int,
- help = 'min samples for dbscan (optional)')
-
- parser.add_argument('-ep', '--eps',
- type = int,
- help = 'eps for dbscan (optional)')
-
-
- args = parser.parse_args()
- return args
-
-########################### warning ###########################################
-
-def warning(s):
- args = process_args(sys.argv)
- with open(args.out_log, 'a') as log:
- log.write(s + "\n\n")
- print(s)
-
-########################## read dataset ######################################
-
-def read_dataset(dataset):
- try:
- dataset = pd.read_csv(dataset, sep = '\t', header = 0)
- except pd.errors.EmptyDataError:
- sys.exit('Execution aborted: wrong format of dataset\n')
- if len(dataset.columns) < 2:
- sys.exit('Execution aborted: wrong format of dataset\n')
- return dataset
-
-############################ rewrite_input ###################################
-
-def rewrite_input(dataset):
- #Riscrivo il dataset come dizionario di liste,
- #non come dizionario di dizionari
-
- for key, val in dataset.items():
- l = []
- for i in val:
- if i == 'None':
- l.append(None)
- else:
- l.append(float(i))
-
- dataset[key] = l
-
- return dataset
-
-############################## write to csv ##################################
-
-def write_to_csv (dataset, labels, name):
- list_labels = labels
- list_values = dataset
-
- list_values = list_values.tolist()
- d = {'Label' : list_labels, 'Value' : list_values}
-
- df = pd.DataFrame(d, columns=['Value','Label'])
-
- dest = name + '.tsv'
- df.to_csv(dest, sep = '\t', index = False,
- header = ['Value', 'Label'])
-
-########################### trova il massimo in lista ########################
-def max_index (lista):
- best = -1
- best_index = 0
- for i in range(len(lista)):
- if lista[i] > best:
- best = lista [i]
- best_index = i
-
- return best_index
-
-################################ kmeans #####################################
-
-def kmeans (k_min, k_max, dataset, elbow, silhouette, davies):
- if not os.path.exists('clustering/kmeans_output'):
- os.makedirs('clustering/kmeans_output')
-
-
- if elbow == 'true':
- elbow = True
- else:
- elbow = False
-
- if silhouette == 'true':
- silhouette = True
- else:
- silhouette = False
-
- if davies == 'true':
- davies = True
- else:
- davies = False
-
-
- range_n_clusters = [i for i in range(k_min, k_max+1)]
- distortions = []
- scores = []
- all_labels = []
-
- for n_clusters in range_n_clusters:
- clusterer = KMeans(n_clusters=n_clusters, random_state=10)
- cluster_labels = clusterer.fit_predict(dataset)
-
- all_labels.append(cluster_labels)
- silhouette_avg = silhouette_score(dataset, cluster_labels)
- scores.append(silhouette_avg)
- distortions.append(clusterer.fit(dataset).inertia_)
-
- best = max_index(scores) + k_min
-
- for i in range(len(all_labels)):
- prefix = ''
- if (i + k_min == best):
- prefix = '_BEST'
-
- write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
-
- if davies:
- with np.errstate(divide='ignore', invalid='ignore'):
- davies_bouldin = davies_bouldin_score(dataset, all_labels[i])
- warning("\nFor n_clusters = " + str(i + k_min) +
- " The average davies bouldin score is: " + str(davies_bouldin))
-
-
- if silhouette:
- silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
-
-
- if elbow:
- elbow_plot(distortions, k_min,k_max)
-
-
-
-
-
-############################## elbow_plot ####################################
-
-def elbow_plot (distortions, k_min, k_max):
- plt.figure(0)
- plt.plot(range(k_min, k_max+1), distortions, marker = 'o')
- plt.xlabel('Number of cluster')
- plt.ylabel('Distortion')
- s = 'clustering/kmeans_output/elbow_plot.png'
- fig = plt.gcf()
- fig.set_size_inches(18.5, 10.5, forward = True)
- fig.savefig(s, dpi=100)
-
-
-############################## silhouette plot ###############################
-def silihouette_draw(dataset, labels, n_clusters, path):
- silhouette_avg = silhouette_score(dataset, labels)
- warning("For n_clusters = " + str(n_clusters) +
- " The average silhouette_score is: " + str(silhouette_avg))
-
- plt.close('all')
- # Create a subplot with 1 row and 2 columns
- fig, (ax1) = plt.subplots(1, 1)
-
- fig.set_size_inches(18, 7)
-
- # The 1st subplot is the silhouette plot
- # The silhouette coefficient can range from -1, 1 but in this example all
- # lie within [-0.1, 1]
- ax1.set_xlim([-1, 1])
- # The (n_clusters+1)*10 is for inserting blank space between silhouette
- # plots of individual clusters, to demarcate them clearly.
- ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
-
- # Compute the silhouette scores for each sample
- sample_silhouette_values = silhouette_samples(dataset, labels)
-
- y_lower = 10
- for i in range(n_clusters):
- # Aggregate the silhouette scores for samples belonging to
- # cluster i, and sort them
- ith_cluster_silhouette_values = \
- sample_silhouette_values[labels == i]
-
- ith_cluster_silhouette_values.sort()
-
- size_cluster_i = ith_cluster_silhouette_values.shape[0]
- y_upper = y_lower + size_cluster_i
-
- color = cm.nipy_spectral(float(i) / n_clusters)
- ax1.fill_betweenx(np.arange(y_lower, y_upper),
- 0, ith_cluster_silhouette_values,
- facecolor=color, edgecolor=color, alpha=0.7)
-
- # Label the silhouette plots with their cluster numbers at the middle
- ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
-
- # Compute the new y_lower for next plot
- y_lower = y_upper + 10 # 10 for the 0 samples
-
- ax1.set_title("The silhouette plot for the various clusters.")
- ax1.set_xlabel("The silhouette coefficient values")
- ax1.set_ylabel("Cluster label")
-
- # The vertical line for average silhouette score of all the values
- ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
-
- ax1.set_yticks([]) # Clear the yaxis labels / ticks
- ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
-
-
- plt.suptitle(("Silhouette analysis for clustering on sample data "
- "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
-
-
- plt.savefig(path, bbox_inches='tight')
-
-######################## dbscan ##############################################
-
-def dbscan(dataset, eps, min_samples):
- if not os.path.exists('clustering/dbscan_output'):
- os.makedirs('clustering/dbscan_output')
-
- if eps is not None:
- clusterer = DBSCAN(eps = eps, min_samples = min_samples)
- else:
- clusterer = DBSCAN()
-
- clustering = clusterer.fit(dataset)
-
- core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
- core_samples_mask[clustering.core_sample_indices_] = True
- labels = clustering.labels_
-
- # Number of clusters in labels, ignoring noise if present.
- n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
-
- silhouette_avg = silhouette_score(dataset, labels)
- warning("For n_clusters =" + str(n_clusters_) +
- "The average silhouette_score is :" + str(silhouette_avg))
-
- ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
-
- # Black removed and is used for noise instead.
- unique_labels = set(labels)
- colors = [plt.cm.Spectral(each)
- for each in np.linspace(0, 1, len(unique_labels))]
- for k, col in zip(unique_labels, colors):
- if k == -1:
- # Black used for noise.
- col = [0, 0, 0, 1]
-
- class_member_mask = (labels == k)
-
- xy = dataset[class_member_mask & core_samples_mask]
- plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
- markeredgecolor='k', markersize=14)
-
- xy = dataset[class_member_mask & ~core_samples_mask]
- plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
- markeredgecolor='k', markersize=6)
-
- plt.title('Estimated number of clusters: %d' % n_clusters_)
- s = 'clustering/dbscan_output/dbscan_plot.png'
- fig = plt.gcf()
- fig.set_size_inches(18.5, 10.5, forward = True)
- fig.savefig(s, dpi=100)
-
-
- write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv')
-
-########################## hierachical #######################################
-
-def hierachical_agglomerative(dataset, k_min, k_max):
-
- if not os.path.exists('clustering/agglomerative_output'):
- os.makedirs('clustering/agglomerative_output')
-
- plt.figure(figsize=(10, 7))
- plt.title("Customer Dendograms")
- shc.dendrogram(shc.linkage(dataset, method='ward'))
- fig = plt.gcf()
- fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200)
-
- range_n_clusters = [i for i in range(k_min, k_max+1)]
-
- for n_clusters in range_n_clusters:
-
- cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
- cluster.fit_predict(dataset)
- cluster_labels = cluster.labels_
-
- silhouette_avg = silhouette_score(dataset, cluster_labels)
- warning("For n_clusters =", n_clusters,
- "The average silhouette_score is :", silhouette_avg)
-
- plt.clf()
- plt.figure(figsize=(10, 7))
- plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score")
- plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow')
- s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png'
- fig = plt.gcf()
- fig.set_size_inches(10, 7, forward = True)
- fig.savefig(s, dpi=200)
-
- write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
-
-
-
-
-############################# main ###########################################
-
-
-def main():
- if not os.path.exists('clustering'):
- os.makedirs('clustering')
-
- args = process_args(sys.argv)
-
- #Data read
-
- X = read_dataset(args.input)
- X = pd.DataFrame.to_dict(X, orient='list')
- X = rewrite_input(X)
- X = pd.DataFrame.from_dict(X, orient = 'index')
-
- for i in X.columns:
- tmp = X[i][0]
- if tmp == None:
- X = X.drop(columns=[i])
-
- X = pd.DataFrame.to_numpy(X)
-
-
- if args.cluster_type == 'kmeans':
- kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies)
-
- if args.cluster_type == 'dbscan':
- dbscan(X, args.eps, args.min_samples)
-
- if args.cluster_type == 'hierarchy':
- hierachical_agglomerative(X, args.k_min, args.k_max)
-
-##############################################################################
-
-if __name__ == "__main__":
- main()
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/marea_cluster.xml
--- a/marea-1.0.1/marea_cluster.xml Tue Oct 01 06:03:12 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,92 +0,0 @@
-
- of Reaction Activity Scores - 1.0.1
-
- marea_macros.xml
-
-
- pandas
- scipy
- cobra
- scikit-learn
- matplotlib
- numpy
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r d0e7f14b773f -r c71ac0bb12de marea-1.0.1/marea_macros.xml
--- a/marea-1.0.1/marea_macros.xml Tue Oct 01 06:03:12 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,92 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-+--------------------+-------------------------------+
-| id | rule (with entrez-id) |
-+====================+===============================+
-| SHMT1 | 155060 or 10357 |
-+--------------------+-------------------------------+
-| NIT2 | 155060 or 100134869 |
-+--------------------+-------------------------------+
-| GOT1_GOT2_GOT1L1_2 | 155060 and 100134869 or 10357 |
-+--------------------+-------------------------------+
-
-|
-
-
-
-
-
-+------------+------------+------------+------------+
-| Hugo_ID | TCGAA62670 | TCGAA62671 | TCGAA62672 |
-+============+============+============+============+
-| HGNC:24086 | 0.523167 | 0.371355 | 0.925661 |
-+------------+------------+------------+------------+
-| HGNC:24086 | 0.568765 | 0.765567 | 0.456789 |
-+------------+------------+------------+------------+
-| HGNC:9876 | 0.876545 | 0.768933 | 0.987654 |
-+------------+------------+------------+------------+
-| HGNC:9 | 0.456788 | 0.876543 | 0.876542 |
-+------------+------------+------------+------------+
-| HGNC:23 | 0.876543 | 0.786543 | 0.897654 |
-+------------+------------+------------+------------+
-
-|
-
-
-
-
-
-+-------------+------------+------------+------------+
-| Hugo_Symbol | TCGAA62670 | TCGAA62671 | TCGAA62672 |
-+=============+============+============+============+
-| A1BG | 0.523167 | 0.371355 | 0.925661 |
-+-------------+------------+------------+------------+
-| A1CF | 0.568765 | 0.765567 | 0.456789 |
-+-------------+------------+------------+------------+
-| A2M | 0.876545 | 0.768933 | 0.987654 |
-+-------------+------------+------------+------------+
-| A4GALT | 0.456788 | 0.876543 | 0.876542 |
-+-------------+------------+------------+------------+
-| M664Y65 | 0.876543 | 0.786543 | 0.897654 |
-+-------------+------------+------------+------------+
-
-|
-
-
-
-
-
-This tool is developed by the `BIMIB`_ at the `Department of Informatics, Systems and Communications`_ of `University of Milan - Bicocca`_.
-
-.. _BIMIB: http://sito di bio.org
-.. _Department of Informatics, Systems and Communications: http://www.disco.unimib.it/go/Home/English
-.. _University of Milan - Bicocca: https://www.unimib.it/
-
-
-
-
-
-
-@online{lh32017,
- author = {Alex Graudenzi, Davide Maspero, Cluadio Isella, Marzia Di Filippo, Giancarlo Mauri, Enzo Medico, Marco Antoniotti, Chiara Damiani},
- year = {2018},
- title = {MaREA: Metabolic feature extraction, enrichment and visualization of RNAseq},
- publisher = {bioRxiv},
- journal = {bioRxiv},
- url = {https://www.biorxiv.org/content/early/2018/01/16/248724},
-}
-
-
-
-
-