# HG changeset patch
# User bimib
# Date 1571043668 14400
# Node ID e6831924df01142b148c0f0f1f32d82aedb37953
# Parent 8c480c977a12d5d4e13888a29bde060b755a83a0
small fixes (elbow plot and output managment)
diff -r 8c480c977a12 -r e6831924df01 Marea/marea.py
--- a/Marea/marea.py Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea.py Mon Oct 14 05:01:08 2019 -0400
@@ -50,7 +50,7 @@
help = 'your tool directory')
parser.add_argument('-op', '--option',
type = str,
- choices = ['datasets', 'dataset_class'],
+ choices = ['datasets', 'dataset_class', 'datasets_rasonly'],
help='dataset or dataset and class')
parser.add_argument('-ol', '--out_log',
help = "Output log")
@@ -86,6 +86,10 @@
default = 'true',
choices = ['true', 'false'],
help = 'generate reaction activity score')
+ parser.add_argument('-sr', '--single_ras_file',
+ type = str,
+ help = 'file that will contain ras')
+
args = parser.parse_args()
return args
@@ -658,7 +662,7 @@
############################ create_ras #######################################
-def create_ras (resolve_rules, dataset_name):
+def create_ras (resolve_rules, dataset_name, single_ras):
if resolve_rules == None:
warning("Couldn't generate RAS for current dataset: " + dataset_name)
@@ -670,8 +674,13 @@
output_ras = pd.DataFrame.from_dict(resolve_rules)
output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
-
- text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
+
+ if (single_ras):
+ args = process_args(sys.argv)
+ text_file = open(args.single_ras_file, "w")
+ else:
+ text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
+
text_file.write(output_to_csv)
text_file.close()
@@ -749,7 +758,34 @@
class_pat = {}
- if args.option == 'datasets':
+ if args.option == 'datasets_rasonly':
+ name = "RAS Dataset"
+ dataset = read_dataset(args.input_datas[0],"dataset")
+
+ dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+
+ type_gene = gene_type(dataset.iloc[0, 0], name)
+
+ if args.rules_selector != 'Custom':
+ genes = data_gene(dataset, type_gene, name, None)
+ ids, rules = load_id_rules(recon.get(type_gene))
+ elif args.rules_selector == 'Custom':
+ genes = data_gene(dataset, type_gene, name, gene_in_rule)
+
+ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+
+ create_ras(resolve_rules, name, True)
+
+ if err != None and err:
+ warning('Warning: gene\n' + str(err) + '\nnot found in class '
+ + name + ', the expression level for this gene ' +
+ 'will be considered NaN\n')
+
+ print('execution succeded')
+ return None
+
+
+ elif args.option == 'datasets':
num = 1
for i, j in zip(args.input_datas, args.names):
@@ -769,8 +805,7 @@
resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
if generate_ras:
- create_ras(resolve_rules, name)
-
+ create_ras(resolve_rules, name, False)
if err != None and err:
warning('Warning: gene\n' + str(err) + '\nnot found in class '
@@ -801,7 +836,8 @@
'will be considered NaN\n')
if resolve_rules != None:
class_pat = split_class(classes, resolve_rules)
-
+
+
if args.rules_selector == 'Custom':
if args.yes_no == 'yes':
try:
diff -r 8c480c977a12 -r e6831924df01 Marea/marea.xml
--- a/Marea/marea.xml Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea.xml Mon Oct 14 05:01:08 2019 -0400
@@ -54,6 +54,10 @@
--input_data ${input_data}
--input_class ${input_class}
#end if
+ #if $cond.type_selector == 'datasets_rasonly':
+ --input_datas ${input_Datasets}
+ --single_ras_file $ras_single
+ #end if
]]>
@@ -83,6 +87,7 @@
+
@@ -90,12 +95,18 @@
+
+
+
+
+
+
@@ -116,13 +127,18 @@
+
+ cond['type_selector'] == "datasets_rasonly"
+
+ cond['type_selector'] == "datasets" or cond['type_selector'] == "dataset_class"
-
+
advanced['choice'] and advanced['generateRas']
+
diff -r 8c480c977a12 -r e6831924df01 Marea/marea_cluster.py
--- a/Marea/marea_cluster.py Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea_cluster.py Mon Oct 14 05:01:08 2019 -0400
@@ -78,6 +78,11 @@
parser.add_argument('-ep', '--eps',
type = int,
help = 'eps for dbscan (optional)')
+
+ parser.add_argument('-bc', '--best_cluster',
+ type = str,
+ help = 'output of best cluster tsv')
+
args = parser.parse_args()
@@ -131,20 +136,7 @@
dest = name
classe.to_csv(dest, sep = '\t', index = False,
header = ['Patient_ID', 'Class'])
-
-
- #list_labels = labels
- #list_values = dataset
-
- #list_values = list_values.tolist()
- #d = {'Label' : list_labels, 'Value' : list_values}
-
- #df = pd.DataFrame(d, columns=['Value','Label'])
-
- #dest = name + '.tsv'
- #df.to_csv(dest, sep = '\t', index = False,
- # header = ['Value', 'Label'])
-
+
########################### trova il massimo in lista ########################
def max_index (lista):
best = -1
@@ -158,7 +150,7 @@
################################ kmeans #####################################
-def kmeans (k_min, k_max, dataset, elbow, silhouette, davies):
+def kmeans (k_min, k_max, dataset, elbow, silhouette, davies, best_cluster):
if not os.path.exists('clustering'):
os.makedirs('clustering')
@@ -189,7 +181,10 @@
cluster_labels = clusterer.fit_predict(dataset)
all_labels.append(cluster_labels)
- silhouette_avg = silhouette_score(dataset, cluster_labels)
+ if n_clusters == 1:
+ silhouette_avg = 0
+ else:
+ silhouette_avg = silhouette_score(dataset, cluster_labels)
scores.append(silhouette_avg)
distortions.append(clusterer.fit(dataset).inertia_)
@@ -201,6 +196,14 @@
prefix = '_BEST'
write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
+
+
+ if (prefix == '_BEST'):
+ labels = all_labels[i]
+ predict = [x+1 for x in labels]
+ classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+ classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
+
if davies:
with np.errstate(divide='ignore', invalid='ignore'):
@@ -235,6 +238,9 @@
############################## silhouette plot ###############################
def silihouette_draw(dataset, labels, n_clusters, path):
+ if n_clusters == 1:
+ return None
+
silhouette_avg = silhouette_score(dataset, labels)
warning("For n_clusters = " + str(n_clusters) +
" The average silhouette_score is: " + str(silhouette_avg))
@@ -375,7 +381,7 @@
if args.cluster_type == 'kmeans':
- kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies)
+ kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster)
if args.cluster_type == 'dbscan':
dbscan(X, args.eps, args.min_samples)
@@ -386,4 +392,4 @@
##############################################################################
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff -r 8c480c977a12 -r e6831924df01 Marea/marea_cluster.xml
--- a/Marea/marea_cluster.xml Mon Oct 07 13:48:01 2019 -0400
+++ b/Marea/marea_cluster.xml Mon Oct 14 05:01:08 2019 -0400
@@ -17,6 +17,7 @@
--input $input
--tool_dir $__tool_directory__
--out_log $log
+ --best_cluster $best_cluster
--cluster_type ${data.clust_type}
#if $data.clust_type == 'kmeans':
--k_min ${data.k_min}
@@ -46,7 +47,7 @@
-
+
@@ -74,7 +75,8 @@
-
+
+