# HG changeset patch # User bimib # Date 1571043668 14400 # Node ID e6831924df01142b148c0f0f1f32d82aedb37953 # Parent 8c480c977a12d5d4e13888a29bde060b755a83a0 small fixes (elbow plot and output managment) diff -r 8c480c977a12 -r e6831924df01 Marea/marea.py --- a/Marea/marea.py Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea.py Mon Oct 14 05:01:08 2019 -0400 @@ -50,7 +50,7 @@ help = 'your tool directory') parser.add_argument('-op', '--option', type = str, - choices = ['datasets', 'dataset_class'], + choices = ['datasets', 'dataset_class', 'datasets_rasonly'], help='dataset or dataset and class') parser.add_argument('-ol', '--out_log', help = "Output log") @@ -86,6 +86,10 @@ default = 'true', choices = ['true', 'false'], help = 'generate reaction activity score') + parser.add_argument('-sr', '--single_ras_file', + type = str, + help = 'file that will contain ras') + args = parser.parse_args() return args @@ -658,7 +662,7 @@ ############################ create_ras ####################################### -def create_ras (resolve_rules, dataset_name): +def create_ras (resolve_rules, dataset_name, single_ras): if resolve_rules == None: warning("Couldn't generate RAS for current dataset: " + dataset_name) @@ -670,8 +674,13 @@ output_ras = pd.DataFrame.from_dict(resolve_rules) output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) - - text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w") + + if (single_ras): + args = process_args(sys.argv) + text_file = open(args.single_ras_file, "w") + else: + text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w") + text_file.write(output_to_csv) text_file.close() @@ -749,7 +758,34 @@ class_pat = {} - if args.option == 'datasets': + if args.option == 'datasets_rasonly': + name = "RAS Dataset" + dataset = read_dataset(args.input_datas[0],"dataset") + + dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) + + type_gene = gene_type(dataset.iloc[0, 0], name) + + if args.rules_selector != 'Custom': + genes = data_gene(dataset, type_gene, name, None) + ids, rules = load_id_rules(recon.get(type_gene)) + elif args.rules_selector == 'Custom': + genes = data_gene(dataset, type_gene, name, gene_in_rule) + + resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) + + create_ras(resolve_rules, name, True) + + if err != None and err: + warning('Warning: gene\n' + str(err) + '\nnot found in class ' + + name + ', the expression level for this gene ' + + 'will be considered NaN\n') + + print('execution succeded') + return None + + + elif args.option == 'datasets': num = 1 for i, j in zip(args.input_datas, args.names): @@ -769,8 +805,7 @@ resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) if generate_ras: - create_ras(resolve_rules, name) - + create_ras(resolve_rules, name, False) if err != None and err: warning('Warning: gene\n' + str(err) + '\nnot found in class ' @@ -801,7 +836,8 @@ 'will be considered NaN\n') if resolve_rules != None: class_pat = split_class(classes, resolve_rules) - + + if args.rules_selector == 'Custom': if args.yes_no == 'yes': try: diff -r 8c480c977a12 -r e6831924df01 Marea/marea.xml --- a/Marea/marea.xml Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea.xml Mon Oct 14 05:01:08 2019 -0400 @@ -54,6 +54,10 @@ --input_data ${input_data} --input_class ${input_class} #end if + #if $cond.type_selector == 'datasets_rasonly': + --input_datas ${input_Datasets} + --single_ras_file $ras_single + #end if ]]> @@ -83,6 +87,7 @@ + @@ -90,12 +95,18 @@ + + + + + + @@ -116,13 +127,18 @@ + + cond['type_selector'] == "datasets_rasonly" + + cond['type_selector'] == "datasets" or cond['type_selector'] == "dataset_class" - + advanced['choice'] and advanced['generateRas'] + diff -r 8c480c977a12 -r e6831924df01 Marea/marea_cluster.py --- a/Marea/marea_cluster.py Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea_cluster.py Mon Oct 14 05:01:08 2019 -0400 @@ -78,6 +78,11 @@ parser.add_argument('-ep', '--eps', type = int, help = 'eps for dbscan (optional)') + + parser.add_argument('-bc', '--best_cluster', + type = str, + help = 'output of best cluster tsv') + args = parser.parse_args() @@ -131,20 +136,7 @@ dest = name classe.to_csv(dest, sep = '\t', index = False, header = ['Patient_ID', 'Class']) - - - #list_labels = labels - #list_values = dataset - - #list_values = list_values.tolist() - #d = {'Label' : list_labels, 'Value' : list_values} - - #df = pd.DataFrame(d, columns=['Value','Label']) - - #dest = name + '.tsv' - #df.to_csv(dest, sep = '\t', index = False, - # header = ['Value', 'Label']) - + ########################### trova il massimo in lista ######################## def max_index (lista): best = -1 @@ -158,7 +150,7 @@ ################################ kmeans ##################################### -def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): +def kmeans (k_min, k_max, dataset, elbow, silhouette, davies, best_cluster): if not os.path.exists('clustering'): os.makedirs('clustering') @@ -189,7 +181,10 @@ cluster_labels = clusterer.fit_predict(dataset) all_labels.append(cluster_labels) - silhouette_avg = silhouette_score(dataset, cluster_labels) + if n_clusters == 1: + silhouette_avg = 0 + else: + silhouette_avg = silhouette_score(dataset, cluster_labels) scores.append(silhouette_avg) distortions.append(clusterer.fit(dataset).inertia_) @@ -201,6 +196,14 @@ prefix = '_BEST' write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') + + + if (prefix == '_BEST'): + labels = all_labels[i] + predict = [x+1 for x in labels] + classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) + classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) + if davies: with np.errstate(divide='ignore', invalid='ignore'): @@ -235,6 +238,9 @@ ############################## silhouette plot ############################### def silihouette_draw(dataset, labels, n_clusters, path): + if n_clusters == 1: + return None + silhouette_avg = silhouette_score(dataset, labels) warning("For n_clusters = " + str(n_clusters) + " The average silhouette_score is: " + str(silhouette_avg)) @@ -375,7 +381,7 @@ if args.cluster_type == 'kmeans': - kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) + kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster) if args.cluster_type == 'dbscan': dbscan(X, args.eps, args.min_samples) @@ -386,4 +392,4 @@ ############################################################################## if __name__ == "__main__": - main() \ No newline at end of file + main() diff -r 8c480c977a12 -r e6831924df01 Marea/marea_cluster.xml --- a/Marea/marea_cluster.xml Mon Oct 07 13:48:01 2019 -0400 +++ b/Marea/marea_cluster.xml Mon Oct 14 05:01:08 2019 -0400 @@ -17,6 +17,7 @@ --input $input --tool_dir $__tool_directory__ --out_log $log + --best_cluster $best_cluster --cluster_type ${data.clust_type} #if $data.clust_type == 'kmeans': --k_min ${data.k_min} @@ -46,7 +47,7 @@ - + @@ -74,7 +75,8 @@ - + +