diff Marea/ras_generator.py @ 47:3af9d394367c draft

Uploaded
author bimib
date Wed, 19 Feb 2020 10:44:52 -0500
parents 5d5d01ef1d68
children 3b0e71e28c0b
line wrap: on
line diff
--- a/Marea/ras_generator.py	Wed Jan 22 11:50:54 2020 -0500
+++ b/Marea/ras_generator.py	Wed Feb 19 10:44:52 2020 -0500
@@ -1,16 +1,10 @@
 from __future__ import division
 import sys
 import pandas as pd
-import itertools as it
-import scipy.stats as st
 import collections
-import lxml.etree as ET
 import pickle as pk
 import math
-import os
 import argparse
-from svglib.svglib import svg2rlg
-from reportlab.graphics import renderPDF
 
 ########################## argparse ##########################################
 
@@ -26,69 +20,25 @@
     parser.add_argument('-cr', '--custom',
                         type = str,
                         help='your dataset if you want custom rules')
-    parser.add_argument('-na', '--names', 
-                        type = str,
-                        nargs = '+', 
-                        help = 'input names')
     parser.add_argument('-n', '--none',
                         type = str,
                         default = 'true',
                         choices = ['true', 'false'], 
                         help = 'compute Nan values')
-    parser.add_argument('-pv' ,'--pValue', 
-                        type = float, 
-                        default = 0.05, 
-                        help = 'P-Value threshold (default: %(default)s)')
-    parser.add_argument('-fc', '--fChange', 
-                        type = float, 
-                        default = 1.5, 
-                        help = 'Fold-Change threshold (default: %(default)s)')
     parser.add_argument('-td', '--tool_dir',
                         type = str,
                         required = True,
                         help = 'your tool directory')
-    parser.add_argument('-op', '--option', 
-                        type = str, 
-                        choices = ['datasets', 'dataset_class', 'datasets_rasonly'],
-                        help='dataset or dataset and class')
     parser.add_argument('-ol', '--out_log', 
                         help = "Output log")    
-    parser.add_argument('-ids', '--input_datas', 
-                        type = str,
-                        nargs = '+', 
-                        help = 'input datasets')
-    parser.add_argument('-id', '--input_data',
+    parser.add_argument('-id', '--input',
                         type = str,
                         help = 'input dataset')
-    parser.add_argument('-ic', '--input_class', 
-                        type = str, 
-                        help = 'sample group specification')
-    parser.add_argument('-cm', '--custom_map', 
-                        type = str, 
-                        help = 'custom map')
-    parser.add_argument('-yn', '--yes_no', 
-                        type = str,
-                        choices = ['yes', 'no'],
-                        help = 'if make or not custom map')
-    parser.add_argument('-gs', '--generate_svg',
+    parser.add_argument('-ra', '--ras_output',
                         type = str,
-                        default = 'true',
-                        choices = ['true', 'false'], 
-                        help = 'generate svg map')
-    parser.add_argument('-gp', '--generate_pdf',
-                        type = str,
-                        default = 'true',
-                        choices = ['true', 'false'], 
-                        help = 'generate pdf map')
-    parser.add_argument('-gr', '--generate_ras',
-                        type = str,
-                        default = 'true',
-                        choices = ['true', 'false'],
-                        help = 'generate reaction activity score')
-    parser.add_argument('-sr', '--single_ras_file',  
-                         type = str,              
-                         help = 'file that will contain ras')
-    					
+                        required = True,
+                        help = 'ras output')
+    
     args = parser.parse_args()
     return args
 
@@ -297,79 +247,6 @@
             return False
     return ris
 
-############################ map_methods ######################################
-
-def fold_change(avg1, avg2):
-    if avg1 == 0 and avg2 == 0:
-        return 0
-    elif avg1 == 0:
-        return '-INF'
-    elif avg2 == 0:
-        return 'INF'
-    else:
-        return math.log(avg1 / avg2, 2)
-    
-def fix_style(l, col, width, dash):
-    tmp = l.split(';')
-    flag_col = False
-    flag_width = False
-    flag_dash = False
-    for i in range(len(tmp)):
-        if tmp[i].startswith('stroke:'):
-            tmp[i] = 'stroke:' + col
-            flag_col = True
-        if tmp[i].startswith('stroke-width:'):
-            tmp[i] = 'stroke-width:' + width
-            flag_width = True
-        if tmp[i].startswith('stroke-dasharray:'):
-            tmp[i] = 'stroke-dasharray:' + dash
-            flag_dash = True
-    if not flag_col:
-        tmp.append('stroke:' + col)
-    if not flag_width:
-        tmp.append('stroke-width:' + width)
-    if not flag_dash:
-        tmp.append('stroke-dasharray:' + dash)
-    return ';'.join(tmp)
-
-def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C):
-    maxT = 12
-    minT = 2
-    grey = '#BEBEBE'
-    blue = '#0000FF'
-    red = '#E41A1C'
-    for el in core_map.iter():
-        el_id = str(el.get('id'))
-        if el_id.startswith('R_'):
-            tmp = d.get(el_id[2:])
-            if tmp != None:
-                p_val = tmp[0]
-                f_c = tmp[1]
-                if p_val < threshold_P_V:
-                    if not isinstance(f_c, str):
-                        if abs(f_c) < math.log(threshold_F_C, 2):
-                            col = grey
-                            width = str(minT)
-                        else:
-                            if f_c < 0:
-                                col = blue
-                            elif f_c > 0:
-                                col = red
-                            width = str(max((abs(f_c) * maxT) / max_F_C, minT))
-                    else:
-                        if f_c == '-INF':
-                            col = blue
-                        elif f_c == 'INF':
-                            col = red
-                        width = str(maxT)
-                    dash = 'none'
-                else:
-                    dash = '5,5'
-                    col = grey
-                    width = str(minT)
-                el.set('style', fix_style(el.get('style'), col, width, dash))
-    return core_map
-
 ############################ make recon #######################################
 
 def check_and_doWord(l):
@@ -615,7 +492,6 @@
 
 def resolve(genes, rules, ids, resolve_none, name):
     resolve_rules = {}
-    names_array = []
     not_found = []
     flag = False
     for key, value in genes.items():
@@ -652,7 +528,6 @@
             for j in range(i, len(classes)):
                 if classes.iloc[j, 1] == classe:
                     pat_id = classes.iloc[j, 0]
-                    tmp = resolve_rules.get(pat_id, None)
                     if tmp != None:
                         l.append(tmp)
                     classes.iloc[j, 1] = None
@@ -665,7 +540,7 @@
 
 ############################ create_ras #######################################
 
-def create_ras (resolve_rules, dataset_name, single_ras, rules, ids):
+def create_ras (resolve_rules, dataset_name, rules, ids, file):
 
     if resolve_rules == None:
         warning("Couldn't generate RAS for current dataset: " + dataset_name)
@@ -680,78 +555,16 @@
     output_ras.insert(0, 'Reactions', ids)
     output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
     
-    if (single_ras):
-        args = process_args(sys.argv)
-        text_file = open(args.single_ras_file, "w")
-    else:
-        text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
+    text_file = open(file, "w")
     
     text_file.write(output_to_csv)
     text_file.close()
 
-############################ map ##############################################
-
-def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf):
-    args = process_args(sys.argv)
-    if (not class_pat) or (len(class_pat.keys()) < 2):
-        sys.exit('Execution aborted: classes provided for comparisons are ' +
-                 'less than two\n')
-    for i, j in it.combinations(class_pat.keys(), 2):
-        tmp = {}
-        count = 0
-        max_F_C = 0
-        for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
-            try:
-               stat_D, p_value = st.ks_2samp(l1, l2)
-               avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
-               if not isinstance(avg, str):
-                   if max_F_C < abs(avg):
-                       max_F_C = abs(avg)
-               tmp[ids[count]] = [float(p_value), avg]
-               count += 1
-            except (TypeError, ZeroDivisionError):
-               count += 1
-        tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
-        tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
-        tmp_csv = tmp_csv.reset_index()
-        header = ['ids', 'P_Value', 'Log2(fold change)']
-        tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
-        
-        if create_svg or create_pdf:
-            if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
-                                                    and args.yes_no == 'yes'):
-                fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
-                file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
-                with open(file_svg, 'wb') as new_map:
-                    new_map.write(ET.tostring(core_map))
-                    
-                
-                if create_pdf:
-                    file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
-                    renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
-                
-                if not create_svg:
-                    #Ho utilizzato il file svg per generare il pdf, 
-                    #ma l'utente non ne ha richiesto il ritorno, quindi
-                    #lo elimino
-                    os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
-                    
-    return None
-
 ############################ MAIN #############################################
 
 def main():
     args = process_args(sys.argv)
-    
-    create_svg = check_bool(args.generate_svg)
-    create_pdf = check_bool(args.generate_pdf)
-    generate_ras = check_bool(args.generate_ras)
 
-    os.makedirs('result')
-
-    if generate_ras:
-        os.makedirs('ras')
-    
     if args.rules_selector == 'HMRcore':        
         recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
     elif args.rules_selector == 'Recon':
@@ -761,104 +574,30 @@
         
     resolve_none = check_bool(args.none)
     
-    class_pat = {}
     
-    if args.option == 'datasets_rasonly':
-        name = "RAS Dataset"
-        dataset = read_dataset(args.input_datas[0],"dataset")
-
-        dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
-
-        type_gene = gene_type(dataset.iloc[0, 0], name) 
-            
-        if args.rules_selector != 'Custom':
-            genes = data_gene(dataset, type_gene, name, None)
-            ids, rules = load_id_rules(recon.get(type_gene))
-        elif args.rules_selector == 'Custom':
-            genes = data_gene(dataset, type_gene, name, gene_in_rule)
-        
-        resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+    name = "RAS Dataset"
+    dataset = read_dataset(args.input, "dataset")
 
-        create_ras(resolve_rules, name, True, rules, ids)
-          
-        if err != None and err:
-            warning('Warning: gene\n' + str(err) + '\nnot found in class '
-                + name + ', the expression level for this gene ' +
-                'will be considered NaN\n')
-        
-        print('execution succeded')
-        return None
-    
-    
-    elif args.option == 'datasets':
-        num = 1
-        for i, j in zip(args.input_datas, args.names):
-
-            name = name_dataset(j, num)
-            dataset = read_dataset(i, name)
-
-            dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
-
-            type_gene = gene_type(dataset.iloc[0, 0], name) 
-            
-            if args.rules_selector != 'Custom':
-                genes = data_gene(dataset, type_gene, name, None)
-                ids, rules = load_id_rules(recon.get(type_gene))
-            elif args.rules_selector == 'Custom':
-                genes = data_gene(dataset, type_gene, name, gene_in_rule)
-                
-                
-            resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+    dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
 
-            if generate_ras:
-                create_ras(resolve_rules, name, False, rules, ids)
-            
-            if err != None and err:
-                warning('Warning: gene\n' + str(err) + '\nnot found in class '
-                    + name + ', the expression level for this gene ' +
-                    'will be considered NaN\n')
-            if resolve_rules != None:
-                class_pat[name] = list(map(list, zip(*resolve_rules.values())))
-            num += 1
-    elif args.option == 'dataset_class':
-        name = 'RNAseq'
-        dataset = read_dataset(args.input_data, name)
-        dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
-        type_gene = gene_type(dataset.iloc[0, 0], name)
-        classes = read_dataset(args.input_class, 'class')
-        if not len(classes.columns) == 2:
-            warning('Warning: more than 2 columns in class file. Extra' +
-                    'columns have been disregarded\n')
-        classes = classes.astype(str)
-        if args.rules_selector != 'Custom':
-            genes = data_gene(dataset, type_gene, name, None)
-            ids, rules = load_id_rules(recon.get(type_gene))
-        elif args.rules_selector == 'Custom':
-            genes = data_gene(dataset, type_gene, name, gene_in_rule)
-        resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
-        if err != None and err:
-            warning('Warning: gene\n'+str(err)+'\nnot found in class '
-                    + name + ', the expression level for this gene ' +
-                    'will be considered NaN\n')
-        if resolve_rules != None:
-            class_pat = split_class(classes, resolve_rules)
-            if generate_ras:
-                create_ras(resolve_rules, name, False, rules, ids)
+    type_gene = gene_type(dataset.iloc[0, 0], name) 
+        
+    if args.rules_selector != 'Custom':
+        genes = data_gene(dataset, type_gene, name, None)
+        ids, rules = load_id_rules(recon.get(type_gene))
+    elif args.rules_selector == 'Custom':
+        genes = data_gene(dataset, type_gene, name, gene_in_rule)
     
-    	
-    if args.rules_selector == 'Custom':
-        if args.yes_no == 'yes':
-            try:
-                core_map = ET.parse(args.custom_map)
-            except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
-                sys.exit('Execution aborted: custom map in wrong format')
-        elif args.yes_no == 'no':
-            core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg')
-    else:       
-        core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
-        
-    maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf)
-        
+    resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+
+    create_ras(resolve_rules, name, rules, ids, args.ras_output)
+      
+    if err != None and err:
+        warning('Warning: gene\n' + str(err) + '\nnot found in class '
+            + name + ', the expression level for this gene ' +
+            'will be considered NaN\n')
+
+    
     print('Execution succeded')
 
     return None