diff Marea/marea.py @ 47:3af9d394367c draft

Uploaded
author bimib
date Wed, 19 Feb 2020 10:44:52 -0500
parents 7b1971251c63
children e4235b5231e4
line wrap: on
line diff
--- a/Marea/marea.py	Wed Jan 22 11:50:54 2020 -0500
+++ b/Marea/marea.py	Wed Feb 19 10:44:52 2020 -0500
@@ -26,10 +26,6 @@
     parser.add_argument('-cr', '--custom',
                         type = str,
                         help='your dataset if you want custom rules')
-    parser.add_argument('-na', '--names', 
-                        type = str,
-                        nargs = '+', 
-                        help = 'input names')
     parser.add_argument('-n', '--none',
                         type = str,
                         default = 'true',
@@ -37,7 +33,7 @@
                         help = 'compute Nan values')
     parser.add_argument('-pv' ,'--pValue', 
                         type = float, 
-                        default = 0.05, 
+                        default = 0.1, 
                         help = 'P-Value threshold (default: %(default)s)')
     parser.add_argument('-fc', '--fChange', 
                         type = float, 
@@ -49,14 +45,10 @@
                         help = 'your tool directory')
     parser.add_argument('-op', '--option', 
                         type = str, 
-                        choices = ['datasets', 'dataset_class', 'datasets_rasonly'],
+                        choices = ['datasets', 'dataset_class'],
                         help='dataset or dataset and class')
     parser.add_argument('-ol', '--out_log', 
                         help = "Output log")    
-    parser.add_argument('-ids', '--input_datas', 
-                        type = str,
-                        nargs = '+', 
-                        help = 'input datasets')
     parser.add_argument('-id', '--input_data',
                         type = str,
                         help = 'input dataset')
@@ -80,15 +72,21 @@
                         default = 'true',
                         choices = ['true', 'false'], 
                         help = 'generate pdf map')
-    parser.add_argument('-gr', '--generate_ras',
+    parser.add_argument('-on', '--control',
+                        type = str)
+    parser.add_argument('-co', '--comparison',
+                        type = str, 
+                        default = '1vs1',
+                        choices = ['manyvsmany', 'onevsrest', 'onevsmany'])
+    parser.add_argument('-ids', '--input_datas', 
                         type = str,
-                        default = 'true',
-                        choices = ['true', 'false'],
-                        help = 'generate reaction activity score')
-    parser.add_argument('-sr', '--single_ras_file',  
-                         type = str,              
-                         help = 'file that will contain ras')
-    					
+                        nargs = '+', 
+                        help = 'input datasets')
+    parser.add_argument('-na', '--names', 
+                        type = str,
+                        nargs = '+', 
+                        help = 'input names')
+                                    
     args = parser.parse_args()
     return args
 
@@ -615,7 +613,6 @@
 
 def resolve(genes, rules, ids, resolve_none, name):
     resolve_rules = {}
-    names_array = []
     not_found = []
     flag = False
     for key, value in genes.items():
@@ -663,79 +660,157 @@
                         ', the class has been disregarded\n')
     return class_pat
 
-############################ create_ras #######################################
-
-def create_ras (resolve_rules, dataset_name, single_ras, rules, ids):
-
-    if resolve_rules == None:
-        warning("Couldn't generate RAS for current dataset: " + dataset_name)
-
-    for geni in resolve_rules.values():
-        for i, valori in enumerate(geni):
-            if valori == None:
-                geni[i] = 'None'
-                
-    output_ras = pd.DataFrame.from_dict(resolve_rules)
-    
-    output_ras.insert(0, 'Reactions', ids)
-    output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
-    
-    if (single_ras):
-        args = process_args(sys.argv)
-        text_file = open(args.single_ras_file, "w")
-    else:
-        text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w")
-    
-    text_file.write(output_to_csv)
-    text_file.close()
-
 ############################ map ##############################################
 
-def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf):
+def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf, comparison, control):
     args = process_args(sys.argv)
     if (not class_pat) or (len(class_pat.keys()) < 2):
         sys.exit('Execution aborted: classes provided for comparisons are ' +
                  'less than two\n')
-    for i, j in it.combinations(class_pat.keys(), 2):
-        tmp = {}
-        count = 0
-        max_F_C = 0
-        for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
-            try:
-               stat_D, p_value = st.ks_2samp(l1, l2)
-               avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
-               if not isinstance(avg, str):
-                   if max_F_C < abs(avg):
-                       max_F_C = abs(avg)
-               tmp[ids[count]] = [float(p_value), avg]
-               count += 1
-            except (TypeError, ZeroDivisionError):
-               count += 1
-        tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
-        tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
-        tmp_csv = tmp_csv.reset_index()
-        header = ['ids', 'P_Value', 'Log2(fold change)']
-        tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
+
+    if comparison == "manyvsmany":
+        for i, j in it.combinations(class_pat.keys(), 2):
+            
+            tmp = {}
+            count = 0
+            max_F_C = 0
+            for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
+                try:
+                    stat_D, p_value = st.ks_2samp(l1, l2)
+                    #sum(l1) da errore secondo me perchè ha null
+                    avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
+                    if not isinstance(avg, str):
+                        if max_F_C < abs(avg):
+                            max_F_C = abs(avg)
+                    tmp[ids[count]] = [float(p_value), avg]
+                    count += 1
+                except (TypeError, ZeroDivisionError):
+                    count += 1
+            tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
+            tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
+            tmp_csv = tmp_csv.reset_index()
+            header = ['ids', 'P_Value', 'Log2(fold change)']
+            tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
+            
+            if create_svg or create_pdf:
+                if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
+                                                        and args.yes_no == 'yes'):
+                    fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
+                    file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
+                    with open(file_svg, 'wb') as new_map:
+                        new_map.write(ET.tostring(core_map))
+                        
+                    
+                    if create_pdf:
+                        file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
+                        renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
+                    
+                    if not create_svg:
+                        #Ho utilizzato il file svg per generare il pdf, 
+                        #ma l'utente non ne ha richiesto il ritorno, quindi
+                        #lo elimino
+                        
+                        os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
+    elif comparison == "onevsrest":
+        for single_cluster in class_pat.keys():
+            t = []
+            for k in class_pat.keys():
+                if k != single_cluster:
+                   t.append(class_pat.get(k))
+            rest = []
+            for i in t:
+                rest = rest + i
+            
+            tmp = {}
+            count = 0
+            max_F_C = 0
+
+            for l1, l2 in zip(rest, class_pat.get(single_cluster)):
+                try:
+                    stat_D, p_value = st.ks_2samp(l1, l2)
+                    avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
+                    if not isinstance(avg, str):
+                        if max_F_C < abs(avg):
+                            max_F_C = abs(avg)
+                    tmp[ids[count]] = [float(p_value), avg]
+                    count += 1
+                except (TypeError, ZeroDivisionError):
+                    count += 1
+            tab = 'result/' + single_cluster + '_vs_rest (Tabular Result).tsv'
+            tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
+            tmp_csv = tmp_csv.reset_index()
+            header = ['ids', 'P_Value', 'Log2(fold change)']
+            tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
+            
+            if create_svg or create_pdf:
+                if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
+                                                        and args.yes_no == 'yes'):
+                    fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
+                    file_svg = 'result/' + single_cluster + '_vs_ rest (SVG Map).svg'
+                    with open(file_svg, 'wb') as new_map:
+                        new_map.write(ET.tostring(core_map))
+                        
+                    
+                    if create_pdf:
+                        file_pdf = 'result/' + single_cluster + '_vs_ rest (PDF Map).pdf'
+                        renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
+                    
+                    if not create_svg:
+                        os.remove('result/' + single_cluster + '_vs_ rest (SVG Map).svg') 
+                        
+    elif comparison == "onevsmany":
+        for i, j in it.combinations(class_pat.keys(), 2):
+
+            if i != control and j != control:
+                print(str(control) + " " + str(i) + " " + str(j))
+                #Se è un confronto fra elementi diversi dal controllo, skippo
+                continue
+            
+            print('vado')
+            tmp = {}
+            count = 0
+            max_F_C = 0
+            for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
+                try:
+                    stat_D, p_value = st.ks_2samp(l1, l2)
+                    #sum(l1) da errore secondo me perchè ha null
+                    avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
+                    if not isinstance(avg, str):
+                        if max_F_C < abs(avg):
+                            max_F_C = abs(avg)
+                    tmp[ids[count]] = [float(p_value), avg]
+                    count += 1
+                except (TypeError, ZeroDivisionError):
+                    count += 1
+            tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
+            tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
+            tmp_csv = tmp_csv.reset_index()
+            header = ['ids', 'P_Value', 'Log2(fold change)']
+            tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
+            
+            if create_svg or create_pdf:
+                if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
+                                                        and args.yes_no == 'yes'):
+                    fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
+                    file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
+                    with open(file_svg, 'wb') as new_map:
+                        new_map.write(ET.tostring(core_map))
+                        
+                    
+                    if create_pdf:
+                        file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
+                        renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
+                    
+                    if not create_svg:
+                        #Ho utilizzato il file svg per generare il pdf, 
+                        #ma l'utente non ne ha richiesto il ritorno, quindi
+                        #lo elimino
+                        
+                        os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
         
-        if create_svg or create_pdf:
-            if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom'
-                                                    and args.yes_no == 'yes'):
-                fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
-                file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
-                with open(file_svg, 'wb') as new_map:
-                    new_map.write(ET.tostring(core_map))
-                    
-                
-                if create_pdf:
-                    file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
-                    renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
-                
-                if not create_svg:
-                    #Ho utilizzato il file svg per generare il pdf, 
-                    #ma l'utente non ne ha richiesto il ritorno, quindi
-                    #lo elimino
-                    os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
-                    
+        
+        
+
     return None
 
 ############################ MAIN #############################################
@@ -745,12 +820,9 @@
     
     create_svg = check_bool(args.generate_svg)
     create_pdf = check_bool(args.generate_pdf)
-    generate_ras = check_bool(args.generate_ras)
 
-    os.makedirs('result')
-
-    if generate_ras:
-        os.makedirs('ras')
+    if os.path.isdir('result') == False:
+        os.makedirs('result')
     
     if args.rules_selector == 'HMRcore':        
         recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
@@ -758,93 +830,60 @@
         recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
     elif args.rules_selector == 'Custom':
         ids, rules, gene_in_rule = make_recon(args.custom)
-        
-    resolve_none = check_bool(args.none)
-    
+
     class_pat = {}
     
-    if args.option == 'datasets_rasonly':
-        name = "RAS Dataset"
-        dataset = read_dataset(args.input_datas[0],"dataset")
-
-        dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
-
-        type_gene = gene_type(dataset.iloc[0, 0], name) 
-            
-        if args.rules_selector != 'Custom':
-            genes = data_gene(dataset, type_gene, name, None)
-            ids, rules = load_id_rules(recon.get(type_gene))
-        elif args.rules_selector == 'Custom':
-            genes = data_gene(dataset, type_gene, name, gene_in_rule)
-        
-        resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
-
-        create_ras(resolve_rules, name, True, rules, ids)
-          
-        if err != None and err:
-            warning('Warning: gene\n' + str(err) + '\nnot found in class '
-                + name + ', the expression level for this gene ' +
-                'will be considered NaN\n')
-        
-        print('execution succeded')
-        return None
-    
-    
-    elif args.option == 'datasets':
+    if args.option == 'datasets':
         num = 1
         for i, j in zip(args.input_datas, args.names):
-
             name = name_dataset(j, num)
-            dataset = read_dataset(i, name)
-
-            dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
-
-            type_gene = gene_type(dataset.iloc[0, 0], name) 
+            resolve_rules = read_dataset(i, name)
+            
+            resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
             
-            if args.rules_selector != 'Custom':
-                genes = data_gene(dataset, type_gene, name, None)
-                ids, rules = load_id_rules(recon.get(type_gene))
-            elif args.rules_selector == 'Custom':
-                genes = data_gene(dataset, type_gene, name, gene_in_rule)
-                
-                
-            resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
+            ids =  pd.Series.tolist(resolve_rules.iloc[:, 0])
 
-            if generate_ras:
-                create_ras(resolve_rules, name, False, rules, ids)
+            resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
+            resolve_rules = resolve_rules.replace({'None': None})
+            resolve_rules = resolve_rules.to_dict('list')
             
-            if err != None and err:
-                warning('Warning: gene\n' + str(err) + '\nnot found in class '
-                    + name + ', the expression level for this gene ' +
-                    'will be considered NaN\n')
+            #Converto i valori da str a float
+            to_float = lambda x: float(x) if (x != None) else None
+            
+            resolve_rules_float = {}
+           
+            for k in resolve_rules:
+                resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
+            
             if resolve_rules != None:
-                class_pat[name] = list(map(list, zip(*resolve_rules.values())))
+                class_pat[name] = list(map(list, zip(*resolve_rules_float.values())))
+        
             num += 1
-    elif args.option == 'dataset_class':
-        name = 'RNAseq'
-        dataset = read_dataset(args.input_data, name)
-        dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
-        type_gene = gene_type(dataset.iloc[0, 0], name)
+            
+    if args.option == 'dataset_class':
+        name = 'RAS'
+        resolve_rules = read_dataset(args.input_data, name)
+        resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
+            
+        ids =  pd.Series.tolist(resolve_rules.iloc[:, 0])
+
+        resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
+        resolve_rules = resolve_rules.replace({'None': None})
+        resolve_rules = resolve_rules.to_dict('list')
+            
+        #Converto i valori da str a float
+        to_float = lambda x: float(x) if (x != None) else None
+            
+        resolve_rules_float = {}
+           
+        for k in resolve_rules:
+            resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
+
         classes = read_dataset(args.input_class, 'class')
-        if not len(classes.columns) == 2:
-            warning('Warning: more than 2 columns in class file. Extra' +
-                    'columns have been disregarded\n')
         classes = classes.astype(str)
-        if args.rules_selector != 'Custom':
-            genes = data_gene(dataset, type_gene, name, None)
-            ids, rules = load_id_rules(recon.get(type_gene))
-        elif args.rules_selector == 'Custom':
-            genes = data_gene(dataset, type_gene, name, gene_in_rule)
-        resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
-        if err != None and err:
-            warning('Warning: gene\n'+str(err)+'\nnot found in class '
-                    + name + ', the expression level for this gene ' +
-                    'will be considered NaN\n')
-        if resolve_rules != None:
-            class_pat = split_class(classes, resolve_rules)
-            if generate_ras:
-                create_ras(resolve_rules, name, False, rules, ids)
-    
+        
+        if resolve_rules_float != None:
+            class_pat = split_class(classes, resolve_rules_float)
     	
     if args.rules_selector == 'Custom':
         if args.yes_no == 'yes':
@@ -857,11 +896,12 @@
     else:       
         core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
         
-    maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf)
+    maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf, args.comparison, args.control)
         
     print('Execution succeded')
+    
+    return None
 
-    return None
 
 ###############################################################################