comparison COBRAxy/marea.py @ 299:8ae278b4a5d5 draft

Uploaded
author francesco_lapi
date Fri, 16 May 2025 15:47:05 +0000
parents 1402c2beb8f2
children 24783af8a2f9
comparison
equal deleted inserted replaced
298:bc8cbaacafd0 299:8ae278b4a5d5
390 390
391 class ArrowColor(Enum): 391 class ArrowColor(Enum):
392 """ 392 """
393 Encodes possible arrow colors based on their meaning in the enrichment process. 393 Encodes possible arrow colors based on their meaning in the enrichment process.
394 """ 394 """
395 Invalid = "#BEBEBE" # gray, fold-change under treshold 395 Invalid = "#BEBEBE" # gray, fold-change under treshold or not significant p-value
396 Transparent = "#ffffff00" # transparent, to make some arrow segments disappear
396 UpRegulated = "#ecac68" # orange, up-regulated reaction 397 UpRegulated = "#ecac68" # orange, up-regulated reaction
397 DownRegulated = "#6495ed" # lightblue, down-regulated reaction 398 DownRegulated = "#6495ed" # lightblue, down-regulated reaction
398 399
399 UpRegulatedInv = "#FF0000" 400 UpRegulatedInv = "#FF0000"
400 # ^^^ bright red, up-regulated net value for a reversible reaction with 401 # ^^^ bright red, up-regulated net value for a reversible reaction with
477 if downSizedForTips: width *= 0.8 478 if downSizedForTips: width *= 0.8
478 return f";stroke:{self.col};stroke-width:{width};stroke-dasharray:{'5,5' if self.dash else 'none'}" 479 return f";stroke:{self.col};stroke-width:{width};stroke-dasharray:{'5,5' if self.dash else 'none'}"
479 480
480 # vvv These constants could be inside the class itself a static properties, but python 481 # vvv These constants could be inside the class itself a static properties, but python
481 # was built by brainless organisms so here we are! 482 # was built by brainless organisms so here we are!
482 INVALID_ARROW = Arrow(Arrow.MIN_W, ArrowColor.Invalid) 483 INVALID_ARROW = Arrow(Arrow.MIN_W, ArrowColor.Invalid)
483 INSIGNIFICANT_ARROW = Arrow(Arrow.MIN_W, ArrowColor.Invalid, isDashed = True) 484 INSIGNIFICANT_ARROW = Arrow(Arrow.MIN_W, ArrowColor.Invalid, isDashed = True)
485 TRANSPARENT_ARROW = Arrow(Arrow.MIN_W, ArrowColor.Transparent) # Who cares how big it is if it's transparent
484 486
485 # TODO: A more general version of this can be used for RAS as well, we don't need "fix map" or whatever 487 # TODO: A more general version of this can be used for RAS as well, we don't need "fix map" or whatever
486 def applyRpsEnrichmentToMap(rpsEnrichmentRes :Dict[str, Union[Tuple[float, FoldChange], Tuple[float, FoldChange, float, float]]], metabMap :ET.ElementTree, maxNumericZScore :float) -> None: 488 def applyRpsEnrichmentToMap(rpsEnrichmentRes :Dict[str, Union[Tuple[float, FoldChange], Tuple[float, FoldChange, float, float]]], metabMap :ET.ElementTree, maxNumericZScore :float) -> None:
487 """ 489 """
488 Applies RPS enrichment results to the provided metabolic map. 490 Applies RPS enrichment results to the provided metabolic map.
708 match ARGS.test: 710 match ARGS.test:
709 case "ks": 711 case "ks":
710 # Perform Kolmogorov-Smirnov test 712 # Perform Kolmogorov-Smirnov test
711 _, p_value = st.ks_2samp(dataset1Data, dataset2Data) 713 _, p_value = st.ks_2samp(dataset1Data, dataset2Data)
712 case "ttest_p": 714 case "ttest_p":
715 # Datasets should have same size
716 if len(dataset1Data) != len(dataset2Data):
717 raise ValueError("Datasets must have the same size for paired t-test.")
713 # Perform t-test for paired samples 718 # Perform t-test for paired samples
714 _, p_value = st.ttest_rel(dataset1Data, dataset2Data) 719 _, p_value = st.ttest_rel(dataset1Data, dataset2Data)
715 case "ttest_ind": 720 case "ttest_ind":
716 # Perform t-test for independent samples 721 # Perform t-test for independent samples
717 _, p_value = st.ttest_ind(dataset1Data, dataset2Data) 722 _, p_value = st.ttest_ind(dataset1Data, dataset2Data)
718 case "wilcoxon": 723 case "wilcoxon":
724 # Datasets should have same size
725 if len(dataset1Data) != len(dataset2Data):
726 raise ValueError("Datasets must have the same size for Wilcoxon signed-rank test.")
719 # Perform Wilcoxon signed-rank test 727 # Perform Wilcoxon signed-rank test
720 _, p_value = st.wilcoxon(dataset1Data, dataset2Data) 728 _, p_value = st.wilcoxon(dataset1Data, dataset2Data)
721 case "mw": 729 case "mw":
722 # Perform Mann-Whitney U test 730 # Perform Mann-Whitney U test
723 _, p_value = st.mannwhitneyu(dataset1Data, dataset2Data) 731 _, p_value = st.mannwhitneyu(dataset1Data, dataset2Data)
734 # Calculate Z-score 742 # Calculate Z-score
735 z_score = (mean1 - mean2) / np.sqrt((std1**2 / n1) + (std2**2 / n2)) 743 z_score = (mean1 - mean2) / np.sqrt((std1**2 / n1) + (std2**2 / n2))
736 744
737 return p_value, z_score 745 return p_value, z_score
738 746
739 def compareDatasetPair(dataset1Data :List[List[float]], dataset2Data :List[List[float]], ids :List[str]) -> Tuple[Dict[str, List[Union[float, FoldChange]]], float]: 747 # TODO: the net RPS computation should be done in the RPS module
748 def compareDatasetPair(dataset1Data :List[List[float]], dataset2Data :List[List[float]], ids :List[str]) -> Tuple[Dict[str, List[Union[float, FoldChange]]], float, Dict[str, Tuple[np.ndarray, np.ndarray]]]:
740 #TODO: the following code still suffers from "dumbvarnames-osis" 749 #TODO: the following code still suffers from "dumbvarnames-osis"
750 netRPS :Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
741 comparisonResult :Dict[str, List[Union[float, FoldChange]]] = {} 751 comparisonResult :Dict[str, List[Union[float, FoldChange]]] = {}
742 count = 0 752 count = 0
743 max_z_score = 0 753 max_z_score = 0
744 754
745 for l1, l2 in zip(dataset1Data, dataset2Data): 755 for l1, l2 in zip(dataset1Data, dataset2Data):
754 try: position = ids.index(reactId[:-1] + ('B' if reactDir is ReactionDirection.Direct else 'F')) 764 try: position = ids.index(reactId[:-1] + ('B' if reactDir is ReactionDirection.Direct else 'F'))
755 except ValueError: continue # we look for the complementary id, if not found we skip 765 except ValueError: continue # we look for the complementary id, if not found we skip
756 766
757 nets1 = np.subtract(l1, dataset1Data[position]) 767 nets1 = np.subtract(l1, dataset1Data[position])
758 nets2 = np.subtract(l2, dataset2Data[position]) 768 nets2 = np.subtract(l2, dataset2Data[position])
769 netRPS[reactId] = (nets1, nets2)
759 770
760 p_value, z_score = computePValue(nets1, nets2) 771 p_value, z_score = computePValue(nets1, nets2)
761 avg1 = sum(nets1) / len(nets1) 772 avg1 = sum(nets1) / len(nets1)
762 avg2 = sum(nets2) / len(nets2) 773 avg2 = sum(nets2) / len(nets2)
763 net = fold_change(avg1, avg2) 774 net = fold_change(avg1, avg2)
785 # Retrieve the p-values from the comparisonResult dictionary 796 # Retrieve the p-values from the comparisonResult dictionary
786 reactIds = list(comparisonResult.keys()) 797 reactIds = list(comparisonResult.keys())
787 pValues = [comparisonResult[reactId][0] for reactId in reactIds] 798 pValues = [comparisonResult[reactId][0] for reactId in reactIds]
788 799
789 # Apply the Benjamini-Hochberg correction and update 800 # Apply the Benjamini-Hochberg correction and update
790 adjustedPValues = st.multipletests(pValues, method="fdr_bh")[1] 801 adjustedPValues = st.false_discovery_control(pValues)[1]
791 for i, reactId in enumerate(reactIds): 802 for i, reactId in enumerate(reactIds):
792 comparisonResult[reactId][0] = adjustedPValues[i] 803 comparisonResult[reactId][0] = adjustedPValues[i]
793 804
794 return comparisonResult, max_z_score 805 return comparisonResult, max_z_score, netRPS
795 806
796 def computeEnrichment(class_pat: Dict[str, List[List[float]]], ids: List[str], *, fromRAS=True) -> List[Tuple[str, str, dict, float]]: 807 def computeEnrichment(class_pat: Dict[str, List[List[float]]], ids: List[str], *, fromRAS=True) -> Tuple[List[Tuple[str, str, dict, float]], dict]:
797 """ 808 """
798 Compares clustered data based on a given comparison mode and applies enrichment-based styling on the 809 Compares clustered data based on a given comparison mode and applies enrichment-based styling on the
799 provided metabolic map. 810 provided metabolic map.
800 811
801 Args: 812 Args:
802 class_pat : the clustered data. 813 class_pat : the clustered data.
803 ids : ids for data association. 814 ids : ids for data association.
804 fromRAS : whether the data to enrich consists of RAS scores. 815 fromRAS : whether the data to enrich consists of RAS scores.
805 816
806 Returns: 817 Returns:
807 List[Tuple[str, str, dict, float]]: List of tuples with pairs of dataset names, comparison dictionary, and max z-score. 818 tuple: A tuple containing:
808 819 - List[Tuple[str, str, dict, float]]: List of tuples with pairs of dataset names, comparison dictionary and max z-score.
820 - dict : net RPS values for each dataset's reactions
821
809 Raises: 822 Raises:
810 sys.exit : if there are less than 2 classes for comparison 823 sys.exit : if there are less than 2 classes for comparison
811 """ 824 """
812 class_pat = {k.strip(): v for k, v in class_pat.items()} 825 class_pat = {k.strip(): v for k, v in class_pat.items()}
813 if (not class_pat) or (len(class_pat.keys()) < 2): 826 if (not class_pat) or (len(class_pat.keys()) < 2):
814 sys.exit('Execution aborted: classes provided for comparisons are less than two\n') 827 sys.exit('Execution aborted: classes provided for comparisons are less than two\n')
815 828
829 # { datasetName : { reactId : netRPS, ... }, ... }
830 netRPSResults :Dict[str, Dict[str, np.ndarray]] = {}
816 enrichment_results = [] 831 enrichment_results = []
817 832
818 if ARGS.comparison == "manyvsmany": 833 if ARGS.comparison == "manyvsmany":
819 for i, j in it.combinations(class_pat.keys(), 2): 834 for i, j in it.combinations(class_pat.keys(), 2):
820 comparisonDict, max_z_score = compareDatasetPair(class_pat.get(i), class_pat.get(j), ids) 835 comparisonDict, max_z_score, netRPS = compareDatasetPair(class_pat.get(i), class_pat.get(j), ids)
821 enrichment_results.append((i, j, comparisonDict, max_z_score)) 836 enrichment_results.append((i, j, comparisonDict, max_z_score))
837 netRPSResults[i] = { reactId : net[0] for reactId, net in netRPS.items() }
838 netRPSResults[j] = { reactId : net[1] for reactId, net in netRPS.items() }
822 839
823 elif ARGS.comparison == "onevsrest": 840 elif ARGS.comparison == "onevsrest":
824 for single_cluster in class_pat.keys(): 841 for single_cluster in class_pat.keys():
825 rest = [item for k, v in class_pat.items() if k != single_cluster for item in v] 842 rest = [item for k, v in class_pat.items() if k != single_cluster for item in v]
826 comparisonDict, max_z_score = compareDatasetPair(class_pat.get(single_cluster), rest, ids) 843 comparisonDict, max_z_score, netRPS = compareDatasetPair(class_pat.get(single_cluster), rest, ids)
827 enrichment_results.append((single_cluster, "rest", comparisonDict, max_z_score)) 844 enrichment_results.append((single_cluster, "rest", comparisonDict, max_z_score))
845 netRPSResults[single_cluster] = { reactId : net[0] for reactId, net in netRPS.items() }
846 netRPSResults["rest"] = { reactId : net[1] for reactId, net in netRPS.items() }
828 847
829 elif ARGS.comparison == "onevsmany": 848 elif ARGS.comparison == "onevsmany":
830 controlItems = class_pat.get(ARGS.control) 849 controlItems = class_pat.get(ARGS.control)
831 for otherDataset in class_pat.keys(): 850 for otherDataset in class_pat.keys():
832 if otherDataset == ARGS.control: 851 if otherDataset == ARGS.control:
833 continue 852 continue
834 comparisonDict, max_z_score = compareDatasetPair(controlItems, class_pat.get(otherDataset), ids) 853
854 comparisonDict, max_z_score, netRPS = compareDatasetPair(controlItems, class_pat.get(otherDataset), ids)
835 enrichment_results.append((ARGS.control, otherDataset, comparisonDict, max_z_score)) 855 enrichment_results.append((ARGS.control, otherDataset, comparisonDict, max_z_score))
836 856 netRPSResults[ARGS.control] = { reactId : net[0] for reactId, net in netRPS.items() }
837 return enrichment_results 857 netRPSResults[otherDataset] = { reactId : net[1] for reactId, net in netRPS.items() }
858
859 return enrichment_results, netRPSResults
838 860
839 def createOutputMaps(dataset1Name: str, dataset2Name: str, core_map: ET.ElementTree) -> None: 861 def createOutputMaps(dataset1Name: str, dataset2Name: str, core_map: ET.ElementTree) -> None:
840 svgFilePath = buildOutputPath(dataset1Name, dataset2Name, details="SVG Map", ext=utils.FileFormat.SVG) 862 svgFilePath = buildOutputPath(dataset1Name, dataset2Name, details="SVG Map", ext=utils.FileFormat.SVG)
841 utils.writeSvg(svgFilePath, core_map) 863 utils.writeSvg(svgFilePath, core_map)
842 864
855 877
856 if not ARGS.generate_svg: # This argument is useless, who cares if the user wants the svg or not 878 if not ARGS.generate_svg: # This argument is useless, who cares if the user wants the svg or not
857 os.remove(svgFilePath.show()) 879 os.remove(svgFilePath.show())
858 880
859 ClassPat = Dict[str, List[List[float]]] 881 ClassPat = Dict[str, List[List[float]]]
860 def getClassesAndIdsFromDatasets(datasetsPaths :List[str], datasetPath :str, classPath :str, names :List[str]) -> Tuple[List[str], ClassPat]: 882 def getClassesAndIdsFromDatasets(datasetsPaths :List[str], datasetPath :str, classPath :str, names :List[str]) -> Tuple[List[str], ClassPat, Dict[str, List[str]]]:
861 # TODO: I suggest creating dicts with ids as keys instead of keeping class_pat and ids separate, 883 # TODO: I suggest creating dicts with ids as keys instead of keeping class_pat and ids separate,
862 # for the sake of everyone's sanity. 884 # for the sake of everyone's sanity.
885 columnNames :Dict[str, List[str]] = {} # { datasetName : [ columnName, ... ], ... }
863 class_pat :ClassPat = {} 886 class_pat :ClassPat = {}
864 if ARGS.option == 'datasets': 887 if ARGS.option == 'datasets':
865 num = 1 888 num = 1
866 for path, name in zip(datasetsPaths, names): 889 for path, name in zip(datasetsPaths, names):
867 name = str(name) 890 name = str(name)
868 if name == 'Dataset': 891 if name == 'Dataset':
869 name += '_' + str(num) 892 name += '_' + str(num)
870 893
871 values, ids = getDatasetValues(path, name) 894 values, ids = getDatasetValues(path, name)
872 if values != None: 895 if values != None:
873 class_pat[name] = list(map(list, zip(*values.values()))) 896 class_pat[name] = list(map(list, zip(*values.values()))) # TODO: ???
874 # TODO: ??? 897 columnNames[name] = list(values.keys())
875 898
876 num += 1 899 num += 1
877 900
878 elif ARGS.option == "dataset_class": 901 elif ARGS.option == "dataset_class":
879 classes = read_dataset(classPath, "class") 902 classes = read_dataset(classPath, "class")
880 classes = classes.astype(str) 903 classes = classes.astype(str)
881 904
882 values, ids = getDatasetValues(datasetPath, "Dataset Class (not actual name)") 905 values, ids = getDatasetValues(datasetPath, "Dataset Class (not actual name)")
883 if values != None: class_pat = split_class(classes, values) 906 if values != None:
884 907 # TODO: add the columnNames thing, I didn't because I don't understand the whole "dataset classes" thing
885 return ids, class_pat 908 class_pat = split_class(classes, values)
909
910 return ids, class_pat, columnNames
886 #^^^ TODO: this could be a match statement over an enum, make it happen future marea dev with python 3.12! (it's why I kept the ifs) 911 #^^^ TODO: this could be a match statement over an enum, make it happen future marea dev with python 3.12! (it's why I kept the ifs)
887 912
888 #TODO: create these damn args as FilePath objects 913 #TODO: create these damn args as FilePath objects
889 def getDatasetValues(datasetPath :str, datasetName :str) -> Tuple[ClassPat, List[str]]: 914 def getDatasetValues(datasetPath :str, datasetName :str) -> Tuple[ClassPat, List[str]]:
890 """ 915 """
930 # Prepare enrichment results containers 955 # Prepare enrichment results containers
931 ras_results = [] 956 ras_results = []
932 rps_results = [] 957 rps_results = []
933 958
934 # Compute RAS enrichment if requested 959 # Compute RAS enrichment if requested
935 if ARGS.using_RAS: 960 if ARGS.using_RAS: # vvv columnNames only matter with RPS data
936 ids_ras, class_pat_ras = getClassesAndIdsFromDatasets( 961 ids_ras, class_pat_ras, _ = getClassesAndIdsFromDatasets(
937 ARGS.input_datas, ARGS.input_data, ARGS.input_class, ARGS.names) 962 ARGS.input_datas, ARGS.input_data, ARGS.input_class, ARGS.names)
938 ras_results = computeEnrichment(class_pat_ras, ids_ras, fromRAS=True) 963 ras_results, _ = computeEnrichment(class_pat_ras, ids_ras, fromRAS=True)
964 # ^^^ netRPS only matter with RPS data
939 965
940 # Compute RPS enrichment if requested 966 # Compute RPS enrichment if requested
941 if ARGS.using_RPS: 967 if ARGS.using_RPS:
942 ids_rps, class_pat_rps = getClassesAndIdsFromDatasets( 968 ids_rps, class_pat_rps, columnNames = getClassesAndIdsFromDatasets(
943 ARGS.input_datas_rps, ARGS.input_data_rps, ARGS.input_class_rps, ARGS.names_rps) 969 ARGS.input_datas_rps, ARGS.input_data_rps, ARGS.input_class_rps, ARGS.names_rps)
944 rps_results = computeEnrichment(class_pat_rps, ids_rps, fromRAS=False) 970
971 rps_results, netRPS = computeEnrichment(class_pat_rps, ids_rps, fromRAS=False)
945 972
946 # Organize by comparison pairs 973 # Organize by comparison pairs
947 comparisons: Dict[Tuple[str, str], Dict[str, Tuple]] = {} 974 comparisons: Dict[Tuple[str, str], Dict[str, Tuple]] = {}
948 for i, j, comparison_data, max_z_score in ras_results: 975 for i, j, comparison_data, max_z_score in ras_results:
949 comparisons[(i, j)] = {'ras': (comparison_data, max_z_score), 'rps': None} 976 comparisons[(i, j)] = {'ras': (comparison_data, max_z_score), 'rps': None}
950 for i, j, comparison_data, max_z_score in rps_results: 977
978 for i, j, comparison_data, max_z_score, in rps_results:
951 comparisons.setdefault((i, j), {}).update({'rps': (comparison_data, max_z_score)}) 979 comparisons.setdefault((i, j), {}).update({'rps': (comparison_data, max_z_score)})
952 980
953 # For each comparison, create a styled map with RAS bodies and RPS heads 981 # For each comparison, create a styled map with RAS bodies and RPS heads
954 for (i, j), res in comparisons.items(): 982 for (i, j), res in comparisons.items():
955 map_copy = copy.deepcopy(core_map) 983 map_copy = copy.deepcopy(core_map)
965 # applyRpsEnrichmentToMap styles only heads unless only RPS are used 993 # applyRpsEnrichmentToMap styles only heads unless only RPS are used
966 temp_thingsInCommon(tmp_rps, map_copy, max_z_rps, i, j, ras_enrichment=False) 994 temp_thingsInCommon(tmp_rps, map_copy, max_z_rps, i, j, ras_enrichment=False)
967 995
968 # Output both SVG and PDF/PNG as configured 996 # Output both SVG and PDF/PNG as configured
969 createOutputMaps(i, j, map_copy) 997 createOutputMaps(i, j, map_copy)
998
999 # Add net RPS output file
1000 if ARGS.net or not ARGS.using_RAS:
1001 for datasetName, rows in netRPS.items():
1002 writeToCsv(
1003 [[reactId, *netValues] for reactId, netValues in rows.items()],
1004 # vvv In weird comparison modes the dataset names are not recorded properly..
1005 columnNames.get(datasetName, ["Reactions", "other cool column names.."]),
1006 utils.FilePath(
1007 datasetName, ext = utils.FileFormat.CSV, prefix = ARGS.output_path))
1008
970 print('Execution succeeded') 1009 print('Execution succeeded')
971 ############################################################################### 1010 ###############################################################################
972 if __name__ == "__main__": 1011 if __name__ == "__main__":
973 main() 1012 main()