comparison COBRAxy/utils/general_utils.py @ 408:f413b78d61bf draft

Uploaded
author francesco_lapi
date Mon, 08 Sep 2025 17:12:35 +0000
parents a0b53ccc73a8
children 71850bdf9e1e
comparison
equal deleted inserted replaced
407:6619f237aebe 408:f413b78d61bf
5 import pickle 5 import pickle
6 import lxml.etree as ET 6 import lxml.etree as ET
7 7
8 from enum import Enum 8 from enum import Enum
9 from itertools import count 9 from itertools import count
10 from typing import Any, Callable, Dict, Generic, List, Literal, Optional, TypeVar, Union 10 from typing import Any, Callable, Dict, Generic, List, Literal, Optional, TypeVar, Union, Set, Tuple
11 11
12 import pandas as pd 12 import pandas as pd
13 import cobra 13 import cobra
14 from cobra import Model, Reaction, Metabolite
14 15
15 import zipfile 16 import zipfile
16 import gzip 17 import gzip
17 import bz2 18 import bz2
18 from io import StringIO 19 from io import StringIO
712 print("No annotation in gene dict!") 713 print("No annotation in gene dict!")
713 return -1 714 return -1
714 rename_genes(model2,dict_genes) 715 rename_genes(model2,dict_genes)
715 716
716 return model2 717 return model2
718
719
720 def build_cobra_model_from_csv(csv_path: str, model_id: str = "ENGRO2_custom") -> cobra.Model:
721 """
722 Costruisce un modello COBRApy a partire da un file CSV con i dati delle reazioni.
723
724 Args:
725 csv_path: Path al file CSV (separato da tab)
726 model_id: ID del modello da creare
727
728 Returns:
729 cobra.Model: Il modello COBRApy costruito
730 """
731
732 # Leggi i dati dal CSV
733 df = pd.read_csv(csv_path, sep='\t')
734
735 # Crea il modello vuoto
736 model = Model(model_id)
737
738 # Dict per tenere traccia di metaboliti e compartimenti
739 metabolites_dict = {}
740 compartments_dict = {}
741
742 print(f"Costruendo modello da {len(df)} reazioni...")
743
744 # Prima passata: estrai metaboliti e compartimenti dalle formule delle reazioni
745 for idx, row in df.iterrows():
746 reaction_formula = str(row['Reaction']).strip()
747 if not reaction_formula or reaction_formula == 'nan':
748 continue
749
750 # Estrai metaboliti dalla formula della reazione
751 metabolites = extract_metabolites_from_reaction(reaction_formula)
752
753 for met_id in metabolites:
754 compartment = extract_compartment_from_metabolite(met_id)
755
756 # Aggiungi compartimento se non esiste
757 if compartment not in compartments_dict:
758 compartments_dict[compartment] = compartment
759
760 # Aggiungi metabolita se non esiste
761 if met_id not in metabolites_dict:
762 metabolites_dict[met_id] = Metabolite(
763 id=met_id,
764 compartment=compartment,
765 name=met_id.replace(f"_{compartment}", "").replace("__", "_")
766 )
767
768 # Aggiungi compartimenti al modello
769 model.compartments = compartments_dict
770
771 # Aggiungi metaboliti al modello
772 model.add_metabolites(list(metabolites_dict.values()))
773
774 print(f"Aggiunti {len(metabolites_dict)} metaboliti e {len(compartments_dict)} compartimenti")
775
776 # Seconda passata: aggiungi le reazioni
777 reactions_added = 0
778 reactions_skipped = 0
779
780 for idx, row in df.iterrows():
781 try:
782 reaction_id = str(row['ReactionID']).strip()
783 reaction_formula = str(row['Reaction']).strip()
784
785 # Salta reazioni senza formula
786 if not reaction_formula or reaction_formula == 'nan':
787 reactions_skipped += 1
788 continue
789
790 # Crea la reazione
791 reaction = Reaction(reaction_id)
792 reaction.name = reaction_id
793
794 # Imposta bounds
795 reaction.lower_bound = float(row['lower_bound']) if pd.notna(row['lower_bound']) else -1000.0
796 reaction.upper_bound = float(row['upper_bound']) if pd.notna(row['upper_bound']) else 1000.0
797
798 # Aggiungi gene rule se presente
799 if pd.notna(row['Rule']) and str(row['Rule']).strip():
800 reaction.gene_reaction_rule = str(row['Rule']).strip()
801
802 # Parse della formula della reazione
803 try:
804 parse_reaction_formula(reaction, reaction_formula, metabolites_dict)
805 except Exception as e:
806 print(f"Errore nel parsing della reazione {reaction_id}: {e}")
807 reactions_skipped += 1
808 continue
809
810 # Aggiungi la reazione al modello
811 model.add_reactions([reaction])
812 reactions_added += 1
813
814 except Exception as e:
815 print(f"Errore nell'aggiungere la reazione {reaction_id}: {e}")
816 reactions_skipped += 1
817 continue
818
819 print(f"Aggiunte {reactions_added} reazioni, saltate {reactions_skipped} reazioni")
820
821 # Imposta l'obiettivo di biomassa
822 set_biomass_objective(model)
823
824 # Imposta il medium
825 set_medium_from_data(model, df)
826
827 print(f"Modello completato: {len(model.reactions)} reazioni, {len(model.metabolites)} metaboliti")
828
829 return model
830
831
832 # Estrae tutti gli ID metaboliti nella formula (gestisce prefissi numerici + underscore)
833 def extract_metabolites_from_reaction(reaction_formula: str) -> Set[str]:
834 """
835 Estrae gli ID dei metaboliti da una formula di reazione.
836 Pattern robusto: cattura token che terminano con _<compartimento> (es. _c, _m, _e)
837 e permette che comincino con cifre o underscore.
838 """
839 metabolites = set()
840 # coefficiente opzionale seguito da un token che termina con _<letters>
841 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[a-z]+)'
842 matches = re.findall(pattern, reaction_formula)
843 metabolites.update(matches)
844 return metabolites
845
846
847 def extract_compartment_from_metabolite(metabolite_id: str) -> str:
848 """
849 Estrae il compartimento dall'ID del metabolita.
850 """
851 # Il compartimento รจ solitamente l'ultima lettera dopo l'underscore
852 if '_' in metabolite_id:
853 return metabolite_id.split('_')[-1]
854 return 'c' # default cytoplasm
855
856
857 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]):
858 """
859 Parsa una formula di reazione e imposta i metaboliti con i loro coefficienti.
860 """
861
862 if reaction.id == 'EX_thbpt_e':
863 print(reaction.id)
864 print(formula)
865 # Dividi in parte sinistra e destra
866 if '<=>' in formula:
867 left, right = formula.split('<=>')
868 reversible = True
869 elif '<--' in formula:
870 left, right = formula.split('<--')
871 reversible = False
872 left, right = left, right
873 elif '-->' in formula:
874 left, right = formula.split('-->')
875 reversible = False
876 elif '<-' in formula:
877 left, right = formula.split('<-')
878 reversible = False
879 left, right = left, right
880 else:
881 raise ValueError(f"Formato reazione non riconosciuto: {formula}")
882
883 # Parse dei metaboliti e coefficienti
884 reactants = parse_metabolites_side(left.strip())
885 products = parse_metabolites_side(right.strip())
886
887 # Aggiungi metaboliti alla reazione
888 metabolites_to_add = {}
889
890 # Reagenti (coefficienti negativi)
891 for met_id, coeff in reactants.items():
892 if met_id in metabolites_dict:
893 metabolites_to_add[metabolites_dict[met_id]] = -coeff
894
895 # Prodotti (coefficienti positivi)
896 for met_id, coeff in products.items():
897 if met_id in metabolites_dict:
898 metabolites_to_add[metabolites_dict[met_id]] = coeff
899
900 reaction.add_metabolites(metabolites_to_add)
901
902
903 def parse_metabolites_side(side_str: str) -> Dict[str, float]:
904 """
905 Parsa un lato della reazione per estrarre metaboliti e coefficienti.
906 """
907 metabolites = {}
908 if not side_str or side_str.strip() == '':
909 return metabolites
910
911 terms = side_str.split('+')
912 for term in terms:
913 term = term.strip()
914 if not term:
915 continue
916
917 # pattern allineato: coefficiente opzionale + id che termina con _<compartimento>
918 match = re.match(r'(?:(\d+\.?\d*)\s+)?([A-Za-z0-9_]+_[a-z]+)', term)
919 if match:
920 coeff_str, met_id = match.groups()
921 coeff = float(coeff_str) if coeff_str else 1.0
922 metabolites[met_id] = coeff
923
924 return metabolites
925
926
927
928 def set_biomass_objective(model: Model):
929 """
930 Imposta la reazione di biomassa come obiettivo.
931 """
932 biomass_reactions = [r for r in model.reactions if 'biomass' in r.id.lower()]
933
934 if biomass_reactions:
935 model.objective = biomass_reactions[0].id
936 print(f"Obiettivo impostato su: {biomass_reactions[0].id}")
937 else:
938 print("Nessuna reazione di biomassa trovata")
939
940
941 def set_medium_from_data(model: Model, df: pd.DataFrame):
942 """
943 Imposta il medium basato sulla colonna InMedium.
944 """
945 medium_reactions = df[df['InMedium'] == True]['ReactionID'].tolist()
946
947 medium_dict = {}
948 for rxn_id in medium_reactions:
949 if rxn_id in [r.id for r in model.reactions]:
950 reaction = model.reactions.get_by_id(rxn_id)
951 if reaction.lower_bound < 0: # Solo reazioni di uptake
952 medium_dict[rxn_id] = abs(reaction.lower_bound)
953
954 if medium_dict:
955 model.medium = medium_dict
956 print(f"Medium impostato con {len(medium_dict)} componenti")
957
958
959 def validate_model(model: Model) -> Dict[str, any]:
960 """
961 Valida il modello e fornisce statistiche di base.
962 """
963 validation = {
964 'num_reactions': len(model.reactions),
965 'num_metabolites': len(model.metabolites),
966 'num_genes': len(model.genes),
967 'num_compartments': len(model.compartments),
968 'objective': str(model.objective),
969 'medium_size': len(model.medium),
970 'reversible_reactions': len([r for r in model.reactions if r.reversibility]),
971 'exchange_reactions': len([r for r in model.reactions if r.id.startswith('EX_')]),
972 }
973
974 try:
975 # Test di crescita
976 solution = model.optimize()
977 validation['growth_rate'] = solution.objective_value
978 validation['status'] = solution.status
979 except Exception as e:
980 validation['growth_rate'] = None
981 validation['status'] = f"Error: {e}"
982
983 return validation