Main functions

The WILDkCAT package is organized into modules:

Extraction : Extraction of kcat values from the provided model
Retrieval : Retrieval of kcat values using curated databases (BRENDA and SABIO-RK)
Prediction : Prediction of missing and low confidence kcat values using ML-based CataPro model
Summary : Generates an HTML report summarizing the percentage and quality of kcat values identified for the model, along with their data sources.

Extraction

`wildkcat.processing.extract_kcat.run_extraction(model_path, output_folder, report=True)`

Extracts kcat-related data from a metabolic model and generates output files and an optional HTML report.

Parameters:

Name	Type	Description	Default
`model_path`	`str`	Path to the metabolic model file (JSON, MATLAB, or SBML format).	required
`output_folder`	`str`	Path to the output folder where all the results will be saved.	required
`report`	`bool`	Whether to generate an HTML report (default: True).	`True`

Source code in wildkcat/processing/extract_kcat.py

def run_extraction(model_path: str, 
                   output_folder: str, 
                   report: bool = True) -> None:
    """
    Extracts kcat-related data from a metabolic model and generates output files and an optional HTML report.

    Parameters:
        model_path (str): Path to the metabolic model file (JSON, MATLAB, or SBML format).
        output_folder (str): Path to the output folder where all the results will be saved.
        report (bool, optional): Whether to generate an HTML report (default: True).
    """
    # Intitialize logging
    os.makedirs("logs", exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"logs/extract_{timestamp}.log"
    logging.getLogger().addFilter(DedupFilter())
    logging.basicConfig(filename=filename, encoding='utf-8', level=logging.INFO)

    # Run extraction
    model = read_model(model_path)
    df, report_statistics = create_kcat_output(model)

    # Save output
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, "kcat.tsv")
    df.to_csv(output_path, sep='\t', index=False)
    logging.info(f"Output saved to '{output_path}'")
    if report:
        report_extraction(model, df, report_statistics, output_folder)

Retrieval

`wildkcat.processing.retrieve_kcat.run_retrieval(output_folder, organism, temperature_range, pH_range, database='both', report=True)`

Retrieves closests kcat values from specified databases for entries in a kcat file, applies filtering criteria, and saves the results to an output file.

Parameters:

Name	Type	Description	Default
`output_folder`	`str`	Path to the output folder where the results will be saved.	required
`organism`	`str`	Organism name.	required
`temperature_range`	`tuple`	Acceptable temperature range for filtering (min, max).	required
`pH_range`	`tuple`	Acceptable pH range for filtering (min, max).	required
`database`	`str`	Specifies which database(s) to query for kcat values. Options are 'both' (default), 'brenda', or 'sabio_rk'.	`'both'`
`report`	`bool`	Whether to generate an HTML report using the retrieved data (default: True).	`True`

Source code in wildkcat/processing/retrieve_kcat.py

def run_retrieval(output_folder: str,
                  organism: str,
                  temperature_range: tuple,
                  pH_range: tuple,
                  database: str = 'both',
                  report: bool = True) -> None:
    """
    Retrieves closests kcat values from specified databases for entries in a kcat file, applies filtering criteria, 
    and saves the results to an output file.

    Parameters:
        output_folder (str): Path to the output folder where the results will be saved.
        organism (str): Organism name.
        temperature_range (tuple): Acceptable temperature range for filtering (min, max).
        pH_range (tuple): Acceptable pH range for filtering (min, max).
        database (str, optional): Specifies which database(s) to query for kcat values. 
            Options are 'both' (default), 'brenda', or 'sabio_rk'.
        report (bool, optional): Whether to generate an HTML report using the retrieved data (default: True).        
    """
    # Intitialize logging
    os.makedirs("logs", exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"logs/retrieval_{timestamp}.log"
    logging.getLogger().addFilter(DedupFilter())
    logging.basicConfig(filename=filename, encoding='utf-8', level=logging.INFO)


    # Create a dict with the general criterias
    general_criteria = {
        "Organism": organism,
        "Temperature": temperature_range,
        "pH": pH_range
    }

    # Read the kcat file
    if not os.path.exists(output_folder):
        raise FileNotFoundError(f"The specified output folder '{output_folder}' does not exist.")

    kcat_file_path = os.path.join(output_folder, "kcat.tsv")
    if not os.path.isfile(kcat_file_path):
        raise FileNotFoundError(f"The specified file '{kcat_file_path}' does not exist in the output folder. Please run the function 'run_extraction()' first.")

    kcat_df = pd.read_csv(kcat_file_path, sep='\t')

    # Initialize new columns
    kcat_df['kcat'] = None
    kcat_df['matching_score'] = None

    # Add data of the retrieve kcat values
    kcat_df['kcat_substrate'] = None
    kcat_df['kcat_organism'] = None
    kcat_df['kcat_enzyme'] = None
    kcat_df['kcat_temperature'] = None
    kcat_df['kcat_ph'] = None
    kcat_df['kcat_variant'] = None
    kcat_df['kcat_db'] = None

    # Retrieve kcat values from databases
    request_count = 0
    for row in tqdm(kcat_df.itertuples(), total=len(kcat_df), desc="Retrieving kcat values"):
        kcat_dict = row._asdict()

        # Extract kcat and matching score
        best_match, matching_score = extract_kcat(kcat_dict, general_criteria, database=database)
        kcat_df.loc[row.Index, 'matching_score'] = matching_score

        request_count += 1
        if request_count % 300 == 0:
            time.sleep(10)

        if best_match is not None:
            # Assign results to the main dataframe
            kcat_df.loc[row.Index, 'kcat'] = best_match['adj_kcat']
            kcat_df.loc[row.Index, 'kcat_substrate'] = best_match['Substrate']
            kcat_df.loc[row.Index, 'kcat_organism'] = best_match['Organism']
            kcat_df.loc[row.Index, 'kcat_enzyme'] = best_match['UniProtKB_AC']
            kcat_df.loc[row.Index, 'kcat_temperature'] = best_match['adj_temp']
            kcat_df.loc[row.Index, 'kcat_ph'] = best_match['pH']
            kcat_df.loc[row.Index, 'kcat_variant'] = best_match['EnzymeVariant']
            kcat_df.loc[row.Index, 'kcat_db'] = best_match['db']
            if best_match.get('id_perc') != -1:
                kcat_df.loc[row.Index, 'kcat_id_percent'] = best_match['id_perc']
            if best_match.get('organism_score') != np.inf:
                kcat_df.loc[row.Index, 'kcat_organism_score'] = best_match['organism_score']

    output_path = os.path.join(output_folder, "kcat_retrieved.tsv")
    kcat_df.to_csv(output_path, sep='\t', index=False)
    logging.info(f"Output saved to '{output_path}'")

    if report:
        report_retrieval(kcat_df, output_folder)

Prediction

`wildkcat.processing.predict_kcat.run_prediction_part1(output_folder, limit_matching_score, report=True)`

Processes kcat data file to generate input files for CataPro prediction. Optionally, it can produce a summary report of the processed data.

Parameters:

Name	Type	Description	Default
`output_folder`	`str`	Path to the output folder where the results will be saved.	required
`limit_matching_score`	`int`	Threshold for filtering entries based on matching score.	required
`report`	`bool`	Whether to generate a report using the retrieved data (default: True).	`True`

Source code in wildkcat/processing/predict_kcat.py

def run_prediction_part1(output_folder: str,
                         limit_matching_score: int, 
                         report: bool = True) -> None:
    """
    Processes kcat data file to generate input files for CataPro prediction.
    Optionally, it can produce a summary report of the processed data.

    Parameters:
        output_folder (str): Path to the output folder where the results will be saved.
        limit_matching_score (int): Threshold for filtering entries based on matching score.
        report (bool, optional): Whether to generate a report using the retrieved data (default: True). 
    """
    # Intitialize logging
    os.makedirs("logs", exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"logs/prediction1_{timestamp}.log"
    logging.getLogger().addFilter(DedupFilter())
    logging.basicConfig(filename=filename, encoding='utf-8', level=logging.INFO)

    # Run prediction part 1
    # Read the kcat file
    if not os.path.exists(output_folder):
        raise FileNotFoundError(f"The specified output folder '{output_folder}' does not exist.")

    kcat_file_path = os.path.join(output_folder, "kcat_retrieved.tsv")
    if not os.path.isfile(kcat_file_path):
        raise FileNotFoundError(f"The specified file '{kcat_file_path}' does not exist in the output folder. Please run the function 'run_retrieval()' first.")

    kcat_df = pd.read_csv(kcat_file_path, sep='\t')

    # Subset rows with no values or matching score above the limit
    kcat_df = kcat_df[(kcat_df['matching_score'] >= limit_matching_score) | (kcat_df['matching_score'].isnull())]
    # Drop rows with no UniProt ID or no substrates_kegg
    before_duplicates_filter = len(kcat_df) - 1 
    kcat_df = kcat_df[kcat_df['uniprot'].notnull() & kcat_df['substrates_kegg'].notnull()]
    nb_missing_enzymes = before_duplicates_filter - len(kcat_df)

    # Generate CataPro input file
    catapro_input_df, substrates_to_smiles_df, report_statistics = create_catapro_input_file(kcat_df)

    # Save the CataPro input file and substrates to SMILES mapping
    os.makedirs(os.path.join(output_folder, "machine_learning"), exist_ok=True)
    output_path = os.path.join(output_folder, "machine_learning/catapro_input.csv")
    kcat_df.to_csv(output_path, sep='\t', index=False)
    catapro_input_df.to_csv(output_path, sep=',', index=True)
    substrates_to_smiles_df.to_csv(output_path.replace('.csv', '_substrates_to_smiles.tsv'), sep='\t', index=False)
    logging.info(f"Output saved to '{output_path}'")

    # Add statistics 
    report_statistics["missing_enzymes"] = nb_missing_enzymes

    if report:
        report_prediction_input(catapro_input_df, report_statistics, output_folder)

`wildkcat.processing.predict_kcat.run_prediction_part2(output_folder, catapro_predictions_path, limit_matching_score)`

Runs the second part of the kcat prediction pipeline by integrating Catapro predictions, mapping substrates to SMILES, formatting the output, and optionally generating a report.

Parameters:

Name	Type	Description	Default
`output_folder`	`str`	Path to the output folder where the results will be saved.	required
`catapro_predictions_path`	`str`	Path to the CataPro predictions CSV file.	required
`limit_matching_score`	`float`	Threshold for taking predictions over retrieved values.	required

Source code in wildkcat/processing/predict_kcat.py

def run_prediction_part2(output_folder: str,
                         catapro_predictions_path: str,
                         limit_matching_score: int) -> None:
    """
    Runs the second part of the kcat prediction pipeline by integrating Catapro predictions,
    mapping substrates to SMILES, formatting the output, and optionally generating a report.

    Parameters:
        output_folder (str): Path to the output folder where the results will be saved.
        catapro_predictions_path (str): Path to the CataPro predictions CSV file.
        limit_matching_score (float): Threshold for taking predictions over retrieved values.
    """ 
    # Intitialize logging
    os.makedirs("logs", exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"logs/prediction2_{timestamp}.log"
    logging.getLogger().addFilter(DedupFilter())
    logging.basicConfig(filename=filename, encoding='utf-8', level=logging.INFO)

    # Run prediction part 2
    # Read the kcat file
    if not os.path.exists(output_folder):
        raise FileNotFoundError(f"The specified output folder '{output_folder}' does not exist.")
    kcat_file_path = os.path.join(output_folder, "kcat_retrieved.tsv")
    if not os.path.isfile(kcat_file_path):
        raise FileNotFoundError(f"The specified file '{kcat_file_path}' does not exist in the output folder. Please run the function 'run_extraction()' first.")
    kcat_df = pd.read_csv(kcat_file_path, sep='\t')
    substrates_to_smiles_path = os.path.join(output_folder, "machine_learning/catapro_input_substrates_to_smiles.tsv")
    substrates_to_smiles = pd.read_csv(substrates_to_smiles_path, sep='\t')
    catapro_predictions_df = pd.read_csv(catapro_predictions_path, sep=',')
    kcat_df = integrate_catapro_predictions(kcat_df, 
                                            substrates_to_smiles,
                                            catapro_predictions_df
                                            )

    # Save the output as a TSV file
    kcat_df = format_output(kcat_df, limit_matching_score)
    output_path = os.path.join(output_folder, "kcat_full.tsv")
    kcat_df.to_csv(output_path, sep='\t', index=False)
    logging.info(f"Output saved to '{output_path}'")

Summary report

`wildkcat.processing.summary.generate_summary_report(model_path, output_folder)`

Generate a HTML report summarizing the kcat extraction, retrieval and prediction for a given model.

Parameters:

Name	Type	Description	Default
`model_path`	`str`	Path to the metabolic model file (JSON, MATLAB, or SBML format).	required
`output_folder`	`str`	Path to the output folder where the kcat file is located.	required

Source code in wildkcat/processing/summary.py

def generate_summary_report(model_path: str,
                            output_folder: str) -> None:
    """
    Generate a HTML report summarizing the kcat extraction, retrieval and prediction for a given model. 

    Parameters:
        model_path (str): Path to the metabolic model file (JSON, MATLAB, or SBML format).
        output_folder (str): Path to the output folder where the kcat file is located.
    """
    # Read the kcat file
    if not os.path.exists(output_folder):
        raise FileNotFoundError(f"The specified output folder '{output_folder}' does not exist.")

    kcat_file_path = os.path.join(output_folder, "kcat_full.tsv")
    if not os.path.isfile(kcat_file_path):
        raise FileNotFoundError(f"The specified file '{kcat_file_path}' does not exist in the output folder. Please run the full pipeline.")
    kcat_df = pd.read_csv(kcat_file_path, sep='\t')
    model = read_model(model_path)
    report_final(model, kcat_df, output_folder)

Matching process and scoring

The matching process is designed to select the most appropriate kcat value when multiple candidates are available.
Each candidate is first assigned a score based on several criteria, such as:

kcat specific criteria:
- Substrate
- Catalytic enzyme(s)
General criteria:
- Organism
- Temperature
- pH

If two or more candidates receive the same score, tie-breaking rules are applied in the following order:

Enzyme sequence identity – the value associated with the most similar protein sequence is preferred.
Organism proximity – preference is given to kcat values measured in organisms closest to the target species.
Minimal kcat value – if ambiguity remains, the smallest kcat value is chosen.

`wildkcat.utils.matching`

`check_catalytic_enzyme(candidate, kcat_dict)`

Checks whether the enzyme in a candidate entry matches the model's enzyme. Identifies the catalytic enzyme using UniProt API.

Source code in wildkcat/utils/matching.py

def check_catalytic_enzyme(candidate, kcat_dict): 
    """
    Checks whether the enzyme in a candidate entry matches the model's enzyme.
    Identifies the catalytic enzyme using UniProt API.
    """
    if pd.notna(kcat_dict['catalytic_enzyme']):
        catalytic_enzymes = str(kcat_dict['catalytic_enzyme']).split(";")
        if candidate["UniProtKB_AC"] in catalytic_enzymes:
            return 0
    return 2

`check_organism(candidate, general_criteria)`

Checks whether the organism in a candidate entry matches the expected organism.

Source code in wildkcat/utils/matching.py

def check_organism(candidate, general_criteria): 
    """
    Checks whether the organism in a candidate entry matches the expected organism.
    """
    if candidate["Organism"] == general_criteria["Organism"]:
        return 0
    return 2

`check_pH(candidate, general_criteria)`

Checks whether the pH in a candidate entry matches the expected pH.

Source code in wildkcat/utils/matching.py

def check_pH(candidate, general_criteria):
    """
    Checks whether the pH in a candidate entry matches the expected pH.
    """
    ph_min, ph_max = general_criteria["pH"]
    candidate_ph = candidate.get("pH", None)
    if ph_min <= candidate_ph <= ph_max:
        return 0
    elif pd.isna(candidate_ph):
        return 1
    else:  # Out of range
        return 2

`check_substrate(entry, kcat_dict=None, candidate=None)`

Checks whether the substrate in a candidate entry matches the model's substrates.

Source code in wildkcat/utils/matching.py

def check_substrate(entry, kcat_dict=None, candidate=None):
    """
    Checks whether the substrate in a candidate entry matches the model's substrates.
    """
    api = entry.get("db", candidate.get("db") if candidate else None)

    # Normalisation des champs
    entry_subs = entry.get("Substrate", "")
    entry_prods = entry.get("Product", "")
    entry_kegg = entry.get("KeggReactionID")

    cand_subs = candidate.get("Substrate", "") if candidate else ""
    cand_prods = candidate.get("Product", "") if candidate else ""
    cand_kegg = candidate.get("KeggReactionID") if candidate else ""

    model_subs = (kcat_dict or {}).get("substrates_name", "")
    model_prods = (kcat_dict or {}).get("products_name", "")
    model_kegg = (kcat_dict or {}).get("rxn_kegg")

    # --- logique identique à avant ---
    if api == "sabio_rk":

        entry_kegg = None if pd.isna(entry_kegg) else entry_kegg
        model_kegg = None if pd.isna(model_kegg) else model_kegg
        cand_kegg  = None if pd.isna(cand_kegg) else cand_kegg

        if model_kegg and entry_kegg and _norm_name(model_kegg) == _norm_name(entry_kegg):
            if _any_intersection(entry_subs, model_subs) or _any_intersection(entry_prods, model_prods):
                return 0
        if cand_kegg and entry_kegg and _norm_name(cand_kegg) == _norm_name(entry_kegg):
            if _any_intersection(entry_subs, cand_subs) or _any_intersection(entry_prods, cand_prods):
                return 0
        base_subs = model_subs or cand_subs
        if _any_intersection(entry_subs, base_subs):
            return 0
        return 3

    elif api == "brenda":
        base_subs = model_subs or cand_subs
        if _any_intersection(entry_subs, base_subs):
            return 0
        return 3

    return 3

`check_temperature(candidate, general_criteria, api_output, min_r2=0.8, expected_range=(50000, 150000))`

Checks whether the temperature in a candidate entry matches the expected temperature. If the temperature is within the specified range is not met, verify if the Arrhenius equation can be applied.

Source code in wildkcat/utils/matching.py

def check_temperature(candidate, general_criteria, api_output, min_r2=0.8, expected_range=(50000, 150000)): 
    """
    Checks whether the temperature in a candidate entry matches the expected temperature.
    If the temperature is within the specified range is not met, verify if the Arrhenius equation can be applied.
    """

    temp_min, temp_max = general_criteria["Temperature"]
    candidate_temp = candidate.get("Temperature")

    if temp_min <= candidate_temp <= temp_max:
        return 0, False

    # Try to find a correct the kcat value using the Arrhenius equation
    ph_min, ph_max = general_criteria["pH"]

    # Base filters
    filters = (
        api_output["pH"].between(ph_min, ph_max)
        & (api_output["UniProtKB_AC"] == candidate["UniProtKB_AC"])
        & api_output["Temperature"].notna()
        & api_output["value"].notna()
    )

    valid_idx = api_output.apply(
        lambda row: check_substrate(row.to_dict(), None, candidate) == 0,
        axis=1
        )

    filters = filters & valid_idx

    temps_dispo = api_output.loc[filters, "Temperature"].nunique()
    api_filtered = api_output.loc[filters, ["Temperature", "value"]].copy()

    # Convert temperatures to Kelvin
    api_filtered["Temperature"] = api_filtered["Temperature"] + 273.15

    if temps_dispo >= 2:
        ea, r2 = calculate_ea(api_filtered)
        if r2 >= min_r2 and ea > 0:
            if not (expected_range[0] <= ea <= expected_range[1]):
                logging.warning(f"{candidate.get('ECNumber')}: Estimated Ea ({ea:.0f} J/mol) is outside the expected range {expected_range} J/mol.")
            # Go Arrhenius
            return 0, True

    if pd.isna(candidate_temp):
        return 1, False

    else:
        return 2, False

`check_variant(candidate)`

Checks whether the enzyme variant in a candidate entry is wildtype or unknown.

Source code in wildkcat/utils/matching.py

def check_variant(candidate):
    """
    Checks whether the enzyme variant in a candidate entry is wildtype or unknown.
    """
    if candidate["EnzymeVariant"] == "wildtype":
        return 0
    else:  # Unknown
        return 1

`compute_score(kcat_dict, candidate, general_criteria, api_output)`

Compute a score for the candidate based on the Kcat dictionary and general criteria.

Source code in wildkcat/utils/matching.py

def compute_score(kcat_dict, candidate, general_criteria, api_output):
    """
    Compute a score for the candidate based on the Kcat dictionary and general criteria.
    """
    score = 0
    # Check catalytic enzyme
    score += check_catalytic_enzyme(candidate, kcat_dict)
    # Check organism
    if score != 0: 
        score += check_organism(candidate, general_criteria)
    # Check variant
    score += check_variant(candidate) 
    # Check pH
    score += check_pH(candidate, general_criteria)
    # Check substrate 
    score += check_substrate(candidate, kcat_dict)
    # Check temperature 
    temperature_penalty, arrhenius = check_temperature(candidate, general_criteria, api_output) 
    score += temperature_penalty
    return score, arrhenius

`find_best_match(kcat_dict, api_output, general_criteria)`

Finds the best matching enzyme entry from the provided API output based on

Kcat specific criteria:
- Substrate
- Catalytic enzyme(s)
General criteria :
- Organism
- Temperature
- pH

This function filters out mutant enzyme variants, orders the remaining entries based on enzyme and organism similarity, and iteratively computes a score for each candidate to identify the best match. If a candidate requires an Arrhenius adjustment, the kcat value is recalculated accordingly.

Parameters:

Name	Type	Description	Default
`kcat_dict`	`dict`	Dictionary containing enzyme information.	required
`api_output`	`DataFrame`	DataFrame containing kcat entries and metadata from an API.	required
`general_criteria`	`dict`	Dictionary specifying matching criteria.	required

Returns:

Name	Type	Description
`tuple`	`Tuple[float, Optional[Dict[str, Any]]]`	best_score (float): The lowest score found, representing the best match. best_candidate (dict or None): Dictionary of the best matching candidate's data, or None if no match is found.

Source code in wildkcat/utils/matching.py

def find_best_match(kcat_dict, api_output, general_criteria) -> Tuple[float, Optional[Dict[str, Any]]]:
    """
    Finds the best matching enzyme entry from the provided API output based on: 
        - Kcat specific criteria: 
            * Substrate 
            * Catalytic enzyme(s)
        - General criteria : 
            * Organism
            * Temperature
            * pH

    This function filters out mutant enzyme variants, orders the remaining entries based on enzyme and organism similarity,
    and iteratively computes a score for each candidate to identify the best match. If a candidate requires an Arrhenius
    adjustment, the kcat value is recalculated accordingly.

    Parameters:
        kcat_dict (dict): Dictionary containing enzyme information.
        api_output (pd.DataFrame): DataFrame containing kcat entries and metadata from an API.
        general_criteria (dict): Dictionary specifying matching criteria.

    Returns:
        tuple:
            best_score (float): The lowest score found, representing the best match.
            best_candidate (dict or None): Dictionary of the best matching candidate's data, or None if no match is found.
    """

    # 1. Remove mutant enzymes
    api_output = api_output[api_output["EnzymeVariant"].isin(['wildtype', None])]
    if api_output.empty:
        return 14, None

    # 2. Compute score and adjust kcat if needed
    scores = []
    adjusted_kcats, adjusted_temps = [], []

    for _, row in api_output.iterrows():
        candidate_dict = row.to_dict()
        score, arrhenius = compute_score(kcat_dict, candidate_dict, general_criteria, api_output)
        if arrhenius:
            kcat = arrhenius_equation(candidate_dict, api_output, general_criteria)
            candidate_dict['value'] = kcat
            candidate_dict['Temperature'] = np.mean(general_criteria["Temperature"])
        scores.append(score)
        adjusted_kcats.append(candidate_dict.get('value', row['value']))
        adjusted_temps.append(candidate_dict.get('Temperature', row['Temperature']))

    api_output = api_output.copy()
    api_output['score'] = scores
    api_output['adj_kcat'] = adjusted_kcats
    api_output['adj_temp'] = adjusted_temps

    api_output["score"] = pd.to_numeric(api_output["score"], errors="coerce").fillna(13)
    api_output["adj_kcat"] = pd.to_numeric(api_output["adj_kcat"], errors="coerce")

    # Initialize columns for tie-breaking
    api_output['id_perc'] = -1
    api_output['organism_score'] = np.inf

    # 3. Keep only best-score candidates
    min_score = api_output['score'].min()
    tied = api_output[api_output['score'] == min_score]

    # 4. Tie-breaking
    if len(tied) > 1:
        # Tie-break with enzyme identity
        tied = closest_enz(kcat_dict, tied)
        if not tied['id_perc'].isna().all():
            max_id = tied['id_perc'].max()
            tied = tied[tied['id_perc'] == max_id]

    if len(tied) > 1:
        # Tie-break with taxonomy
        tied = closest_taxonomy(general_criteria, tied)
        if not tied['organism_score'].isna().all():
            min_tax = tied['organism_score'].min()
            tied = tied[tied['organism_score'] == min_tax]

    if len(tied) > 1:
        # Tie-break with lowest kcat
        min_kcat = tied['adj_kcat'].min()
        tied = tied[tied['adj_kcat'] == min_kcat]

    # 5. Select best candidate
    best_candidate = tied.iloc[0].to_dict()
    best_candidate['catalytic_enzyme'] = kcat_dict.get('catalytic_enzyme')
    best_score = best_candidate['score']

    return best_score, best_candidate

Find closest enzyme and organism

`wildkcat.utils.organism`

`closest_enz(kcat_dict, api_output)`

Retrieve and ranks the enzymes sequences closest to the sequence of the target enzyme based on the percentage of identity. If the reference UniProt ID is missing, invalid, or the sequence cannot be retrieved, the function returns the input DataFrame with "id_perc" set to None.

Parameters:

Name	Type	Description	Default
`kcat_dict`	`dict`	Dictionary containing at least the key 'uniprot_model' with the reference UniProt ID.	required
`api_output`	`DataFrame`	DataFrame containing a column "UniProtKB_AC" with UniProt IDs to compare against.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A copy of `api_output` with an added "id_perc" column (identity percentage).

Source code in wildkcat/utils/organism.py

def closest_enz(kcat_dict, api_output) -> pd.DataFrame:
    """
    Retrieve and ranks the enzymes sequences closest to the sequence of the target enzyme based on the percentage of identity.
    If the reference UniProt ID is missing, invalid, or the sequence cannot be retrieved, the function returns the input DataFrame with "id_perc" set to None.

    Parameters:    
        kcat_dict (dict): Dictionary containing at least the key 'uniprot_model' with the reference UniProt ID.
        api_output (pd.DataFrame): DataFrame containing a column "UniProtKB_AC" with UniProt IDs to compare against.

    Returns:
        pd.DataFrame: A copy of `api_output` with an added "id_perc" column (identity percentage). 
    """

    def _calculate_identity(seq_ref, seq_db):
        """
        Returns the percentage of identical characters between two sequences.
        Adapted from https://gist.github.com/JoaoRodrigues/8c2f7d2fc5ae38fc9cb2 

        Parameters: 
            seq_ref (str): The reference sequence.
            seq_db (str): The sequence to compare against.

        Returns: 
            float: The percentage of identical characters between the two sequences.
        """
        matches = [a == b for a, b in zip(seq_ref, seq_db)]
        return (100 * sum(matches)) / len(seq_ref)

    ref_uniprot_id = kcat_dict.get('catalytic_enzyme')
    if pd.isna(ref_uniprot_id) or (";" in str(ref_uniprot_id)):
        api_output = api_output.copy()
        api_output["id_perc"] = None
        return api_output

    ref_seq = convert_uniprot_to_sequence(ref_uniprot_id)
    if ref_seq is None:
        api_output = api_output.copy()
        api_output["id_perc"] = None
        return api_output

    aligner = Align.PairwiseAligner()
    identity_scores = []

    for uniprot_id in api_output["UniProtKB_AC"]:
        if pd.isna(uniprot_id):
            identity_scores.append(None)
            continue
        seq = convert_uniprot_to_sequence(uniprot_id)
        if seq is None:
            identity_scores.append(None)
            continue
        elif len(seq) == 0:
            identity_scores.append(0)
            continue

        alignments = aligner.align(ref_seq, seq)
        aligned_ref, aligned_db = alignments[0]
        id_score = _calculate_identity(aligned_ref, aligned_db)
        identity_scores.append(id_score)

    api_output = api_output.copy()
    api_output["id_perc"] = identity_scores

    return api_output

`closest_taxonomy(general_criteria, api_output)`

Retrieve and ranks the organisms based on their taxonomic similarity to the reference organism.

Parameters:

Name	Type	Description	Default
`general_criteria`	`dict`	Dictionary containing at least the key 'organism' with the reference organism.	required
`api_output`	`DataFrame`	DataFrame containing a column "Organism".	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A copy of `api_output` with an added "organism_score" column.

Source code in wildkcat/utils/organism.py

def closest_taxonomy(general_criteria, api_output) -> pd.DataFrame: 
    """
    Retrieve and ranks the organisms based on their taxonomic similarity to the reference organism.

    Parameters:    
        general_criteria (dict): Dictionary containing at least the key 'organism' with the reference organism.
        api_output (pd.DataFrame): DataFrame containing a column "Organism". 

    Returns:
        pd.DataFrame: A copy of `api_output` with an added "organism_score" column.
    """
    @lru_cache(maxsize=None)
    def _fetch_taxonomy(species_name): 
        """
        Fetches the taxonomic lineage for a given species name using NCBI Entrez.

        Parameters:
            species_name (str): The name of the species.

        Returns: 
            list: A list of scientific names representing the taxonomic lineage.
        """
        Entrez.email = os.getenv("ENTREZ_EMAIL")
        handle = Entrez.esearch(db="taxonomy", term=species_name)
        record = Entrez.read(handle)
        if not record["IdList"]:
            return []
        tax_id = record["IdList"][0]

        handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
        records = Entrez.read(handle)
        if not records:
            return []

        lineage = [taxon["ScientificName"] for taxon in records[0]["LineageEx"]]
        lineage.append(records[0]["ScientificName"])  # include the species itself
        return lineage

    @lru_cache(maxsize=None)
    def _calculate_taxonomy_score(ref_organism, target_organism): 
        """
        Calculate a taxonomy distance score between reference and target organisms.

        Parameters: 
            ref_organism (str): The reference organism's name.
            target_organism (str): The target organism's name.

        Returns:
            int: distance between reference and target organisms (0 = identical species, higher = more distant).
        """
        ref_lineage = _fetch_taxonomy(ref_organism)
        target_lineage = _fetch_taxonomy(target_organism)

        if not target_lineage: # If target organism is not found
            return len(ref_lineage) + 1  # Penalize missing taxonomy

        similarity = 0

        for taxon in target_lineage: 
            if taxon in ref_lineage:
                similarity += 1
            else:
                break
        return len(ref_lineage) - similarity


    ref_organism = general_criteria['Organism']
    api_output = api_output.copy()
    api_output["organism_score"] = [
        _calculate_taxonomy_score(ref_organism, target) 
        for target in api_output["Organism"]
    ]
    return api_output

Correct the kcat value using Arrhenius equation

`wildkcat.utils.temperature`

`arrhenius_equation(candidate, api_output, general_criteria)`

Estimates the kcat value at a target temperature using the Arrhenius equation, based on available experimental data.

Parameters:

Name	Type	Description	Default
`candidate`	`dict`	Information about the enzyme candidate.	required
`api_output`	`DataFrame`	DataFrame containing experimental kcat values.	required
`general_criteria`	`dict`	Dictionary specifying selection criteria, including 'Temperature' and 'pH'.	required

Returns:

Name	Type	Description
`float`	`float`	Estimated kcat value at the objective temperature, calculated using the Arrhenius equation.

Source code in wildkcat/utils/temperature.py

def arrhenius_equation(candidate, api_output, general_criteria) -> float:
    """
    Estimates the kcat value at a target temperature using the Arrhenius equation, based on available experimental data.

    Parameters:
        candidate (dict): Information about the enzyme candidate.
        api_output (pd.DataFrame): DataFrame containing experimental kcat values.
        general_criteria (dict): Dictionary specifying selection criteria, including 'Temperature' and 'pH'.

    Returns:
        float: Estimated kcat value at the objective temperature, calculated using the Arrhenius equation.
    """

    def calculate_kcat(temp_obj, ea, kcat_ref, temp_ref): 
        """
        Calculates the catalytic rate constant (kcat) at a given temperature using the Arrhenius equation.

        Parameters: 
            temp_obj (float): The target temperature (in Kelvin) at which to calculate kcat.
            ea (float): The activation energy calculated using find_ea(). 
            kcat_ref (float): The reference kcat value measured at temp_ref.
            temp_ref (float): The reference temperature (in Kelvin) at which kcat_ref was measured.

        Returns: 
            float: The calculated kcat value at temp_obj.
        """
        r = 8.314
        kcat_obj = kcat_ref * np.exp(ea / r * (1/temp_ref - 1/temp_obj))
        return kcat_obj

    # Objective temperature
    obj_temp = np.mean(general_criteria["Temperature"]) + 273.15

    # Format the api_output DataFrame
    ph_min, ph_max = general_criteria["pH"]
    filters = (
        (api_output["UniProtKB_AC"] == candidate["UniProtKB_AC"]) &
        api_output["Temperature"].notna() &
        api_output["value"].notna() &
        api_output["pH"].between(ph_min, ph_max)
    )
    api_filtered = api_output.loc[filters, ["Temperature", "value"]].copy()

    # Convert temperatures to Kelvin
    api_filtered["Temperature"] = api_filtered["Temperature"] + 273.15

    # Estimate the activation energy (Ea)
    ea, _ = calculate_ea(api_filtered)

    # Select one kcat for the ref
    kcat_ref = float(api_filtered['value'].iloc[0])
    temp_ref = float(api_filtered['Temperature'].iloc[0])

    kcat = calculate_kcat(obj_temp, ea, kcat_ref, temp_ref)
    return kcat

`calculate_ea(df)`

Estimate the activation energy (Ea) using the Arrhenius equation from kcat values at different temperatures.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame with at least 'Temperature' (°C) and 'value' (kcat) columns.	required

Returns:

Name	Type	Description
`float`	`float`	Estimated activation energy (Ea) in J/mol.

Source code in wildkcat/utils/temperature.py

def calculate_ea(df) -> float:
    """
    Estimate the activation energy (Ea) using the Arrhenius equation from kcat values at different temperatures.

    Parameters:
        df (pd.DataFrame): DataFrame with at least 'Temperature' (°C) and 'value' (kcat) columns.

    Returns:
        float: Estimated activation energy (Ea) in J/mol. 
    """

    r = 8.314  # Gas constant in J/(mol*K)

    # Filter out rows with missing values
    valid = df[['Temperature', 'value']].dropna()

    temps_K = valid['Temperature'].values
    kcats = pd.to_numeric(valid['value'], errors='coerce').values

    x = 1 / temps_K
    y = np.log(kcats)
    slope, intercept = np.polyfit(x, y, 1)

    # R2 
    y_pred = slope * x + intercept
    ss_res = np.sum((y - y_pred) ** 2)
    ss_tot = np.sum((y - np.mean(y)) ** 2)
    r2 = 1 - ss_res / ss_tot if ss_tot != 0 else np.nan

    # Activation energy 
    ea = float(-slope * r)

    return ea, r2 

API

`wildkcat.api.brenda_api`

`create_brenda_client(wsdl_url='https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl')`

Creates and configures a persistent SOAP client for the BRENDA API.

Parameters:

Name	Type	Description	Default
`wsdl_url`	`str`	URL to the BRENDA WSDL file.	`'https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl'`

Returns:

Type	Description
`Client`	zeep.Client: Configured SOAP client.

Source code in wildkcat/api/brenda_api.py

def create_brenda_client(wsdl_url: str = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl") -> Client:
    """
    Creates and configures a persistent SOAP client for the BRENDA API.

    Parameters:
        wsdl_url (str): URL to the BRENDA WSDL file.

    Returns:
        zeep.Client: Configured SOAP client.
    """
    # Configure retry logic for network resilience
    session = Session()
    retry = Retry(total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    # Set a custom User-Agent (some servers block default Python UA)
    session.headers.update({"User-Agent": "BRENDA-Client"})

    # Create zeep transport and settings
    transport = Transport(session=session, cache=InMemoryCache())
    settings = Settings(strict=False, xml_huge_tree=True) 

    return Client(wsdl_url, settings=settings, transport=transport)

`get_brenda_credentials()`

Retrieves and hashes BRENDA API credentials from environment variables.

Returns:

Type	Description
`tuple[str, str]`	tuple[str, str]: (email, hashed_password)

Source code in wildkcat/api/brenda_api.py

def get_brenda_credentials() -> tuple[str, str]:
    """
    Retrieves and hashes BRENDA API credentials from environment variables.

    Returns:
        tuple[str, str]: (email, hashed_password)
    """
    email = os.getenv("BRENDA_EMAIL")
    password = os.getenv("BRENDA_PASSWORD")

    if not email or not password:
        raise ValueError("BRENDA_EMAIL and BRENDA_PASSWORD environment variables must be set.")

    hashed_password = hashlib.sha256(password.encode("utf-8")).hexdigest()
    return email, hashed_password

`get_cofactor(ec_number)` `cached`

Queries the BRENDA SOAP API to retrieve cofactor information for a given Enzyme Commission (EC) number.

Parameters:

Name	Type	Description	Default
`ec_number`	`str`	EC number (e.g., '1.1.1.1').	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A DataFrame containing turnover number entries.

Source code in wildkcat/api/brenda_api.py

@lru_cache(maxsize=None)
def get_cofactor(ec_number) -> pd.DataFrame:
    """
    Queries the BRENDA SOAP API to retrieve cofactor information for a given Enzyme Commission (EC) number.

    Parameters:
        ec_number (str): EC number (e.g., '1.1.1.1').

    Returns:
        pd.DataFrame: A DataFrame containing turnover number entries.
    """
    # Call the SOAP API
    email, hashed_password = get_brenda_credentials()
    client = create_brenda_client()

    parameters_cofactor = [
        email,
        hashed_password,
        f'ecNumber*{ec_number}',
        "cofactor*", 
        "commentary*", 
        "organism*", 
        "ligandStructureId*", 
        "literature*"
    ]

    result_cofactor = client.service.getCofactor(*parameters_cofactor)
    data = serialize_object(result_cofactor)
    df = pd.DataFrame(data)
    if df.empty:
        return []
    cofactor = df['cofactor'].unique().tolist()
    return cofactor

`get_turnover_number_brenda(ec_number)` `cached`

Queries the BRENDA SOAP API to retrieve turnover number values for a Enzyme Commission (EC) Number.

Parameters:

Name	Type	Description	Default
`ec_number`	`str`	EC number (e.g., '1.1.1.1').	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A DataFrame containing turnover number entries.

Source code in wildkcat/api/brenda_api.py

@lru_cache(maxsize=None)
def get_turnover_number_brenda(ec_number) -> pd.DataFrame:
    """
    Queries the BRENDA SOAP API to retrieve turnover number values for a Enzyme Commission (EC) Number.

    Parameters:
        ec_number (str): EC number (e.g., '1.1.1.1').

    Returns:
        pd.DataFrame: A DataFrame containing turnover number entries.
    """

    email, hashed_password = get_brenda_credentials()
    client = create_brenda_client()

    # Define the parameters for the SOAP request

    parameters_kcat = [
        email,
        hashed_password,
        f'ecNumber*{ec_number}',
        "turnoverNumber*", 
        "turnoverNumberMaximum*", 
        "substrate*", 
        "commentary*", 
        "organism*", 
        "ligandStructureId*", 
        "literature*"
    ]

    parameters_org = [
        email,
        hashed_password,
        f'ecNumber*{ec_number}',
        "organism*",
        "sequenceCode*", 
        "commentary*", 
        "literature*",
        "textmining*"
    ]

    # print(client.service.__getattr__('getTurnoverNumber').__doc__)
    # print(client.service.__getattr__('getOrganism').__doc__)

    result_kcat = client.service.getTurnoverNumber(*parameters_kcat)
    result_organism = client.service.getOrganism(*parameters_org)

    # Format the response into a DataFrame
    data = serialize_object(result_kcat)
    data_organism = serialize_object(result_organism)

    if not data:
        logging.warning('%s: No data found for the query in BRENDA.' % f"{ec_number}")
        return pd.DataFrame()

    # Remove None values (-999)
    data = [entry for entry in data if entry.get('turnoverNumber') is not None and entry.get('turnoverNumber') != '-999']
    if data == []:
        logging.warning('%s: No valid data found for the query in BRENDA.' % f"{ec_number}")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    df_org = pd.DataFrame(data_organism)

    # Format the organism response
    df_org.drop(columns=['commentary', 'textmining'], inplace=True, errors='ignore')

    # Merge on the literature column
    df_org['literature'] = df_org['literature'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
    df['literature'] = df['literature'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
    df = pd.merge(df, df_org, on=['literature', 'organism'], how='left')
    df.drop_duplicates(inplace=True)

    # Rename columns for consistency with other APIs
    df.rename(columns={
        'turnoverNumber': 'value',
        'sequenceCode' : 'UniProtKB_AC',
        'substrate': 'Substrate',
        'organism': 'Organism',
        'ecNumber': 'ECNumber'}, inplace=True) 

    # Extract pH from commentary
    df["pH"] = df["commentary"].str.extract(r"pH\s*([\d\.]+)")
    # Extract temperature from commentary
    df["Temperature"] = df["commentary"].str.extract(r"([\d\.]+)\?C")
    # Convert Temperature and pH to numeric, coercing errors to NaN
    df['Temperature'] = pd.to_numeric(df['Temperature'], errors='coerce')
    df['pH'] = pd.to_numeric(df['pH'], errors='coerce')
    # Extract enzyme variant from commentary
    df["EnzymeVariant"] = df["commentary"].apply(get_variant)
    # Drop unnecessary columns
    df.drop(columns=["literature", "turnoverNumberMaximum", "parameter.endValue", "commentary", "ligandStructureId"], inplace=True, errors='ignore')

    # Remove the cofactor from the output 
    cofactor = get_cofactor(ec_number)
    # Drop the lines where the cofactor is not define
    df = df[~df['Substrate'].isin(cofactor)]   
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Add a column for the db 
    df['db'] = 'brenda' 
    return df

`get_variant(text)`

Extracts the enzyme variant information from the commentary text.

Parameters:

Name	Type	Description	Default
`text`	`str`	Commentary text from BRENDA API response.	required

Returns:

Name	Type	Description
`str`	`str \| None`	The extracted enzyme variant information: wildtype, mutant, or None if not found.

Source code in wildkcat/api/brenda_api.py

def get_variant(text) -> str | None:
    """
    Extracts the enzyme variant information from the commentary text.

    Parameters:
        text (str): Commentary text from BRENDA API response.

    Returns:
        str: The extracted enzyme variant information: wildtype, mutant, or None if not found.
    """
    if text is None or pd.isna(text):
        return None
    text = text.lower()
    if "wild" in text:  # wild-type, wildtype or wild type
        return "wildtype"
    elif any(word in text for word in ["mutant", "mutated", "mutation"]):
        return "mutant"
    return None

`wildkcat.api.sabio_rk_api`

`get_turnover_number_sabio(ec_number)` `cached`

Retrieve turnover number (kcat) data from SABIO-RK for a given EC number.

Parameters:

Name	Type	Description	Default
`ec_number`	`str`	Enzyme Commission number.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: DataFrame containing SABIO-RK entries for kcat.

Source code in wildkcat/api/sabio_rk_api.py

@lru_cache(maxsize=None)
def get_turnover_number_sabio(ec_number) -> pd.DataFrame:
    """
    Retrieve turnover number (kcat) data from SABIO-RK for a given EC number.

    Parameters:
        ec_number (str): Enzyme Commission number.

    Returns:
        pd.DataFrame: DataFrame containing SABIO-RK entries for kcat.
    """
    base_url = 'https://sabiork.h-its.org/sabioRestWebServices/searchKineticLaws/entryIDs'
    parameters = 'https://sabiork.h-its.org/entry/exportToExcelCustomizable'
    entryIDs = []

    # -- Retrieve entryIDs --
    query = {'format': 'txt', 'q': f'Parametertype:"kcat" AND ECNumber:"{ec_number}"'}

    # Make GET request
    request = requests.get(base_url, params=query)
    request.raise_for_status()
    if request.text == "no data found":
        logging.warning('%s: No data found for the query in SABIO-RK.' % f"{ec_number}")
        return pd.DataFrame()  # Return empty DataFrame if no data found

    entryIDs = [int(x) for x in request.text.strip().split('\n')]
    # Retrieve informations matching the entryIDs
    data_field = {'entryIDs[]': entryIDs}
    # Possible fields to retrieve:
    # EntryID, Reaction, Buffer, ECNumber, CellularLocation, UniProtKB_AC, Tissue, Enzyme Variant, Enzymename, Organism
    # Temperature, pH, Activator, Cofactor, Inhibitor, KeggReactionID, KineticMechanismType, Other Modifier, Parameter,
    # Pathway, Product, PubMedID, Publication, Rate Equation, SabioReactionID, Substrate
    query = {'format':'tsv', 'fields[]':['EntryID', 'ECNumber', 'KeggReactionID', 'Reaction', 'Substrate', 'Product', 
                                         'UniProtKB_AC', 'Organism', 'Enzyme Variant', 'Temperature', 'pH', 
                                         'Parameter']}

    # Make POST request
    request = requests.post(parameters, params=query, data=data_field)
    request.raise_for_status()

    # Format the response into a DataFrame
    df = pd.read_csv(StringIO(request.text), sep='\t')
    df = df[df['parameter.name'].str.lower() == 'kcat'].reset_index(drop=True) # Keep only kcat parameters
    # Convert Temperature and pH to numeric, coercing errors to NaN
    df['Temperature'] = pd.to_numeric(df['Temperature'], errors='coerce')
    df['pH'] = pd.to_numeric(df['pH'], errors='coerce')
    # Drop unnecessary columns
    df.drop(columns=['EntryID', 'parameter.name', 'parameter.type', 'parameter.associatedSpecies', 
                     'parameter.endValue', 'parameter.standardDeviation'], inplace=True, errors='ignore')
    # Drop duplicates based on normalized Substrate and Product sets
    df["Substrate_set"] = df["Substrate"].fillna("").str.split(";").apply(lambda x: tuple(sorted(s.strip() for s in x if s.strip())))
    df["Product_set"] = df["Product"].fillna("").str.split(";").apply(lambda x: tuple(sorted(s.strip() for s in x if s.strip())))
    dedup_cols = [col for col in df.columns if col not in ["Substrate", "Product"]]
    df = df.drop_duplicates(subset=dedup_cols + ["Substrate_set", "Product_set"], keep="first")
    df = df.drop(columns=["Substrate_set", "Product_set"])
    # Rename columns for consistency
    df.rename(columns={
        'ECNumber': 'ECNumber',
        'KeggReactionID': 'KeggReactionID',
        'Reaction': 'Reaction',
        'Substrate': 'Substrate',
        'Product': 'Product',
        'UniProtKB_AC': 'UniProtKB_AC',
        'Organism': 'Organism',
        'Enzyme Variant': 'EnzymeVariant',
        'Temperature': 'Temperature',
        'pH': 'pH',
        'parameter.startValue': 'value',
        'parameter.unit': 'unit'
    }, inplace=True)
    # Add a column for the db
    df['db'] = 'sabio_rk'
    return df

`wildkcat.api.uniprot_api`

`catalytic_activity(uniprot_id)` `cached`

Retrieves the EC (Enzyme Commission) numbers associated with the catalytic activity of a given UniProt ID.

Parameters:

Name	Type	Description	Default
`uniprot_id`	`str`	The UniProt identifier for the protein of interest.	required

Returns:

Type	Description
`list[str] \| None`	list[str] or None: A list of EC numbers if found, otherwise None.

Source code in wildkcat/api/uniprot_api.py

@lru_cache(maxsize=None)
def catalytic_activity(uniprot_id) -> list[str] | None:
    """
    Retrieves the EC (Enzyme Commission) numbers associated with the catalytic activity of a given UniProt ID.

    Parameters:
        uniprot_id (str): The UniProt identifier for the protein of interest.

    Returns:
        list[str] or None: A list of EC numbers if found, otherwise None.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}?fields=cc_catalytic_activity"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        ec_numbers = []
        for comment in data.get('comments', []):
            if comment.get('commentType') == 'CATALYTIC ACTIVITY':
                reaction = comment.get('reaction', {})
                ec_number = reaction.get('ecNumber')
                if ec_number:
                    ec_numbers.append(ec_number)
        if len(ec_numbers) != 0:
            return ec_numbers
    else:
        # logging.warning(f"No catalytic activity found for UniProt ID {uniprot_id}")
        return None

`convert_uniprot_to_sequence(uniprot_id)` `cached`

Convert a UniProt accession ID to its corresponding amino acid sequence.

Parameters:

Name	Type	Description	Default
`uniprot_id`	`str`	The UniProt accession ID.	required

Returns:

Name	Type	Description
`str`	`str \| None`	The amino acid sequence, or None if not found.

Source code in wildkcat/api/uniprot_api.py

@lru_cache(maxsize=None)
def convert_uniprot_to_sequence(uniprot_id) -> str | None:
    """
    Convert a UniProt accession ID to its corresponding amino acid sequence.

    Parameters:
        uniprot_id (str): The UniProt accession ID.

    Returns:
        str: The amino acid sequence, or None if not found.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)

    if response.status_code == 200:
        fasta = response.text
        lines = fasta.splitlines()
        sequence = ''.join(lines[1:])  # Skip the header
        return sequence
    else:
        # logging.warning(f"Failed to retrieve sequence for UniProt ID {uniprot_id}")
        return None

`identify_catalytic_enzyme(lst_uniprot_ids, ec)`

Identifies the catalytic enzyme from a list of UniProt IDs for a given EC number.

Parameters:

Name	Type	Description	Default
`lst_uniprot_ids`	`str`	A semicolon-separated string of UniProt IDs representing enzyme candidates.	required
`ec`	`str`	The Enzyme Commission (EC) number to match against the catalytic activity.	required

Returns:

Type	Description
`str \| None`	str or None: The UniProt ID of the catalytic enzyme if exactly one match is found; None if no match or multiple matches are found.

Source code in wildkcat/api/uniprot_api.py

def identify_catalytic_enzyme(lst_uniprot_ids, ec) -> str | None:
    """
    Identifies the catalytic enzyme from a list of UniProt IDs for a given EC number.

    Parameters:
        lst_uniprot_ids (str): A semicolon-separated string of UniProt IDs representing enzyme candidates.
        ec (str): The Enzyme Commission (EC) number to match against the catalytic activity.

    Returns:
        str or None: The UniProt ID of the catalytic enzyme if exactly one match is found; 
                     None if no match or multiple matches are found.
    """ 
    enzymes_model = lst_uniprot_ids.split(';')
    catalytic_enzyme = []
    for enzyme in enzymes_model:
        if catalytic_activity(enzyme):
            if ec in catalytic_activity(enzyme):
                catalytic_enzyme.append(enzyme)
    if catalytic_enzyme == []:
        logging.warning(f"{ec}: No catalytic enzyme found for the complex {lst_uniprot_ids}.")
        catalytic_enzyme = None 
    elif len(catalytic_enzyme) > 1:
        logging.warning(f"{ec}: Multiple catalytic enzymes found for the complex {lst_uniprot_ids}.")
        catalytic_enzyme = ';'.join(catalytic_enzyme)
    else:
        catalytic_enzyme = catalytic_enzyme[0]
    return catalytic_enzyme

Machine Learning preprocessing

`wildkcat.machine_learning.catapro`

`convert_cid_to_smiles(cid)`

Converts a PubChem Compound ID (CID) to its corresponding SMILES representation.

Parameters:

Name	Type	Description	Default
`cid`	`str`	PubChem Compound ID.	required

Returns:

Type	Description
`list \| None`	list or None: A list of SMILES strings if found, otherwise None.

Source code in wildkcat/machine_learning/catapro.py

def convert_cid_to_smiles(cid) -> list | None:    
    """
    Converts a PubChem Compound ID (CID) to its corresponding SMILES representation.

    Parameters:
        cid (str): PubChem Compound ID.

    Returns:
       list or None: A list of SMILES strings if found, otherwise None.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/smiles/txt"
    try:
        safe_get_with_retry = retry_api(max_retries=1, backoff_factor=2)(safe_requests_get)
        response = safe_get_with_retry(url)

        if response is None:
            return None

        response.raise_for_status()
        smiles = response.text.strip().split('\n')
        return smiles
    except:
        return None

`convert_kegg_compound_to_sid(kegg_compound_id)`

Convert the KEGG compound ID to the PubChem Substance ID (SID).

Parameters:

Name	Type	Description	Default
`kegg_compound_id`	`str`	KEGG compound ID.	required

Returns:

Name	Type	Description
`str`	`str \| None`	The PubChem SID if found, otherwise None.

Source code in wildkcat/machine_learning/catapro.py

def convert_kegg_compound_to_sid(kegg_compound_id) -> str | None:
    """
    Convert the KEGG compound ID to the PubChem Substance ID (SID).

    Parameters:
        kegg_compound_id (str): KEGG compound ID.

    Returns:
        str: The PubChem SID if found, otherwise None.
    """
    url = f"https://rest.kegg.jp/conv/pubchem/compound:{kegg_compound_id}"
    safe_get_with_retry = retry_api(max_retries=2, backoff_factor=2)(safe_requests_get)
    response = safe_get_with_retry(url)

    if response is None:
        return None

    if response.status_code != 200:
        return None

    match = re.search(r'pubchem:\s*(\d+)', response.text)
    sid = match.group(1) if match else None
    return sid

`convert_kegg_to_smiles(kegg_compound_id)` `cached`

Convert the KEGG compound ID to the PubChem Compound ID (CID).

Parameters:

Name	Type	Description	Default
`kegg_compound_id`	`str`	KEGG compound ID.	required

Returns:

Type	Description
`list \| None`	list or None: A list of SMILES strings if found, otherwise None.

Source code in wildkcat/machine_learning/catapro.py

@lru_cache(maxsize=None)
def convert_kegg_to_smiles(kegg_compound_id) -> list | None:
    """
    Convert the KEGG compound ID to the PubChem Compound ID (CID).

    Parameters:
        kegg_compound_id (str): KEGG compound ID.

    Returns:
        list or None: A list of SMILES strings if found, otherwise None.
    """
    sid = convert_kegg_compound_to_sid(kegg_compound_id)
    if sid is None:
        logging.warning('%s: Failed to retrieve SID for KEGG compound ID' % (kegg_compound_id))
        return None
    cid = convert_sid_to_cid(sid)
    if cid is None:
        logging.warning('%s: Failed to retrieve CID for KEGG compound ID' % (kegg_compound_id))
        return None
    smiles = convert_cid_to_smiles(cid)
    if smiles is None:
        logging.warning('%s: Failed to retrieve SMILES for KEGG compound ID' % (kegg_compound_id))
        return None
    return smiles

`convert_sid_to_cid(sid)`

Converts a PubChem Substance ID (SID) to the corresponding Compound ID (CID).

Parameters:

Name	Type	Description	Default
`sid`	`str`	PubChem Substance ID.	required

Returns:

Type	Description
`int \| None`	int or None: The corresponding PubChem Compound ID (CID), or None if not found.

Source code in wildkcat/machine_learning/catapro.py

def convert_sid_to_cid(sid) -> int | None:
    """
    Converts a PubChem Substance ID (SID) to the corresponding Compound ID (CID).

    Parameters:
        sid (str): PubChem Substance ID.

    Returns:
        int or None: The corresponding PubChem Compound ID (CID), or None if not found.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/{sid}/cids/JSON"
    safe_get_with_retry = retry_api(max_retries=1, backoff_factor=2)(safe_requests_get)
    response = safe_get_with_retry(url)

    if response is None:
        return None

    if response.status_code == 200:
        try:
            cid = response.json()['InformationList']['Information'][0]['CID'][0]
        except (KeyError, IndexError):
            cid = None
    return cid

`create_catapro_input_file(kcat_df)`

Generate CataPro input file and a mapping of substrate KEGG IDs to SMILES.

Parameters:

Name	Type	Description	Default
`kcat_df`	`DataFrame`	Input DataFrame containing kcat information.	required

Returns:

Name	Type	Description
`catapro_input_df`	`DataFrame`	DataFrame for CataPro input.
`substrates_to_smiles`	`dict`	Mapping KEGG ID <-> SMILES.

Source code in wildkcat/machine_learning/catapro.py

def create_catapro_input_file(kcat_df):
    """
    Generate CataPro input file and a mapping of substrate KEGG IDs to SMILES.

    Parameters: 
        kcat_df (pd.DataFrame): Input DataFrame containing kcat information.

    Returns:
        catapro_input_df (pd.DataFrame): DataFrame for CataPro input.
        substrates_to_smiles (dict): Mapping KEGG ID <-> SMILES.
    """
    catapro_input = []
    substrates_to_smiles = {}

    counter_no_catalytic, counter_kegg_no_matching, counter_rxn_covered, counter_cofactor = 0, 0, 0, 0
    for _, row in tqdm(kcat_df.iterrows(), total=len(kcat_df), desc="Generating CataPro input"):
        uniprot = row['uniprot']
        ec_code = row['ec_code']

        if len(uniprot.split(';')) > 1:       
            catalytic_enzyme = identify_catalytic_enzyme(uniprot, ec_code)
            if catalytic_enzyme is None or (";" in str(catalytic_enzyme)):
                counter_no_catalytic += 1
                continue
            else: 
                uniprot = catalytic_enzyme

        # If the number of KEGG Compound IDs is not matching the number of names, continue 
        if len([s for s in row['substrates_kegg'].split(';') if s]) != len(row['substrates_name'].split(';')):
            logging.warning(f"Number of KEGG compounds IDs does not match number of names for {ec_code}: {uniprot}.")
            counter_kegg_no_matching += 1
            continue

        sequence = convert_uniprot_to_sequence(uniprot) 
        if sequence is None:
            continue

        smiles_list = []
        names = row['substrates_name'].split(';')
        kegg_ids = row['substrates_kegg'].split(';')

        # Get the cofactor for the EC code
        cofactor = get_cofactor(ec_code) 

        for name, kegg_compound_id in zip(names, kegg_ids):
            if name.lower() in [c.lower() for c in cofactor]:  # TODO: Should we add a warning if no cofactor is found for a reaction? 
                counter_cofactor += 1
                continue
            smiles = convert_kegg_to_smiles(kegg_compound_id)
            if smiles is not None:
                smiles_str = smiles[0]  # TODO: If multiple SMILES, take the first one ? 
                smiles_list.append(smiles_str)
                substrates_to_smiles[kegg_compound_id] = smiles_str

        if len(smiles_list) > 0:
            for smiles in smiles_list:
                catapro_input.append({
                    "Enzyme_id": uniprot,
                    "type": "wild",
                    "sequence": sequence,
                    "smiles": smiles
                })

        counter_rxn_covered += 1

    # Generate CataPro input file
    catapro_input_df = pd.DataFrame(catapro_input)
    # Remove duplicates
    before_duplicates_filter = len(catapro_input_df)
    catapro_input_df = catapro_input_df.drop_duplicates().reset_index(drop=True)
    nb_lines_dropped = before_duplicates_filter - len(catapro_input_df)
    # Generate reverse mapping from SMILES to KEGG IDs as TSV
    substrates_to_smiles_df = pd.DataFrame(list(substrates_to_smiles.items()), columns=['kegg_id', 'smiles'])

    report_statistics = {
        "rxn_covered": counter_rxn_covered,
        "cofactor_identified": counter_cofactor,
        "no_catalytic": counter_no_catalytic,
        "kegg_no_matching": counter_kegg_no_matching,
        "duplicates_enzyme_substrates": nb_lines_dropped,
    }

    return catapro_input_df, substrates_to_smiles_df, report_statistics

`integrate_catapro_predictions(kcat_df, substrates_to_smiles, catapro_predictions_df)`

Integrates Catapro predictions into an kcat file. If multiple values are provided for a single combination of EC, Enzyme, Substrate, the minimum value is taken.

Parameters:

Name	Type	Description	Default
`kcat_df`	`DataFrame`	Input DataFrame containing kcat information.	required
`substrates_to_smiles`	`DataFrame`	DataFrame mapping KEGG ID <-> SMILES.	required
`catapro_predictions_df`	`DataFrame`	DataFrame containing Catapro model predictions	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The input kcat_df with an additional column 'catapro_predicted_kcat_s' containing the integrated Catapro predicted kcat(s^-1) values.

Source code in wildkcat/machine_learning/catapro.py

def integrate_catapro_predictions(kcat_df, substrates_to_smiles, catapro_predictions_df) -> pd.DataFrame:
    """
    Integrates Catapro predictions into an kcat file.
    If multiple values are provided for a single combination of EC, Enzyme, Substrate, the minimum value is taken.

    Parameters:
        kcat_df (pd.DataFrame): Input DataFrame containing kcat information.
        substrates_to_smiles (pd.DataFrame): DataFrame mapping KEGG ID <-> SMILES.
        catapro_predictions_df (pd.DataFrame): DataFrame containing Catapro model predictions

    Returns:
        pd.DataFrame: The input kcat_df with an additional column 'catapro_predicted_kcat_s' containing
            the integrated Catapro predicted kcat(s^-1) values.
    """
    # Convert pred_log10[kcat(s^-1)] to kcat(s^-1)
    catapro_predictions_df['kcat_s'] = 10 ** catapro_predictions_df['pred_log10[kcat(s^-1)]']
    catapro_predictions_df['uniprot'] = catapro_predictions_df['fasta_id'].str.replace('_wild', '', regex=False) # Extract UniProt ID

    # Match the SMILES to KEGG IDs using substrates_to_smiles
    # If multiple KEGG IDs are found for a single SMILES, they are concatenated
    smiles_to_kegg = (
        substrates_to_smiles.groupby('smiles')['kegg_id']
        .apply(lambda x: ';'.join(sorted(set(x))))
    )
    catapro_predictions_df['substrates_kegg'] = catapro_predictions_df['smiles'].map(smiles_to_kegg)

    catapro_map = catapro_predictions_df.set_index(['uniprot', 'substrates_kegg'])['kcat_s'].to_dict()

    def get_min_pred_kcat(row):
        uniprot = row['uniprot']
        kegg_ids = str(row['substrates_kegg']).split(';')
        kcat_values = [
            catapro_map.get((uniprot, kegg_id))
            for kegg_id in kegg_ids
            if (uniprot, kegg_id) in catapro_map
        ]
        return min(kcat_values) if kcat_values else None  # If multiple substrates, take the minimum kcat value

    kcat_df['catapro_predicted_kcat_s'] = kcat_df.apply(get_min_pred_kcat, axis=1)
    return kcat_df

Generate reports

`wildkcat.utils.generate_reports`

`report_extraction(model, df, report_statistics, output_folder, shader=False)`

Generates a detailed HTML report summarizing kcat extraction results from a metabolic model.

Parameters:

Name	Type	Description	Default
`model`	`Model`	The metabolic model object containing reactions, metabolites, and genes.	required
`df`	`DataFrame`	DataFrame containing data from the run_extraction function.	required
`report_statistics`	`dict`	Dictionary with statistics about EC code assignment and extraction issues.	required
`output_folder`	`str`	Path to the output folder where the report will be saved.	required
`shader`	`bool`	If True, includes a shader canvas background in the report. Default is False.	`False`

Returns:

Name	Type	Description
`None`	`None`	The function saves the generated HTML report to 'reports/extract_kcat_report.html'.

Source code in wildkcat/utils/generate_reports.py

def report_extraction(model, df, report_statistics, output_folder, shader=False) -> None:
    """
    Generates a detailed HTML report summarizing kcat extraction results from a metabolic model.

    Parameters: 
        model (cobra.Model): The metabolic model object containing reactions, metabolites, and genes.
        df (pandas.DataFrame): DataFrame containing data from the run_extraction function.
        report_statistics (dict): Dictionary with statistics about EC code assignment and extraction issues.
        output_folder (str): Path to the output folder where the report will be saved.
        shader (bool, optional): If True, includes a shader canvas background in the report. Default is False.

    Returns: 
        None: The function saves the generated HTML report to 'reports/extract_kcat_report.html'. 
    """
    # Model statistics
    nb_model_reactions = len(model.reactions)
    nb_model_metabolites = len(model.metabolites)
    nb_model_genes = len(model.genes)
    unique_ec_codes = []
    for rxn in model.reactions:
        ec_code = rxn.annotation.get('ec-code')
        if ec_code:
            if isinstance(ec_code, str):
                ec_code = [ec_code.strip()]
            elif isinstance(ec_code, list):
                ec_code = [x.strip() for x in ec_code if x.strip()]
            else:
                ec_code = []
            unique_ec_codes.extend(ec_code)
    nb_model_ec_codes = len(set(unique_ec_codes))

    # Kcat statistics
    nb_reactions = df['rxn'].nunique()
    nb_ec_codes = df['ec_code'].nunique()

    nb_ec_codes_transferred = report_statistics.get('transferred_ec_codes', 0)
    nb_ec_codes_incomplete = report_statistics.get('incomplete_ec_codes', 0)
    nb_reactions_dropped = report_statistics.get('nb_of_reactions_due_to_unconsistent_ec', 0)
    nb_lines_dropped = report_statistics.get('nb_of_lines_dropped_due_to_unconsistent_ec', 0)

    rxn_coverage = 100.0 * nb_reactions / nb_model_reactions if nb_model_reactions else 0

    percent_ec_retrieved = 100.0 * nb_ec_codes / nb_model_ec_codes if nb_model_ec_codes else 0

    # Pie Chart
    pie_data = {
        "Retrieved": nb_ec_codes,
        "Transferred": nb_ec_codes_transferred,
        "Incomplete": nb_ec_codes_incomplete,
    }

    pie_data = {k: v for k, v in pie_data.items() if v > 0}

    fig = px.pie(
        names=list(pie_data.keys()),
        values=list(pie_data.values()),
        color_discrete_sequence=["#55bb55", "#ee9944", "#cc4455"]
    )
    fig.update_traces(textinfo="percent+label", textfont_size=16)
    fig.update_layout(
        title="",
        title_font=dict(size=30, color="black"),
        showlegend=True
    )

    pie_chart_html = fig.to_html(full_html=False, include_plotlyjs="cdn")

    # Time
    generated_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Html report
    html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Extract kcat Report</title>
        {report_style()}
    </head>
    <body>
        <header>
            <canvas id="shader-canvas"></canvas>
            <div class="overlay">
                <h1>Extract k<sub>cat</sub> Report</h1>
                <p>Generated on {generated_time}</p>
            </div>
        </header>

        <div class="container">
            <!-- Model Overview -->
            <div class="card">
                <h2>Model Overview</h2>
                <div class="stats-grid">
                    <div class="stat-box">
                        <h3>{model.id}</h3>
                        <p>Model ID</p>
                    </div>
                    <div class="stat-box">
                        <h3>{nb_model_reactions}</h3>
                        <p>Reactions</p>
                    </div>
                    <div class="stat-box">
                        <h3>{nb_model_metabolites}</h3>
                        <p>Metabolites</p>
                    </div>
                    <div class="stat-box">
                        <h3>{nb_model_genes}</h3>
                        <p>Genes</p>
                    </div>
                </div>
            </div>

            <!-- kcat Extraction Table -->
            <div class="card">
                <h2>k<sub>cat</sub> Extraction Statistics</h2>
                <table>
                    <tr>
                        <th>Metric</th>
                        <th>Value</th>
                        <th>Visualization</th>
                    </tr>
                    <tr>
                        <td>Reactions with EC info</td>
                        <td>{nb_reactions} ({rxn_coverage:.1f}%)</td>
                        <td>
                            <div class="progress">
                                <div class="progress-bar-table" style="width:{rxn_coverage}%;"></div>
                            </div>
                        </td>
                    </tr>
                    <tr>
                        <td>EC codes retrieved in KEGG</td>
                        <td>{nb_ec_codes} ({percent_ec_retrieved:.1f}%)</td>
                        <td>
                            <div class="progress">
                                <div class="progress-bar-table" style="width:{percent_ec_retrieved}%;"></div>
                            </div>
                        </td>
                    </tr>
                    <tr>
                        <td>Total rows in output (Rxn - EC - Enzyme - Substrate)</td>
                        <td>{len(df) - 1}</td>
                        <td>-</td>
                    </tr>
                </table>
            </div>

            <!-- EC Issues Table -->
            <div class="card">
                <h2>Issues in EC Assignment</h2>
                <table>
                    <tr>
                        <th>Cases</th>
                        <th>Count</th>
                    </tr>
                    <tr>
                        <td>Transferred EC codes</td>
                        <td>{nb_ec_codes_transferred}</td>
                    </tr>
                    <tr>
                        <td>Incomplete EC codes</td>
                        <td>{nb_ec_codes_incomplete}</td>
                    </tr>
                    <tr>
                        <td>Number of reactions dropped due to inconsistent EC codes</td>
                        <td>{nb_reactions_dropped}</td>
                    </tr>
                    <tr>
                        <td>Number of k<sub>cat</sub> values dropped due to inconsistent EC codes</td>
                        <td>{nb_lines_dropped}</td>
                    </tr>
                </table>
            </div>

            <!-- Pie Chart Section -->
            <div class="card">
                <h2>EC Distribution</h2>
                {pie_chart_html}
            </div>
        </div>

        <footer>WILDkCAT</footer>
    """
    if shader:
        html += report_shader()
    else: 
        html += report_simple()
    html += """
    </body>
    </html>
    """

    # Save report
    os.makedirs(os.path.join(output_folder, "reports"), exist_ok=True)
    report_path = os.path.join(output_folder, "reports/extract_report.html")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(html)
    logging.info(f"HTML report saved to '{report_path}'")

`report_final(model, final_df, output_folder, shader=False)`

Generate a full HTML report summarizing retrieval results, including kcat distributions and coverage.

Parameters:

Name	Type	Description	Default
`model`	`Model`	The metabolic model object containing reactions, metabolites, and genes.	required
`final_df`	`DataFrame`	DataFrame containing the final kcat assignments from run_prediction_part2 function	required

Returns:

Name	Type	Description
`None`	`None`	The function saves the generated HTML report to 'reports/general_report.html'.

Source code in wildkcat/utils/generate_reports.py

def report_final(model, final_df, output_folder, shader=False) -> None:
    """
    Generate a full HTML report summarizing retrieval results, including kcat distributions and coverage.

    Parameters:
        model (cobra.Model): The metabolic model object containing reactions, metabolites, and genes.
        final_df (pd.DataFrame): DataFrame containing the final kcat assignments from run_prediction_part2 function

    Returns: 
        None: The function saves the generated HTML report to 'reports/general_report.html'.
    """
    # Model information 
    nb_model_reactions = len(model.reactions)
    nb_model_metabolites = len(model.metabolites)
    nb_model_genes = len(model.genes)


    df = final_df.copy()
    df["db"] = df["db"].fillna("Unknown")
    generated_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Utility to convert matplotlib figures to base64 <img>
    def fig_to_base64(fig):
        buf = BytesIO()
        fig.savefig(buf, format="png", bbox_inches="tight")
        buf.seek(0)
        encoded = base64.b64encode(buf.read()).decode("utf-8")
        plt.close(fig)
        return f'<div class="plot-container"><img src="data:image/png;base64,{encoded}"></div>'

    # Distribution plots
    def plot_kcat_distribution_stacked(column_name, title, source):
        # Ensure numeric kcat
        df[column_name] = pd.to_numeric(df[column_name], errors='coerce')

        # Drop NaNs for both columns
        valid_df = df.dropna(subset=[column_name, source])
        kcat_values = valid_df[column_name]

        total = len(df)
        matched = len(kcat_values)
        match_percent = matched / total * 100 if total else 0

        if not kcat_values.empty:
            # Define log bins
            min_exp = int(np.floor(np.log10(max(1e-6, kcat_values.min()))))
            max_exp = int(np.ceil(np.log10(kcat_values.max())))
            bins = np.logspace(min_exp, max_exp, num=40)

            # Prepare data for stacked histogram
            sources = valid_df[source].unique()
            grouped_values = [valid_df.loc[valid_df[source] == src, column_name] for src in sources]

            # Colors from seaborn palette
            # Fixed color mapping
            color_map = {
                "brenda": "#55bb55",   # green
                "sabio_rk": "#2277cc", # blue
                "catapro": "#eedd00",  # yellow
                "Unknown": "#dddddd"   # gray
            }
            colors = [color_map.get(src, "#999999") for src in sources]  # fallback gray

            # Plot
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.hist(grouped_values, bins=bins, stacked=True,
                    color=colors, label=sources, edgecolor="white", linewidth=0.7)

            ax.set_xscale("log")
            ax.set_xlim([10**min_exp / 1.5, 10**max_exp * 1.5])
            ax.xaxis.set_major_formatter(LogFormatter(10))

            ax.set_xlabel("kcat (s⁻¹)", fontsize=12)
            ax.set_ylabel("Count", fontsize=12)
            ax.set_title(f"{title} (n={matched}, {match_percent:.1f}%)", fontsize=13)
            ax.legend(title="Source")

            return fig_to_base64(fig)

        return "<p>No valid values available for plotting.</p>"

    img_final = plot_kcat_distribution_stacked(
        'kcat', "kcat distribution", "db"
    )

    # Coverage
    db_counts = df["db"].fillna("Unknown").value_counts()
    total_db = db_counts.sum()

    colors = {
        "brenda": "#55bb55",      # blue
        "sabio_rk": "#2277cc",    # orange
        "catapro": "#eedd00",     # green
        "Unknown": "#ddd"      # gray
    }

    db_colors = {db: colors.get(db, "#ddd") for db in db_counts.index}

    progress_segments = ""
    legend_items = ""
    for db, count in db_counts.items():
        percent = count / total_db * 100
        progress_segments += f"""
            <div class="progress-segment" style="width:{percent:.1f}%; background-color:{db_colors[db]};"
                title="{db.capitalize()}: {percent:.1f}%"></div>
        """
        legend_items += f"""
            <span style="display:flex; align-items:center; margin-right:15px; margin-bottom:5px;">
                <span style="display:flex; align-items:center; width:16px; height:16px; 
                            background:{db_colors[db]}; border:1px solid #000; margin-right:5px;"></span>
                {db.capitalize()} ({percent:.1f}%)
            </span>
        """

    progress_bar = f"""
        <div class="progress-multi" style="height: 18px; margin-bottom:18px; display:flex;">
            {progress_segments}
        </div>
        <div style="margin-top:10px; display:flex; justify-content:center; flex-wrap: wrap;">{legend_items}</div>
    """

    # Statistics 
    grouped = df.groupby("rxn")
    rxns_with_kcat = grouped["kcat"].apply(lambda x: x.notna().any())
    nb_rxn = grouped.ngroups
    nb_rxn_with_kcat = rxns_with_kcat.sum()
    coverage = nb_rxn_with_kcat / nb_rxn
    coverage_total = nb_rxn_with_kcat / nb_model_reactions

    kcat_values = df["kcat"].dropna()
    total = len(df)
    matched = len(kcat_values)
    match_percent = matched / total

    # HTML
    html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>WILDkCAT Report</title>
        {report_style()}
    </head>
    <body>
        <header>
            <canvas id="shader-canvas"></canvas>
            <div class="overlay">
                <h1>WILDkCAT Report</h1>
                <p>Generated on {generated_time}</p>
            </div>
        </header>

        <div class="container">
            <div class="card">
                <h2>Introduction</h2>
                <p style="margin-bottom:20px; font-size:14px; color:#555; text-align: justify;">
                    This report provides a summary of the performance of k<sub>cat</sub> value extraction, retrieval, and prediction for the specified metabolic model. 
                    It presents statistics on k<sub>cat</sub> values successfully retrieved, whether experimental or predicted.
                </p>
                <p style="margin-bottom:20px; font-size:14px; color:#555; text-align: justify;">
                    The output file, containing the full list of k<sub>cat</sub> values associated with each reaction, are available as a tab-separated file (TSV) at the default output path: <code>output/model_name_kcat_full</code>.
                </p>
            </div>

            <div class="card">
                <h2>Model Overview</h2>
                <div class="stats-grid">
                    <div class="stat-box">
                        <h3>{model.id}</h3>
                        <p>Model ID</p>
                    </div>
                    <div class="stat-box">
                        <h3>{nb_model_reactions}</h3>
                        <p>Reactions</p>
                    </div>
                    <div class="stat-box">
                        <h3>{nb_model_metabolites}</h3>
                        <p>Metabolites</p>
                    </div>
                    <div class="stat-box">
                        <h3>{nb_model_genes}</h3>
                        <p>Genes</p>
                    </div>
                </div>
            </div>

            <div class="card" style="padding:20px; margin-bottom:20px;">
                <h2 style="margin-bottom:10px;">Coverage</h2>

                <!-- Explanation -->
                <p style="margin-bottom:20px; font-size:14px; color:#555; text-align: justify;">
                    The coverage section reports the number of k<sub>cat</sub> values retrieved for the model and the number of reactions that have at least one 
                    associated k<sub>cat</sub> value, whether experimental or predicted. This provides a measure of how extensively the model’s reactions are 
                    annotated with kinetic data.
                </p>
                <p style="margin-bottom:20px; font-size:14px; color:#555; text-align: justify;">
                    Higher coverage indicates that a larger fraction of reactions are constrained by k<sub>cat</sub> values, 
                    improving the accuracy and reliability of enzyme-constrained simulations.
                </p>

                <!-- Global coverage progress bar -->
                {progress_bar}        

                <!-- Detailed stats -->
                <table class="table" style="width:100%; border-spacing:0; border-collapse: collapse;">
                    <tbody>
                        <tr>
                            <td style="padding:8px 12px;">Reactions with EC information with at least one kcat values</td>
                            <td style="padding:8px 12px;">{nb_rxn_with_kcat} ({coverage:.1%})</td>
                            <td style="width:40%;">
                                <div class="progress" style="height:18px;">
                                    <div class="progress-bar-table" 
                                        style="width:{coverage:.1%}; background-color:#4caf50;">
                                    </div>
                                </div>
                            </td>
                        </tr>
                        <tr>
                            <td style="padding:8px 12px;">Reactions in the model with at least one kcat values</td>
                            <td style="padding:8px 12px;">{nb_rxn_with_kcat} ({coverage_total:.1%})</td>
                            <td style="width:40%;">
                                <div class="progress" style="height:18px;">
                                    <div class="progress-bar-table" 
                                        style="width:{coverage_total:.1%}; background-color:#4caf50;">
                                    </div>
                                </div>
                            </td>
                        </tr>
                        <tr>
                            <td style="padding:8px 12px;">k<sub>cat</sub> values retrieved </td>
                            <td style="padding:8px 12px;">{matched} ({match_percent:.1%})</td>
                            <td style="width:40%;">
                                <div class="progress" style="height:18px;">
                                    <div class="progress-bar-table" 
                                        style="width:{match_percent:.1%}; background-color:#4caf50;">
                                    </div>
                                </div>
                            </td>
                        </tr>
                    </tbody>
                </table>
            </div>

            <div class="card">
                <h2>k<sub>cat</sub> Distribution</h2>
                <div class="img-section">
                    {img_final}
                </div>
            </div>
        </div>

        <footer>WILDkCAT</footer>
    """
    if shader:
        html += report_shader()
    else: 
        html += report_simple()
    html += """
    </body>
    </html>
    """

    os.makedirs(os.path.join(output_folder, "reports"), exist_ok=True)
    report_path = os.path.join(output_folder, "reports/general_report.html")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(html)

    logging.info(f"HTML report saved to '{report_path}'")
    return report_path

`report_prediction_input(catapro_df, report_statistics, output_folder, shader=False)`

Generate a detailed HTML report summarizing the kcat prediction input statistics.

Parameters:

Name	Type	Description	Default
`catapro_df`	`DataFrame`	DataFrame containing the CataPro input data.	required
`report_statistics`	`dict`	Dictionary with statistics about the prediction input.	required
`output_folder`	`str`	Path to the output folder where the report will be saved.	required
`shader`	`bool`	If True, includes a shader canvas background in the report. Default is False.	`False`

Returns:

Name	Type	Description
`None`	`None`	The function saves the generated HTML report to 'reports/predict_kcat_report.html'.

Source code in wildkcat/utils/generate_reports.py

def report_prediction_input(catapro_df, report_statistics, output_folder, shader=False) -> None: 
    """
    Generate a detailed HTML report summarizing the kcat prediction input statistics.

    Parameters:
        catapro_df (pd.DataFrame): DataFrame containing the CataPro input data.
        report_statistics (dict): Dictionary with statistics about the prediction input.
        output_folder (str): Path to the output folder where the report will be saved.
        shader (bool, optional): If True, includes a shader canvas background in the report. Default is False.

    Returns:
        None: The function saves the generated HTML report to 'reports/predict_kcat_report.html'.
    """

    # TODO: Show the number of rows without any enzymes

    # CataPro Statistics 
    total_catapro_entries = len(catapro_df) - 1

    # Report Statistics
    rxn_covered = report_statistics['rxn_covered']
    cofactors_covered = report_statistics['cofactor_identified']
    no_catalytic = report_statistics['no_catalytic']
    kegg_missing = report_statistics['kegg_no_matching']
    duplicates = report_statistics['duplicates_enzyme_substrates']
    missing_enzyme = report_statistics['missing_enzymes']

    total_rxn = rxn_covered + no_catalytic + kegg_missing + missing_enzyme
    rxn_coverage = (rxn_covered / total_rxn * 100) if total_rxn > 0 else 0

    # Time
    generated_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Html report
    html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Predict kcat Report</title>
        {report_style()}
    </head>
    <body>
        <header>
            <canvas id="shader-canvas"></canvas>
            <div class="overlay">
                <h1>Predict k<sub>cat</sub> Report</h1>
                <p>Generated on {generated_time}</p>
            </div>
        </header>

        <div class="container">
            <!-- CataPro Overview -->
            <div class="card">
                <h2>Overview</h2>
                <div class="stats-grid">
                    <div class="stat-box">
                        <h3>{total_rxn}</h3>
                        <p>Total k<sub>cat</sub> values</p>
                    </div>
                    <div class="stat-box">
                        <h3>{rxn_covered}</h3>
                        <p>k<sub>cat</sub> to be predicted ({rxn_coverage:.2f}%)</p>
                    </div>
                </div>
            </div>

            <!-- Prediction kcat Table -->
            <div class="card">
                <h2>k<sub>cat</sub> Prediction Statistics</h2>
                <table>
                    <tr>
                        <th>Metric</th>
                        <th>Value</th>
                    </tr>
                    <tr>
                        <td>Total of entries in CataPro input file</td>
                        <td>{total_catapro_entries}</td>
                    </tr>
                    <tr>
                        <td>Number of cofactor identified</td>
                        <td>{cofactors_covered}</td>
                    </tr>
                </table>
            </div>

            <div class="card">
                <h2>Issues in k<sub>cat</sub> Predictions</h2>
                <table>
                    <tr>
                        <th>Metric</th>
                        <th>Value</th>
                    </tr>
                    <tr>
                        <td>Entries with no catalytic enzyme identified</td>
                        <td>{no_catalytic}</td>
                    </tr>
                    <tr>
                        <td>Entries with missing KEGG IDs</td>
                        <td>{kegg_missing}</td>
                    </tr>
                    <tr>
                        <td>Entries with missing enzyme information</td>
                        <td>{missing_enzyme}</td>
                    </tr>
                </table>
            </div>

            <div class="card">
                <h2>Duplicates</h2>
                <table>
                    <tr>
                        <th>Metric</th>
                        <th>Value</th>
                    </tr>
                    <tr>
                        <td>Number of duplicates</td>
                        <td>{duplicates}</td>
                    </tr>
                </table>
                <p>
                    Duplicates occur when multiple reactions share the same enzyme-substrate combination. 
                    A high number of duplicates may result from multiple enzyme complexes sharing the same catalytic enzyme.
                </p>
            </div>

            <!-- Prediction Instructions -->
            <div class="card">
                <h2>Running k<sub>cat</sub> Predictions with CataPro</h2>
                <p>
                    This report provides the input needed to run the CataPro machine learning model 
                    (<a href="https://github.com/zchwang/CataPro" target="_blank">CataPro repository</a>). 
                    Follow the instructions in the repository to set up the environment and generate k<sub>cat</sub> predictions.
                </p>
            </div>

    <footer>WILDkCAT</footer>
    """
    if shader:
        html += report_shader()
    else: 
        html += report_simple()
    html += """
    </body>
    </html>
    """

    # Save report
    os.makedirs(os.path.join(output_folder, "reports"), exist_ok=True)
    report_path = os.path.join(output_folder, "reports/predict_report.html")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(html)
    logging.info(f"HTML report saved to '{report_path}'")

`report_retrieval(df, output_folder, shader=False)`

Generate a styled HTML report summarizing the kcat matching results, including kcat value distribution and matching score repartition.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing data from the run_retrieval function.	required
`output_folder`	`str`	Path to the output folder where the report will be saved.	required
`shader`	`bool`	If True, includes a shader canvas background in the report. Default is False.	`False`

Returns:

Name	Type	Description
`None`	`None`	The function saves the generated HTML report to 'reports/retrieve_kcat_report.html'.

Source code in wildkcat/utils/generate_reports.py

def report_retrieval(df, output_folder,shader=False) -> None:
    """
    Generate a styled HTML report summarizing the kcat matching results,
    including kcat value distribution and matching score repartition.

    Parameters:
        df (pd.DataFrame): DataFrame containing data from the run_retrieval function.
        output_folder (str): Path to the output folder where the report will be saved.
        shader (bool, optional): If True, includes a shader canvas background in the report. Default is False.

    Returns:
        None: The function saves the generated HTML report to 'reports/retrieve_kcat_report.html'.
    """
    # Ensure numeric kcat values to avoid TypeError on comparisons
    kcat_values = pd.to_numeric(df['kcat'], errors='coerce').dropna()

    # Only use scores present in the data
    present_scores = sorted(df['matching_score'].dropna().unique())
    score_counts = df['matching_score'].value_counts().reindex(present_scores, fill_value=0)
    total = len(df) - 1
    matched = len(kcat_values)
    match_percent = matched / total * 100 if total else 0
    score_percent = (score_counts / total * 100).round(2) if total else pd.Series(0, index=present_scores)

    # Distinct colors for each score (up to 12, then cycle)
    # Gradient colors from green (best score) to red (worst score)
    distinct_colors = [
        "#27ae60",
        "#43b76e",
        "#60c07c",
        "#7cc98a",
        "#98d298",
        "#b5dbb6",
        "#d1e4c4",
        "#f1e9b6",
        "#f7d97c",
        "#f9c74f",
        "#f8961e",
        "#f3722c",
        "#e67e22",
        "#e74c3c",
        "#c0392b",
        "#a93226",
        "#922b21",
        "#7b241c"
    ]

    def score_color(score):
        idx = present_scores.index(score)
        return distinct_colors[idx % len(distinct_colors)]

    generated_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Histogram with stacked bars for scores
    kcat_hist_base64 = ""
    if not kcat_values.empty:
        min_exp = int(np.floor(np.log10(max(1e-6, kcat_values.min()))))
        max_exp = int(np.ceil(np.log10(kcat_values.max())))
        bins = np.logspace(min_exp, max_exp, num=40)

        fig, ax = plt.subplots(figsize=(10, 6))

        # Stacked histogram by score

        hist_data = [pd.to_numeric(df[df['matching_score'] == score]['kcat'], errors='coerce').dropna() for score in present_scores]
        ax.hist(hist_data, bins=bins, stacked=True, 
                color=[score_color(s) for s in present_scores], label=[f"Score {s}" for s in present_scores], edgecolor='white')

        ax.set_xscale('log')
        ax.set_xlim([10**min_exp / 1.5, 10**max_exp * 1.5])
        ax.xaxis.set_major_formatter(LogFormatter(10))

        ax.set_xlabel("kcat (s⁻¹)", fontsize=12)
        ax.set_ylabel("Count", fontsize=12)
        ax.set_title(f"", fontsize=13)
        ax.legend(title="Matching Score", fontsize=12)

        plt.tight_layout()
        buf = io.BytesIO()
        plt.savefig(buf, format='png', bbox_inches='tight')
        plt.close(fig)
        kcat_hist_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')

    # HTML start
    html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Retrieve kcat Report</title>
        {report_style()}
    </head>
    <body>
        <header>
            <canvas id="shader-canvas"></canvas>
            <div class="overlay">
                <h1>Retrieve k<sub>cat</sub> Report</h1>
                <p>Generated on {generated_time}</p>
            </div>
        </header>

        <div class="container">
            <div class="card">
                <h2>Overview</h2>
                <div class="stats-grid">
                    <div class="stat-box">
                        <h3>{total}</h3>
                        <p>Total Entries</p>
                    </div>
                    <div class="stat-box">
                        <h3>{matched}</h3>
                        <p>Matched k<sub>cat</sub> ({match_percent:.2f}%)</p>
                    </div>
                </div>
            </div>

            <div class="card">
                <h2>Matching Score Distribution</h2>
                <div class="progress-stacked">
    """

    # Add progress bars only for present scores
    for score in present_scores:
        percent = score_percent.get(score, 0)
        if percent > 0:
            html += f'<div class="progress-bar" style="width:{percent}%;background:{score_color(score)};" title="Score {score}: {percent:.2f}%"></div>'

    html += """
            </div>
            <div class="legend">
    """

    # Add legend only for present scores
    for score in present_scores:
        html += f'<div class="legend-item"><div class="legend-color" style="background:{score_color(score)};"></div> Score {score}</div>'

    html += """
            </div>
            <table>
                <tr>
                    <th>Score</th>
                    <th>Count</th>
                    <th>Percent</th>
                </tr>
    """

    # Table rows only for present scores
    for score in present_scores:
        html += f'<tr><td>{score}</td><td>{score_counts[score]}</td><td>{score_percent[score]:.2f}%</td></tr>'

    html += """
            </table>
        </div>
    """

    # Histogram section (stacked by score)
    html += """
        <div class="card">
            <h2>Distribution of k<sub>cat</sub> values (Stacked by Matching Score)</h2>
            <div class="img-section">
    """
    if kcat_hist_base64:
        html += f'<img src="data:image/png;base64,{kcat_hist_base64}" alt="k<sub>cat</sub> Distribution">'
    html += """
            </div>
        </div>
    """

    # Metadata section
    html += f"""
            <div class="card">
                <h2>Matching Score</h2>
                <p>
                    The matching score evaluates how well a candidate k<sub>cat</sub> entry fits the query enzyme and conditions. 
                    A lower score indicates a better match (0 = best possible, 15 = no match).
                </p>
                <h3>Scoring process:</h3>
                <ul>
                    <li><b>Catalytic enzyme:</b> Check if the reported enzyme matches the expected catalytic enzyme(s).</li>
                    <li><b>Organism:</b> Penalize mismatches between the source organism and the target organism.</li>
                    <li><b>Enzyme variant:</b> Exclude or penalize mutant/engineered variants (wildtype preferred).</li>
                    <li><b>pH:</b> Check whether the reported pH is consistent with the desired experimental range.</li>
                    <li><b>Substrate:</b> Verify substrate compatibility with the catalytic reaction.</li>
                    <li><b>Temperature:</b> Penalize deviations from the target temperature; 
                        if possible, adjust kcat values using the Arrhenius equation.</li>
                </ul>

                <h3>Score breakdown (default penalties):</h3>
                <table border="1" cellpadding="6" cellspacing="0" style="border-collapse: collapse; text-align: left;">
                    <tr>
                        <th>Criterion</th>
                        <th>Penalty</th>
                    </tr>
                    <tr>
                        <td>Substrate mismatch</td>
                        <td>+3</td>
                    </tr>
                    <tr>
                        <td>Catalytic enzyme mismatch</td>
                        <td>+2</td>
                    </tr>
                    <tr>
                        <td>Organism mismatch</td>
                        <td>+2</td>
                    </tr>
                    <tr>
                        <td>pH unknown</td>
                        <td>+1</td>
                    </tr>
                    <tr>
                        <td>pH out of range</td>
                        <td>+2</td>
                    </tr>
                    <tr>
                        <td>Temperature unknown</td>
                        <td>+1</td>
                    </tr>
                    <tr>
                        <td>Temperature out of range</td>
                        <td>+2</td>
                    </tr>
                    <tr>
                        <td>Enzyme variant unknown</td>
                        <td>+1</td>
                    </tr>
                </table>

                <p>
                    Candidates are then ranked by:
                    <ol>
                        <li>Lowest total score</li>
                        <li>Highest sequence identity percentage to the target enzyme</li>
                        <li>Adjusted k<sub>cat</sub> value (favoring the smallest value by default)</li>
                    </ol>
                </p>
                <p>
                    The best candidate is the one with the lowest score after these checks. 
                    If multiple candidates tie on score, sequence identity and k<sub>cat</sub> values break the tie.
                </p>
            </div>
        </div>

        <footer>WILDkCAT</footer>
    """
    if shader: 
        html += report_shader()
    else: 
        html += report_simple()
    html += """
    </body>
    </html>
    """

    # Save HTML
    os.makedirs(os.path.join(output_folder, "reports"), exist_ok=True)
    report_path = os.path.join(output_folder, "reports/retrieve_report.html")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(html)

    logging.info(f"HTML report saved to '{report_path}'")

`report_shader()`

Return HTML and GLSL shader code for report background. Adapted from localthunk (https://localthunk.com)

Source code in wildkcat/utils/generate_reports.py

def report_shader(): 
    """Return HTML and GLSL shader code for report background. Adapted from localthunk (https://localthunk.com)"""
    return """
    <!-- Background adapted from original work by localthunk (https://localthunk.com) -->
    <script id="fragShader" type="x-shader/x-fragment">
    precision highp float;
    uniform vec2 iResolution;
    uniform float iTime;
    #define SPIN_ROTATION -1.0
    #define SPIN_SPEED 3.5
    #define OFFSET vec2(0.0)
    #define COLOUR_1 vec4(0.2, 0.4, 0.7, 1.0)
    #define COLOUR_2 vec4(0.6, 0.75, 0.9, 1.0)
    #define COLOUR_3 vec4(0.2, 0.2, 0.25, 1.0)
    #define CONTRAST 3.5
    #define LIGTHING 0.4
    #define SPIN_AMOUNT 0.25
    #define PIXEL_FILTER 745.0
    #define SPIN_EASE 1.0
    #define PI 3.14159265359
    #define IS_ROTATE false
    vec4 effect(vec2 screenSize, vec2 screen_coords) {
        float pixel_size = length(screenSize.xy) / PIXEL_FILTER;
        vec2 uv = (floor(screen_coords.xy*(1./pixel_size))*pixel_size - 0.5*screenSize.xy)/length(screenSize.xy) - OFFSET;
        float uv_len = length(uv);
        float speed = (SPIN_ROTATION*SPIN_EASE*0.2);
        if(IS_ROTATE) {
        speed = iTime * speed;
        }
        speed += 302.2;
        float new_pixel_angle = atan(uv.y, uv.x) + speed - SPIN_EASE*20.*(1.*SPIN_AMOUNT*uv_len + (1. - 1.*SPIN_AMOUNT));
        vec2 mid = (screenSize.xy/length(screenSize.xy))/2.;
        uv = (vec2((uv_len * cos(new_pixel_angle) + mid.x), (uv_len * sin(new_pixel_angle) + mid.y)) - mid);
        uv *= 30.;
        speed = iTime*(SPIN_SPEED);
        vec2 uv2 = vec2(uv.x+uv.y);
        for(int i=0; i < 5; i++) {
            uv2 += sin(max(uv.x, uv.y)) + uv;
            uv  += 0.5*vec2(cos(5.1123314 + 0.353*uv2.y + speed*0.131121),sin(uv2.x - 0.113*speed));
            uv  -= 1.0*cos(uv.x + uv.y) - 1.0*sin(uv.x*0.711 - uv.y);
        }
        float contrast_mod = (0.25*CONTRAST + 0.5*SPIN_AMOUNT + 1.2);
        float paint_res = min(2., max(0.,length(uv)*(0.035)*contrast_mod));
        float c1p = max(0.,1. - contrast_mod*abs(1.-paint_res));
        float c2p = max(0.,1. - contrast_mod*abs(paint_res));
        float c3p = 1. - min(1., c1p + c2p);
        float light = (LIGTHING - 0.2)*max(c1p*5. - 4., 0.) + LIGTHING*max(c2p*5. - 4., 0.);
        return (0.3/CONTRAST)*COLOUR_1 + (1. - 0.3/CONTRAST)*(COLOUR_1*c1p + COLOUR_2*c2p + vec4(c3p*COLOUR_3.rgb, c3p*COLOUR_1.a)) + light;
    }
    void mainImage(out vec4 fragColor, in vec2 fragCoord) {
        vec2 uv = fragCoord/iResolution.xy;
        fragColor = effect(iResolution.xy, uv * iResolution.xy);
    }
    void main() { mainImage(gl_FragColor, gl_FragCoord.xy); }
    </script>
    <script>
    const canvas = document.getElementById("shader-canvas");
    const gl = canvas.getContext("webgl");
    function resize() {
        canvas.width = canvas.clientWidth * window.devicePixelRatio;
        canvas.height = canvas.clientHeight * window.devicePixelRatio;
        gl.viewport(0, 0, canvas.width, canvas.height);
    }
    window.addEventListener("resize", resize);
    resize();
    const vertexSrc = `
    attribute vec2 position;
    void main() {
        gl_Position = vec4(position, 0.0, 1.0);
    }
    `;
    const fragSrc = document.getElementById("fragShader").text;
    function compileShader(src, type) {
    const shader = gl.createShader(type);
    gl.shaderSource(shader, src);
    gl.compileShader(shader);
    if (!gl.getShaderParameter(shader, gl.COMPILE_STATUS)) {
        console.error(gl.getShaderInfoLog(shader));
    }
    return shader;
    }
    const vertexShader = compileShader(vertexSrc, gl.VERTEX_SHADER);
    const fragmentShader = compileShader(fragSrc, gl.FRAGMENT_SHADER);
    const program = gl.createProgram();
    gl.attachShader(program, vertexShader);
    gl.attachShader(program, fragmentShader);
    gl.linkProgram(program);
    gl.useProgram(program);
    const positionBuffer = gl.createBuffer();
    gl.bindBuffer(gl.ARRAY_BUFFER, positionBuffer);
    gl.bufferData(gl.ARRAY_BUFFER, new Float32Array([
    -1, -1, 1, -1, -1, 1,
    -1, 1, 1, -1, 1, 1
    ]), gl.STATIC_DRAW);
    const positionLoc = gl.getAttribLocation(program, "position");
    gl.enableVertexAttribArray(positionLoc);
    gl.vertexAttribPointer(positionLoc, 2, gl.FLOAT, false, 0, 0);
    const iResolutionLoc = gl.getUniformLocation(program, "iResolution");
    const iTimeLoc = gl.getUniformLocation(program, "iTime");
    function render(time) {
    resize();
    gl.uniform2f(iResolutionLoc, canvas.width, canvas.height);
    gl.uniform1f(iTimeLoc, time * 0.001);
    gl.drawArrays(gl.TRIANGLES, 0, 6);
    requestAnimationFrame(render);
    }
    requestAnimationFrame(render);
    </script>
    """

`report_simple()`

Return HTML code for report background.

Source code in wildkcat/utils/generate_reports.py

def report_simple():
    """Return HTML code for report background."""
    return """
    <style>
        header {
            background-color: #2980b9; /* simple blue background */
            margin: 0;
            padding: 0;
        }
    </style>
    """

`report_style()`

Return CSS script for report style.

Source code in wildkcat/utils/generate_reports.py

def report_style():
    """Return CSS script for report style."""
    return """
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background-color: #f4f6f9;
            margin: 0;
            padding: 0;
            color: #333;
        }
        header {
            position: relative;
            width: 100%;
            height: 150px;
            overflow: hidden;
            display: flex;
            align-items: center;
            justify-content: center;
            color: #fff;
            text-align: center;
        }
        header canvas {
            position: absolute;
            top: 0; left: 0;
            width: 100%;
            height: 100%;
            z-index: 0;
        }
        header::before {
            content: "";
            position: absolute;
            top: 0; left: 0; right: 0; bottom: 0;
            background: linear-gradient(
                rgba(0,0,0,0.5),
                rgba(0,0,0,0.3)
            );
            z-index: 1;
        }
        header .overlay {
            position: relative;
            z-index: 2;
            padding: 10px 20px;
            border-radius: 8px;
        }
        header h1 {
            margin: 0;
            font-size: 2.5rem;
            font-weight: bold;
            text-shadow: 0 2px 6px rgba(0,0,0,0.6);
        }
        header p {
            margin: 8px 0 0;
            font-size: 1.1rem;
            text-shadow: 0 1px 4px rgba(0,0,0,0.6);
        }
        .container {
            max-width: 1100px;
            margin: 30px auto;
            padding: 20px;
        }
        .card {
            background: #fff;
            border-radius: 12px;
            padding: 20px;
            margin-bottom: 20px;
            box-shadow: 0 2px 8px rgba(0,0,0,0.05);
        }
        .card h2 {
            margin-top: 0;
            color: #2980b9;
            border-bottom: 2px solid #e6e6e6;
            padding-bottom: 10px;
            font-size: 1.5rem;
        }
        .stats-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 15px;
            margin-top: 15px;
        }
        .stat-box {
            background: #f9fafc;
            border-radius: 8px;
            padding: 15px;
            text-align: center;
            border: 1px solid #e2e2e2;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin-top: 20px;
            font-size: 0.95rem;
        }
        table th, table td {
            border: 1px solid #ddd;
            padding: 10px;
            text-align: left;
        }
        table th {
            background-color: #2980b9;
            color: #fff;
        }
        table tr:nth-child(even) {
            background-color: #f2f2f2;
        }
        .progress {
            background-color: #ddd;
            border-radius: 10px;
            overflow: hidden;
            height: 18px;
            width: 100%;
            margin-top: 5px;
        }
        .progress-stacked {
            display: flex;
            height: 18px;
            border-radius: 10px;
            overflow: hidden;
            background-color: #ddd;
            font-size: 0.75rem;
            line-height: 18px;
            color: white;
            text-shadow: 0 1px 1px rgba(0,0,0,0.2);
            margin-bottom: 10px;
        }
        .progress-bar {
            display: flex;
            align-items: center;
            justify-content: center;
            height: 100%;
            white-space: nowrap;
            overflow: hidden;
        }
        .progress-bar-table {
            background-color: #27ae60;
            height: 100%;
            text-align: right;
            padding-right: 5px;
            color: white;
            font-size: 0.8rem;
            line-height: 18px;
        }
        .progress-multi {
            display: flex;
            width: 100%;
            height: 25px;
            border-radius: 12px;
            overflow: hidden;
            border: 1px solid #ccc;
        }
        .progress-segment {
            height: 100%;
        }
        .legend {
            display: flex;
            flex-wrap: wrap;
            gap: 10px;
            font-size: 0.85rem;
            margin-top: 5px;
        }
        .legend-item {
            display: flex;
            align-items: center;
            gap: 5px;
        }
        .legend-color {
            width: 14px;
            height: 14px;
            border-radius: 3px;
            border: 1px solid #aaa;
        }
        .img-section {
            display: flex;
            flex-wrap: wrap;
            gap: 30px;
            justify-content: center;
            align-items: flex-start;
            margin-top: 20px;
        }
        footer {
            text-align: center;
            font-size: 0.9rem;
            color: #777;
            padding: 15px;
            margin-top: 20px;
            border-top: 1px solid #ddd;
        }
    </style>
    """

Main functions

Extraction

wildkcat.processing.extract_kcat.run_extraction(model_path, output_folder, report=True)

Retrieval

wildkcat.processing.retrieve_kcat.run_retrieval(output_folder, organism, temperature_range, pH_range, database='both', report=True)

Prediction

wildkcat.processing.predict_kcat.run_prediction_part1(output_folder, limit_matching_score, report=True)

wildkcat.processing.predict_kcat.run_prediction_part2(output_folder, catapro_predictions_path, limit_matching_score)

Summary report

wildkcat.processing.summary.generate_summary_report(model_path, output_folder)

Matching process and scoring

wildkcat.utils.matching

check_catalytic_enzyme(candidate, kcat_dict)

check_organism(candidate, general_criteria)

check_pH(candidate, general_criteria)

check_substrate(entry, kcat_dict=None, candidate=None)

check_temperature(candidate, general_criteria, api_output, min_r2=0.8, expected_range=(50000, 150000))

check_variant(candidate)

compute_score(kcat_dict, candidate, general_criteria, api_output)

find_best_match(kcat_dict, api_output, general_criteria)

Find closest enzyme and organism

wildkcat.utils.organism

closest_enz(kcat_dict, api_output)

closest_taxonomy(general_criteria, api_output)

Correct the kcat value using Arrhenius equation

wildkcat.utils.temperature

arrhenius_equation(candidate, api_output, general_criteria)

calculate_ea(df)

API

wildkcat.api.brenda_api

create_brenda_client(wsdl_url='https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl')

get_brenda_credentials()

get_cofactor(ec_number) cached

get_turnover_number_brenda(ec_number) cached

get_variant(text)

wildkcat.api.sabio_rk_api

get_turnover_number_sabio(ec_number) cached

wildkcat.api.uniprot_api

catalytic_activity(uniprot_id) cached

convert_uniprot_to_sequence(uniprot_id) cached

identify_catalytic_enzyme(lst_uniprot_ids, ec)

Machine Learning preprocessing

wildkcat.machine_learning.catapro

convert_cid_to_smiles(cid)

convert_kegg_compound_to_sid(kegg_compound_id)

convert_kegg_to_smiles(kegg_compound_id) cached

convert_sid_to_cid(sid)

create_catapro_input_file(kcat_df)

integrate_catapro_predictions(kcat_df, substrates_to_smiles, catapro_predictions_df)

Generate reports

wildkcat.utils.generate_reports

report_extraction(model, df, report_statistics, output_folder, shader=False)

report_final(model, final_df, output_folder, shader=False)

report_prediction_input(catapro_df, report_statistics, output_folder, shader=False)

report_retrieval(df, output_folder, shader=False)

report_shader()

report_simple()

report_style()

`wildkcat.processing.extract_kcat.run_extraction(model_path, output_folder, report=True)`

`wildkcat.processing.retrieve_kcat.run_retrieval(output_folder, organism, temperature_range, pH_range, database='both', report=True)`

`wildkcat.processing.predict_kcat.run_prediction_part1(output_folder, limit_matching_score, report=True)`

`wildkcat.processing.predict_kcat.run_prediction_part2(output_folder, catapro_predictions_path, limit_matching_score)`

`wildkcat.processing.summary.generate_summary_report(model_path, output_folder)`

`wildkcat.utils.matching`

`check_catalytic_enzyme(candidate, kcat_dict)`

`check_organism(candidate, general_criteria)`

`check_pH(candidate, general_criteria)`

`check_substrate(entry, kcat_dict=None, candidate=None)`

`check_temperature(candidate, general_criteria, api_output, min_r2=0.8, expected_range=(50000, 150000))`

`check_variant(candidate)`

`compute_score(kcat_dict, candidate, general_criteria, api_output)`

`find_best_match(kcat_dict, api_output, general_criteria)`

`wildkcat.utils.organism`

`closest_enz(kcat_dict, api_output)`

`closest_taxonomy(general_criteria, api_output)`

`wildkcat.utils.temperature`

`arrhenius_equation(candidate, api_output, general_criteria)`

`calculate_ea(df)`

`wildkcat.api.brenda_api`

`create_brenda_client(wsdl_url='https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl')`

`get_brenda_credentials()`

`get_cofactor(ec_number)` `cached`

`get_turnover_number_brenda(ec_number)` `cached`

`get_variant(text)`

`wildkcat.api.sabio_rk_api`

`get_turnover_number_sabio(ec_number)` `cached`

`wildkcat.api.uniprot_api`

`catalytic_activity(uniprot_id)` `cached`

`convert_uniprot_to_sequence(uniprot_id)` `cached`

`identify_catalytic_enzyme(lst_uniprot_ids, ec)`

`wildkcat.machine_learning.catapro`

`convert_cid_to_smiles(cid)`

`convert_kegg_compound_to_sid(kegg_compound_id)`

`convert_kegg_to_smiles(kegg_compound_id)` `cached`

`convert_sid_to_cid(sid)`

`create_catapro_input_file(kcat_df)`

`integrate_catapro_predictions(kcat_df, substrates_to_smiles, catapro_predictions_df)`

`wildkcat.utils.generate_reports`

`report_extraction(model, df, report_statistics, output_folder, shader=False)`

`report_final(model, final_df, output_folder, shader=False)`

`report_prediction_input(catapro_df, report_statistics, output_folder, shader=False)`

`report_retrieval(df, output_folder, shader=False)`

`report_shader()`

`report_simple()`

`report_style()`