Source code for recipes.cpc2.baseline.predict

"""Make intelligibility predictions from HASPI scores."""

from __future__ import annotations

import json
import logging
from pathlib import Path

import hydra
import numpy as np
import pandas as pd
from omegaconf import DictConfig
from scipy.optimize import curve_fit

from clarity.utils.file_io import read_jsonl

log = logging.getLogger(__name__)



[docs]
class LogisticModel:
    """Class to represent a logistic mapping.

    Fits a logistic mapping from input values x to output values y.
    """

    params: np.ndarray | None = None  # The model params

    def _logistic_mapping(self, x, x_0, k):
        """Logistic function

        Args:
            x - the input value
            x_0 - logistic parameter: the x value of the logistic's midpoint
            k - logistic parameter: the growth rate of the curve

        Returns:
            The output of the logistic function.
        """
        return 100.0 / (1 + np.exp(-k * (x - x_0)))


[docs]
    def fit(self, x, y):
        """Fit a mapping from x values to y values."""
        initial_guess = [0.5, 1.0]  # Initial guess for parameter values
        self.params, *_pcov = curve_fit(self._logistic_mapping, x, y, initial_guess)



[docs]
    def predict(self, x):
        """Predict y values given x.

        Raises:
            TypeError: If the predict() method is called before fit().
        """
        if self.params is None:
            raise TypeError(
                "params is None. Logistic fit() must be called before predict()."
            )

        return self._logistic_mapping(x, self.params[0], self.params[1])





[docs]
def make_disjoint_train_set(
    full_df: pd.DataFrame, test_df: pd.DataFrame
) -> pd.DataFrame:
    """Make a disjoint train set for given test samples."""
    # make sure that the train and test sets are disjoint
    # i.e. no signals, systems or listeners are shared
    train_df = full_df[~full_df.signal.isin(test_df.signal)]
    train_df = train_df[~train_df.system.isin(test_df.system)]
    train_df = train_df[~train_df.listener.isin(test_df.listener)]
    assert not set(train_df.signal).intersection(set(test_df.signal))
    return train_df



# pylint: disable = no-value-for-parameter

[docs]
@hydra.main(config_path=".", config_name="config", version_base=None)
def predict(cfg: DictConfig):
    """Predict intelligibility from HASPI scores."""

    # Load the intelligibility dataset records
    dataset_filename = Path(cfg.path.metadata_dir) / f"{cfg.dataset}.json"
    with dataset_filename.open("r", encoding="utf-8") as fp:
        records = json.load(fp)

    # load haspi scores and add them to the records
    haspi_score = read_jsonl(f"{cfg.dataset}.haspi.jsonl")

    haspi_score_index = {record["signal"]: record["haspi"] for record in haspi_score}
    for record in records:
        record["haspi_score"] = haspi_score_index[record["signal"]]
    records_df = pd.DataFrame(records)

    # make predictions for each item in the data
    for i, _record in records_df.iterrows():
        test_df = records_df.iloc[[i]].copy()

        # The prediction is made using a logistic mapping from HASPI scores to
        # intelligibility. It is important that this mapping is trained using a
        # disjoint set of data, i.e. we define a training data set that does not
        # contain systems, listeners or signals that appear in the test data sample.

        train_df = make_disjoint_train_set(records_df, test_df)

        model = LogisticModel()
        model.fit(train_df.haspi_score, train_df.correctness)

        # predict correctness from haspi scores using the model
        test_df["predicted_correctness"] = model.predict(test_df.haspi_score)

        # save results to csv file
        header, mode = (
            (["signal_ID", "intelligibility_score"], "w") if i == 0 else (False, "a")
        )
        test_df[["signal", "predicted_correctness"]].to_csv(
            f"{cfg.dataset}.predict.csv",
            index=False,
            header=header,
            mode=mode,
        )



if __name__ == "__main__":
    predict()