import json
import logging
from pathlib import Path
import hydra
import numpy as np
import pandas as pd
from omegaconf import DictConfig
from scipy.optimize import curve_fit
from scipy.stats import kendalltau, pearsonr
logger = logging.getLogger(__name__)
[docs]
def rmse_score(x, y):
return np.sqrt(np.mean((x - y) ** 2))
[docs]
def ncc_score(x, y):
return pearsonr(x, y)[0]
[docs]
def kt_score(x, y):
return kendalltau(x, y)[0]
[docs]
def std_err(x, y):
return np.std(x - y) / np.sqrt(len(x))
[docs]
class Model:
"""Class to represent the mapping from mbstoi parameters to intelligibility scores.
The mapping uses a simple logistic function scaled between 0 and 100.
The mapping parameters need to fit first using mbstoi, intelligibility score pairs,
using fit(). Once the fit has been made predictions can be made by calling predict()
"""
params = None # The model params
def _logistic_mapping(self, x, x0, k):
"""
Logistic function
x0 - x value of the logistic's midpoint
k - the logistic growth rate or steepness of the curve
"""
L = 100 # correctness can't be over 100
return L / (1 + np.exp(-k * (x - x0)))
[docs]
def fit(self, pred, intel):
"""Fit a mapping betweeen mbstoi scores and intelligibility scores."""
initial_guess = [0.5, 1.0] # Initial guess for parameter values
self.params, *_remaining_returns = curve_fit(
self._logistic_mapping, pred, intel, initial_guess
)
[docs]
def predict(self, x):
"""Predict intelligilbity scores from mbstoi scores."""
# Note, fit() must be called before predictions can be made
assert self.params is not None
return self._logistic_mapping(x, self.params[0], self.params[1])
[docs]
def compute_scores(predictions, labels):
return {
"RMSE": rmse_score(predictions, labels),
"Std": std_err(predictions, labels),
"NCC": ncc_score(predictions, labels),
"KT": kt_score(predictions, labels),
}
[docs]
def read_data(pred_csv: Path, label_json: Path):
df_pred = pd.read_csv(pred_csv).rename(
columns={"signal_ID": "signal", "intelligibility_score": "prediction"}
)
df_label = pd.read_json(label_json).rename( # pylint: disable=no-member
columns={"correctness": "label"}
)
data = df_pred.merge(df_label[["signal", "label"]])
data["prediction"] = data["prediction"].apply(lambda x: x * 100)
return data
[docs]
@hydra.main(config_path=".", config_name="config", version_base=None)
def run(cfg: DictConfig) -> None:
logger.info("Run evaluation on the closed set.")
data_tr = read_data(
pred_csv=Path(cfg.train_path.exp_folder) / "sii.csv",
label_json=Path(cfg.train_path.scenes_file),
)
data_eval = read_data(
pred_csv=Path(cfg.test_path.exp_folder) / "sii.csv",
label_json=Path(cfg.test_path.scenes_file),
)
logger.info("Apply logistic fitting.")
model = Model()
model.fit(data_tr["prediction"].to_numpy(), data_tr["label"].to_numpy())
fit_pred = model.predict(data_eval["prediction"].to_numpy())
closed_set_scores = compute_scores(fit_pred, data_eval["label"].to_numpy())
logger.info("Run evaluation on the open set.")
data_tr = read_data(
pred_csv=Path(cfg.train_indep_path.exp_folder) / "sii.csv",
label_json=Path(cfg.train_indep_path.scenes_file),
)
data_eval = read_data(
pred_csv=Path(cfg.test_indep_path.exp_folder) / "sii.csv",
label_json=Path(cfg.test_indep_path.scenes_file),
)
logger.info("Apply logistic fitting.")
model = Model()
model.fit(data_tr["prediction"].to_numpy(), data_tr["label"].to_numpy())
fit_pred = model.predict(data_eval["prediction"].to_numpy())
open_set_scores = compute_scores(fit_pred, data_eval["label"].to_numpy())
with open("results.json", "w", encoding="utf-8") as fp:
json.dump(
{
"closed_set scores:": closed_set_scores,
"open_set scores:": open_set_scores,
},
fp,
)
# pylint: disable=no-value-for-parameter
if __name__ == "__main__":
run()