Source code for clarity.evaluator.hasqi.hasqi

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Final

import numpy as np

from clarity.evaluator.haspi import eb
from clarity.utils.audiogram import Audiogram, Listener

if TYPE_CHECKING:
    from numpy import ndarray


# HASQI assumes the following audiogram frequencies:
HASQI_AUDIOGRAM_FREQUENCIES: Final = np.array([250, 500, 1000, 2000, 4000, 6000])


[docs] def hasqi_v2( reference: ndarray, reference_sample_rate: float, processed: ndarray, processed_sample_rate: float, audiogram: Audiogram, equalisation: int = 1, level1: float = 65.0, silence_threshold: float = 2.5, add_noise: float = 0.0, segment_covariance: int = 16, ) -> tuple[float, float, float, list[float]]: """ Function to compute the HASQI version 2 quality index using the auditory model followed by computing the envelope cepstral correlation and BM vibration average short-time coherence signals. The reference signal presentation level for NH listeners is assumed to be 65 dB SPL. The same model is used for both normal and impaired hearing. Arguments: reference (np.ndarray): Clear input reference speech signal with no noise or distortion. If a hearing loss is specified, NAL-R equalization is optional reference_sample_Rate (int): Sampling rate in Hz for reference signal. processed (np.ndarray): Output signal with noise, distortion, HA gain, and/or processing. processed_sample_rate (int): Sampling rate in Hz for processed signal. hearing_loss (np.ndarray): vector of hearing loss at the 6 audiometric frequencies [250, 500, 1000, 2000, 4000, 6000] Hz. equalisation (int): Mode to use when equalising the reference signal: 1 = no EQ has been provided, the function will add NAL-R 2 = NAL-R EQ has already been added to the reference signal level1: Optional input specifying level in dB SPL that corresponds to a signal RMS = 1. Default is 65 dB SPL if argument not provided. silence_threshold (float): Silence threshold sum across bands, dB above audio threshold. Default: 2.5 add_noise (float): Additive noise in dB SL to conditional cross-covariance. Default is 0.0 segment_covariance (int): Segment size for the covariance calculation. Default is 16 Returns: tuple(Combined, Nonlin, Linear, raw) Combined: Quality estimate is the product of the nonlinear and linear terms Nonlin: Nonlinear quality component = (cepstral corr)^2 x seg BM coherence Linear: Linear quality component = std of spectrum and spectrum slope raw: Vector of raw values = [CepCorr, BMsync5, Dloud, Dslope] James M. Kates, 5 August 2013. Translated from MATLAB to Python by Gerardo Roa Dabike, October 2022. """ if not audiogram.has_frequencies(HASQI_AUDIOGRAM_FREQUENCIES): logging.warning( "Audiogram does not have all HASQI frequency measurements" "Measurements will be interpolated" ) audiogram = audiogram.resample(HASQI_AUDIOGRAM_FREQUENCIES) # Auditory model for quality # Reference is no processing or NAL-R, impaired hearing ( reference_db, reference_basilar_membrane, processed_db, processed_basilar_membrane, reference_sl, processed_sl, sample_rate, ) = eb.ear_model( reference, reference_sample_rate, processed, processed_sample_rate, audiogram.levels, equalisation, level1, ) # Envelope and long-term average spectral features # Smooth the envelope outputs: 125 Hz sub-sampling rate reference_smooth = eb.env_smooth(reference_db, segment_covariance, sample_rate) processed_smooth = eb.env_smooth(processed_db, segment_covariance, sample_rate) # Mel cepstrum correlation using smoothed envelopes ( average_cepstral_correlation, _individual_cepstral_correlations, ) = eb.mel_cepstrum_correlation( reference_smooth, processed_smooth, silence_threshold, add_noise ) # Linear changes in the log-term spectra # dloud vector: [sum abs diff, std dev diff, max diff] spectra # dnorm vector: [sum abs diff, std dev diff, max diff] norm spectra # dslope vector: [sum abs diff, std dev diff, max diff] slope d_loud_stats, _d_norm_stats, d_slope_stats = eb.spectrum_diff( reference_sl, processed_sl ) # Temporal fine structure correlation measurements # Compute the time-frequency segment covariances ( signal_cross_covariance, reference_mean_square, _processed_mean_square, ) = eb.bm_covary( reference_basilar_membrane, processed_basilar_membrane, segment_covariance, sample_rate, ) # Average signal segment cross-covariance # avecov=weighted ave of cross-covariances, using only data above threshold # syncov=ave cross-covariance with added IHC loss of synchronization at HF silence_threshold = 2.5 # Threshold in dB SL for including time-freq tile _, ihc_sync_covariance = eb.ave_covary2( signal_cross_covariance, reference_mean_square, silence_threshold ) basilar_membrane_sync5 = ihc_sync_covariance[ 4 ] # Ave segment coherence with IHC loss of sync # Extract and normalize the spectral features # Dloud:std d_loud = d_loud_stats[1] / 2.5 # Loudness difference std d_loud = 1.0 - d_loud # 1=perfect, 0=bad d_loud = max(min(d_loud, 1.0), 0.0) # Dslope:std d_slope = d_slope_stats[1] # Slope difference std d_slope = 1.0 - d_slope d_slope = max(min(d_slope, 1.0), 0.0) # Construct the models # Nonlinear model non_linear = ( average_cepstral_correlation**2 ) * basilar_membrane_sync5 # Combined envelope and temporal fine structure # Linear model linear = 0.579 * d_loud + 0.421 * d_slope # Linear fit # Combined model combined = non_linear * linear # Product of nonlinear x linear # Raw data raw = [average_cepstral_correlation, basilar_membrane_sync5, d_loud, d_slope] return combined, non_linear, linear, raw
[docs] def hasqi_v2_better_ear( reference_left: ndarray, reference_right: ndarray, processed_left: ndarray, processed_right: ndarray, sample_rate: float, listener: Listener, level: float = 100.0, ) -> float: """Better ear HASQI. Calculates HASQI for left and right ear and selects the better result. Args: reference_left (np.ndarray): left channel of reference signal reference_right (np.ndarray): right channel of reference signal reference_left (np.ndarray): left channel of processed signal reference_right (np.ndarray): right channel of processed signal sample_rate: sampling rate for both signal audiogram_l: left ear audiogram audiogram_r: right ear audiogram level: level in dB SPL corresponding to RMS=1 audiogram_freq: selected frequencies to use for audiogram Returns: float: beHASQI score Gerardo Roa Dabike, November 2022 """ score_left, _, _, _ = hasqi_v2( reference_left, sample_rate, processed_left, sample_rate, listener.audiogram_left, equalisation=1, level1=level, ) score_right, _, _, _ = hasqi_v2( reference_right, sample_rate, processed_right, sample_rate, listener.audiogram_right, equalisation=1, level1=level, ) return max(score_left, score_right)