Source code for clarity.evaluator.haspi.eb

"""Module for HASPI, HASQI, HAAQI EBs"""

from __future__ import annotations

# pylint: disable=import-error
import logging
from typing import TYPE_CHECKING

import numpy as np
from numba import njit  # type: ignore # <-- silence mypy no attribute error
from scipy.signal import (
    butter,
    cheby2,
    convolve,
    correlate,
    firwin,
    group_delay,
    lfilter,
    resample_poly,
)

from clarity.enhancer.nalr import NALR
from clarity.utils.audiogram import Audiogram

if TYPE_CHECKING:
    from numpy import ndarray

logger = logging.getLogger(__name__)



[docs]
def ear_model(
    reference: ndarray,
    reference_freq: float,
    processed: ndarray,
    processed_freq: float,
    hearing_loss: ndarray,
    itype: int,
    level1: float,
    nchan: int = 32,
    m_delay: int = 1,
    shift: float | None = None,
) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray, ndarray, float]:
    """
    Function that implements a cochlear model that includes the middle ear,
    auditory filter bank, Outer Hair Cell (OHC) dynamic-range compression,
    and Inner Hair Cell (IHC) attenuation.

    The inputs are the reference and processed signals that are to be
    compared. The reference is at the reference intensity (e.g. 65 dB SPL
    or with NAL-R amplification) and has no other processing. The processed
    signal is the hearing-aid output, and is assumed to have the same or
    greater group delay compared to the reference.

    The function outputs the envelopes of the signals after OHC compression
    and IHC loss attenuation.

    Arguments:
        reference (np.ndarray): reference signal: should be adjusted to 65 dB SPL
            (itype=0 or 1) or to 65 dB SPL plus NAL-R gain (itype=2)
        reference_freq (int): sampling rate for the reference signal, Hz
        processed (np.ndarray): processed signal (e.g. hearing-aid output) includes
            HA gain
        processed_freq (int): sampling rate for the processed signal, Hz
        hearing_loss (np.ndarray): audiogram giving the hearing loss in dB at 6
            audiometric frequencies: [250, 500, 1000, 2000, 4000, 6000] Hz
        itype (int): purpose for the calculation:
             0=intelligibility: reference is normal hearing and must not
               include NAL-R EQ
             1=quality: reference does not include NAL-R EQ
             2=quality: reference already has NAL-R EQ applied
        level1:   level calibration: signal RMS=1 corresponds to Level1 dB SPL
        nchan (int): auditory frequency bands
        m_delay (int): Compensate for the gammatone group delay.
        shift (float): Basal shift of the basilar membrane length

    Returns:
        reference_db (): envelope for the reference in each band
        reference_basilar_membrane (): BM motion for the reference in each band
        processed_db (): envelope for the processed signal in each band
        processed_basilar_membrane (): BM motion for the processed signal in each band
        reference_sl (): compressed RMS average reference in each band converted
            to dB SL
        processed_sl (): compressed RMS average output in each band converted to dB SL
        freq_sample (): sampling rate in Hz for the model outputs

    Updates:
    James M. Kates, 27 October 2011.
    Basilar Membrane added 30 Dec 2011.
    Revised 19 June 2012.
    Remove match of reference RMS level to processed 29 August 2012.
    IHC adaptation added 1 October 2012.
    Basilar Membrane envelope converted to dB SL, 2 Oct 2012.
    Filterbank group delay corrected, 14 Dec 2012.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    Updated by Gerardo Roa Dabike, September 2022.
    """

    # Processing parameters
    # OHC and IHC parameters for the hearing loss
    # Auditory filter center frequencies span 80 to 8000 Hz.
    _center_freq = center_frequency(nchan)  # Center frequencies on an ERB scale

    # Cochlear model parameters for the processed signal
    (
        attn_ohc_y,
        bandwidth_min_y,
        low_knee_y,
        compression_ratio_y,
        attn_ihc_y,
    ) = loss_parameters(hearing_loss, _center_freq)

    # The cochlear model parameters for the reference are the same as for the hearing
    # loss if calculating quality, but are for normal hearing if calculating
    # intelligibility.
    if itype == 0:
        hearing_loss_x = np.zeros(len(hearing_loss))
    else:
        hearing_loss_x = hearing_loss
    [
        attn_ohc_x,
        bandwidth_min_x,
        low_knee_x,
        compression_ratio_x,
        attn_ihc_x,
    ] = loss_parameters(hearing_loss_x, _center_freq)

    # Compute center frequencies for the control
    _center_freq_control = center_frequency(nchan, shift)
    # Maximum BW for the control
    _, bandwidth_1, _, _, _ = loss_parameters(np.full(6, 100), _center_freq_control)

    # Input signal adjustments
    # Convert the signals to 24 kHz sampling rate.
    # Using 24 kHz guarantees that all of the cochlear filters have the same shape
    # independent of the incoming signal sampling rates
    reference_24hz, _ = resample_24khz(reference, reference_freq)
    processed_24hz, freq_sample = resample_24khz(processed, processed_freq)

    # Check file sizes
    min_signal_length = min(len(reference_24hz), len(processed_24hz))
    reference_24hz = reference_24hz[:min_signal_length]
    processed_24hz = processed_24hz[:min_signal_length]

    # Bulk broadband signal alignment
    reference_24hz, processed_24hz = input_align(reference_24hz, processed_24hz)
    nsamp = len(reference_24hz)

    # For HASQI, here add NAL-R equalization if the quality reference doesn't
    # already have it.
    if itype == 1:
        nfir = 140  # Length in samples of the FIR NAL-R EQ filter (24-kHz rate)
        enhancer = NALR(nfir, freq_sample)
        audiogram = Audiogram(
            levels=hearing_loss,
            frequencies=np.array([250, 500, 1000, 2000, 4000, 6000]),
        )
        nalr_fir, _ = enhancer.build(audiogram)
        reference_24hz = enhancer.apply(nalr_fir, reference_24hz)
        reference_24hz = reference_24hz[nfir : nfir + nsamp]

    # Cochlear model
    # Middle ear
    reference_mid = middle_ear(reference_24hz, freq_sample)
    processed_mid = middle_ear(processed_24hz, freq_sample)

    # Initialize storage
    # Reference and processed envelopes and BM motion
    reference_db = np.zeros((nchan, nsamp))
    processed_db = np.zeros((nchan, nsamp))

    # Reference and processed average spectral values
    reference_average = np.zeros(nchan)
    processed_average = np.zeros(nchan)
    reference_control_average = np.zeros(nchan)
    processed_control_average = np.zeros(nchan)

    # Filter bandwidths adjusted for intensity
    reference_bandwidth = np.zeros(nchan)
    processed_bandwidth = np.zeros(nchan)

    reference_b = np.zeros((nchan, nsamp))
    processed_b = np.zeros((nchan, nsamp))

    # Loop over each filter in the auditory filter bank
    for n in range(nchan):
        # Control signal envelopes for the reference and processed signals
        reference_control, _, processed_control, _ = gammatone_basilar_membrane(
            reference_mid,
            bandwidth_1[n],
            processed_mid,
            bandwidth_1[n],
            freq_sample,
            _center_freq_control[n],
        )

        # Adjust the auditory filter bandwidths for the average signal level
        reference_bandwidth[n] = bandwidth_adjust(
            reference_control, bandwidth_min_x[n], bandwidth_1[n], level1
        )
        processed_bandwidth[n] = bandwidth_adjust(
            processed_control, bandwidth_min_y[n], bandwidth_1[n], level1
        )

        # Envelopes and BM motion of the reference and processed signals
        xenv, xbm, yenv, ybm = gammatone_basilar_membrane(
            reference_mid,
            reference_bandwidth[n],
            processed_mid,
            processed_bandwidth[n],
            freq_sample,
            _center_freq[n],
        )

        # RMS levels of the ref and output envelopes for linear metric
        reference_average[n] = np.sqrt(np.mean(xenv**2))
        processed_average[n] = np.sqrt(np.mean(yenv**2))
        reference_control_average[n] = np.sqrt(np.mean(reference_control**2))
        processed_control_average[n] = np.sqrt(np.mean(processed_control**2))

        # Cochlear compression for the signal envelopes and BM motion
        reference_cochlear_compression, reference_b[n] = env_compress_basilar_membrane(
            xenv,
            xbm,
            reference_control,
            attn_ohc_x[n],
            low_knee_x[n],
            compression_ratio_x[n],
            freq_sample,
            level1,
        )
        processed_cochlear_compression, processed_b[n] = env_compress_basilar_membrane(
            yenv,
            ybm,
            processed_control,
            attn_ohc_y[n],
            low_knee_y[n],
            compression_ratio_y[n],
            freq_sample,
            level1,
        )

        # Correct for the delay between the reference and output
        processed_cochlear_compression = envelope_align(
            reference_cochlear_compression, processed_cochlear_compression
        )  # Align processed envelope to reference
        processed_b[n] = envelope_align(
            reference_b[n], processed_b[n]
        )  # Align processed BM motion to reference

        # Convert the compressed envelopes and BM vibration envelopes to dB SPL
        reference_cochlear_compression, reference_b[n] = envelope_sl(
            reference_cochlear_compression, reference_b[n], attn_ihc_x[n], level1
        )
        processed_cochlear_compression, processed_b[n] = envelope_sl(
            processed_cochlear_compression, processed_b[n], attn_ihc_y[n], level1
        )

        # Apply the IHC rapid and short-term adaptation
        delta = 2  # Amount of overshoot
        reference_db[n], reference_b[n] = inner_hair_cell_adaptation(
            reference_cochlear_compression, reference_b[n], delta, freq_sample
        )
        processed_db[n], processed_b[n] = inner_hair_cell_adaptation(
            processed_cochlear_compression, processed_b[n], delta, freq_sample
        )

    # Additive noise level to give the auditory threshold
    ihc_threshold = -10  # Additive noise level, dB re: auditory threshold
    reference_basilar_membrane = basilar_membrane_add_noise(
        reference_b, ihc_threshold, level1
    )
    processed_basilar_membrane = basilar_membrane_add_noise(
        processed_b, ihc_threshold, level1
    )

    # Correct for the gammatone filterbank interchannel group delay.
    if m_delay > 0:
        reference_db = group_delay_compensate(
            reference_db, reference_bandwidth, _center_freq, freq_sample
        )
        processed_db = group_delay_compensate(
            processed_db, reference_bandwidth, _center_freq, freq_sample
        )
        reference_basilar_membrane = group_delay_compensate(
            reference_basilar_membrane, reference_bandwidth, _center_freq, freq_sample
        )
        processed_basilar_membrane = group_delay_compensate(
            processed_basilar_membrane, reference_bandwidth, _center_freq, freq_sample
        )

    # Convert average gammatone outputs to dB SPL
    reference_sl = convert_rms_to_sl(
        reference_average,
        reference_control_average,
        attn_ohc_x,
        low_knee_x,
        compression_ratio_x,
        attn_ihc_x,
        level1,
    )
    processed_sl = convert_rms_to_sl(
        processed_average,
        processed_control_average,
        attn_ohc_y,
        low_knee_y,
        compression_ratio_y,
        attn_ihc_y,
        level1,
    )

    return (
        reference_db,
        reference_basilar_membrane,
        processed_db,
        processed_basilar_membrane,
        reference_sl,
        processed_sl,
        freq_sample,
    )




[docs]
def center_frequency(
    nchan: int,
    shift: float | None = None,
    low_freq: int = 80,
    high_freq: int = 8000,
    ear_q: float = 9.26449,
    min_bw: float = 24.7,
) -> ndarray:
    """
    Compute the Equivalent Rectangular Bandwidth_[1] frequency spacing for the
    gammatone filter bank. The equation comes from Malcolm Slaney[2].

    Arguments:
        nchan (int): number of filters in the filter bank
        low_freq (int): Low Frequency level.
        high_freq (int): High Frequency level.
        shift (): optional frequency shift of the filter bank specified as a fractional
            shift in distance along the BM. A positive shift is an increase in frequency
            (basal shift), and negative is a decrease in frequency (apical shift). The
            total length of the BM is normalized to 1. The frequency-to-distance map is
            from D.D. Greenwood[3].
        ear_q (float):
        min_bw (float):

    Returns:


    References:
    .. [1] Moore BCJ, Glasberg BR (1983) Suggested formulae for calculating
           auditory-filter bandwidths and excitation patterns. J Acoustical
           Soc America 74:750-753. Available at
           <https://doi.org/10.1121/1.389861>
    .. [2] Slaney M (1993) An Efficient Implemtnation of the Patterson-
           Holdsworth Auditory Filter Bank. Available at:
           <https://asset-pdf.scinapse.io/prod/396690109/396690109.pdf>.
    .. [3] Greenwood DD (1990) A cochlear frequency-position function for
           several species--29 years later. J Acoust Soc Am 87(6):2592-
           2605. Available at
           <https://doi.o10.1121/1.399052>

    Updates:
    James M. Kates, 25 January 2007.
    Frequency shift added 22 August 2008.
    Lower and upper frequencies fixed at 80 and 8000 Hz, 19 June 2012.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """

    if shift is not None:
        k = 1
        A = 165.4  # pylint: disable=invalid-name
        a = 2.1  # shift specified as a fraction of the total length
        # Locations of the low and high frequencies on the BM between 0 and 1
        x_low = (1 / a) * np.log10(k + (low_freq / A))
        x_high = (1 / a) * np.log10(k + (high_freq / A))
        # Shift the locations
        x_low = x_low * (1 + shift)
        x_high = x_high * (1 + shift)
        # Compute the new frequency range
        low_freq = A * (10 ** (a * x_low) - k)
        high_freq = A * (10 ** (a * x_high) - k)

    # All of the following expressions are derived in Apple TR #35,
    # "An Efficient Implementation of the Patterson-Holdsworth Cochlear
    # Filter Bank" by Malcolm Slaney.
    # https://engineering.purdue.edu/~malcolm/apple/tr35/PattersonsEar.pdf
    _center_freq = -(ear_q * min_bw) + np.exp(
        np.arange(1, nchan)
        * (-np.log(high_freq + ear_q * min_bw) + np.log(low_freq + ear_q * min_bw))
        / (nchan - 1)
    ) * (high_freq + ear_q * min_bw)
    _center_freq = np.insert(
        _center_freq, 0, high_freq
    )  # Last center frequency is set to highFreq
    _center_freq = np.flip(_center_freq)
    return _center_freq




[docs]
def loss_parameters(
    hearing_loss: ndarray,
    center_freq: ndarray,
    audiometric_freq: ndarray | None = None,
) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
    """
    Apportion the hearing loss to the outer hair cells (OHC) and the inner
    hair cells (IHC) and to increase the bandwidth of the cochlear filters
    in proportion to the OHC fraction of the total loss.

    Arguments:
        hearing_loss (np.ndarray): hearing loss at the 6 audiometric frequencies
        center_freq (np.ndarray): array containing the center frequencies of the
            gammatone filters arranged from low to high
        audiometric_freq (list):

    Returns:
        attenuated_ohc (): attenuation in dB for the OHC gammatone filters
        bandwidth (): OHC filter bandwidth expressed in terms of normal
        low_knee (): Lower kneepoint for the low-level linear amplification
        compression_ratio (): Ranges from 1.4:1 at 150 Hz to 3.5:1 at 8 kHz for normal
            hearing. Reduced in proportion to the OHC loss to 1:1.
        attenuated_ihc ():	attenuation in dB for the input to the IHC synapse

    Updates:
    James M. Kates, 25 January 2007.
    Version for loss in dB and match of OHC loss to CR, 9 March 2007.
    Low-frequency extent changed to 80 Hz, 27 Oct 2011.
    Lower kneepoint set to 30 dB, 19 June 2012.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """
    # Audiometric frequencies in Hz
    if audiometric_freq is None:
        audiometric_freq = np.array([250, 500, 1000, 2000, 4000, 6000])

    # Interpolation to give the loss at the gammatone center frequencies
    # Use linear interpolation in dB. The interpolation assumes that
    # cfreq[1] < aud[1] and cfreq[nfilt] > aud[6]
    nfilt = len(center_freq)
    f_v = np.insert(
        audiometric_freq, [0, len(audiometric_freq)], [center_freq[0], center_freq[-1]]
    )

    # Interpolated gain in dB
    loss = np.interp(
        center_freq,
        f_v,
        np.insert(
            hearing_loss, [0, len(hearing_loss)], [hearing_loss[0], hearing_loss[-1]]
        ),
    )
    loss = np.maximum(loss, 0)
    # Make sure there are no negative losses

    # Compression ratio changes linearly with ERB rate from 1.25:1 in the 80-Hz
    # frequency band to 3.5:1 in the 8-kHz frequency band
    compression_ratio = 1.25 + 2.25 * np.arange(nfilt) / (nfilt - 1)

    # Maximum OHC sensitivity loss depends on the compression ratio. The compression
    # I/O curves assume linear below 30 and above 100 dB SPL in normal ears.
    max_ohc = 70 * (
        1 - (1 / compression_ratio)
    )  # HC loss that results in 1:1 compression
    theoretical_ohc = 1.25 * max_ohc  # Loss threshold for adjusting the OHC parameters

    # Apportion the loss in dB to the outer and inner hair cells based on the data of
    # Moore et al (1999), JASA 106, 2761-2778.

    # Reduce the CR towards 1:1 in proportion to the OHC loss.
    attenuated_ohc = 0.8 * np.copy(loss)
    attenuated_ihc = 0.2 * np.copy(loss)

    attenuated_ohc[loss >= theoretical_ohc] = (
        0.8 * theoretical_ohc[loss >= theoretical_ohc]
    )
    attenuated_ihc[loss >= theoretical_ohc] = 0.2 * theoretical_ohc[
        loss >= theoretical_ohc
    ] + (loss[loss >= theoretical_ohc] - theoretical_ohc[loss >= theoretical_ohc])

    # Adjust the OHC bandwidth in proportion to the OHC loss
    bandwidth = np.ones(nfilt)
    bandwidth = bandwidth + (attenuated_ohc / 50.0) + 2.0 * (attenuated_ohc / 50.0) ** 6

    # Compute the compression lower kneepoint and compression ratio
    low_knee = attenuated_ohc + 30
    upamp = 30 + (70 / compression_ratio)  # Output level for an input of 100 dB SPL

    compression_ratio = (100 - low_knee) / (
        upamp + attenuated_ohc - low_knee
    )  # OHC loss Compression ratio

    return attenuated_ohc, bandwidth, low_knee, compression_ratio, attenuated_ihc




[docs]
def resample_24khz(
    reference_signal: ndarray, reference_freq: float, freq_sample_hz: float = 24000.0
) -> tuple[ndarray, float]:
    """
    Resample the input signal at 24 kHz. The input sampling rate is
    rounded to the nearest kHz to compute the sampling rate conversion
    ratio.

    Arguments:
    reference_signal (np.ndarray): input signal
    reference_freq (int): sampling rate for the input in Hz
    freq_sample_hz (int): Frequency sample in Hz

    Returns:
    reference_signal_24         signal resampled at kHz (default 24Khz)
    freq_sample_hz     output sampling rate in Hz

    Updates
    James M. Kates, 20 June 2011.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """

    # Sampling rate information
    sample_rate_target_khz = np.round(
        freq_sample_hz / 1000
    )  # output rate to nearest kHz
    reference_freq_khz = np.round(reference_freq / 1000)

    # Resample the signal
    if reference_freq_khz == sample_rate_target_khz:
        # No resampling performed if the rates match
        return reference_signal, freq_sample_hz

    if reference_freq_khz < sample_rate_target_khz:
        # Resample for the input rate lower than the output
        resample_signal = resample_poly(
            reference_signal, sample_rate_target_khz, reference_freq_khz
        )

        # Match the RMS level of the resampled signal to that of the input
        reference_rms = np.sqrt(np.mean(reference_signal**2))
        resample_rms = np.sqrt(np.mean(resample_signal**2))
        resample_signal = (reference_rms / resample_rms) * resample_signal

        return resample_signal, freq_sample_hz

    # Resample for the input rate higher than the output
    resample_signal = resample_poly(
        reference_signal, sample_rate_target_khz, reference_freq_khz
    )

    # Reduce the input signal bandwidth to 21 kHz (-10.5 to +10.5 kHz)
    # The power equalization is designed to match the signal intensities
    # over the frequency range spanned by the gammatone filter bank.
    # Chebyshev Type 2 LP
    order = 7
    attenuation = 30  # sidelobe attenuation in dB
    reference_freq_cut = 21 / reference_freq_khz
    reference_b, reference_a = cheby2(order, attenuation, reference_freq_cut)
    reference_filter = lfilter(reference_b, reference_a, reference_signal, axis=0)

    # Reduce the resampled signal bandwidth to 21 kHz (-10.5 to +10.5 kHz)
    resample_rate_cut = 21 / sample_rate_target_khz
    target_b, target_a = cheby2(order, attenuation, resample_rate_cut)
    target_filter = lfilter(target_b, target_a, resample_signal, axis=0)

    # Compute the input and output RMS levels within the 21 kHz bandwidth and
    # match the output to the input
    reference_rms = np.sqrt(np.mean(reference_filter**2))
    resample_rms = np.sqrt(np.mean(target_filter**2))
    resample_signal = (reference_rms / resample_rms) * resample_signal

    return resample_signal, freq_sample_hz




[docs]
def input_align(reference: ndarray, processed: ndarray) -> tuple[ndarray, ndarray]:
    """
    Approximate temporal alignment of the reference and processed output
    signals. Leading and trailing zeros are then pruned.

    The function assumes that the two sequences have the same sampling rate:
    call eb_Resamp24kHz for each sequence first, then call this function to
    align the signals.

    Arguments:
    reference (np.ndarray): input reference sequence
    processed (np.ndarray): hearing-aid output sequence

    Returns:
    reference (np.ndarray): pruned and shifted reference
    processed (np.ndarray): pruned and shifted hearing-aid output

    Updates:
    James M. Kates, 12 July 2011.
    Match the length of the processed output to the reference for the
    purposes of computing the cross-covariance
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """

    # Match the length of the processed output to the reference for the purposes
    # of computing the cross-covariance
    reference_n = len(reference)
    processed_n = len(processed)
    min_sample_length = min(reference_n, processed_n)

    # Determine the delay of the output relative to the reference
    reference_processed_correlation = correlate(
        reference[:min_sample_length] - np.mean(reference[:min_sample_length]),
        processed[:min_sample_length] - np.mean(processed[:min_sample_length]),
        "full",
    )  # Matlab code uses xcov thus the subtraction of mean
    index = np.argmax(np.abs(reference_processed_correlation))
    delay = min_sample_length - index - 1

    # Back up 2 msec to allow for dispersion
    fsamp = 24000.0  # Cochlear model input sampling rate in Hz
    delay = np.rint(delay - 2 * fsamp / 1000.0).astype(int)  # Back up 2 ms

    # Align the output with the reference allowing for the dispersion
    if delay > 0:
        # Output delayed relative to the reference
        processed = np.concatenate((processed[delay:processed_n], np.zeros(delay)))
    else:
        # Output advanced relative to the reference
        processed = np.concatenate((np.zeros(-delay), processed[: processed_n + delay]))

    # Find the start and end of the noiseless reference sequence
    reference_abs = np.abs(reference)
    reference_max = np.max(reference_abs)
    reference_threshold = 0.001 * reference_max  # Zero detection threshold

    above_threshold = np.where(reference_abs > reference_threshold)[0]
    reference_n_above_threshold = above_threshold[0]
    reference_n_below_threshold = above_threshold[-1]

    # Prune the sequences to remove the leading and trailing zeros
    reference_n_below_threshold = min(reference_n_below_threshold, processed_n)

    return (
        reference[reference_n_above_threshold : reference_n_below_threshold + 1],
        processed[reference_n_above_threshold : reference_n_below_threshold + 1],
    )




[docs]
def middle_ear(reference: ndarray, freq_sample: float) -> ndarray:
    """
    Design the middle ear filters and process the input through the
    cascade of filters. The middle ear model is a 2-pole HP filter
    at 350 Hz in series with a 1-pole LP filter at 5000 Hz. The
    result is a rough approximation to the equal-loudness contour
    at threshold.

    Arguments:
    reference (np.ndarray):	input signal
    freq_sample (float): sampling rate in Hz

    Returns:
    xout (): filtered output

    Updates:
    James M. Kates, 18 January 2007.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """

    # Design the 1-pole Butterworth LP using the bilinear transformation
    butterworth_low_pass, low_pass = butter(1, 5000 / (0.5 * freq_sample))

    # LP filter the input
    y = lfilter(butterworth_low_pass, low_pass, reference)

    # Design the 2-pole Butterworth HP using the bilinear transformation
    butterworth_high_pass, high_pass = butter(2, 350 / (0.5 * freq_sample), "high")

    # HP filter the signal
    return lfilter(butterworth_high_pass, high_pass, y)




[docs]
def gammatone_basilar_membrane(
    reference: ndarray,
    reference_bandwidth: float,
    processed: ndarray,
    processed_bandwidth: float,
    freq_sample: float,
    center_freq: float,
    ear_q: float = 9.26449,
    min_bandwidth: float = 24.7,
) -> tuple[ndarray, ndarray, ndarray, ndarray]:
    """
    4th-order gammatone auditory filter. This implementation is based on the c program
    published on-line by Ning Ma, U. Sheffield, UK[1]_ that gives an implementation of
    the Martin Cooke filters[2]_: an impulse-invariant transformation of the gammatone
    filter. The signal is demodulated down to baseband using a complex exponential,
    and then passed through a cascade of four one-pole low-pass filters.

    This version filters two signals that have the same sampling rate and the same
    gammatone filter center frequencies. The lengths of the two signals should match;
    if they don't, the signals are truncated to the shorter of the two lengths.

    Arguments:
        reference (): first sequence to be filtered
        reference_bandwidth: bandwidth for x relative to that of a normal ear
        processed (): second sequence to be filtered
        processed_bandwidth (): bandwidth for x relative to that of a normal ear
        freq_sample (): sampling rate in Hz
        center_frequency (int): filter center frequency in Hz
        ear_q: (float): ???
        min_bandwidth (float): ???

    Returns:
        reference_envelope (): filter envelope output (modulated down to baseband)
            1st signal
        reference_basilar_membrane (): Basilar Membrane for the first signal
        processed_envelope (): filter envelope output (modulated down to baseband)
            2nd signal
        processed_basilar_membrane (): Basilar Membrane for the second signal

    References:
    .. [1] Ma N, Green P, Barker J, Coy A (2007) Exploiting correlogram
           structure for robust speech recognition with multiple speech
           sources. Speech Communication, 49 (12): 874-891. Available at
           <https://doi.org/10.1016/j.specom.2007.05.003>
           <https://staffwww.dcs.shef.ac.uk/people/N.Ma/resources/gammatone/>
    .. [2] Cooke, M. (1993) Modelling auditory processing and organisation.
           Cambridge University Press

    Updates:
    James M. Kates, 8 Jan 2007.
    Vectorized version for efficient MATLAB execution, 4 February 2007.
    Cosine and sine generation, 29 June 2011.
    Output sine and cosine sequences, 19 June 2012.
    Cosine/sine loop speed increased, 9 August 2013.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """
    # Filter Equivalent Rectangular Bandwidth from Moore and Glasberg (1983)
    # doi: 10.1121/1.389861
    erb = min_bandwidth + (center_freq / ear_q)

    # Check the lengths of the two signals and trim to shortest
    min_sample = min(len(reference), len(processed))
    x = reference[:min_sample]
    y = processed[:min_sample]

    # Filter the first signal
    # Initialize the filter coefficients
    tpt = 2 * np.pi / freq_sample
    tpt_bw = reference_bandwidth * tpt * erb * 1.019
    a = np.exp(-tpt_bw)
    a_1 = 4.0 * a
    a_2 = -6.0 * a * a
    a_3 = 4.0 * a * a * a
    a_4 = -a * a * a * a
    a_5 = 4.0 * a * a
    gain = 2.0 * (1 - a_1 - a_2 - a_3 - a_4) / (1 + a_1 + a_5)

    # Initialize the complex demodulation
    npts = len(x)
    sincf, coscf = gammatone_bandwidth_demodulation(
        npts, tpt, center_freq, np.zeros(npts), np.zeros(npts)
    )

    # Filter the real and imaginary parts of the signal
    ureal = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], x * coscf)
    uimag = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], x * sincf)
    assert isinstance(ureal, np.ndarray)  # lfilter can return different types
    assert isinstance(uimag, np.ndarray)

    # Extract the BM velocity and the envelope
    reference_basilar_membrane = gain * (ureal * coscf + uimag * sincf)
    reference_envelope = gain * np.sqrt(ureal * ureal + uimag * uimag)

    # Filter the second signal using the existing cosine and sine sequences
    tpt_bw = processed_bandwidth * tpt * erb * 1.019
    a = np.exp(-tpt_bw)
    a_1 = 4.0 * a
    a_2 = -6.0 * a * a
    a_3 = 4.0 * a * a * a
    a_4 = -a * a * a * a
    a_5 = 4.0 * a * a
    gain = 2.0 * (1 - a_1 - a_2 - a_3 - a_4) / (1 + a_1 + a_5)

    # Filter the real and imaginary parts of the signal
    ureal = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], y * coscf)
    uimag = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], y * sincf)
    assert isinstance(ureal, np.ndarray)
    assert isinstance(uimag, np.ndarray)

    # Extract the BM velocity and the envelope
    processed_basilar_membrane = gain * (ureal * coscf + uimag * sincf)
    processed_envelope = gain * np.sqrt(ureal * ureal + uimag * uimag)

    return (
        reference_envelope,
        reference_basilar_membrane,
        processed_envelope,
        processed_basilar_membrane,
    )




[docs]
@njit
def gammatone_bandwidth_demodulation(
    npts: int,
    tpt: float,
    center_freq: float,
    center_freq_cos: np.ndarray,
    center_freq_sin: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
    """Create the carriers for demodulaton, using the 2d Rotation method from
      https://ccrma.stanford.edu/~jos/pasp/Digital_Sinusoid_Generators.html
    to generate the sin and cos components.  More efficient, perhaps, than
    calculating the sin and cos at each point in time.

    Arguments:
        npts (): How many points are needed.
        tpt (): Phase change (2pi/T) due to each sample time.
        center_freq (): The carrier frequency
        center_freq_cos (): Array to overwrite for the output.
        center_freq_sin (): Array to overwrite for the output.

    Returns:
        sincf (): Samples of the carrier frequency in sin phase.
        coscf (): Samples of the carrier frequency in cos phase.
    """
    cos_n = np.cos(tpt * center_freq)
    sin_n = np.sin(tpt * center_freq)
    cold = 1.0
    sold = 0.0
    center_freq_cos[0] = cold
    center_freq_sin[0] = sold
    for n in range(1, npts):
        arg = cold * cos_n + sold * sin_n
        sold = sold * cos_n - cold * sin_n
        cold = arg
        center_freq_cos[n] = cold
        center_freq_sin[n] = sold

    return center_freq_sin, center_freq_cos




[docs]
def bandwidth_adjust(
    control: ndarray,
    bandwidth_min: float,
    bandwidth_max: float,
    level1: float,
) -> float:
    """
    Compute the increase in auditory filter bandwidth in response to high signal
    levels.  The RMS of the control signal, a scalar, is used to set the
    bandwidth for the entire signal.

    Arguments:
        control (): envelope output in the control filter band
        bandwidth_min (): auditory filter bandwidth computed for the loss (or NH)
        bandwidth_max (): auditory filter bandwidth at maximum OHC damage
        level1 ():     RMS=1 corresponds to Level1 dB SPL

    Returns:
        bandwidth (): filter bandwidth increased for high signal levels

    Updates:
    James M. Kates, 21 June 2011.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """

    # Compute the control signal level
    control_rms = np.sqrt(np.mean(control**2))
    control_db = 20 * np.log10(control_rms) + level1

    # Adjust the auditory filter bandwidth
    if control_db < 50:
        # No BW adjustment for a signal below 50 dB SPL
        return bandwidth_min
    if control_db > 100:
        # Maximum BW if signal is above 100 dB SPL
        return bandwidth_max
    return bandwidth_min + ((control_db - 50) / 50) * (bandwidth_max - bandwidth_min)




[docs]
def env_compress_basilar_membrane(
    envsig: ndarray,
    bm: ndarray,  # pylint: disable=invalid-name
    control: ndarray,
    attn_ohc: float,
    threshold_low: float,
    compression_ratio: float,
    fsamp: float,
    level1: float,
    small: float = 1e-30,
    threshold_high: int = 100,
) -> tuple[ndarray, ndarray]:
    """
    Compute the cochlear compression in one auditory filter band. The gain is linear
    below the lower threshold, compressive with a compression ratio of CR:1 between the
    lower and upper thresholds, and reverts to linear above the upper threshold. The
    compressor assumes that auditory threshold is 0 dB SPL.

    Arguments:
        envsig (): analytic signal envelope (magnitude) returned by the
                gammatone filter bank
        bm (): BM motion output by the filter bank
        control (): analytic control envelope returned by the wide control
                path filter bank
        attn_ohc (): OHC attenuation at the input to the compressor
        threshold_Low (): kneepoint for the low-level linear amplification
        compression_ratio (): compression ratio
        fsamp (): sampling rate in Hz
        level1 (): dB reference level: a signal having an RMS value of 1 is
                assigned to Level1 dB SPL.
        small (): ???
        threshold_high: kneepoint for the high-level linear amplification

    Returns:
        compressed_signal (): compressed version of the signal envelope
        compressed_basilar_membrane (): compressed version of the BM motion

    Updates:
    James M. Kates, 19 January 2007.
    LP filter added 15 Feb 2007 (Ref: Zhang et al., 2001)
    Version to compress the envelope, 20 Feb 2007.
    Change in the OHC I/O function, 9 March 2007.
    Two-tone suppression added 22 August 2008.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """
    # Convert the control envelope to dB SPL
    logenv = np.maximum(control, small)
    logenv = level1 + 20 * np.log10(logenv)
    logenv = np.minimum(
        logenv, threshold_high
    )  # Clip signal levels above the upper threshold
    logenv = np.maximum(logenv, threshold_low)  # Clip signal at the lower threshold

    # Compute the compression gain in dB
    gain = -attn_ohc - (logenv - threshold_low) * (1 - (1 / compression_ratio))

    # Convert the gain to linear and apply a LP filter to give a 0.2 ms delay
    gain = 10 ** (gain / 20)
    flp = 800
    b, a = butter(1, flp / (0.5 * fsamp))
    gain = lfilter(b, a, gain)

    # Apply the gain to the signals
    compressed_signal = gain * envsig
    compressed_basilar_membrane = gain * bm

    return compressed_signal, compressed_basilar_membrane




[docs]
def envelope_align(
    reference: ndarray, output: ndarray, freq_sample: int = 24000, corr_range: int = 100
) -> ndarray:
    """
    Align the envelope of the processed signal to that of the reference signal.

    Arguments:
        reference (): envelope or BM motion of the reference signal
        output (): envelope or BM motion of the output signal
        freq_sample (int): Frequency sample rate in Hz
        corr_range (int): range in msec for the correlation

    Returns:
        y (): shifted output envelope to match the input

    Updates:
    James M. Kates, 28 October 2011.
    Absolute value of the cross-correlation peak removed, 22 June 2012.
    Cross-correlation range reduced, 13 August 2013.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """

    # The MATLAB code limits the range of lags to search (to 100 ms) to save computation
    # time - no such option exists in numpy, but the code below limits the delay to the
    # same range as in Matlab, for consistent results
    lags = np.rint(0.001 * corr_range * freq_sample).astype(int)  # Range in samples
    npts = len(reference)
    lags = min(lags, npts)

    ref_out_correlation = correlate(reference, output, "full")
    location = np.argmax(
        ref_out_correlation[npts - lags : npts + lags]
    )  # Limit the range in which
    delay = lags - location - 1

    # Time shift the output sequence
    if delay > 0:
        # Output delayed relative to the reference
        return np.concatenate((output[delay:npts], np.zeros(delay)))
    return np.concatenate((np.zeros(-delay), output[: npts + delay]))




[docs]
def envelope_sl(
    reference: ndarray,
    basilar_membrane: ndarray,
    attenuated_ihc: float,
    level1: float,
    small: float = 1e-30,
) -> tuple[ndarray, ndarray]:
    """
    Convert the compressed envelope returned by cochlear_envcomp to dB SL.

    Arguments:
        reference (): linear envelope after compression
        basilar_membrane (): linear Basilar Membrane vibration after compression
        attenuated_ihc (): IHC attenuation at the input to the synapse
        level1 (): level in dB SPL corresponding to 1 RMS
        small (float): ???

    Returns:
        _reference (): reference envelope in dB SL
        _basilar_membrane (): Basilar Membrane vibration with envelope converted to
            dB SL

    Updates:
    James M. Kates, 20 Feb 07.
    IHC attenuation added 9 March 2007.
    Basilar membrane vibration conversion added 2 October 2012.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """
    # Convert the envelope to dB SL
    _reference = level1 - attenuated_ihc + 20 * np.log10(reference + small)
    _reference = np.maximum(_reference, 0)

    # Convert the linear BM motion to have a dB SL envelope
    gain = (_reference + small) / (reference + small)
    _basilar_membrane = gain * basilar_membrane

    return _reference, _basilar_membrane




[docs]
@njit
def inner_hair_cell_adaptation(
    reference_db, reference_basilar_membrane, delta, freq_sample
):
    """
    Provide inner hair cell (IHC) adaptation. The adaptation is based on an
    equivalent RC circuit model, and the derivatives are mapped into
    1st-order backward differences. Rapid and short-term adaptation are
    provided. The input is the signal envelope in dB SL, with IHC attenuation
    already applied to the envelope. The outputs are the envelope in dB SL
    with adaptation providing overshoot of the long-term output level, and
    the BM motion is multiplied by a gain vs. time function that reproduces
    the adaptation. IHC attenuation and additive noise for the equivalent
    auditory threshold are provided by a subsequent call to eb_BMatten.

    Arguments:
        reference_db (np.ndarray): signal envelope in one frequency band in dB SL
             contains OHC compression and IHC attenuation
        reference_basilar_membrane (): basilar membrane vibration with OHC compression
            but no IHC attenuation
        delta (): overshoot factor = delta x steady-state
        freq_sample (int): sampling rate in Hz

    Returns:
        output_db (): envelope in dB SL with IHC adaptation
        output_basilar_membrane (): Basilar Membrane multiplied by the IHC adaptation
            gain function

    Updates:
    James M. Kates, 1 October 2012.
    Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """
    # Test the amount of overshoot
    dsmall = 1.0001
    delta = max(delta, dsmall)

    # Initialize adaptation time constants
    tau1 = 2  # Rapid adaptation in msec
    tau2 = 60  # Short-term adaptation in msec
    tau1 = 0.001 * tau1  # Convert to seconds
    tau2 = 0.001 * tau2

    # Equivalent circuit parameters
    freq_sample_inverse = 1 / freq_sample
    r_1 = 1 / delta
    r_2 = 0.5 * (1 - r_1)
    r_3 = r_2
    c_1 = tau1 * (r_1 + r_2) / (r_1 * r_2)
    c_2 = tau2 / ((r_1 + r_2) * r_3)

    # Intermediate values used for the voltage update matrix inversion
    a11 = r_1 + r_2 + r_1 * r_2 * (c_1 / freq_sample_inverse)
    a12 = -r_1
    a21 = -r_3
    a22 = r_2 + r_3 + r_2 * r_3 * (c_2 / freq_sample_inverse)
    denom = 1 / ((a11 * a22) - (a21 * a12))

    # Additional intermediate values
    r_1_inv = 1 / r_1
    product_r1_r2_c1 = r_1 * r_2 * (c_1 / freq_sample_inverse)
    product_r2_r3_c2 = r_2 * r_3 * (c_2 / freq_sample_inverse)

    # Initialize the outputs and state of the equivalent circuit
    nsamp = len(reference_db)
    gain = np.ones_like(
        reference_db
    )  # Gain vector to apply to the BM motion, default is 1
    output_db = np.zeros_like(reference_db)
    v_1 = 0
    v_2 = 0
    small = 1e-30

    # Loop to process the envelope signal
    # The gain asymptote is 1 for an input envelope of 0 dB SPL
    for n in range(nsamp):
        v_0 = reference_db[n]
        b_1 = v_0 * r_2 + product_r1_r2_c1 * v_1
        b_2 = product_r2_r3_c2 * v_2
        v_1 = denom * (a22 * b_1 - a12 * b_2)
        v_2 = denom * (-a21 * b_1 + a11 * b_2)
        out = (v_0 - v_1) * r_1_inv
        output_db[n] = out

    output_db = np.maximum(output_db, 0)
    gain = (output_db + small) / (reference_db + small)

    output_basilar_membrane = gain * reference_basilar_membrane

    return output_db, output_basilar_membrane




[docs]
def basilar_membrane_add_noise(
    reference: ndarray, threshold: int, level1: float
) -> ndarray:
    """
    Apply the IHC attenuation to the BM motion and to add a low-level Gaussian noise to
    give the auditory threshold.

    Arguments:
        reference (): BM motion to be attenuated
        threshold (): additive noise level in dB re:auditory threshold
        level1 (): an input having RMS=1 corresponds to Level1 dB SPL

    Returns:
        Attenuated signal with threshold noise added

    Updates:
        James M. Kates, 19 June 2012.
        Just additive noise, 2 Oct 2012.
        Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """
    gain = 10 ** ((threshold - level1) / 20)  # Linear gain for the noise

    # rng = np.random.default_rng()
    noise = gain * np.random.standard_normal(
        reference.shape
    )  # Gaussian RMS=1, then attenuated
    return reference + noise




[docs]
def group_delay_compensate(
    reference: ndarray,
    bandwidths: ndarray,
    center_freq: ndarray,
    freq_sample: float,
    ear_q: float = 9.26449,
    min_bandwidth: float = 24.7,
) -> ndarray:
    """
    Compensate for the group delay of the gammatone filter bank. The group
    delay is computed for each filter at its center frequency. The firing
    rate output of the IHC model is then adjusted so that all outputs have
    the same group delay.

    Arguments:
        xenv (np.ndarray): matrix of signal envelopes or BM motion
        bandwidths (): gammatone filter bandwidths adjusted for loss
        center_freq (): center frequencies of the bands
        freq_sample (): sampling rate for the input signal in Hz (e.g. 24,000 Hz)
        ear_q (float):
        min_bandwidth (float) :

    Returns:
        processed (): envelopes or BM motion compensated for the group delay.

    Updates:
        James M. Kates, 28 October 2011.
        Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """
    # Processing parameters
    nchan = len(bandwidths)

    # Filter ERB from Moore and Glasberg (1983)
    erb = min_bandwidth + (center_freq / ear_q)

    # Initialize the gammatone filter coefficients
    tpt = 2 * np.pi / freq_sample
    tpt_bandwidth = tpt * 1.019 * bandwidths * erb
    a = np.exp(-tpt_bandwidth)
    a_1 = 4.0 * a
    a_2 = -6.0 * a * a
    a_3 = 4.0 * a * a * a
    a_4 = -a * a * a * a
    a_5 = 4.0 * a * a

    # Compute the group delay in samples at fsamp for each filter
    _group_delay = np.zeros(nchan)
    for n in range(nchan):
        _, _group_delay[n] = group_delay(
            ([1, a_1[n], a_5[n]], [1, -a_1[n], -a_2[n], -a_3[n], -a_4[n]]), 1
        )
    _group_delay = np.rint(_group_delay).astype(int)  # convert to integer samples

    # Compute the delay correlation
    group_delay_min = np.min(_group_delay)
    _group_delay = (
        _group_delay - group_delay_min
    )  # Remove the minimum delay from all the over values
    group_delay_max = np.max(_group_delay)
    correct = (
        group_delay_max - _group_delay
    )  # Samples delay needed to add to give alignment

    # Add delay correction to each frequency band
    processed = np.zeros(reference.shape)
    for n in range(nchan):
        ref = reference[n]
        npts = len(ref)
        processed[n] = np.concatenate((np.zeros(correct[n]), ref[: npts - correct[n]]))

    return processed




[docs]
def convert_rms_to_sl(
    reference: ndarray,
    control: ndarray,
    attenuated_ohc: ndarray | float,
    threshold_low: ndarray | int,
    compression_ratio: ndarray | int,
    attenuated_ihc: ndarray | float,
    level1: float,
    threshold_high: int = 100,
    small: float = 1e-30,
) -> ndarray:
    """
    Covert the Root Mean Square average output of the gammatone filter bank
    into dB SL. The gain is linear below the lower threshold, compressive
    with a compression ratio of CR:1 between the lower and upper thresholds,
    and reverts to linear above the upper threshold. The compressor
    assumes that auditory threshold is 0 dB SPL.

    Arguments:
        reference (): analytic signal envelope (magnitude) returned by the
        gammatone filter bank, RMS average level
        control (): control signal envelope
        attenuated_ohc (): OHC attenuation at the input to the compressor
        threshold_low (): kneepoint for the low-level linear amplification
        compression_ratio (): compression ratio
        attenuated_ihc (): IHC attenuation at the input to the synapse
        level1 (): dB reference level: a signal having an RMS value of 1 is
                assigned to Level1 dB SPL.
        threshold_high (int):
        small (float):

    Returns:
        reference_db (): compressed output in dB above the impaired threshold

    Updates:
        James M. Kates, 6 August 2007.
        Version for two-tone suppression, 29 August 2008.
        Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
    """

    # Initialize the compression parameters
    threshold_high = 100  # Upper compression threshold

    # Convert the control to dB SPL
    small = 1e-30
    control_db_spl = np.maximum(control, small)
    control_db_spl = level1 + 20 * np.log10(control_db_spl)
    control_db_spl = np.minimum(control_db_spl, threshold_high)
    control_db_spl = np.maximum(control_db_spl, threshold_low)

    # Compute compression gain in dB
    gain = -attenuated_ohc - (control_db_spl - threshold_low) * (
        1 - (1 / compression_ratio)
    )

    # Convert the signal envelope to dB SPL
    control_db_spl = np.maximum(reference, small)
    control_db_spl = level1 + 20 * np.log10(control_db_spl)
    control_db_spl = np.maximum(control_db_spl, 0)
    reference_db = control_db_spl + gain - attenuated_ihc
    reference_db = np.maximum(reference_db, 0)

    return reference_db




[docs]
def env_smooth(envelopes: np.ndarray, segment_size: int, sample_rate: float) -> ndarray:
    """
    Function to smooth the envelope returned by the cochlear model. The
    envelope is divided into segments having a 50% overlap. Each segment is
    windowed, summed, and divided by the window sum to produce the average.
    A raised cosine window is used. The envelope sub-sampling frequency is
    2*(1000/segsize).

    Arguments:
        envelopes (np.ndarray): matrix of envelopes in each of the auditory bands
        segment_size: averaging segment size in msec
        freq_sample (int): input envelope sampling rate in Hz

    Returns:
        smooth: matrix of subsampled windowed averages in each band

    Updates:
        James M. Kates, 26 January 2007.
        Final half segment added 27 August 2012.
        Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
    """

    # Compute the window
    # Segment size in samples
    n_samples = int(np.around(segment_size * (0.001 * sample_rate)))
    n_samples += n_samples % 2

    window = np.hanning(n_samples)  # Raised cosine von Hann window
    wsum = np.sum(window)  # Sum for normalization

    #  The first segment has a half window
    nhalf = int(n_samples / 2)
    halfwindow = window[nhalf:n_samples]
    halfsum = np.sum(halfwindow)

    # Number of segments and assign the matrix storage
    n_channels = np.size(envelopes, 0)
    npts = np.size(envelopes, 1)
    nseg = int(
        1 + np.floor(npts / n_samples) + np.floor((npts - n_samples / 2) / n_samples)
    )
    smooth = np.zeros((n_channels, nseg))

    #  Loop to compute the envelope in each frequency band
    for k in range(n_channels):
        # Extract the envelope in the frequency band
        r = envelopes[k, :]  # pylint: disable=invalid-name

        # The first (half) windowed segment
        nstart = 0
        smooth[k, 0] = np.sum(r[nstart:nhalf] * halfwindow.conj().transpose()) / halfsum

        # Loop over the remaining full segments, 50% overlap
        for n in range(1, nseg - 1):
            nstart = int(nstart + nhalf)
            nstop = int(nstart + n_samples)
            smooth[k, n] = sum(r[nstart:nstop] * window.conj().transpose()) / wsum

        # The last (half) windowed segment
        nstart = nstart + nhalf
        nstop = nstart + nhalf
        smooth[k, nseg - 1] = (
            np.sum(r[nstart:nstop] * window[:nhalf].conj().transpose()) / halfsum
        )

    return smooth




[docs]
def mel_cepstrum_correlation(
    reference: ndarray,
    distorted: ndarray,
    threshold: float,
    addnoise: float,
) -> tuple[float, ndarray]:
    """
    Compute the cross-correlations between the input signal time-frequency
    envelope and the distortion time-frequency envelope.

    For each time interval, the log spectrum is fitted with a set of
    half-cosine basis functions. The spectrum weighted by the basis
    functions corresponds to Mel Cepstral Coefficients computed in the
    frequency domain. The amplitude-normalized cross-covariance between
    the time-varying basis functions for the input and output signals is
    then computed.

    Arguments:
        reference (): subsampled input signal envelope in dB SL in each critical band
        distorted (): subsampled distorted output signal envelope
        threshold (): threshold in dB SPL to include segment in calculation
        addnoise (): additive Gaussian noise to ensure 0 cross-corr at low levels

    Returns:
        average_cepstral_correlation : average cepstral correlation 2-6, input vs output
        individual_cepstral_correlations : individual cepstral correlations,
            input vs output

    Updates:
        James M. Kates, 24 October 2006.
        Difference signal removed for cochlear model, 31 January 2007.
        Absolute value added 13 May 2011.
        Changed to loudness criterion for silence threshsold, 28 August 2012.
        Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
    """

    # Processing parameters
    nbands = reference.shape[0]

    # Mel cepstrum basis functions (mel cepstrum because of auditory bands)
    nbasis = 6  # Number of cepstral coefficients to be used
    freq = np.arange(nbasis)
    k = np.arange(nbands)
    basis = np.cos(np.outer(k, freq) * np.pi / float(nbands - 1))
    mel_cepstral = basis / np.linalg.norm(basis, axis=0)

    # Find the segments that lie sufficiently above the quiescent rate
    reference_linear = 10 ** (
        reference / 20
    )  # Convert envelope dB to linear (specific loudness)
    xsum = np.sum(reference_linear, 0) / nbands  # Proportional to loudness in sones
    xsum = 20 * np.log10(xsum)  # Convert back to dB (loudness in phons)
    index = np.where(xsum > threshold)[0]  # Identify those segments above threshold
    nsamp = index.shape[0]  # Number of segments above threshold

    # Exit if not enough segments above zero
    average_cepstral_correlation = 0.0
    individual_cepstral_correlations = np.zeros(nbasis)
    if nsamp <= 1:
        logger.warning(
            "Function MelCepstrumCorrelation: Signal below threshold, outputs set to 0."
        )
        return average_cepstral_correlation, individual_cepstral_correlations

    # Remove the silent intervals
    ref = reference[:, index]
    proc = distorted[:, index]

    # Add the low-level noise to the envelopes
    ref = ref + addnoise * np.random.standard_normal(ref.shape)
    proc = proc + addnoise * np.random.standard_normal(proc.shape)

    # Compute the mel cepstrum coefficients using only those segments
    # above threshold

    reference_cep = np.dot(mel_cepstral.T, ref)
    processed_cep = np.dot(mel_cepstral.T, proc)

    # Remove the average value from the cepstral coefficients. The
    # cross-correlation thus becomes a cross-covariance, and there
    # is no effect of the absolute signal level in dB.
    reference_cep -= np.mean(reference_cep, axis=1, keepdims=True)
    processed_cep -= np.mean(processed_cep, axis=1, keepdims=True)

    # Normalized cross-correlations between the time-varying cepstral coeff
    # individual_cepstral_correlations = np.zeros(nbasis)  # Input vs output
    small = 1.0e-30
    xsum = np.sum(reference_cep**2, axis=1)
    ysum = np.sum(processed_cep**2, axis=1)
    mask = (xsum < small) | (ysum < small)
    individual_cepstral_correlations = np.zeros(nbasis)
    individual_cepstral_correlations[~mask] = np.abs(
        np.sum(reference_cep[~mask] * processed_cep[~mask], axis=1)
        / np.sqrt(xsum[~mask] * ysum[~mask])
    )

    # Figure of merit is the average of the cepstral correlations, ignoring
    # the first (average spectrum level).
    average_cepstral_correlation = np.sum(
        individual_cepstral_correlations[1:nbasis]
    ) / (nbasis - 1)
    return average_cepstral_correlation, individual_cepstral_correlations




[docs]
def melcor9(
    reference: ndarray,
    distorted: ndarray,
    threshold: float,
    add_noise: float,
    segment_size: int,
    n_cepstral_coef: int = 6,
) -> tuple[float, float, float, ndarray]:
    """
    Compute the cross-correlations between the input signal
    time-frequency envelope and the distortion time-frequency envelope. For
    each time interval, the log spectrum is fitted with a set of half-cosine
    basis functions. The spectrum weighted by the basis functions corresponds
    to mel cepstral coefficients computed in the frequency domain. The
    amplitude-normalized cross-covariance between the time-varying basis
    functions for the input and output signals is then computed for each of
    the 8 modulation frequencies.

    Arguments:
        reference (): subsampled input signal envelope in dB SL in each critical band
        distorted (): subsampled distorted output signal envelope
        threshold (): threshold in dB SPL to include segment in calculation
        add_noise (): additive Gaussian noise to ensure 0 cross-corr at low levels
        segment_size (): segment size in ms used for the envelope LP filter (8 msec)
        n_cepstral_coef (int): Number of cepstral coefficients

    Returns:
        mel_cepstral_average (): average of the modulation correlations across analysis
            frequency bands and modulation frequency bands, basis functions 2 -6
        mel_cepstral_low (): average over the four lower mod freq bands, 0 - 20 Hz
        mel_cepstral_high (): average over the four higher mod freq bands, 20 - 125 Hz
        mel_cepstral_modulation (): vector of cross-correlations by modulation
            frequency, averaged over analysis frequency band

    Updates:
        James M. Kates, 24 October 2006.
        Difference signal removed for cochlear model, 31 January 2007.
        Absolute value added 13 May 2011.
        Changed to loudness criterion for silence threshold, 28 August 2012.
        Version using envelope modulation filters, 15 July 2014.
        Modulation frequency vector output added 27 August 2014.
        Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
    """

    # Processing parameters
    nbands = reference.shape[0]

    # Mel cepstrum basis functions (mel cepstrum because of auditory bands)
    freq = np.arange(n_cepstral_coef)
    k = np.arange(nbands)
    basis = np.cos(np.outer(k, freq) * np.pi / (nbands - 1))
    cepm = basis / np.linalg.norm(basis, axis=0, keepdims=True)

    # Find the segments that lie sufficiently above the quiescent rate
    # Convert envelope dB to linear (specific loudness)
    reference_linear = 10 ** (reference / 20)

    # Proportional to loudness in sones
    reference_sum = np.sum(reference_linear, 0) / nbands

    # Convert back to dB (loudness in phons)
    reference_sum = 20 * np.log10(reference_sum)

    # Identify those segments above threshold
    index = np.where(reference_sum > threshold)[0]

    segments_above_threshold = index.shape[0]  # Number of segments above threshold

    # Modulation filter bands, segment size is 8 msec
    edge = [4.0, 8.0, 12.5, 20.0, 32.0, 50.0, 80.0]  # 8 bands covering 0 to 125 Hz
    n_modulation_filter_bands = 1 + len(edge)  # Number of modulation filter bands

    # Exit if not enough segments above zero
    mel_cepstral_average = 0.0
    mel_cepstral_low = 0.0
    mel_cepstral_high = 0.0
    mel_cepstral_modulation = np.zeros(n_modulation_filter_bands)
    if segments_above_threshold <= 1:
        logger.warning("Function melcor9: Signal below threshold, outputs set to 0.")
        return (
            mel_cepstral_average,
            mel_cepstral_low,
            mel_cepstral_high,
            mel_cepstral_modulation,
        )

    # Remove the silent intervals
    _reference = reference[:, index]
    _distorted = distorted[:, index]

    # Add the low-level noise to the envelopes
    _reference += add_noise * np.random.standard_normal(_reference.shape)
    _distorted += add_noise * np.random.standard_normal(_distorted.shape)

    # Compute the mel cepstrum coefficients using only those segments
    # above threshold
    reference_cep = np.dot(cepm.T, _reference[:, :segments_above_threshold])
    distorted_cep = np.dot(cepm.T, _distorted[:, :segments_above_threshold])

    reference_cep -= np.mean(reference_cep, axis=1, keepdims=True)
    distorted_cep -= np.mean(distorted_cep, axis=1, keepdims=True)

    # Envelope sampling parameters
    sampling_freq = 1000.0 / (0.5 * segment_size)  # Envelope sampling frequency in Hz
    nyquist_freq = 0.5 * sampling_freq  # Envelope Nyquist frequency

    # Design the linear-phase envelope modulation filters
    n_fir = np.around(
        128 * (nyquist_freq / 125)
    )  # Adjust filter length to sampling rate
    n_fir = int(2 * np.floor(n_fir / 2))  # Force an even filter length
    b = np.zeros((n_modulation_filter_bands, n_fir + 1))

    # LP filter 0-4 Hz
    b[0, :] = firwin(
        n_fir + 1, edge[0] / nyquist_freq, window="hann", pass_zero="lowpass"
    )
    # HP 80-125 Hz
    b[n_modulation_filter_bands - 1, :] = firwin(
        n_fir + 1,
        edge[n_modulation_filter_bands - 2] / nyquist_freq,
        window="hann",
        pass_zero="highpass",
    )
    # Bandpass filter
    for m in range(1, n_modulation_filter_bands - 1):
        b[m, :] = firwin(
            n_fir + 1,
            [edge[m - 1] / nyquist_freq, edge[m] / nyquist_freq],
            window="hann",
            pass_zero="bandpass",
        )

    mel_cepstral_cross_covar = melcor9_crosscovmatrix(
        b,
        n_modulation_filter_bands,
        n_cepstral_coef,
        segments_above_threshold,
        n_fir,
        reference_cep,
        distorted_cep,
    )

    mel_cepstral_average = np.sum(mel_cepstral_cross_covar[:, 1:], axis=(0, 1))
    mel_cepstral_average /= n_modulation_filter_bands * (n_cepstral_coef - 1)

    mel_cepstral_low = np.sum(mel_cepstral_cross_covar[:4, 1:])
    mel_cepstral_low /= 4 * (n_cepstral_coef - 1)

    mel_cepstral_high = np.sum(mel_cepstral_cross_covar[4:8, 1:])
    mel_cepstral_high /= 4 * (n_cepstral_coef - 1)

    mel_cepstral_modulation = np.mean(mel_cepstral_cross_covar[:, 1:], axis=1)

    return (
        mel_cepstral_average,
        mel_cepstral_low,
        mel_cepstral_high,
        mel_cepstral_modulation,
    )




[docs]
def melcor9_crosscovmatrix(
    b: ndarray,
    nmod: int,
    nbasis: int,
    nsamp: int,
    nfir: int,
    reference_cep: ndarray,
    processed_cep: ndarray,
) -> ndarray:
    """Compute the cross-covariance matrix.

    Arguments:
        b (): ???
        nmod (): ???
        nbasis (): ???
        nsamp (): ???
        nfir (): ???
        xcep (): ???
        ycep (): ???

    Returns:
        cross_covariance_matrix ():
    """
    small = 1.0e-30
    nfir2 = nfir / 2
    # Convolve the input and output envelopes with the modulation filters
    reference = np.zeros((nmod, nbasis, nsamp))
    processed = np.zeros((nmod, nbasis, nsamp))
    for m in range(nmod):
        for j in range(nbasis):
            # Convolve and remove transients
            c = convolve(b[m], reference_cep[j, :], mode="full")
            reference[m, j, :] = c[int(nfir2) : int(nfir2 + nsamp)]
            c = convolve(b[m], processed_cep[j, :], mode="full")
            processed[m, j, :] = c[int(nfir2) : int(nfir2 + nsamp)]

    # Compute the cross-covariance matrix
    cross_covariance_matrix = np.zeros((nmod, nbasis))

    for m in range(nmod):
        # Input freq band j, modulation freq m
        x_j = reference[m]
        x_j -= np.mean(x_j, axis=1, keepdims=True)
        reference_sum = np.sum(x_j**2, axis=1)

        # Processed signal band
        y_j = processed[m]
        y_j -= np.mean(y_j, axis=1, keepdims=True)
        processed_sum = np.sum(y_j**2, axis=1)

        xy = np.sum(x_j * y_j, axis=1)
        mask = (reference_sum < small) | (processed_sum < small)
        cross_covariance_matrix[m, ~mask] = np.abs(xy[~mask]) / np.sqrt(
            reference_sum[~mask] * processed_sum[~mask]
        )

    return cross_covariance_matrix




[docs]
def spectrum_diff(
    reference_sl: ndarray, processed_sl: ndarray
) -> tuple[ndarray, ndarray, ndarray]:
    """
    Compute changes in the long-term spectrum and spectral slope.

    The metric is based on the spectral distortion metric of Moore and Tan[1]_
    (JAES, Vol 52, pp 900-914). The log envelopes in dB SL are converted to
    linear to approximate specific loudness. The outputs are the sum of the
    absolute differences, the standard deviation of the differences, and the
    maximum absolute difference. The same three outputs are provided for the
    normalized spectral difference and for the slope. The output is
    calibrated so that a processed signal having 0 amplitude produces a
    value of 1 for the spectrum difference.

    Abs diff: weight all deviations uniformly
    Std diff: weight larger deviations more than smaller deviations
    Max diff: only weight the largest deviation

    Arguments:
        reference_sl (np.ndarray): reference signal spectrum in dB SL
        processed_sl (np.ndarray): degraded signal spectrum in dB SL

    Returns:
        dloud (np.array) : [sum abs diff, std dev diff, max diff] spectra
        dnorm (np.array) : [sum abs diff, std dev diff, max diff] norm spectra
        dslope (np.array) : [sum abs diff, std dev diff, max diff] slope

    References:
    .. [1] Moore BCJ, Tan, CT (2004) Development and Validation of a Method
           for Predicting the Perceived Naturalness of Sounds Subjected to
           Spectral Distortion J Audio Eng Soc 52(9):900-914. Available at.
           <http://www.aes.org/e-lib/browse.cfm?elib=13018>.

    Updates:
        James M. Kates, 28 June 2012.
        Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
    """

    # Convert the dB SL to linear magnitude values. Because of the auditory
    # filter bank, the OHC compression, and auditory threshold, the linear
    # values are closely related to specific loudness.
    nbands = reference_sl.shape[0]
    reference_linear_magnitude = 10 ** (reference_sl / 20)
    processed_linear_magnitude = 10 ** (processed_sl / 20)

    # Normalize the level of the reference and degraded signals to have the
    # same loudness. Thus overall level is ignored while differences in
    # spectral shape are measured.
    reference_sum = np.sum(reference_linear_magnitude)
    reference_linear_magnitude /= (
        reference_sum  # Loudness sum = 1 (arbitrary amplitude, proportional to sones)
    )
    processed_sum = np.sum(processed_linear_magnitude)
    processed_linear_magnitude /= processed_sum

    # Compute the spectrum difference
    dloud = np.zeros(3)
    diff_spectrum = (
        reference_linear_magnitude - processed_linear_magnitude
    )  # Difference in specific loudness in each band
    dloud[0] = np.sum(np.abs(diff_spectrum))
    dloud[1] = nbands * np.std(diff_spectrum)  # Biased std: second moment
    dloud[2] = np.max(np.abs(diff_spectrum))

    # Compute the normalized spectrum difference
    dnorm = np.zeros(3)
    diff_normalised_spectrum = (
        reference_linear_magnitude - processed_linear_magnitude
    ) / (
        reference_linear_magnitude + processed_linear_magnitude
    )  # Relative difference in specific loudness
    dnorm[0] = np.sum(np.abs(diff_normalised_spectrum))
    dnorm[1] = nbands * np.std(diff_normalised_spectrum)
    dnorm[2] = np.max(np.abs(diff_normalised_spectrum))

    # Compute the slope difference
    dslope = np.zeros(3)
    reference_slope = (
        reference_linear_magnitude[1:nbands]
        - reference_linear_magnitude[0 : nbands - 1]
    )
    processed_slope = (
        processed_linear_magnitude[1:nbands]
        - processed_linear_magnitude[0 : nbands - 1]
    )
    diff_slope = reference_slope - processed_slope  # Slope difference
    dslope[0] = np.sum(np.abs(diff_slope))
    dslope[1] = nbands * np.std(diff_slope)
    dslope[2] = np.max(np.abs(diff_slope))

    return dloud, dnorm, dslope




[docs]
def bm_covary(
    reference_basilar_membrane: ndarray,
    processed_basilar_membrane: ndarray,
    segment_size: int,
    sample_rate: float,
) -> tuple[ndarray, ndarray, ndarray]:
    """
    Compute the cross-covariance (normalized cross-correlation) between  the reference
    and processed signals in each auditory band. The signals are divided into segments
    having 50% overlap.

    Arguments:
        reference_basilar_membrane (): Basilar Membrane movement, reference signal
        processed_basilar_membrane (): Basilar Membrane movement, processed signal
        segment_size (): signal segment size, msec
        freq_sample (int): sampling rate in Hz

    Returns:
        signal_cross_covariance (np.array) : [nchan,nseg] of cross-covariance values
        reference_mean_square (np.array) : [nchan,nseg] of MS input signal energy values
        processed_mean_square (np.array) : [nchan,nseg] of MS processed signal energy
            values

    Updates:
        James M. Kates, 28 August 2012.
        Output amplitude adjustment added, 30 october 2012.
        Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
    """

    # Initialize parameters
    small = 1.0e-30

    # Lag for computing the cross-covariance
    lagsize = 1.0  # Lag (+/-) in msec
    maxlag = np.around(lagsize * (0.001 * sample_rate))  # Lag in samples

    # Compute the segment size in samples
    nwin = int(np.around(segment_size * (0.001 * sample_rate)))

    nwin += nwin % 2 == 1  # Force window length to be even
    window = np.hanning(nwin).conj().transpose()  # Raised cosine von Hann window

    # compute inverted Window autocorrelation
    win_corr = correlate(window, window, "full")
    start_sample = int(len(window) - 1 - maxlag)
    end_sample = int(maxlag + len(window))
    if start_sample < 0:
        raise ValueError("segment size too small")
    win_corr = 1 / win_corr[start_sample:end_sample]
    win_sum2 = 1.0 / np.sum(window**2)  # Window power, inverted

    # The first segment has a half window
    nhalf = int(nwin / 2)
    half_window = window[nhalf:nwin]
    half_corr = correlate(half_window, half_window, "full")
    start_sample = int(len(half_window) - 1 - maxlag)
    end_sample = int(maxlag + len(half_window))
    if start_sample < 0:
        raise ValueError("segment size too small")
    half_corr = 1 / half_corr[start_sample:end_sample]
    halfsum2 = 1.0 / np.sum(half_window**2)  # MS sum normalization, first segment

    # Number of segments
    nchan = reference_basilar_membrane.shape[0]
    npts = reference_basilar_membrane.shape[1]
    nseg = int(1 + np.floor(npts / nwin) + np.floor((npts - nwin / 2) / nwin))

    reference_mean_square = np.zeros((nchan, nseg))
    processed_mean_square = np.zeros((nchan, nseg))
    signal_cross_covariance = np.zeros((nchan, nseg))

    # Loop to compute the signal mean-squared level in each band for each
    # segment and to compute the cross-corvariances.
    for k in range(nchan):
        # Extract the BM motion in the frequency band
        x = reference_basilar_membrane[k, :]
        y = processed_basilar_membrane[k, :]

        # The first (half) windowed segment
        nstart = 0
        reference_seg = x[nstart:nhalf] * half_window  # Window the reference
        processed_seg = y[nstart:nhalf] * half_window  # Window the processed signal
        reference_seg = reference_seg - np.mean(reference_seg)  # Make 0-mean
        processed_seg = processed_seg - np.mean(processed_seg)

        # Normalize signal MS value by the window
        ref_mean_square = np.sum(reference_seg**2) * halfsum2

        proc_mean_squared = np.sum(processed_seg**2) * halfsum2
        correlation = correlate(reference_seg, processed_seg, "full")
        correlation = correlation[
            int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
        ]

        unbiased_cross_correlation = np.max(np.abs(correlation * half_corr))
        if (ref_mean_square > small) and (proc_mean_squared > small):
            # Normalize cross-covariance
            signal_cross_covariance[k, 0] = unbiased_cross_correlation / np.sqrt(
                ref_mean_square * proc_mean_squared
            )
        else:
            signal_cross_covariance[k, 0] = 0.0

        # Save the reference MS level
        reference_mean_square[k, 0] = ref_mean_square
        processed_mean_square[k, 0] = proc_mean_squared

        # Loop over the remaining full segments, 50% overlap
        for n in range(1, nseg - 1):
            nstart = nstart + nhalf
            nstop = nstart + nwin
            reference_seg = x[nstart:nstop] * window  # Window the reference
            processed_seg = y[nstart:nstop] * window  # Window the processed signal
            reference_seg = reference_seg - np.mean(reference_seg)  # Make 0-mean
            processed_seg = processed_seg - np.mean(processed_seg)

            # Normalize signal MS value by the window
            ref_mean_square = np.sum(reference_seg**2) * win_sum2
            proc_mean_squared = np.sum(processed_seg**2) * win_sum2
            correlation = correlate(reference_seg, processed_seg, "full")
            correlation = correlation[
                int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
            ]

            unbiased_cross_correlation = np.max(np.abs(correlation * win_corr))
            if (ref_mean_square > small) and (proc_mean_squared > small):
                # Normalize cross-covariance
                signal_cross_covariance[k, n] = unbiased_cross_correlation / np.sqrt(
                    ref_mean_square * proc_mean_squared
                )
            else:
                signal_cross_covariance[k, n] = 0.0

            reference_mean_square[k, n] = ref_mean_square
            processed_mean_square[k, n] = proc_mean_squared

        # The last (half) windowed segment
        nstart = nstart + nhalf
        nstop = nstart + nhalf
        reference_seg = x[nstart:nstop] * window[0:nhalf]  # Window the reference
        processed_seg = y[nstart:nstop] * window[0:nhalf]  # Window the processed signal
        reference_seg = reference_seg - np.mean(reference_seg)  # Make 0-mean
        processed_seg = processed_seg - np.mean(processed_seg)
        # Normalize signal MS value by the window
        ref_mean_square = np.sum(reference_seg**2) * halfsum2
        proc_mean_squared = np.sum(processed_seg**2) * halfsum2

        correlation = correlate(reference_seg, processed_seg, "full")
        correlation = correlation[
            int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
        ]

        unbiased_cross_correlation = np.max(np.abs(correlation * half_corr))
        if (ref_mean_square > small) and (proc_mean_squared > small):
            # Normalized cross-covariance
            signal_cross_covariance[k, nseg - 1] = unbiased_cross_correlation / np.sqrt(
                ref_mean_square * proc_mean_squared
            )
        else:
            signal_cross_covariance[k, nseg - 1] = 0.0

        # Save the reference and processed MS level
        reference_mean_square[k, nseg - 1] = ref_mean_square
        processed_mean_square[k, nseg - 1] = proc_mean_squared

    # Limit the cross-covariance to lie between 0 and 1
    signal_cross_covariance = np.clip(signal_cross_covariance, 0, 1)

    # Adjust the BM magnitude to correspond to the envelope in dB SL
    reference_mean_square *= 2.0
    processed_mean_square *= 2.0

    return signal_cross_covariance, reference_mean_square, processed_mean_square




[docs]
def ave_covary2(
    signal_cross_covariance: np.ndarray,
    reference_signal_mean_square: np.ndarray,
    threshold_db: float,
    lp_filter_order: ndarray | None = None,
    freq_cutoff: ndarray | None = None,
) -> tuple[float, ndarray]:
    """
    Compute the average cross-covariance between the reference and processed
    signals in each auditory band.

    The silent time-frequency tiles are removed from consideration. The
    cross-covariance is computed for each segment in each frequency band. The
    values are weighted by 1 for inclusion or 0 if the tile is below
    threshold. The sum of the covariance values across time and frequency are
    then divided by the total number of tiles above threshold. The calculation
    is a modification of Tan et al.[1]_ . The cross-covariance is also output
    with a frequency weighting that reflects the loss of IHC synchronization at high
    frequencies Johnson[2]_.

    Arguments:
        signal_cross_covariance (np.array): [nchan,nseg] of cross-covariance values
        reference_signal_mean_square (np.array): [nchan,nseg] of reference signal MS
            values
        threshold_db (): threshold in dB SL to include segment ave over freq in
            average
        lp_filter (list): LP filter order
        freq_cutoff (list): Cutoff frequencies in Hz

    Returns:
        average_covariance (): cross-covariance in segments averaged over time and
            frequency
        ihc_sync_covariance (): cross-covariance array, 6 different weightings for loss
            of IHC synchronization at high frequencies:
              LP Filter Order     Cutoff Freq, kHz
                1              1.5
                3              2.0
                5              2.5, 3.0, 3.5, 4.0

    References:

    .. [1] Tan CT, Moore, BCJ, Zacharov N, Mattila VV (2004) Predicting the Perceived
           Quality of Nonlinearly Distorted Music and Speech Signals. J Audio Eng Soc
           52(9):900-914. Available at.
           <http://www.aes.org/e-lib/browse.cfm?elib=13013>.

    .. [2] Johnson DH (1980) The relationship between spike rate and synchrony in
           responses of auditory‐nerve fibers to single tones J Acoustical Soc of Am
           68:1115 Available at.
           <https://doi.org/10.1121/1.384982>

    Updates:
        James M. Kates, 28 August 2012.
        Adjusted for BM vibration in dB SL, 30 October 2012.
        Threshold for including time-freq tile modified, 30 January 2013.
        Version for different sync loss, 15 February 2013.
        Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
    """

    # Array dimensions
    n_channels = signal_cross_covariance.shape[0]

    # Initialize the LP filter for loss of IHC synchronization
    # Center frequencies in Hz on an ERB scale
    _center_freq = center_frequency(n_channels)
    # Default LP filter order
    if lp_filter_order is None:
        lp_filter_order = np.array([1, 3, 5, 5, 5, 5])
    # Default cutoff frequencies in Hz
    if freq_cutoff is None:
        freq_cutoff = 1000 * np.array([1.5, 2.0, 2.5, 3.0, 3.5, 4.0])

    fc2p = (
        np.atleast_2d(freq_cutoff ** (2 * lp_filter_order)).repeat(n_channels, axis=0).T
    )
    freq2p = _center_freq ** (
        2 * np.atleast_2d(lp_filter_order).repeat(n_channels, axis=0).T
    )
    fsync = np.sqrt(fc2p / (fc2p + freq2p))

    # Find the segments that lie sufficiently above the threshold.
    # Convert squared amplitude to dB envelope
    signal_rms = np.sqrt(reference_signal_mean_square)
    # Linear amplitude (specific loudness)
    signal_linear_amplitude = 10 ** (signal_rms / 20)
    # Intensity averaged over frequency bands
    reference_mean = np.sum(signal_linear_amplitude, 0) / n_channels
    # Convert back to dB (loudness in phons)
    reference_mean = 20 * np.log10(reference_mean)
    # Identify those segments above threshold
    index = np.argwhere(reference_mean > threshold_db).T
    if index.size != 1:
        index = index.squeeze()
    nseg = index.shape[0]  # Number of segments above threshold

    # Exit if not enough segments above zero
    if nseg <= 1:
        logger.warning(
            "Function AveCovary2: Ave signal below threshold, outputs set to 0."
        )
        average_covariance = 0
        # syncov = 0
        ihc_sync_covariance = np.zeros(6)
        return average_covariance, ihc_sync_covariance

    # Remove the silent segments
    signal_cross_covariance = signal_cross_covariance[:, index]
    signal_rms = signal_rms[:, index]

    # Compute the time-frequency weights. The weight=1 if a segment in a
    # frequency band is above threshold, and weight=0 if below threshold.
    weight = np.zeros((n_channels, nseg))  # No IHC synchronization roll-off
    weight[signal_rms > threshold_db] = 1

    # The wsync tensor should be constructed as follows:
    #
    # wsync = np.zeros((6, n_channels, nseg))
    # for k in range(n_channels):
    #    for n in range(nseg):
    #        # Thresh in dB SL for including time-freq tile
    #        if signal_rms[k, n] > threshold_db:
    #            wsync[:, k, n] = fsync[:, k]
    #
    # This can be written is an efficient vectorsized form as follows:
    wsync = np.zeros((6, n_channels, nseg))
    mask = signal_rms > threshold_db
    fsync3d = np.repeat(fsync[..., None], nseg, axis=2)
    wsync[:, mask] = fsync3d[:, mask]

    # Sum the weighted covariance values
    # Sum of weighted time-freq tiles
    csum = np.sum(np.sum(weight * signal_cross_covariance))

    wsum = np.sum(np.sum(weight))  # Total number of tiles above threshold

    tiles_above_threshold = np.zeros(6)

    # Sum of weighted time-freq tiles
    sum_weighted_time_freq = np.sum(wsync * signal_cross_covariance, axis=(1, 2))

    tiles_above_threshold = np.sum(wsync, axis=(1, 2))

    # Exit if not enough segments above zero
    if wsum < 1:
        average_covariance = 0
        logger.warning(
            "Function AveCovary2: Signal tiles below threshold, outputs set to 0."
        )
    else:
        average_covariance = csum / wsum
    ihc_sync_covariance = sum_weighted_time_freq / tiles_above_threshold

    return average_covariance, ihc_sync_covariance
Source code for clarity.evaluator.haspi.eb

Project name not set

Navigation

Related Topics