"""Module for HASPI, HASQI, HAAQI EBs"""
from __future__ import annotations
# pylint: disable=import-error
import logging
from typing import TYPE_CHECKING
import numpy as np
from numba import njit # type: ignore # <-- silence mypy no attribute error
from scipy.signal import (
butter,
cheby2,
convolve,
correlate,
firwin,
group_delay,
lfilter,
resample_poly,
)
from clarity.enhancer.nalr import NALR
from clarity.utils.audiogram import Audiogram
if TYPE_CHECKING:
from numpy import ndarray
logger = logging.getLogger(__name__)
[docs]
def ear_model(
reference: ndarray,
reference_freq: float,
processed: ndarray,
processed_freq: float,
hearing_loss: ndarray,
itype: int,
level1: float,
nchan: int = 32,
m_delay: int = 1,
shift: float | None = None,
) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray, ndarray, float]:
"""
Function that implements a cochlear model that includes the middle ear,
auditory filter bank, Outer Hair Cell (OHC) dynamic-range compression,
and Inner Hair Cell (IHC) attenuation.
The inputs are the reference and processed signals that are to be
compared. The reference is at the reference intensity (e.g. 65 dB SPL
or with NAL-R amplification) and has no other processing. The processed
signal is the hearing-aid output, and is assumed to have the same or
greater group delay compared to the reference.
The function outputs the envelopes of the signals after OHC compression
and IHC loss attenuation.
Arguments:
reference (np.ndarray): reference signal: should be adjusted to 65 dB SPL
(itype=0 or 1) or to 65 dB SPL plus NAL-R gain (itype=2)
reference_freq (int): sampling rate for the reference signal, Hz
processed (np.ndarray): processed signal (e.g. hearing-aid output) includes
HA gain
processed_freq (int): sampling rate for the processed signal, Hz
hearing_loss (np.ndarray): audiogram giving the hearing loss in dB at 6
audiometric frequencies: [250, 500, 1000, 2000, 4000, 6000] Hz
itype (int): purpose for the calculation:
0=intelligibility: reference is normal hearing and must not
include NAL-R EQ
1=quality: reference does not include NAL-R EQ
2=quality: reference already has NAL-R EQ applied
level1: level calibration: signal RMS=1 corresponds to Level1 dB SPL
nchan (int): auditory frequency bands
m_delay (int): Compensate for the gammatone group delay.
shift (float): Basal shift of the basilar membrane length
Returns:
reference_db (): envelope for the reference in each band
reference_basilar_membrane (): BM motion for the reference in each band
processed_db (): envelope for the processed signal in each band
processed_basilar_membrane (): BM motion for the processed signal in each band
reference_sl (): compressed RMS average reference in each band converted
to dB SL
processed_sl (): compressed RMS average output in each band converted to dB SL
freq_sample (): sampling rate in Hz for the model outputs
Updates:
James M. Kates, 27 October 2011.
Basilar Membrane added 30 Dec 2011.
Revised 19 June 2012.
Remove match of reference RMS level to processed 29 August 2012.
IHC adaptation added 1 October 2012.
Basilar Membrane envelope converted to dB SL, 2 Oct 2012.
Filterbank group delay corrected, 14 Dec 2012.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
Updated by Gerardo Roa Dabike, September 2022.
"""
# Processing parameters
# OHC and IHC parameters for the hearing loss
# Auditory filter center frequencies span 80 to 8000 Hz.
_center_freq = center_frequency(nchan) # Center frequencies on an ERB scale
# Cochlear model parameters for the processed signal
(
attn_ohc_y,
bandwidth_min_y,
low_knee_y,
compression_ratio_y,
attn_ihc_y,
) = loss_parameters(hearing_loss, _center_freq)
# The cochlear model parameters for the reference are the same as for the hearing
# loss if calculating quality, but are for normal hearing if calculating
# intelligibility.
if itype == 0:
hearing_loss_x = np.zeros(len(hearing_loss))
else:
hearing_loss_x = hearing_loss
[
attn_ohc_x,
bandwidth_min_x,
low_knee_x,
compression_ratio_x,
attn_ihc_x,
] = loss_parameters(hearing_loss_x, _center_freq)
# Compute center frequencies for the control
_center_freq_control = center_frequency(nchan, shift)
# Maximum BW for the control
_, bandwidth_1, _, _, _ = loss_parameters(np.full(6, 100), _center_freq_control)
# Input signal adjustments
# Convert the signals to 24 kHz sampling rate.
# Using 24 kHz guarantees that all of the cochlear filters have the same shape
# independent of the incoming signal sampling rates
reference_24hz, _ = resample_24khz(reference, reference_freq)
processed_24hz, freq_sample = resample_24khz(processed, processed_freq)
# Check file sizes
min_signal_length = min(len(reference_24hz), len(processed_24hz))
reference_24hz = reference_24hz[:min_signal_length]
processed_24hz = processed_24hz[:min_signal_length]
# Bulk broadband signal alignment
reference_24hz, processed_24hz = input_align(reference_24hz, processed_24hz)
nsamp = len(reference_24hz)
# For HASQI, here add NAL-R equalization if the quality reference doesn't
# already have it.
if itype == 1:
nfir = 140 # Length in samples of the FIR NAL-R EQ filter (24-kHz rate)
enhancer = NALR(nfir, freq_sample)
audiogram = Audiogram(
levels=hearing_loss,
frequencies=np.array([250, 500, 1000, 2000, 4000, 6000]),
)
nalr_fir, _ = enhancer.build(audiogram)
reference_24hz = enhancer.apply(nalr_fir, reference_24hz)
reference_24hz = reference_24hz[nfir : nfir + nsamp]
# Cochlear model
# Middle ear
reference_mid = middle_ear(reference_24hz, freq_sample)
processed_mid = middle_ear(processed_24hz, freq_sample)
# Initialize storage
# Reference and processed envelopes and BM motion
reference_db = np.zeros((nchan, nsamp))
processed_db = np.zeros((nchan, nsamp))
# Reference and processed average spectral values
reference_average = np.zeros(nchan)
processed_average = np.zeros(nchan)
reference_control_average = np.zeros(nchan)
processed_control_average = np.zeros(nchan)
# Filter bandwidths adjusted for intensity
reference_bandwidth = np.zeros(nchan)
processed_bandwidth = np.zeros(nchan)
reference_b = np.zeros((nchan, nsamp))
processed_b = np.zeros((nchan, nsamp))
# Loop over each filter in the auditory filter bank
for n in range(nchan):
# Control signal envelopes for the reference and processed signals
reference_control, _, processed_control, _ = gammatone_basilar_membrane(
reference_mid,
bandwidth_1[n],
processed_mid,
bandwidth_1[n],
freq_sample,
_center_freq_control[n],
)
# Adjust the auditory filter bandwidths for the average signal level
reference_bandwidth[n] = bandwidth_adjust(
reference_control, bandwidth_min_x[n], bandwidth_1[n], level1
)
processed_bandwidth[n] = bandwidth_adjust(
processed_control, bandwidth_min_y[n], bandwidth_1[n], level1
)
# Envelopes and BM motion of the reference and processed signals
xenv, xbm, yenv, ybm = gammatone_basilar_membrane(
reference_mid,
reference_bandwidth[n],
processed_mid,
processed_bandwidth[n],
freq_sample,
_center_freq[n],
)
# RMS levels of the ref and output envelopes for linear metric
reference_average[n] = np.sqrt(np.mean(xenv**2))
processed_average[n] = np.sqrt(np.mean(yenv**2))
reference_control_average[n] = np.sqrt(np.mean(reference_control**2))
processed_control_average[n] = np.sqrt(np.mean(processed_control**2))
# Cochlear compression for the signal envelopes and BM motion
reference_cochlear_compression, reference_b[n] = env_compress_basilar_membrane(
xenv,
xbm,
reference_control,
attn_ohc_x[n],
low_knee_x[n],
compression_ratio_x[n],
freq_sample,
level1,
)
processed_cochlear_compression, processed_b[n] = env_compress_basilar_membrane(
yenv,
ybm,
processed_control,
attn_ohc_y[n],
low_knee_y[n],
compression_ratio_y[n],
freq_sample,
level1,
)
# Correct for the delay between the reference and output
processed_cochlear_compression = envelope_align(
reference_cochlear_compression, processed_cochlear_compression
) # Align processed envelope to reference
processed_b[n] = envelope_align(
reference_b[n], processed_b[n]
) # Align processed BM motion to reference
# Convert the compressed envelopes and BM vibration envelopes to dB SPL
reference_cochlear_compression, reference_b[n] = envelope_sl(
reference_cochlear_compression, reference_b[n], attn_ihc_x[n], level1
)
processed_cochlear_compression, processed_b[n] = envelope_sl(
processed_cochlear_compression, processed_b[n], attn_ihc_y[n], level1
)
# Apply the IHC rapid and short-term adaptation
delta = 2 # Amount of overshoot
reference_db[n], reference_b[n] = inner_hair_cell_adaptation(
reference_cochlear_compression, reference_b[n], delta, freq_sample
)
processed_db[n], processed_b[n] = inner_hair_cell_adaptation(
processed_cochlear_compression, processed_b[n], delta, freq_sample
)
# Additive noise level to give the auditory threshold
ihc_threshold = -10 # Additive noise level, dB re: auditory threshold
reference_basilar_membrane = basilar_membrane_add_noise(
reference_b, ihc_threshold, level1
)
processed_basilar_membrane = basilar_membrane_add_noise(
processed_b, ihc_threshold, level1
)
# Correct for the gammatone filterbank interchannel group delay.
if m_delay > 0:
reference_db = group_delay_compensate(
reference_db, reference_bandwidth, _center_freq, freq_sample
)
processed_db = group_delay_compensate(
processed_db, reference_bandwidth, _center_freq, freq_sample
)
reference_basilar_membrane = group_delay_compensate(
reference_basilar_membrane, reference_bandwidth, _center_freq, freq_sample
)
processed_basilar_membrane = group_delay_compensate(
processed_basilar_membrane, reference_bandwidth, _center_freq, freq_sample
)
# Convert average gammatone outputs to dB SPL
reference_sl = convert_rms_to_sl(
reference_average,
reference_control_average,
attn_ohc_x,
low_knee_x,
compression_ratio_x,
attn_ihc_x,
level1,
)
processed_sl = convert_rms_to_sl(
processed_average,
processed_control_average,
attn_ohc_y,
low_knee_y,
compression_ratio_y,
attn_ihc_y,
level1,
)
return (
reference_db,
reference_basilar_membrane,
processed_db,
processed_basilar_membrane,
reference_sl,
processed_sl,
freq_sample,
)
[docs]
def center_frequency(
nchan: int,
shift: float | None = None,
low_freq: int = 80,
high_freq: int = 8000,
ear_q: float = 9.26449,
min_bw: float = 24.7,
) -> ndarray:
"""
Compute the Equivalent Rectangular Bandwidth_[1] frequency spacing for the
gammatone filter bank. The equation comes from Malcolm Slaney[2].
Arguments:
nchan (int): number of filters in the filter bank
low_freq (int): Low Frequency level.
high_freq (int): High Frequency level.
shift (): optional frequency shift of the filter bank specified as a fractional
shift in distance along the BM. A positive shift is an increase in frequency
(basal shift), and negative is a decrease in frequency (apical shift). The
total length of the BM is normalized to 1. The frequency-to-distance map is
from D.D. Greenwood[3].
ear_q (float):
min_bw (float):
Returns:
References:
.. [1] Moore BCJ, Glasberg BR (1983) Suggested formulae for calculating
auditory-filter bandwidths and excitation patterns. J Acoustical
Soc America 74:750-753. Available at
<https://doi.org/10.1121/1.389861>
.. [2] Slaney M (1993) An Efficient Implemtnation of the Patterson-
Holdsworth Auditory Filter Bank. Available at:
<https://asset-pdf.scinapse.io/prod/396690109/396690109.pdf>.
.. [3] Greenwood DD (1990) A cochlear frequency-position function for
several species--29 years later. J Acoust Soc Am 87(6):2592-
2605. Available at
<https://doi.o10.1121/1.399052>
Updates:
James M. Kates, 25 January 2007.
Frequency shift added 22 August 2008.
Lower and upper frequencies fixed at 80 and 8000 Hz, 19 June 2012.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
if shift is not None:
k = 1
A = 165.4 # pylint: disable=invalid-name
a = 2.1 # shift specified as a fraction of the total length
# Locations of the low and high frequencies on the BM between 0 and 1
x_low = (1 / a) * np.log10(k + (low_freq / A))
x_high = (1 / a) * np.log10(k + (high_freq / A))
# Shift the locations
x_low = x_low * (1 + shift)
x_high = x_high * (1 + shift)
# Compute the new frequency range
low_freq = A * (10 ** (a * x_low) - k)
high_freq = A * (10 ** (a * x_high) - k)
# All of the following expressions are derived in Apple TR #35,
# "An Efficient Implementation of the Patterson-Holdsworth Cochlear
# Filter Bank" by Malcolm Slaney.
# https://engineering.purdue.edu/~malcolm/apple/tr35/PattersonsEar.pdf
_center_freq = -(ear_q * min_bw) + np.exp(
np.arange(1, nchan)
* (-np.log(high_freq + ear_q * min_bw) + np.log(low_freq + ear_q * min_bw))
/ (nchan - 1)
) * (high_freq + ear_q * min_bw)
_center_freq = np.insert(
_center_freq, 0, high_freq
) # Last center frequency is set to highFreq
_center_freq = np.flip(_center_freq)
return _center_freq
[docs]
def loss_parameters(
hearing_loss: ndarray,
center_freq: ndarray,
audiometric_freq: ndarray | None = None,
) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
"""
Apportion the hearing loss to the outer hair cells (OHC) and the inner
hair cells (IHC) and to increase the bandwidth of the cochlear filters
in proportion to the OHC fraction of the total loss.
Arguments:
hearing_loss (np.ndarray): hearing loss at the 6 audiometric frequencies
center_freq (np.ndarray): array containing the center frequencies of the
gammatone filters arranged from low to high
audiometric_freq (list):
Returns:
attenuated_ohc (): attenuation in dB for the OHC gammatone filters
bandwidth (): OHC filter bandwidth expressed in terms of normal
low_knee (): Lower kneepoint for the low-level linear amplification
compression_ratio (): Ranges from 1.4:1 at 150 Hz to 3.5:1 at 8 kHz for normal
hearing. Reduced in proportion to the OHC loss to 1:1.
attenuated_ihc (): attenuation in dB for the input to the IHC synapse
Updates:
James M. Kates, 25 January 2007.
Version for loss in dB and match of OHC loss to CR, 9 March 2007.
Low-frequency extent changed to 80 Hz, 27 Oct 2011.
Lower kneepoint set to 30 dB, 19 June 2012.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Audiometric frequencies in Hz
if audiometric_freq is None:
audiometric_freq = np.array([250, 500, 1000, 2000, 4000, 6000])
# Interpolation to give the loss at the gammatone center frequencies
# Use linear interpolation in dB. The interpolation assumes that
# cfreq[1] < aud[1] and cfreq[nfilt] > aud[6]
nfilt = len(center_freq)
f_v = np.insert(
audiometric_freq, [0, len(audiometric_freq)], [center_freq[0], center_freq[-1]]
)
# Interpolated gain in dB
loss = np.interp(
center_freq,
f_v,
np.insert(
hearing_loss, [0, len(hearing_loss)], [hearing_loss[0], hearing_loss[-1]]
),
)
loss = np.maximum(loss, 0)
# Make sure there are no negative losses
# Compression ratio changes linearly with ERB rate from 1.25:1 in the 80-Hz
# frequency band to 3.5:1 in the 8-kHz frequency band
compression_ratio = 1.25 + 2.25 * np.arange(nfilt) / (nfilt - 1)
# Maximum OHC sensitivity loss depends on the compression ratio. The compression
# I/O curves assume linear below 30 and above 100 dB SPL in normal ears.
max_ohc = 70 * (
1 - (1 / compression_ratio)
) # HC loss that results in 1:1 compression
theoretical_ohc = 1.25 * max_ohc # Loss threshold for adjusting the OHC parameters
# Apportion the loss in dB to the outer and inner hair cells based on the data of
# Moore et al (1999), JASA 106, 2761-2778.
# Reduce the CR towards 1:1 in proportion to the OHC loss.
attenuated_ohc = 0.8 * np.copy(loss)
attenuated_ihc = 0.2 * np.copy(loss)
attenuated_ohc[loss >= theoretical_ohc] = (
0.8 * theoretical_ohc[loss >= theoretical_ohc]
)
attenuated_ihc[loss >= theoretical_ohc] = 0.2 * theoretical_ohc[
loss >= theoretical_ohc
] + (loss[loss >= theoretical_ohc] - theoretical_ohc[loss >= theoretical_ohc])
# Adjust the OHC bandwidth in proportion to the OHC loss
bandwidth = np.ones(nfilt)
bandwidth = bandwidth + (attenuated_ohc / 50.0) + 2.0 * (attenuated_ohc / 50.0) ** 6
# Compute the compression lower kneepoint and compression ratio
low_knee = attenuated_ohc + 30
upamp = 30 + (70 / compression_ratio) # Output level for an input of 100 dB SPL
compression_ratio = (100 - low_knee) / (
upamp + attenuated_ohc - low_knee
) # OHC loss Compression ratio
return attenuated_ohc, bandwidth, low_knee, compression_ratio, attenuated_ihc
[docs]
def resample_24khz(
reference_signal: ndarray, reference_freq: float, freq_sample_hz: float = 24000.0
) -> tuple[ndarray, float]:
"""
Resample the input signal at 24 kHz. The input sampling rate is
rounded to the nearest kHz to compute the sampling rate conversion
ratio.
Arguments:
reference_signal (np.ndarray): input signal
reference_freq (int): sampling rate for the input in Hz
freq_sample_hz (int): Frequency sample in Hz
Returns:
reference_signal_24 signal resampled at kHz (default 24Khz)
freq_sample_hz output sampling rate in Hz
Updates
James M. Kates, 20 June 2011.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Sampling rate information
sample_rate_target_khz = np.round(
freq_sample_hz / 1000
) # output rate to nearest kHz
reference_freq_khz = np.round(reference_freq / 1000)
# Resample the signal
if reference_freq_khz == sample_rate_target_khz:
# No resampling performed if the rates match
return reference_signal, freq_sample_hz
if reference_freq_khz < sample_rate_target_khz:
# Resample for the input rate lower than the output
resample_signal = resample_poly(
reference_signal, sample_rate_target_khz, reference_freq_khz
)
# Match the RMS level of the resampled signal to that of the input
reference_rms = np.sqrt(np.mean(reference_signal**2))
resample_rms = np.sqrt(np.mean(resample_signal**2))
resample_signal = (reference_rms / resample_rms) * resample_signal
return resample_signal, freq_sample_hz
# Resample for the input rate higher than the output
resample_signal = resample_poly(
reference_signal, sample_rate_target_khz, reference_freq_khz
)
# Reduce the input signal bandwidth to 21 kHz (-10.5 to +10.5 kHz)
# The power equalization is designed to match the signal intensities
# over the frequency range spanned by the gammatone filter bank.
# Chebyshev Type 2 LP
order = 7
attenuation = 30 # sidelobe attenuation in dB
reference_freq_cut = 21 / reference_freq_khz
reference_b, reference_a = cheby2(order, attenuation, reference_freq_cut)
reference_filter = lfilter(reference_b, reference_a, reference_signal, axis=0)
# Reduce the resampled signal bandwidth to 21 kHz (-10.5 to +10.5 kHz)
resample_rate_cut = 21 / sample_rate_target_khz
target_b, target_a = cheby2(order, attenuation, resample_rate_cut)
target_filter = lfilter(target_b, target_a, resample_signal, axis=0)
# Compute the input and output RMS levels within the 21 kHz bandwidth and
# match the output to the input
reference_rms = np.sqrt(np.mean(reference_filter**2))
resample_rms = np.sqrt(np.mean(target_filter**2))
resample_signal = (reference_rms / resample_rms) * resample_signal
return resample_signal, freq_sample_hz
[docs]
def middle_ear(reference: ndarray, freq_sample: float) -> ndarray:
"""
Design the middle ear filters and process the input through the
cascade of filters. The middle ear model is a 2-pole HP filter
at 350 Hz in series with a 1-pole LP filter at 5000 Hz. The
result is a rough approximation to the equal-loudness contour
at threshold.
Arguments:
reference (np.ndarray): input signal
freq_sample (float): sampling rate in Hz
Returns:
xout (): filtered output
Updates:
James M. Kates, 18 January 2007.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Design the 1-pole Butterworth LP using the bilinear transformation
butterworth_low_pass, low_pass = butter(1, 5000 / (0.5 * freq_sample))
# LP filter the input
y = lfilter(butterworth_low_pass, low_pass, reference)
# Design the 2-pole Butterworth HP using the bilinear transformation
butterworth_high_pass, high_pass = butter(2, 350 / (0.5 * freq_sample), "high")
# HP filter the signal
return lfilter(butterworth_high_pass, high_pass, y)
[docs]
def gammatone_basilar_membrane(
reference: ndarray,
reference_bandwidth: float,
processed: ndarray,
processed_bandwidth: float,
freq_sample: float,
center_freq: float,
ear_q: float = 9.26449,
min_bandwidth: float = 24.7,
) -> tuple[ndarray, ndarray, ndarray, ndarray]:
"""
4th-order gammatone auditory filter. This implementation is based on the c program
published on-line by Ning Ma, U. Sheffield, UK[1]_ that gives an implementation of
the Martin Cooke filters[2]_: an impulse-invariant transformation of the gammatone
filter. The signal is demodulated down to baseband using a complex exponential,
and then passed through a cascade of four one-pole low-pass filters.
This version filters two signals that have the same sampling rate and the same
gammatone filter center frequencies. The lengths of the two signals should match;
if they don't, the signals are truncated to the shorter of the two lengths.
Arguments:
reference (): first sequence to be filtered
reference_bandwidth: bandwidth for x relative to that of a normal ear
processed (): second sequence to be filtered
processed_bandwidth (): bandwidth for x relative to that of a normal ear
freq_sample (): sampling rate in Hz
center_frequency (int): filter center frequency in Hz
ear_q: (float): ???
min_bandwidth (float): ???
Returns:
reference_envelope (): filter envelope output (modulated down to baseband)
1st signal
reference_basilar_membrane (): Basilar Membrane for the first signal
processed_envelope (): filter envelope output (modulated down to baseband)
2nd signal
processed_basilar_membrane (): Basilar Membrane for the second signal
References:
.. [1] Ma N, Green P, Barker J, Coy A (2007) Exploiting correlogram
structure for robust speech recognition with multiple speech
sources. Speech Communication, 49 (12): 874-891. Available at
<https://doi.org/10.1016/j.specom.2007.05.003>
<https://staffwww.dcs.shef.ac.uk/people/N.Ma/resources/gammatone/>
.. [2] Cooke, M. (1993) Modelling auditory processing and organisation.
Cambridge University Press
Updates:
James M. Kates, 8 Jan 2007.
Vectorized version for efficient MATLAB execution, 4 February 2007.
Cosine and sine generation, 29 June 2011.
Output sine and cosine sequences, 19 June 2012.
Cosine/sine loop speed increased, 9 August 2013.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Filter Equivalent Rectangular Bandwidth from Moore and Glasberg (1983)
# doi: 10.1121/1.389861
erb = min_bandwidth + (center_freq / ear_q)
# Check the lengths of the two signals and trim to shortest
min_sample = min(len(reference), len(processed))
x = reference[:min_sample]
y = processed[:min_sample]
# Filter the first signal
# Initialize the filter coefficients
tpt = 2 * np.pi / freq_sample
tpt_bw = reference_bandwidth * tpt * erb * 1.019
a = np.exp(-tpt_bw)
a_1 = 4.0 * a
a_2 = -6.0 * a * a
a_3 = 4.0 * a * a * a
a_4 = -a * a * a * a
a_5 = 4.0 * a * a
gain = 2.0 * (1 - a_1 - a_2 - a_3 - a_4) / (1 + a_1 + a_5)
# Initialize the complex demodulation
npts = len(x)
sincf, coscf = gammatone_bandwidth_demodulation(
npts, tpt, center_freq, np.zeros(npts), np.zeros(npts)
)
# Filter the real and imaginary parts of the signal
ureal = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], x * coscf)
uimag = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], x * sincf)
assert isinstance(ureal, np.ndarray) # lfilter can return different types
assert isinstance(uimag, np.ndarray)
# Extract the BM velocity and the envelope
reference_basilar_membrane = gain * (ureal * coscf + uimag * sincf)
reference_envelope = gain * np.sqrt(ureal * ureal + uimag * uimag)
# Filter the second signal using the existing cosine and sine sequences
tpt_bw = processed_bandwidth * tpt * erb * 1.019
a = np.exp(-tpt_bw)
a_1 = 4.0 * a
a_2 = -6.0 * a * a
a_3 = 4.0 * a * a * a
a_4 = -a * a * a * a
a_5 = 4.0 * a * a
gain = 2.0 * (1 - a_1 - a_2 - a_3 - a_4) / (1 + a_1 + a_5)
# Filter the real and imaginary parts of the signal
ureal = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], y * coscf)
uimag = lfilter([1, a_1, a_5], [1, -a_1, -a_2, -a_3, -a_4], y * sincf)
assert isinstance(ureal, np.ndarray)
assert isinstance(uimag, np.ndarray)
# Extract the BM velocity and the envelope
processed_basilar_membrane = gain * (ureal * coscf + uimag * sincf)
processed_envelope = gain * np.sqrt(ureal * ureal + uimag * uimag)
return (
reference_envelope,
reference_basilar_membrane,
processed_envelope,
processed_basilar_membrane,
)
[docs]
@njit
def gammatone_bandwidth_demodulation(
npts: int,
tpt: float,
center_freq: float,
center_freq_cos: np.ndarray,
center_freq_sin: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
"""Create the carriers for demodulaton, using the 2d Rotation method from
https://ccrma.stanford.edu/~jos/pasp/Digital_Sinusoid_Generators.html
to generate the sin and cos components. More efficient, perhaps, than
calculating the sin and cos at each point in time.
Arguments:
npts (): How many points are needed.
tpt (): Phase change (2pi/T) due to each sample time.
center_freq (): The carrier frequency
center_freq_cos (): Array to overwrite for the output.
center_freq_sin (): Array to overwrite for the output.
Returns:
sincf (): Samples of the carrier frequency in sin phase.
coscf (): Samples of the carrier frequency in cos phase.
"""
cos_n = np.cos(tpt * center_freq)
sin_n = np.sin(tpt * center_freq)
cold = 1.0
sold = 0.0
center_freq_cos[0] = cold
center_freq_sin[0] = sold
for n in range(1, npts):
arg = cold * cos_n + sold * sin_n
sold = sold * cos_n - cold * sin_n
cold = arg
center_freq_cos[n] = cold
center_freq_sin[n] = sold
return center_freq_sin, center_freq_cos
[docs]
def bandwidth_adjust(
control: ndarray,
bandwidth_min: float,
bandwidth_max: float,
level1: float,
) -> float:
"""
Compute the increase in auditory filter bandwidth in response to high signal
levels. The RMS of the control signal, a scalar, is used to set the
bandwidth for the entire signal.
Arguments:
control (): envelope output in the control filter band
bandwidth_min (): auditory filter bandwidth computed for the loss (or NH)
bandwidth_max (): auditory filter bandwidth at maximum OHC damage
level1 (): RMS=1 corresponds to Level1 dB SPL
Returns:
bandwidth (): filter bandwidth increased for high signal levels
Updates:
James M. Kates, 21 June 2011.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Compute the control signal level
control_rms = np.sqrt(np.mean(control**2))
control_db = 20 * np.log10(control_rms) + level1
# Adjust the auditory filter bandwidth
if control_db < 50:
# No BW adjustment for a signal below 50 dB SPL
return bandwidth_min
if control_db > 100:
# Maximum BW if signal is above 100 dB SPL
return bandwidth_max
return bandwidth_min + ((control_db - 50) / 50) * (bandwidth_max - bandwidth_min)
[docs]
def env_compress_basilar_membrane(
envsig: ndarray,
bm: ndarray, # pylint: disable=invalid-name
control: ndarray,
attn_ohc: float,
threshold_low: float,
compression_ratio: float,
fsamp: float,
level1: float,
small: float = 1e-30,
threshold_high: int = 100,
) -> tuple[ndarray, ndarray]:
"""
Compute the cochlear compression in one auditory filter band. The gain is linear
below the lower threshold, compressive with a compression ratio of CR:1 between the
lower and upper thresholds, and reverts to linear above the upper threshold. The
compressor assumes that auditory threshold is 0 dB SPL.
Arguments:
envsig (): analytic signal envelope (magnitude) returned by the
gammatone filter bank
bm (): BM motion output by the filter bank
control (): analytic control envelope returned by the wide control
path filter bank
attn_ohc (): OHC attenuation at the input to the compressor
threshold_Low (): kneepoint for the low-level linear amplification
compression_ratio (): compression ratio
fsamp (): sampling rate in Hz
level1 (): dB reference level: a signal having an RMS value of 1 is
assigned to Level1 dB SPL.
small (): ???
threshold_high: kneepoint for the high-level linear amplification
Returns:
compressed_signal (): compressed version of the signal envelope
compressed_basilar_membrane (): compressed version of the BM motion
Updates:
James M. Kates, 19 January 2007.
LP filter added 15 Feb 2007 (Ref: Zhang et al., 2001)
Version to compress the envelope, 20 Feb 2007.
Change in the OHC I/O function, 9 March 2007.
Two-tone suppression added 22 August 2008.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Convert the control envelope to dB SPL
logenv = np.maximum(control, small)
logenv = level1 + 20 * np.log10(logenv)
logenv = np.minimum(
logenv, threshold_high
) # Clip signal levels above the upper threshold
logenv = np.maximum(logenv, threshold_low) # Clip signal at the lower threshold
# Compute the compression gain in dB
gain = -attn_ohc - (logenv - threshold_low) * (1 - (1 / compression_ratio))
# Convert the gain to linear and apply a LP filter to give a 0.2 ms delay
gain = 10 ** (gain / 20)
flp = 800
b, a = butter(1, flp / (0.5 * fsamp))
gain = lfilter(b, a, gain)
# Apply the gain to the signals
compressed_signal = gain * envsig
compressed_basilar_membrane = gain * bm
return compressed_signal, compressed_basilar_membrane
[docs]
def envelope_align(
reference: ndarray, output: ndarray, freq_sample: int = 24000, corr_range: int = 100
) -> ndarray:
"""
Align the envelope of the processed signal to that of the reference signal.
Arguments:
reference (): envelope or BM motion of the reference signal
output (): envelope or BM motion of the output signal
freq_sample (int): Frequency sample rate in Hz
corr_range (int): range in msec for the correlation
Returns:
y (): shifted output envelope to match the input
Updates:
James M. Kates, 28 October 2011.
Absolute value of the cross-correlation peak removed, 22 June 2012.
Cross-correlation range reduced, 13 August 2013.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# The MATLAB code limits the range of lags to search (to 100 ms) to save computation
# time - no such option exists in numpy, but the code below limits the delay to the
# same range as in Matlab, for consistent results
lags = np.rint(0.001 * corr_range * freq_sample).astype(int) # Range in samples
npts = len(reference)
lags = min(lags, npts)
ref_out_correlation = correlate(reference, output, "full")
location = np.argmax(
ref_out_correlation[npts - lags : npts + lags]
) # Limit the range in which
delay = lags - location - 1
# Time shift the output sequence
if delay > 0:
# Output delayed relative to the reference
return np.concatenate((output[delay:npts], np.zeros(delay)))
return np.concatenate((np.zeros(-delay), output[: npts + delay]))
[docs]
def envelope_sl(
reference: ndarray,
basilar_membrane: ndarray,
attenuated_ihc: float,
level1: float,
small: float = 1e-30,
) -> tuple[ndarray, ndarray]:
"""
Convert the compressed envelope returned by cochlear_envcomp to dB SL.
Arguments:
reference (): linear envelope after compression
basilar_membrane (): linear Basilar Membrane vibration after compression
attenuated_ihc (): IHC attenuation at the input to the synapse
level1 (): level in dB SPL corresponding to 1 RMS
small (float): ???
Returns:
_reference (): reference envelope in dB SL
_basilar_membrane (): Basilar Membrane vibration with envelope converted to
dB SL
Updates:
James M. Kates, 20 Feb 07.
IHC attenuation added 9 March 2007.
Basilar membrane vibration conversion added 2 October 2012.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Convert the envelope to dB SL
_reference = level1 - attenuated_ihc + 20 * np.log10(reference + small)
_reference = np.maximum(_reference, 0)
# Convert the linear BM motion to have a dB SL envelope
gain = (_reference + small) / (reference + small)
_basilar_membrane = gain * basilar_membrane
return _reference, _basilar_membrane
[docs]
@njit
def inner_hair_cell_adaptation(
reference_db, reference_basilar_membrane, delta, freq_sample
):
"""
Provide inner hair cell (IHC) adaptation. The adaptation is based on an
equivalent RC circuit model, and the derivatives are mapped into
1st-order backward differences. Rapid and short-term adaptation are
provided. The input is the signal envelope in dB SL, with IHC attenuation
already applied to the envelope. The outputs are the envelope in dB SL
with adaptation providing overshoot of the long-term output level, and
the BM motion is multiplied by a gain vs. time function that reproduces
the adaptation. IHC attenuation and additive noise for the equivalent
auditory threshold are provided by a subsequent call to eb_BMatten.
Arguments:
reference_db (np.ndarray): signal envelope in one frequency band in dB SL
contains OHC compression and IHC attenuation
reference_basilar_membrane (): basilar membrane vibration with OHC compression
but no IHC attenuation
delta (): overshoot factor = delta x steady-state
freq_sample (int): sampling rate in Hz
Returns:
output_db (): envelope in dB SL with IHC adaptation
output_basilar_membrane (): Basilar Membrane multiplied by the IHC adaptation
gain function
Updates:
James M. Kates, 1 October 2012.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Test the amount of overshoot
dsmall = 1.0001
delta = max(delta, dsmall)
# Initialize adaptation time constants
tau1 = 2 # Rapid adaptation in msec
tau2 = 60 # Short-term adaptation in msec
tau1 = 0.001 * tau1 # Convert to seconds
tau2 = 0.001 * tau2
# Equivalent circuit parameters
freq_sample_inverse = 1 / freq_sample
r_1 = 1 / delta
r_2 = 0.5 * (1 - r_1)
r_3 = r_2
c_1 = tau1 * (r_1 + r_2) / (r_1 * r_2)
c_2 = tau2 / ((r_1 + r_2) * r_3)
# Intermediate values used for the voltage update matrix inversion
a11 = r_1 + r_2 + r_1 * r_2 * (c_1 / freq_sample_inverse)
a12 = -r_1
a21 = -r_3
a22 = r_2 + r_3 + r_2 * r_3 * (c_2 / freq_sample_inverse)
denom = 1 / ((a11 * a22) - (a21 * a12))
# Additional intermediate values
r_1_inv = 1 / r_1
product_r1_r2_c1 = r_1 * r_2 * (c_1 / freq_sample_inverse)
product_r2_r3_c2 = r_2 * r_3 * (c_2 / freq_sample_inverse)
# Initialize the outputs and state of the equivalent circuit
nsamp = len(reference_db)
gain = np.ones_like(
reference_db
) # Gain vector to apply to the BM motion, default is 1
output_db = np.zeros_like(reference_db)
v_1 = 0
v_2 = 0
small = 1e-30
# Loop to process the envelope signal
# The gain asymptote is 1 for an input envelope of 0 dB SPL
for n in range(nsamp):
v_0 = reference_db[n]
b_1 = v_0 * r_2 + product_r1_r2_c1 * v_1
b_2 = product_r2_r3_c2 * v_2
v_1 = denom * (a22 * b_1 - a12 * b_2)
v_2 = denom * (-a21 * b_1 + a11 * b_2)
out = (v_0 - v_1) * r_1_inv
output_db[n] = out
output_db = np.maximum(output_db, 0)
gain = (output_db + small) / (reference_db + small)
output_basilar_membrane = gain * reference_basilar_membrane
return output_db, output_basilar_membrane
[docs]
def basilar_membrane_add_noise(
reference: ndarray, threshold: int, level1: float
) -> ndarray:
"""
Apply the IHC attenuation to the BM motion and to add a low-level Gaussian noise to
give the auditory threshold.
Arguments:
reference (): BM motion to be attenuated
threshold (): additive noise level in dB re:auditory threshold
level1 (): an input having RMS=1 corresponds to Level1 dB SPL
Returns:
Attenuated signal with threshold noise added
Updates:
James M. Kates, 19 June 2012.
Just additive noise, 2 Oct 2012.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
gain = 10 ** ((threshold - level1) / 20) # Linear gain for the noise
# rng = np.random.default_rng()
noise = gain * np.random.standard_normal(
reference.shape
) # Gaussian RMS=1, then attenuated
return reference + noise
[docs]
def group_delay_compensate(
reference: ndarray,
bandwidths: ndarray,
center_freq: ndarray,
freq_sample: float,
ear_q: float = 9.26449,
min_bandwidth: float = 24.7,
) -> ndarray:
"""
Compensate for the group delay of the gammatone filter bank. The group
delay is computed for each filter at its center frequency. The firing
rate output of the IHC model is then adjusted so that all outputs have
the same group delay.
Arguments:
xenv (np.ndarray): matrix of signal envelopes or BM motion
bandwidths (): gammatone filter bandwidths adjusted for loss
center_freq (): center frequencies of the bands
freq_sample (): sampling rate for the input signal in Hz (e.g. 24,000 Hz)
ear_q (float):
min_bandwidth (float) :
Returns:
processed (): envelopes or BM motion compensated for the group delay.
Updates:
James M. Kates, 28 October 2011.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Processing parameters
nchan = len(bandwidths)
# Filter ERB from Moore and Glasberg (1983)
erb = min_bandwidth + (center_freq / ear_q)
# Initialize the gammatone filter coefficients
tpt = 2 * np.pi / freq_sample
tpt_bandwidth = tpt * 1.019 * bandwidths * erb
a = np.exp(-tpt_bandwidth)
a_1 = 4.0 * a
a_2 = -6.0 * a * a
a_3 = 4.0 * a * a * a
a_4 = -a * a * a * a
a_5 = 4.0 * a * a
# Compute the group delay in samples at fsamp for each filter
_group_delay = np.zeros(nchan)
for n in range(nchan):
_, _group_delay[n] = group_delay(
([1, a_1[n], a_5[n]], [1, -a_1[n], -a_2[n], -a_3[n], -a_4[n]]), 1
)
_group_delay = np.rint(_group_delay).astype(int) # convert to integer samples
# Compute the delay correlation
group_delay_min = np.min(_group_delay)
_group_delay = (
_group_delay - group_delay_min
) # Remove the minimum delay from all the over values
group_delay_max = np.max(_group_delay)
correct = (
group_delay_max - _group_delay
) # Samples delay needed to add to give alignment
# Add delay correction to each frequency band
processed = np.zeros(reference.shape)
for n in range(nchan):
ref = reference[n]
npts = len(ref)
processed[n] = np.concatenate((np.zeros(correct[n]), ref[: npts - correct[n]]))
return processed
[docs]
def convert_rms_to_sl(
reference: ndarray,
control: ndarray,
attenuated_ohc: ndarray | float,
threshold_low: ndarray | int,
compression_ratio: ndarray | int,
attenuated_ihc: ndarray | float,
level1: float,
threshold_high: int = 100,
small: float = 1e-30,
) -> ndarray:
"""
Covert the Root Mean Square average output of the gammatone filter bank
into dB SL. The gain is linear below the lower threshold, compressive
with a compression ratio of CR:1 between the lower and upper thresholds,
and reverts to linear above the upper threshold. The compressor
assumes that auditory threshold is 0 dB SPL.
Arguments:
reference (): analytic signal envelope (magnitude) returned by the
gammatone filter bank, RMS average level
control (): control signal envelope
attenuated_ohc (): OHC attenuation at the input to the compressor
threshold_low (): kneepoint for the low-level linear amplification
compression_ratio (): compression ratio
attenuated_ihc (): IHC attenuation at the input to the synapse
level1 (): dB reference level: a signal having an RMS value of 1 is
assigned to Level1 dB SPL.
threshold_high (int):
small (float):
Returns:
reference_db (): compressed output in dB above the impaired threshold
Updates:
James M. Kates, 6 August 2007.
Version for two-tone suppression, 29 August 2008.
Translated from MATLAB to Python by Zuzanna Podwinska, March 2022.
"""
# Initialize the compression parameters
threshold_high = 100 # Upper compression threshold
# Convert the control to dB SPL
small = 1e-30
control_db_spl = np.maximum(control, small)
control_db_spl = level1 + 20 * np.log10(control_db_spl)
control_db_spl = np.minimum(control_db_spl, threshold_high)
control_db_spl = np.maximum(control_db_spl, threshold_low)
# Compute compression gain in dB
gain = -attenuated_ohc - (control_db_spl - threshold_low) * (
1 - (1 / compression_ratio)
)
# Convert the signal envelope to dB SPL
control_db_spl = np.maximum(reference, small)
control_db_spl = level1 + 20 * np.log10(control_db_spl)
control_db_spl = np.maximum(control_db_spl, 0)
reference_db = control_db_spl + gain - attenuated_ihc
reference_db = np.maximum(reference_db, 0)
return reference_db
[docs]
def env_smooth(envelopes: np.ndarray, segment_size: int, sample_rate: float) -> ndarray:
"""
Function to smooth the envelope returned by the cochlear model. The
envelope is divided into segments having a 50% overlap. Each segment is
windowed, summed, and divided by the window sum to produce the average.
A raised cosine window is used. The envelope sub-sampling frequency is
2*(1000/segsize).
Arguments:
envelopes (np.ndarray): matrix of envelopes in each of the auditory bands
segment_size: averaging segment size in msec
freq_sample (int): input envelope sampling rate in Hz
Returns:
smooth: matrix of subsampled windowed averages in each band
Updates:
James M. Kates, 26 January 2007.
Final half segment added 27 August 2012.
Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
"""
# Compute the window
# Segment size in samples
n_samples = int(np.around(segment_size * (0.001 * sample_rate)))
n_samples += n_samples % 2
window = np.hanning(n_samples) # Raised cosine von Hann window
wsum = np.sum(window) # Sum for normalization
# The first segment has a half window
nhalf = int(n_samples / 2)
halfwindow = window[nhalf:n_samples]
halfsum = np.sum(halfwindow)
# Number of segments and assign the matrix storage
n_channels = np.size(envelopes, 0)
npts = np.size(envelopes, 1)
nseg = int(
1 + np.floor(npts / n_samples) + np.floor((npts - n_samples / 2) / n_samples)
)
smooth = np.zeros((n_channels, nseg))
# Loop to compute the envelope in each frequency band
for k in range(n_channels):
# Extract the envelope in the frequency band
r = envelopes[k, :] # pylint: disable=invalid-name
# The first (half) windowed segment
nstart = 0
smooth[k, 0] = np.sum(r[nstart:nhalf] * halfwindow.conj().transpose()) / halfsum
# Loop over the remaining full segments, 50% overlap
for n in range(1, nseg - 1):
nstart = int(nstart + nhalf)
nstop = int(nstart + n_samples)
smooth[k, n] = sum(r[nstart:nstop] * window.conj().transpose()) / wsum
# The last (half) windowed segment
nstart = nstart + nhalf
nstop = nstart + nhalf
smooth[k, nseg - 1] = (
np.sum(r[nstart:nstop] * window[:nhalf].conj().transpose()) / halfsum
)
return smooth
[docs]
def mel_cepstrum_correlation(
reference: ndarray,
distorted: ndarray,
threshold: float,
addnoise: float,
) -> tuple[float, ndarray]:
"""
Compute the cross-correlations between the input signal time-frequency
envelope and the distortion time-frequency envelope.
For each time interval, the log spectrum is fitted with a set of
half-cosine basis functions. The spectrum weighted by the basis
functions corresponds to Mel Cepstral Coefficients computed in the
frequency domain. The amplitude-normalized cross-covariance between
the time-varying basis functions for the input and output signals is
then computed.
Arguments:
reference (): subsampled input signal envelope in dB SL in each critical band
distorted (): subsampled distorted output signal envelope
threshold (): threshold in dB SPL to include segment in calculation
addnoise (): additive Gaussian noise to ensure 0 cross-corr at low levels
Returns:
average_cepstral_correlation : average cepstral correlation 2-6, input vs output
individual_cepstral_correlations : individual cepstral correlations,
input vs output
Updates:
James M. Kates, 24 October 2006.
Difference signal removed for cochlear model, 31 January 2007.
Absolute value added 13 May 2011.
Changed to loudness criterion for silence threshsold, 28 August 2012.
Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
"""
# Processing parameters
nbands = reference.shape[0]
# Mel cepstrum basis functions (mel cepstrum because of auditory bands)
nbasis = 6 # Number of cepstral coefficients to be used
freq = np.arange(nbasis)
k = np.arange(nbands)
basis = np.cos(np.outer(k, freq) * np.pi / float(nbands - 1))
mel_cepstral = basis / np.linalg.norm(basis, axis=0)
# Find the segments that lie sufficiently above the quiescent rate
reference_linear = 10 ** (
reference / 20
) # Convert envelope dB to linear (specific loudness)
xsum = np.sum(reference_linear, 0) / nbands # Proportional to loudness in sones
xsum = 20 * np.log10(xsum) # Convert back to dB (loudness in phons)
index = np.where(xsum > threshold)[0] # Identify those segments above threshold
nsamp = index.shape[0] # Number of segments above threshold
# Exit if not enough segments above zero
average_cepstral_correlation = 0.0
individual_cepstral_correlations = np.zeros(nbasis)
if nsamp <= 1:
logger.warning(
"Function MelCepstrumCorrelation: Signal below threshold, outputs set to 0."
)
return average_cepstral_correlation, individual_cepstral_correlations
# Remove the silent intervals
ref = reference[:, index]
proc = distorted[:, index]
# Add the low-level noise to the envelopes
ref = ref + addnoise * np.random.standard_normal(ref.shape)
proc = proc + addnoise * np.random.standard_normal(proc.shape)
# Compute the mel cepstrum coefficients using only those segments
# above threshold
reference_cep = np.dot(mel_cepstral.T, ref)
processed_cep = np.dot(mel_cepstral.T, proc)
# Remove the average value from the cepstral coefficients. The
# cross-correlation thus becomes a cross-covariance, and there
# is no effect of the absolute signal level in dB.
reference_cep -= np.mean(reference_cep, axis=1, keepdims=True)
processed_cep -= np.mean(processed_cep, axis=1, keepdims=True)
# Normalized cross-correlations between the time-varying cepstral coeff
# individual_cepstral_correlations = np.zeros(nbasis) # Input vs output
small = 1.0e-30
xsum = np.sum(reference_cep**2, axis=1)
ysum = np.sum(processed_cep**2, axis=1)
mask = (xsum < small) | (ysum < small)
individual_cepstral_correlations = np.zeros(nbasis)
individual_cepstral_correlations[~mask] = np.abs(
np.sum(reference_cep[~mask] * processed_cep[~mask], axis=1)
/ np.sqrt(xsum[~mask] * ysum[~mask])
)
# Figure of merit is the average of the cepstral correlations, ignoring
# the first (average spectrum level).
average_cepstral_correlation = np.sum(
individual_cepstral_correlations[1:nbasis]
) / (nbasis - 1)
return average_cepstral_correlation, individual_cepstral_correlations
[docs]
def melcor9(
reference: ndarray,
distorted: ndarray,
threshold: float,
add_noise: float,
segment_size: int,
n_cepstral_coef: int = 6,
) -> tuple[float, float, float, ndarray]:
"""
Compute the cross-correlations between the input signal
time-frequency envelope and the distortion time-frequency envelope. For
each time interval, the log spectrum is fitted with a set of half-cosine
basis functions. The spectrum weighted by the basis functions corresponds
to mel cepstral coefficients computed in the frequency domain. The
amplitude-normalized cross-covariance between the time-varying basis
functions for the input and output signals is then computed for each of
the 8 modulation frequencies.
Arguments:
reference (): subsampled input signal envelope in dB SL in each critical band
distorted (): subsampled distorted output signal envelope
threshold (): threshold in dB SPL to include segment in calculation
add_noise (): additive Gaussian noise to ensure 0 cross-corr at low levels
segment_size (): segment size in ms used for the envelope LP filter (8 msec)
n_cepstral_coef (int): Number of cepstral coefficients
Returns:
mel_cepstral_average (): average of the modulation correlations across analysis
frequency bands and modulation frequency bands, basis functions 2 -6
mel_cepstral_low (): average over the four lower mod freq bands, 0 - 20 Hz
mel_cepstral_high (): average over the four higher mod freq bands, 20 - 125 Hz
mel_cepstral_modulation (): vector of cross-correlations by modulation
frequency, averaged over analysis frequency band
Updates:
James M. Kates, 24 October 2006.
Difference signal removed for cochlear model, 31 January 2007.
Absolute value added 13 May 2011.
Changed to loudness criterion for silence threshold, 28 August 2012.
Version using envelope modulation filters, 15 July 2014.
Modulation frequency vector output added 27 August 2014.
Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
"""
# Processing parameters
nbands = reference.shape[0]
# Mel cepstrum basis functions (mel cepstrum because of auditory bands)
freq = np.arange(n_cepstral_coef)
k = np.arange(nbands)
basis = np.cos(np.outer(k, freq) * np.pi / (nbands - 1))
cepm = basis / np.linalg.norm(basis, axis=0, keepdims=True)
# Find the segments that lie sufficiently above the quiescent rate
# Convert envelope dB to linear (specific loudness)
reference_linear = 10 ** (reference / 20)
# Proportional to loudness in sones
reference_sum = np.sum(reference_linear, 0) / nbands
# Convert back to dB (loudness in phons)
reference_sum = 20 * np.log10(reference_sum)
# Identify those segments above threshold
index = np.where(reference_sum > threshold)[0]
segments_above_threshold = index.shape[0] # Number of segments above threshold
# Modulation filter bands, segment size is 8 msec
edge = [4.0, 8.0, 12.5, 20.0, 32.0, 50.0, 80.0] # 8 bands covering 0 to 125 Hz
n_modulation_filter_bands = 1 + len(edge) # Number of modulation filter bands
# Exit if not enough segments above zero
mel_cepstral_average = 0.0
mel_cepstral_low = 0.0
mel_cepstral_high = 0.0
mel_cepstral_modulation = np.zeros(n_modulation_filter_bands)
if segments_above_threshold <= 1:
logger.warning("Function melcor9: Signal below threshold, outputs set to 0.")
return (
mel_cepstral_average,
mel_cepstral_low,
mel_cepstral_high,
mel_cepstral_modulation,
)
# Remove the silent intervals
_reference = reference[:, index]
_distorted = distorted[:, index]
# Add the low-level noise to the envelopes
_reference += add_noise * np.random.standard_normal(_reference.shape)
_distorted += add_noise * np.random.standard_normal(_distorted.shape)
# Compute the mel cepstrum coefficients using only those segments
# above threshold
reference_cep = np.dot(cepm.T, _reference[:, :segments_above_threshold])
distorted_cep = np.dot(cepm.T, _distorted[:, :segments_above_threshold])
reference_cep -= np.mean(reference_cep, axis=1, keepdims=True)
distorted_cep -= np.mean(distorted_cep, axis=1, keepdims=True)
# Envelope sampling parameters
sampling_freq = 1000.0 / (0.5 * segment_size) # Envelope sampling frequency in Hz
nyquist_freq = 0.5 * sampling_freq # Envelope Nyquist frequency
# Design the linear-phase envelope modulation filters
n_fir = np.around(
128 * (nyquist_freq / 125)
) # Adjust filter length to sampling rate
n_fir = int(2 * np.floor(n_fir / 2)) # Force an even filter length
b = np.zeros((n_modulation_filter_bands, n_fir + 1))
# LP filter 0-4 Hz
b[0, :] = firwin(
n_fir + 1, edge[0] / nyquist_freq, window="hann", pass_zero="lowpass"
)
# HP 80-125 Hz
b[n_modulation_filter_bands - 1, :] = firwin(
n_fir + 1,
edge[n_modulation_filter_bands - 2] / nyquist_freq,
window="hann",
pass_zero="highpass",
)
# Bandpass filter
for m in range(1, n_modulation_filter_bands - 1):
b[m, :] = firwin(
n_fir + 1,
[edge[m - 1] / nyquist_freq, edge[m] / nyquist_freq],
window="hann",
pass_zero="bandpass",
)
mel_cepstral_cross_covar = melcor9_crosscovmatrix(
b,
n_modulation_filter_bands,
n_cepstral_coef,
segments_above_threshold,
n_fir,
reference_cep,
distorted_cep,
)
mel_cepstral_average = np.sum(mel_cepstral_cross_covar[:, 1:], axis=(0, 1))
mel_cepstral_average /= n_modulation_filter_bands * (n_cepstral_coef - 1)
mel_cepstral_low = np.sum(mel_cepstral_cross_covar[:4, 1:])
mel_cepstral_low /= 4 * (n_cepstral_coef - 1)
mel_cepstral_high = np.sum(mel_cepstral_cross_covar[4:8, 1:])
mel_cepstral_high /= 4 * (n_cepstral_coef - 1)
mel_cepstral_modulation = np.mean(mel_cepstral_cross_covar[:, 1:], axis=1)
return (
mel_cepstral_average,
mel_cepstral_low,
mel_cepstral_high,
mel_cepstral_modulation,
)
[docs]
def melcor9_crosscovmatrix(
b: ndarray,
nmod: int,
nbasis: int,
nsamp: int,
nfir: int,
reference_cep: ndarray,
processed_cep: ndarray,
) -> ndarray:
"""Compute the cross-covariance matrix.
Arguments:
b (): ???
nmod (): ???
nbasis (): ???
nsamp (): ???
nfir (): ???
xcep (): ???
ycep (): ???
Returns:
cross_covariance_matrix ():
"""
small = 1.0e-30
nfir2 = nfir / 2
# Convolve the input and output envelopes with the modulation filters
reference = np.zeros((nmod, nbasis, nsamp))
processed = np.zeros((nmod, nbasis, nsamp))
for m in range(nmod):
for j in range(nbasis):
# Convolve and remove transients
c = convolve(b[m], reference_cep[j, :], mode="full")
reference[m, j, :] = c[int(nfir2) : int(nfir2 + nsamp)]
c = convolve(b[m], processed_cep[j, :], mode="full")
processed[m, j, :] = c[int(nfir2) : int(nfir2 + nsamp)]
# Compute the cross-covariance matrix
cross_covariance_matrix = np.zeros((nmod, nbasis))
for m in range(nmod):
# Input freq band j, modulation freq m
x_j = reference[m]
x_j -= np.mean(x_j, axis=1, keepdims=True)
reference_sum = np.sum(x_j**2, axis=1)
# Processed signal band
y_j = processed[m]
y_j -= np.mean(y_j, axis=1, keepdims=True)
processed_sum = np.sum(y_j**2, axis=1)
xy = np.sum(x_j * y_j, axis=1)
mask = (reference_sum < small) | (processed_sum < small)
cross_covariance_matrix[m, ~mask] = np.abs(xy[~mask]) / np.sqrt(
reference_sum[~mask] * processed_sum[~mask]
)
return cross_covariance_matrix
[docs]
def spectrum_diff(
reference_sl: ndarray, processed_sl: ndarray
) -> tuple[ndarray, ndarray, ndarray]:
"""
Compute changes in the long-term spectrum and spectral slope.
The metric is based on the spectral distortion metric of Moore and Tan[1]_
(JAES, Vol 52, pp 900-914). The log envelopes in dB SL are converted to
linear to approximate specific loudness. The outputs are the sum of the
absolute differences, the standard deviation of the differences, and the
maximum absolute difference. The same three outputs are provided for the
normalized spectral difference and for the slope. The output is
calibrated so that a processed signal having 0 amplitude produces a
value of 1 for the spectrum difference.
Abs diff: weight all deviations uniformly
Std diff: weight larger deviations more than smaller deviations
Max diff: only weight the largest deviation
Arguments:
reference_sl (np.ndarray): reference signal spectrum in dB SL
processed_sl (np.ndarray): degraded signal spectrum in dB SL
Returns:
dloud (np.array) : [sum abs diff, std dev diff, max diff] spectra
dnorm (np.array) : [sum abs diff, std dev diff, max diff] norm spectra
dslope (np.array) : [sum abs diff, std dev diff, max diff] slope
References:
.. [1] Moore BCJ, Tan, CT (2004) Development and Validation of a Method
for Predicting the Perceived Naturalness of Sounds Subjected to
Spectral Distortion J Audio Eng Soc 52(9):900-914. Available at.
<http://www.aes.org/e-lib/browse.cfm?elib=13018>.
Updates:
James M. Kates, 28 June 2012.
Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
"""
# Convert the dB SL to linear magnitude values. Because of the auditory
# filter bank, the OHC compression, and auditory threshold, the linear
# values are closely related to specific loudness.
nbands = reference_sl.shape[0]
reference_linear_magnitude = 10 ** (reference_sl / 20)
processed_linear_magnitude = 10 ** (processed_sl / 20)
# Normalize the level of the reference and degraded signals to have the
# same loudness. Thus overall level is ignored while differences in
# spectral shape are measured.
reference_sum = np.sum(reference_linear_magnitude)
reference_linear_magnitude /= (
reference_sum # Loudness sum = 1 (arbitrary amplitude, proportional to sones)
)
processed_sum = np.sum(processed_linear_magnitude)
processed_linear_magnitude /= processed_sum
# Compute the spectrum difference
dloud = np.zeros(3)
diff_spectrum = (
reference_linear_magnitude - processed_linear_magnitude
) # Difference in specific loudness in each band
dloud[0] = np.sum(np.abs(diff_spectrum))
dloud[1] = nbands * np.std(diff_spectrum) # Biased std: second moment
dloud[2] = np.max(np.abs(diff_spectrum))
# Compute the normalized spectrum difference
dnorm = np.zeros(3)
diff_normalised_spectrum = (
reference_linear_magnitude - processed_linear_magnitude
) / (
reference_linear_magnitude + processed_linear_magnitude
) # Relative difference in specific loudness
dnorm[0] = np.sum(np.abs(diff_normalised_spectrum))
dnorm[1] = nbands * np.std(diff_normalised_spectrum)
dnorm[2] = np.max(np.abs(diff_normalised_spectrum))
# Compute the slope difference
dslope = np.zeros(3)
reference_slope = (
reference_linear_magnitude[1:nbands]
- reference_linear_magnitude[0 : nbands - 1]
)
processed_slope = (
processed_linear_magnitude[1:nbands]
- processed_linear_magnitude[0 : nbands - 1]
)
diff_slope = reference_slope - processed_slope # Slope difference
dslope[0] = np.sum(np.abs(diff_slope))
dslope[1] = nbands * np.std(diff_slope)
dslope[2] = np.max(np.abs(diff_slope))
return dloud, dnorm, dslope
[docs]
def bm_covary(
reference_basilar_membrane: ndarray,
processed_basilar_membrane: ndarray,
segment_size: int,
sample_rate: float,
) -> tuple[ndarray, ndarray, ndarray]:
"""
Compute the cross-covariance (normalized cross-correlation) between the reference
and processed signals in each auditory band. The signals are divided into segments
having 50% overlap.
Arguments:
reference_basilar_membrane (): Basilar Membrane movement, reference signal
processed_basilar_membrane (): Basilar Membrane movement, processed signal
segment_size (): signal segment size, msec
freq_sample (int): sampling rate in Hz
Returns:
signal_cross_covariance (np.array) : [nchan,nseg] of cross-covariance values
reference_mean_square (np.array) : [nchan,nseg] of MS input signal energy values
processed_mean_square (np.array) : [nchan,nseg] of MS processed signal energy
values
Updates:
James M. Kates, 28 August 2012.
Output amplitude adjustment added, 30 october 2012.
Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
"""
# Initialize parameters
small = 1.0e-30
# Lag for computing the cross-covariance
lagsize = 1.0 # Lag (+/-) in msec
maxlag = np.around(lagsize * (0.001 * sample_rate)) # Lag in samples
# Compute the segment size in samples
nwin = int(np.around(segment_size * (0.001 * sample_rate)))
nwin += nwin % 2 == 1 # Force window length to be even
window = np.hanning(nwin).conj().transpose() # Raised cosine von Hann window
# compute inverted Window autocorrelation
win_corr = correlate(window, window, "full")
start_sample = int(len(window) - 1 - maxlag)
end_sample = int(maxlag + len(window))
if start_sample < 0:
raise ValueError("segment size too small")
win_corr = 1 / win_corr[start_sample:end_sample]
win_sum2 = 1.0 / np.sum(window**2) # Window power, inverted
# The first segment has a half window
nhalf = int(nwin / 2)
half_window = window[nhalf:nwin]
half_corr = correlate(half_window, half_window, "full")
start_sample = int(len(half_window) - 1 - maxlag)
end_sample = int(maxlag + len(half_window))
if start_sample < 0:
raise ValueError("segment size too small")
half_corr = 1 / half_corr[start_sample:end_sample]
halfsum2 = 1.0 / np.sum(half_window**2) # MS sum normalization, first segment
# Number of segments
nchan = reference_basilar_membrane.shape[0]
npts = reference_basilar_membrane.shape[1]
nseg = int(1 + np.floor(npts / nwin) + np.floor((npts - nwin / 2) / nwin))
reference_mean_square = np.zeros((nchan, nseg))
processed_mean_square = np.zeros((nchan, nseg))
signal_cross_covariance = np.zeros((nchan, nseg))
# Loop to compute the signal mean-squared level in each band for each
# segment and to compute the cross-corvariances.
for k in range(nchan):
# Extract the BM motion in the frequency band
x = reference_basilar_membrane[k, :]
y = processed_basilar_membrane[k, :]
# The first (half) windowed segment
nstart = 0
reference_seg = x[nstart:nhalf] * half_window # Window the reference
processed_seg = y[nstart:nhalf] * half_window # Window the processed signal
reference_seg = reference_seg - np.mean(reference_seg) # Make 0-mean
processed_seg = processed_seg - np.mean(processed_seg)
# Normalize signal MS value by the window
ref_mean_square = np.sum(reference_seg**2) * halfsum2
proc_mean_squared = np.sum(processed_seg**2) * halfsum2
correlation = correlate(reference_seg, processed_seg, "full")
correlation = correlation[
int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
]
unbiased_cross_correlation = np.max(np.abs(correlation * half_corr))
if (ref_mean_square > small) and (proc_mean_squared > small):
# Normalize cross-covariance
signal_cross_covariance[k, 0] = unbiased_cross_correlation / np.sqrt(
ref_mean_square * proc_mean_squared
)
else:
signal_cross_covariance[k, 0] = 0.0
# Save the reference MS level
reference_mean_square[k, 0] = ref_mean_square
processed_mean_square[k, 0] = proc_mean_squared
# Loop over the remaining full segments, 50% overlap
for n in range(1, nseg - 1):
nstart = nstart + nhalf
nstop = nstart + nwin
reference_seg = x[nstart:nstop] * window # Window the reference
processed_seg = y[nstart:nstop] * window # Window the processed signal
reference_seg = reference_seg - np.mean(reference_seg) # Make 0-mean
processed_seg = processed_seg - np.mean(processed_seg)
# Normalize signal MS value by the window
ref_mean_square = np.sum(reference_seg**2) * win_sum2
proc_mean_squared = np.sum(processed_seg**2) * win_sum2
correlation = correlate(reference_seg, processed_seg, "full")
correlation = correlation[
int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
]
unbiased_cross_correlation = np.max(np.abs(correlation * win_corr))
if (ref_mean_square > small) and (proc_mean_squared > small):
# Normalize cross-covariance
signal_cross_covariance[k, n] = unbiased_cross_correlation / np.sqrt(
ref_mean_square * proc_mean_squared
)
else:
signal_cross_covariance[k, n] = 0.0
reference_mean_square[k, n] = ref_mean_square
processed_mean_square[k, n] = proc_mean_squared
# The last (half) windowed segment
nstart = nstart + nhalf
nstop = nstart + nhalf
reference_seg = x[nstart:nstop] * window[0:nhalf] # Window the reference
processed_seg = y[nstart:nstop] * window[0:nhalf] # Window the processed signal
reference_seg = reference_seg - np.mean(reference_seg) # Make 0-mean
processed_seg = processed_seg - np.mean(processed_seg)
# Normalize signal MS value by the window
ref_mean_square = np.sum(reference_seg**2) * halfsum2
proc_mean_squared = np.sum(processed_seg**2) * halfsum2
correlation = correlate(reference_seg, processed_seg, "full")
correlation = correlation[
int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
]
unbiased_cross_correlation = np.max(np.abs(correlation * half_corr))
if (ref_mean_square > small) and (proc_mean_squared > small):
# Normalized cross-covariance
signal_cross_covariance[k, nseg - 1] = unbiased_cross_correlation / np.sqrt(
ref_mean_square * proc_mean_squared
)
else:
signal_cross_covariance[k, nseg - 1] = 0.0
# Save the reference and processed MS level
reference_mean_square[k, nseg - 1] = ref_mean_square
processed_mean_square[k, nseg - 1] = proc_mean_squared
# Limit the cross-covariance to lie between 0 and 1
signal_cross_covariance = np.clip(signal_cross_covariance, 0, 1)
# Adjust the BM magnitude to correspond to the envelope in dB SL
reference_mean_square *= 2.0
processed_mean_square *= 2.0
return signal_cross_covariance, reference_mean_square, processed_mean_square
[docs]
def ave_covary2(
signal_cross_covariance: np.ndarray,
reference_signal_mean_square: np.ndarray,
threshold_db: float,
lp_filter_order: ndarray | None = None,
freq_cutoff: ndarray | None = None,
) -> tuple[float, ndarray]:
"""
Compute the average cross-covariance between the reference and processed
signals in each auditory band.
The silent time-frequency tiles are removed from consideration. The
cross-covariance is computed for each segment in each frequency band. The
values are weighted by 1 for inclusion or 0 if the tile is below
threshold. The sum of the covariance values across time and frequency are
then divided by the total number of tiles above threshold. The calculation
is a modification of Tan et al.[1]_ . The cross-covariance is also output
with a frequency weighting that reflects the loss of IHC synchronization at high
frequencies Johnson[2]_.
Arguments:
signal_cross_covariance (np.array): [nchan,nseg] of cross-covariance values
reference_signal_mean_square (np.array): [nchan,nseg] of reference signal MS
values
threshold_db (): threshold in dB SL to include segment ave over freq in
average
lp_filter (list): LP filter order
freq_cutoff (list): Cutoff frequencies in Hz
Returns:
average_covariance (): cross-covariance in segments averaged over time and
frequency
ihc_sync_covariance (): cross-covariance array, 6 different weightings for loss
of IHC synchronization at high frequencies:
LP Filter Order Cutoff Freq, kHz
1 1.5
3 2.0
5 2.5, 3.0, 3.5, 4.0
References:
.. [1] Tan CT, Moore, BCJ, Zacharov N, Mattila VV (2004) Predicting the Perceived
Quality of Nonlinearly Distorted Music and Speech Signals. J Audio Eng Soc
52(9):900-914. Available at.
<http://www.aes.org/e-lib/browse.cfm?elib=13013>.
.. [2] Johnson DH (1980) The relationship between spike rate and synchrony in
responses of auditory‐nerve fibers to single tones J Acoustical Soc of Am
68:1115 Available at.
<https://doi.org/10.1121/1.384982>
Updates:
James M. Kates, 28 August 2012.
Adjusted for BM vibration in dB SL, 30 October 2012.
Threshold for including time-freq tile modified, 30 January 2013.
Version for different sync loss, 15 February 2013.
Translated from MATLAB to Python by Gerardo Roa Dabike, September 2022.
"""
# Array dimensions
n_channels = signal_cross_covariance.shape[0]
# Initialize the LP filter for loss of IHC synchronization
# Center frequencies in Hz on an ERB scale
_center_freq = center_frequency(n_channels)
# Default LP filter order
if lp_filter_order is None:
lp_filter_order = np.array([1, 3, 5, 5, 5, 5])
# Default cutoff frequencies in Hz
if freq_cutoff is None:
freq_cutoff = 1000 * np.array([1.5, 2.0, 2.5, 3.0, 3.5, 4.0])
fc2p = (
np.atleast_2d(freq_cutoff ** (2 * lp_filter_order)).repeat(n_channels, axis=0).T
)
freq2p = _center_freq ** (
2 * np.atleast_2d(lp_filter_order).repeat(n_channels, axis=0).T
)
fsync = np.sqrt(fc2p / (fc2p + freq2p))
# Find the segments that lie sufficiently above the threshold.
# Convert squared amplitude to dB envelope
signal_rms = np.sqrt(reference_signal_mean_square)
# Linear amplitude (specific loudness)
signal_linear_amplitude = 10 ** (signal_rms / 20)
# Intensity averaged over frequency bands
reference_mean = np.sum(signal_linear_amplitude, 0) / n_channels
# Convert back to dB (loudness in phons)
reference_mean = 20 * np.log10(reference_mean)
# Identify those segments above threshold
index = np.argwhere(reference_mean > threshold_db).T
if index.size != 1:
index = index.squeeze()
nseg = index.shape[0] # Number of segments above threshold
# Exit if not enough segments above zero
if nseg <= 1:
logger.warning(
"Function AveCovary2: Ave signal below threshold, outputs set to 0."
)
average_covariance = 0
# syncov = 0
ihc_sync_covariance = np.zeros(6)
return average_covariance, ihc_sync_covariance
# Remove the silent segments
signal_cross_covariance = signal_cross_covariance[:, index]
signal_rms = signal_rms[:, index]
# Compute the time-frequency weights. The weight=1 if a segment in a
# frequency band is above threshold, and weight=0 if below threshold.
weight = np.zeros((n_channels, nseg)) # No IHC synchronization roll-off
weight[signal_rms > threshold_db] = 1
# The wsync tensor should be constructed as follows:
#
# wsync = np.zeros((6, n_channels, nseg))
# for k in range(n_channels):
# for n in range(nseg):
# # Thresh in dB SL for including time-freq tile
# if signal_rms[k, n] > threshold_db:
# wsync[:, k, n] = fsync[:, k]
#
# This can be written is an efficient vectorsized form as follows:
wsync = np.zeros((6, n_channels, nseg))
mask = signal_rms > threshold_db
fsync3d = np.repeat(fsync[..., None], nseg, axis=2)
wsync[:, mask] = fsync3d[:, mask]
# Sum the weighted covariance values
# Sum of weighted time-freq tiles
csum = np.sum(np.sum(weight * signal_cross_covariance))
wsum = np.sum(np.sum(weight)) # Total number of tiles above threshold
tiles_above_threshold = np.zeros(6)
# Sum of weighted time-freq tiles
sum_weighted_time_freq = np.sum(wsync * signal_cross_covariance, axis=(1, 2))
tiles_above_threshold = np.sum(wsync, axis=(1, 2))
# Exit if not enough segments above zero
if wsum < 1:
average_covariance = 0
logger.warning(
"Function AveCovary2: Signal tiles below threshold, outputs set to 0."
)
else:
average_covariance = csum / wsum
ihc_sync_covariance = sum_weighted_time_freq / tiles_above_threshold
return average_covariance, ihc_sync_covariance