参考https://www.cnblogs.com/LXP-Never/p/11561355.html
谢谢作者~
audio_hjk2是我用来提取PPG+mel+mfcc+linear的标准
Rayhane是Tacotron合成效果比较好的标准
看看Rayhane和PPG标准是否一样
(具体对比在2. Rayhane的超参那里)
# 超参数个数:16
hparams = {
'sample_rate': 16000,
'preemphasis': 0.97,
'n_fft': 400,
'hop_length': 160,
'win_length': 400,
'num_mels': 80,
'n_mfcc': 13,
'window': 'hann',
'fmin': 30.,
'fmax': 7600.,
'ref_db': 20,
'min_db': -80.0,
'griffin_lim_power': 1.5,
'griffin_lim_iterations': 60,
'silence_db': -28.0,
'center': True, # 不知道为什么提取ppg要是True
}
# 问题:
# fmin和fmax到底有什么用,目前mfcc的fmin=0,fmax=none;mel的fmin=30,fmax=7600;spec又没有限制。所以怎么办?
# 以后只用power谱了,统一起来,都用stft之后先算平方,然后转换log后乘以10,但是其实不懂区别,哪一个更好?
# Griffinlim超参数临时使用1.2和80,区别在哪里?
# 取log的时候,浮点数(power值)统一加上了1e-5
# min_db没有详细统计,直接用的-80
# 户建坤-hujk17为了理解长河10ms版本cbhg-ppg代码进行了一次梳理,抄写的。2020-10-14-16-13
import librosa
import numpy as np
from scipy.io import wavfile
from scipy import signal
from scipy.fftpack import dct
import matplotlib.pyplot as plt
# 超参数个数:16
hparams = {
'sample_rate': 16000,
'preemphasis': 0.97,
'n_fft': 400,
'hop_length': 160,
'win_length': 400,
'num_mels': 80,
'n_mfcc': 13,
'window': 'hann',
'fmin': 30.,
'fmax': 7600.,
'ref_db': 20,
'min_db': -80.0,
'griffin_lim_power': 1.5,
'griffin_lim_iterations': 60,
'silence_db': -28.0,
'center': True, # 不知道为什么提取ppg要是True
}
_mel_basis = None
_inv_mel_basis = None
# 超参数个数:1
def load_wav(wav_f, sr = hparams['sample_rate']):
wav_arr, _ = librosa.load(wav_f, sr=sr)
return wav_arr
# 超参数个数:1
def write_wav(write_path, wav_arr, sr = hparams['sample_rate']):
wav_arr *= 32767 / max(0.01, np.max(np.abs(wav_arr)))
wavfile.write(write_path, sr, wav_arr.astype(np.int16))
return
# 超参数个数:1
def split_wav(wav_arr, top_db = -hparams['silence_db']):
intervals = librosa.effects.split(wav_arr, top_db=top_db)
return intervals
# 超参数个数:12
def wav2unnormalized_mfcc(wav_arr, sr=hparams['sample_rate'], preemphasis=hparams['preemphasis'],
n_fft=hparams['n_fft'], hop_len=hparams['hop_length'],
win_len=hparams['win_length'], num_mels=hparams['num_mels'],
n_mfcc=hparams['n_mfcc'], window=hparams['window'],fmin=0.0,
fmax=None, ref_db=hparams['ref_db'],
center=hparams['center']):
emph_wav_arr = _preempahsis(wav_arr, pre_param=preemphasis)
power_spec = _power_spec(emph_wav_arr, n_fft=n_fft, hop_len=hop_len, win_len=win_len, window=window, center=center)
power_mel = _power_spec2power_mel(power_spec, sr=sr, n_fft=n_fft, num_mels=num_mels, fmin=fmin, fmax=fmax)
db_mel = _power2db(power_mel, ref_db=ref_db)
# 没有进行norm
mfcc = dct(x=db_mel.T, axis=0, type=2, norm='ortho')[:n_mfcc]
deltas = librosa.feature.delta(mfcc)
delta_deltas = librosa.feature.delta(mfcc, order=2)
mfcc_feature = np.concatenate((mfcc, deltas, delta_deltas), axis=0)
return mfcc_feature.T
# 超参数个数:12
def wav2normalized_db_mel(wav_arr, sr=hparams['sample_rate'], preemphasis=hparams['preemphasis'],
n_fft=hparams['n_fft'], hop_len=hparams['hop_length'],
win_len=hparams['win_length'], num_mels=hparams['num_mels'],
window=hparams['window'],fmin=hparams['fmin'],
fmax=hparams['fmax'], ref_db=hparams['ref_db'], min_db=hparams['min_db'],
center=hparams['center']):
emph_wav_arr = _preempahsis(wav_arr, pre_param=preemphasis)
power_spec = _power_spec(emph_wav_arr, n_fft=n_fft, hop_len=hop_len, win_len=win_len, window=window, center=center) # (time, n_fft/2+1)
power_mel = _power_spec2power_mel(power_spec, sr=sr, n_fft=n_fft, num_mels=num_mels, fmin=fmin, fmax=fmax)
db_mel = _power2db(power_mel, ref_db=ref_db)
normalized_db_mel = _db_normalize(db_mel, min_db=min_db)
return normalized_db_mel
# 超参数个数:9
def wav2normalized_db_spec(wav_arr, sr=hparams['sample_rate'], preemphasis=hparams['preemphasis'],
n_fft=hparams['n_fft'], hop_len=hparams['hop_length'],
win_len=hparams['win_length'],
window=hparams['window'], ref_db=hparams['ref_db'], min_db=hparams['min_db'],
center=hparams['center']):
emph_wav_arr = _preempahsis(wav_arr, pre_param=preemphasis)
power_spec = _power_spec(emph_wav_arr, n_fft=n_fft, hop_len=hop_len, win_len=win_len, window=window, center=center) # (time, n_fft/2+1)
# power_mel = _power_spec2power_mel(power_spec, sr=sr, n_fft=n_fft, num_mels=num_mels, fmin=fmin, fmax=fmax)
db_spec = _power2db(power_spec, ref_db=ref_db)
normalized_db_spec = _db_normalize(db_spec, min_db=min_db)
return normalized_db_spec
# inv操作
# 超参数个数:14
def normalized_db_mel2wav(normalized_db_mel, sr=hparams['sample_rate'], preemphasis=hparams['preemphasis'],
n_fft=hparams['n_fft'], hop_len=hparams['hop_length'],
win_len=hparams['win_length'], num_mels=hparams['num_mels'],
window=hparams['window'], fmin=hparams['fmin'],
fmax=hparams['fmax'],
ref_db=hparams['ref_db'], min_db=hparams['min_db'],
center=hparams['center'], griffin_lim_power=hparams['griffin_lim_power'],
griffin_lim_iterations=hparams['griffin_lim_iterations']):
db_mel = _db_denormalize(normalized_db_mel, min_db=min_db)
power_mel = _db2power(db_mel, ref_db=ref_db)
power_spec = _power_mel2power_spec(power_mel, sr=sr, n_fft=n_fft, num_mels=num_mels, fmin=fmin, fmax=fmax) #矩阵求逆猜出来的spec
magnitude_spec = power_spec ** 0.5 # (time, n_fft/2+1)
# print('-----1:', magnitude_spec.shape)
# magnitude_spec_t = magnitude_spec.T
griffinlim_powered_magnitude_spec = magnitude_spec ** griffin_lim_power # (time, n_fft/2+1)
# print('-----2:', griffinlim_powered_magnitude_spec.shape)
# 送入griffinlim的是正常的 (time, n_fft/2+1)
emph_wav_arr = _griffin_lim(griffinlim_powered_magnitude_spec, gl_iterations=griffin_lim_iterations,
n_fft=n_fft, hop_len=hop_len, win_len=win_len, window=window, center=center)
wav_arr = _deemphasis(emph_wav_arr, pre_param=preemphasis)
return wav_arr
# inv操作
# 超参数个数:11
def normalized_db_spec2wav(normalized_db_spec, sr=hparams['sample_rate'], preemphasis=hparams['preemphasis'],
n_fft=hparams['n_fft'], hop_len=hparams['hop_length'],
win_len=hparams['win_length'],
window=hparams['window'], ref_db=hparams['ref_db'], min_db=hparams['min_db'],
center=hparams['center'], griffin_lim_power=hparams['griffin_lim_power'],
griffin_lim_iterations=hparams['griffin_lim_iterations']):
db_spec = _db_denormalize(normalized_db_spec, min_db=min_db)
power_spec = _db2power(db_spec, ref_db=ref_db) # (time, n_fft/2+1)
magnitude_spec = power_spec ** 0.5 # (time, n_fft/2+1)
# magnitude_spec_t = magnitude_spec.T #(n_fft/2+1, time)
griffinlim_powered_magnitude_spec = magnitude_spec ** griffin_lim_power
emph_wav_arr = _griffin_lim(griffinlim_powered_magnitude_spec, gl_iterations=griffin_lim_iterations,
n_fft=n_fft, hop_len=hop_len, win_len=win_len, window=window, center=center)
wav_arr = _deemphasis(emph_wav_arr, pre_param=preemphasis)
return wav_arr
# 超参数个数:1
def _preempahsis(wav_arr, pre_param):
return signal.lfilter([1, -pre_param], [1], wav_arr)
# 超参数个数:1
def _deemphasis(wav_arr, pre_param):
return signal.lfilter([1], [1, -pre_param], wav_arr)
# 超参数个数:5
# 注意center的参数
# return shape: [n_freqs, time]
def _stft(wav_arr, n_fft, hop_len, win_len, window, center):
return librosa.core.stft(wav_arr, n_fft=n_fft, hop_length=hop_len,
win_length=win_len, window=window, center=center)
# 超参数个数:3
# stft_matrix shape [n_freqs, time],复数
def _istft(stft_matrix, hop_len, win_len, window):
return librosa.core.istft(stft_matrix, hop_length=hop_len,
win_length=win_len, window=window)
# 超参数个数:5
# 注意center的参数
# 以后只用power谱了,统一起来,都用stft之后先算平方,然后转换log后乘以10,但是其实不懂区别,哪一个更好?
# return shape: [time, n_freqs]
def _power_spec(wav_arr, n_fft, hop_len, win_len, window, center):
s = _stft(wav_arr, n_fft=n_fft, hop_len=hop_len, win_len=win_len, window=window, center=center).T
power = np.abs(s) ** 2
return power
# 超参数个数:5
# input shape: [time, n_freqs]
# return shape: [time, n_mels]
def _power_spec2power_mel(power_spec, sr, n_fft, num_mels, fmin, fmax):
power_spec_t = power_spec.T
global _mel_basis
_mel_basis = (librosa.filters.mel(sr, n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) if _mel_basis is None else _mel_basis) # [n_mels, 1+n_fft/2]
power_mel_t = np.dot(_mel_basis, power_spec_t) # [n_mels, time]
power_mel = power_mel_t.T
return power_mel
# inv操作
# 超参数个数:5
# input shape: [time, n_mels]
# return shape: [time, n_freqs]
def _power_mel2power_spec(power_mel, sr, n_fft, num_mels, fmin, fmax):
power_mel_t = power_mel.T
global _mel_basis, _inv_mel_basis
_mel_basis = (librosa.filters.mel(sr, n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) if _mel_basis is None else _mel_basis) # [n_mels, 1+n_fft/2]
_inv_mel_basis = (np.linalg.pinv(_mel_basis) if _inv_mel_basis is None else _inv_mel_basis)
power_spec_t = np.dot(_inv_mel_basis, power_mel_t)
power_spec_t = np.maximum(1e-10, power_spec_t)
power_spec = power_spec_t.T
return power_spec
# 超参数个数:1
# returned value: (10. * log10(power_spec) - ref_db)
def _power2db(power_spec, ref_db, tol=1e-5):
return 10. * np.log10(power_spec + tol) - ref_db
# inv操作
# 超参数个数:1
def _db2power(power_db, ref_db):
return np.power(10.0, 0.1 * (power_db + ref_db))
# 超参数个数:1
# return: db normalized to [0., 1.]
def _db_normalize(db, min_db):
return np.clip((db - min_db) / -min_db, 0., 1.)
# inv操作
# 超参数个数:1
def _db_denormalize(normalized_db, min_db):
return np.clip(normalized_db, 0., 1.) * -min_db + min_db
# 超参数个数:6
# input: magnitude spectrogram of shape [time, n_freqs]
# return: waveform array
def _griffin_lim(magnitude_spec, gl_iterations, n_fft, hop_len, win_len, window, center):
# # 在这里进行gl的power,输入的是正常的magnitude_spec
# magnitude_spec = magnitude_spec ** gl_power
mag = magnitude_spec.T # transpose to [n_freqs, time]
# print('-----3:', magnitude_spec.shape)
# print('-----4:', mag.shape)
angles = np.exp(2j * np.pi * np.random.rand(*mag.shape))
complex_mag = np.abs(mag).astype(np.complex)
stft_0 = complex_mag * angles
y = _istft(stft_0, hop_len = hop_len, win_len = win_len, window = window)
for _i in range(gl_iterations):
angles = np.exp(1j * np.angle(_stft(y, n_fft=n_fft, hop_len=hop_len, win_len=win_len, window=window, center=center)))
y = _istft(complex_mag * angles, hop_len = hop_len, win_len = win_len, window = window)
return y
def _wav2unnormalized_mfcc_test(wav_path, mfcc_path):
wav_arr = load_wav(wav_path)
mfcc = wav2unnormalized_mfcc(wav_arr)
mfcc_label = np.load(mfcc_path)
print(mfcc.min(), mfcc_label.min())
print(mfcc.max(), mfcc_label.max())
print(mfcc.mean(), mfcc_label.mean())
print(np.abs(mfcc - mfcc_label))
print(np.mean(np.abs(mfcc - mfcc_label)))
plt.figure()
plt.subplot(211)
plt.imshow(mfcc.T, origin='lower')
# plt.colorbar()
plt.subplot(212)
plt.imshow(mfcc_label.T, origin='lower')
# plt.colorbar()
plt.tight_layout()
plt.show()
return
def _wav2normalized_db_mel_test(wav_path, wav_rec_path):
wav_arr = load_wav(wav_path)
spec = wav2normalized_db_spec(wav_arr)
wav_arr_rec = normalized_db_spec2wav(spec)
write_wav(wav_rec_path, wav_arr_rec)
def _wav2normalized_db_spec_test(wav_path, wav_rec_path):
wav_arr = load_wav(wav_path)
mel = wav2normalized_db_mel(wav_arr)
wav_arr_rec = normalized_db_mel2wav(mel)
write_wav(wav_rec_path, wav_arr_rec)
if __name__ == '__main__':
_wav2unnormalized_mfcc_test('test.wav', 'test_mfcc.npy')
_wav2normalized_db_mel_test('test.wav', 'test_mel_rec.wav')
_wav2normalized_db_spec_test('test.wav', 'test_spec_rec.wav')
# 超参数个数:19
hparams = {
'sample_rate': 16000, # same
'preemphasis': 0.97, # same
'n_fft': 1024, # 不同: PPG 400==400; Rayhane 800->1024
'hop_length': 160, # 不同: PPG 160->10ms; Rayhane 200->12.5ms 窗移
'win_length': 800, # 不同: PPG 400->25ms; Rayhane 800->50ms 窗长
'num_mels': 80, # same
'n_mfcc': 13, # Rayhane no
'window': 'hann', # Rayhane no, 但是默认的同样是hann
'fmin': 30., # 不同: PPG 30hz; Rayhane 55 {#Set 55 if male if female 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])}
'fmax': 7600., # same
'ref_db': 20, # same. 都一样, 不用管; 不懂TODO
'min_db': -100.0, # 不同: PPG -80.0; Rayhane -100.0; PPG [-80, 0] -> [-70, -9](eps=1e-5), [-100, 0] -> [-100, -9]
'griffin_lim_power': 1.5,
'griffin_lim_iterations': 60,
'silence_db': -28.0, # PPG没有去掉首尾静默段, Rayhane有, 先不管
'center': True, # Rayhane no, 但是默认的同样是True
'rescaling_max' : 0.999, # 不同: Rayhane add
'magnitude_power' : 2, # same. PPG默认为power, 所以代码中并不涉及这个超参
'value_scale': 4, # 不同: PPG 是映射区间至[0, 1]; Rayhane 是映射区间至[-4, 4]
}
# 超参数个数:19
hparams = {
'sample_rate': 16000, # same
'preemphasis': 0.97, # same
'n_fft': 1024, # 不同: PPG 400==400; Rayhane 800->1024
'hop_length': 160, # 不同: PPG 160->10ms; Rayhane 200->12.5ms 窗移
'win_length': 800, # 不同: PPG 400->25ms; Rayhane 800->50ms 窗长
'num_mels': 80, # same
'n_mfcc': 13, # Rayhane no
'window': 'hann', # Rayhane no, 但是默认的同样是hann
'fmin': 30., # 不同: PPG 30hz; Rayhane 55 {#Set 55 if male if female 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])}
'fmax': 7600., # same
'ref_db': 20, # same. 都一样, 不用管; 不懂TODO
'min_db': -100.0, # 不同: PPG -80.0; Rayhane -100.0; PPG [-80, 0] -> [-70, -9](eps=1e-5), [-100, 0] -> [-100, -9]
'griffin_lim_power': 1.5,
'griffin_lim_iterations': 60,
'silence_db': -28.0, # PPG没有去掉首尾静默段, Rayhane有, 先不管
'center': True, # Rayhane no, 但是默认的同样是True
'rescaling_max' : 0.999, # 不同: Rayhane add
'magnitude_power' : 2, # same. PPG默认为power, 所以代码中并不涉及这个超参
'value_scale': 4, # 不同: PPG 是映射区间至[0, 1]; Rayhane 是映射区间至[-4, 4]
}
import librosa
import librosa.filters
import numpy as np
import tensorflow as tf
from scipy import signal
from scipy.io import wavfile
def load_wav(path, sr):
return librosa.core.load(path, sr=sr)[0]
def save_wav(wav, path, sr):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
#proposed by @dsmiller
wavfile.write(path, sr, wav.astype(np.int16))
def save_wavenet_wav(wav, path, sr):
librosa.output.write_wav(path, wav, sr=sr)
def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
if abs(quantized[start] - 127) > silence_threshold:
break
for end in range(quantized.size - 1, 1, -1):
if abs(quantized[end] - 127) > silence_threshold:
break
assert abs(quantized[start] - 127) > silence_threshold
assert abs(quantized[end] - 127) > silence_threshold
return start, end
def trim_silence(wav, hparams):
'''Trim leading and trailing silence
Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end.
'''
#Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset.
return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0]
def get_hop_size(hparams):
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
def linearspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def melspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def inv_linear_spectrogram(linear_spectrogram, hparams):
'''Converts linear spectrogram to waveform using librosa'''
if hparams.signal_normalization:
D = _denormalize(linear_spectrogram, hparams)
else:
D = linear_spectrogram
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def inv_mel_spectrogram(mel_spectrogram, hparams):
'''Converts mel spectrogram to waveform using librosa'''
if hparams.signal_normalization:
D = _denormalize(mel_spectrogram, hparams)
else:
D = mel_spectrogram
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def _lws_processor(hparams):
import lws
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
def _griffin_lim(S, hparams):
'''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434
'''
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y, hparams)))
y = _istft(S_complex * angles, hparams)
return y
def _stft(y, hparams):
if hparams.use_lws:
return _lws_processor(hparams).stft(y).T
else:
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
def _istft(y, hparams):
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
##########################################################
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
def num_frames(length, fsize, fshift):
"""Compute number of time frames of spectrogram
"""
pad = (fsize - fshift)
if length % fshift == 0:
M = (length + pad * 2 - fsize) // fshift + 1
else:
M = (length + pad * 2 - fsize) // fshift + 2
return M
def pad_lr(x, fsize, fshift):
"""Compute left and right padding
"""
M = num_frames(len(x), fsize, fshift)
pad = (fsize - fshift)
T = len(x) + 2 * pad
r = (M - 1) * fshift + fsize - T
return pad, pad + r
##########################################################
#Librosa correct padding
def librosa_pad_lr(x, fsize, fshift):
'''compute right padding (final frame)
'''
return int(fsize // 2)
# Conversions
_mel_basis = None
_inv_mel_basis = None
def _linear_to_mel(spectogram, hparams):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis(hparams)
return np.dot(_mel_basis, spectogram)
def _mel_to_linear(mel_spectrogram, hparams):
global _inv_mel_basis
if _inv_mel_basis is None:
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
def _build_mel_basis(hparams):
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
fmin=hparams.fmin, fmax=hparams.fmax)
def _amp_to_db(x, hparams):
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(x):
return np.power(10.0, (x) * 0.05)
def _normalize(S, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
-hparams.max_abs_value, hparams.max_abs_value)
else:
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
if hparams.symmetric_mels:
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
else:
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
def _denormalize(D, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return (((np.clip(D, -hparams.max_abs_value,
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
+ hparams.min_level_db)
else:
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
if hparams.symmetric_mels:
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
else:
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
def wav2spectrograms(wav, hparams):
"""
Process the wav data and generate the mel, linear specgrogram
Returns:
- A tuple: (audio_data, mel_spectrogram, linear_spectrogram, time_steps, mel_frames)
"""
#[-1, 1]
out = wav
constant_values = 0.
# Compute the mel scale spectrogram from the wav
mel_spectrogram = melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None
#Compute the linear scale spectrogram from the wav
linear_spectrogram = linearspectrogram(wav, hparams).astype(np.float32)
linear_frames = linear_spectrogram.shape[1]
#sanity check
assert linear_frames == mel_frames
if hparams.use_lws:
#Ensure time resolution adjustement between audio and mel-spectrogram
fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
l, r = pad_lr(wav, fft_size, get_hop_size(hparams))
#Zero pad audio signal
out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
else:
#Ensure time resolution adjustement between audio and mel-spectrogram
pad = librosa_pad_lr(wav, hparams.n_fft, get_hop_size(hparams))
#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
out = np.pad(out, pad, mode='reflect')
assert len(out) >= mel_frames * get_hop_size(hparams)
#time resolution adjustement
#ensure length of raw audio is multiple of hop size so that we can use
#transposed convolution to upsample
out = out[:mel_frames * get_hop_size(hparams)]
assert len(out) % get_hop_size(hparams) == 0
time_steps = len(out)
return (out, mel_spectrogram, linear_spectrogram, time_steps, mel_frames)