rtphone/src/libs/libevs/lib_enc/amr_wb_enc.cpp

544 lines
21 KiB
C++

/*====================================================================================
EVS Codec 3GPP TS26.443 Nov 13, 2018. Version 12.11.0 / 13.7.0 / 14.3.0 / 15.1.0
====================================================================================*/
#include <stdlib.h>
#include <math.h>
#include "options.h"
#include "cnst.h"
#include "rom_enc.h"
#include "rom_com.h"
#include "prot.h"
namespace evs {
/*-------------------------------------------------------------------*
* amr_wb_enc()
*
* AMR-WB encoder
*-------------------------------------------------------------------*/
void amr_wb_enc(
Encoder_State *st, /* i/o: encoder state structure */
const short *input_sp, /* i : input signal */
const short n_samples /* i : number of input samples */
)
{
short i, delay, harm_flag;
float old_inp[L_INP_12k8], *new_inp, *inp; /* buffer of old input signal */
float old_inp_16k[L_INP_12k8+L_SUBFR], *inp_16k, *new_inp_16k;/* buffer of old input signal @16kHz*/
float old_exc[L_EXC], *exc; /* excitation signal buffer */
float old_wsp[L_WSP], *wsp; /* weighted input signal buffer */
short input_frame; /* frame length at input sampling freq. */
float fr_bands[2*NB_BANDS]; /* energy in frequency bands */
float lf_E[2*VOIC_BINS]; /* per bin spectrum energy in lf */
float tmpN[NB_BANDS]; /* temporary noise update */
float tmpE[NB_BANDS], PS[L_FFT/2]; /* temporary averaged energy of 2 sf. */
float corr_shift; /* correlation shift */
float relE; /* frame relative energy */
float non_staX, cor_map_sum, sp_div;
short vad_flag;
short localVAD;
float Etot; /* total energy */
float ener; /* residual energy from Levinson-Durbin */
short pitch[3]; /* open-loop pitch values */
float voicing[3]; /* open-loop pitch gains */
float A[NB_SUBFR*(M+1)]; /* A(z) unquantized for the 4 subframes */
float Aw[NB_SUBFR*(M+1)]; /* weigted A(z) unquant. for 4 subframes*/
float epsP[M+1]; /* LP prediction errors */
float isp_new[M]; /* ISPs at the end of the frame */
float isf_new[M]; /* ISFs at the end of the frame */
float isp_tmp[M];
float Aq[NB_SUBFR*(M+1)]; /* A(z) quantized for the 4 subframes */
float syn[L_FRAME]; /* synthesis vector */
float res[L_FRAME]; /* residual signal for FER protection */
float exc2[L_FRAME]; /* enhanced excitation */
float pitch_buf[NB_SUBFR]; /* floating pitch for each subframe */
float dummy_buf[L_FRAME32k]; /* dummy buffer - no usage */
float snr_sum_he;
short allow_cn_step;
short localVAD_HE_SAD;
short tmps;
short vad_flag_dtx;
short vad_hover_flag;
short coder_type;
short hf_gain[NB_SUBFR];
short high_lpn_flag;
float lp_bckr, hp_bckr;
float q_env[NUM_ENV_CNG];
short sid_bw = 0;
float exc3[L_FRAME];
float fft_buff[2*L_FFT];
float sp_floor;
float tmp;
/*------------------------------------------------------------------*
* Initialization
*------------------------------------------------------------------*/
st->L_frame = L_FRAME;
st->gamma = GAMMA1;
st->core = AMR_WB_CORE;
st->core_brate = st->total_brate;
st->input_bwidth = st->last_input_bwidth;
st->bwidth = st->last_bwidth;
st->extl = -1;
coder_type = GENERIC;
input_frame = (short)(st->input_Fs / 50); /* frame length of the input signal */
st->encoderPastSamples_enc = (L_FRAME*9)/16;
st->encoderLookahead_enc = L_LOOK_12k8;
st->bpf_off = 0;
if( st->last_core == HQ_CORE || st->last_codec_mode == MODE2 )
{
st->bpf_off = 1;
}
st->igf = 0;
/* Updates in case of EVS primary mode -> AMR-WB IO mode switching */
if( st->last_core != AMR_WB_CORE )
{
updt_IO_switch_enc( st, input_frame );
}
/* Updates in case of HQ -> AMR-WB IO switching */
core_switching_pre_enc( st, &(st->LPDmem), NULL, NULL );
set_s( hf_gain, 0, NB_SUBFR );
set_f( old_inp, 0.0f, L_INP_12k8 );
exc = old_exc + L_EXC_MEM; /* pointer to excitation signal in the current frame */
mvr2r( st->LPDmem.old_exc, old_exc, L_EXC_MEM );
new_inp = old_inp + L_INP_MEM; /* pointer to new samples of the input signal */
inp = new_inp - L_LOOK_12k8; /* pointer to current frame of input signal */
wsp = old_wsp + L_WSP_MEM; /* pointer to current frame of weighted signal */
mvr2r( st->old_inp_12k8, old_inp, L_INP_MEM );
mvr2r( st->old_wsp, old_wsp, L_WSP_MEM );
new_inp_16k = old_inp_16k + L_INP_MEM; /* pointer to new samples of the input signal in 16kHz core */
inp_16k = new_inp_16k - L_LOOK_16k; /* pointer to the current frame of input signal in 16kHz core */
mvr2r( st->old_inp_16k, old_inp_16k, L_INP_MEM );
/* in case of switching, reset AMR-WB BWE memories */
if( st->total_brate == ACELP_23k85 && st->last_core_brate != ACELP_23k85 )
{
hf_cod_init( st->mem_hp400_enc, st->mem_hf_enc, st->mem_syn_hf_enc, st->mem_hf2_enc, &st->gain_alpha );
}
/*----------------------------------------------------------------*
* set input samples buffer
*----------------------------------------------------------------*/
/* get delay to synchronize ACELP and MDCT frame */
delay = NS2SA(st->input_Fs, DELAY_FIR_RESAMPL_NS);
mvr2r( st->input - delay, st->old_input_signal, input_frame+delay );
/*----------------------------------------------------------------*
* Buffering of input signal
* (convert 'short' input data to 'float')
* HP filtering
*----------------------------------------------------------------*/
mvs2r( input_sp, st->input, n_samples );
for( i = n_samples; i < input_frame; i++ )
{
st->input[i] = 0;
}
hp20( st->input, input_frame, st->mem_hp20_in, st->input_Fs );
/*-----------------------------------------------------------------*
* switching from ACELP@16k core to AMR-WB IO mode
*-----------------------------------------------------------------*/
st->rate_switching_reset = 0;
if( st->last_core != AMR_WB_CORE && st->last_L_frame == L_FRAME16k && st->last_core != HQ_CORE)
{
/* in case of switching, do not apply BPF */
st->bpf_off = 1;
/* convert old quantized LSP vector */
st->rate_switching_reset = lsp_convert_poly( st->lsp_old, L_FRAME, 1 );
/* convert old quantized LSF vector */
lsp2lsf( st->lsp_old, st->lsf_old, M, INT_FS_12k8 );
/* Reset LPC mem */
mvr2r( GEWB_Ave, st->mem_AR, M );
set_zero( st->mem_MA, M );
/* update synthesis filter memories */
synth_mem_updt2( L_FRAME, st->last_L_frame, st->LPDmem.old_exc, st->LPDmem.mem_syn_r, st->mem_syn1, st->LPDmem.mem_syn, ENC );
mvr2r( st->LPDmem.old_exc, old_exc, L_EXC_MEM );
mvr2r( st->mem_syn1, st->LPDmem.mem_syn2, M );
mvr2r( st->LPDmem.mem_syn2, st->LPDmem.mem_syn3, M );
/* lsp -> isp */
mvr2r( stable_ISP, isp_tmp, M );
lsp2isp( st->lsp_old, st->lsp_old, isp_tmp, M );
}
/* update buffer of old subframe pitch values */
if( st->last_L_frame != L_FRAME )
{
if( st->last_L_frame == L_FRAME32k )
{
tmp = (float)12800/(float)32000;
}
else if( st->last_L_frame == 512 )
{
tmp = (float)12800/(float)25600;
}
else /* st->last_L_frame == L_FRAME16k */
{
tmp = (float)12800/(float)16000;
}
for( i=NB_SUBFR16k-NB_SUBFR; i<NB_SUBFR16k; i++ )
{
st->old_pitch_buf[i-1] = tmp * st->old_pitch_buf[i];
}
for( i=2*NB_SUBFR16k-NB_SUBFR; i<2*NB_SUBFR16k; i++ )
{
st->old_pitch_buf[i-2] = tmp * st->old_pitch_buf[i];
}
}
if( st->last_bwidth == NB && st->ini_frame != 0 )
{
st->rate_switching_reset = 1;
}
/*----------------------------------------------------------------*
* Change the sampling frequency to 12.8 kHz
*----------------------------------------------------------------*/
modify_Fs( st->input, input_frame, st->input_Fs, new_inp, 12800, st->mem_decim, 0 );
/* update signal buffer */
mvr2r( new_inp, st->buf_speech_enc+L_FRAME, L_FRAME );
/*------------------------------------------------------------------*
* Perform fixed preemphasis through 1 - g*z^-1
*-----------------------------------------------------------------*/
preemph( new_inp, PREEMPH_FAC, L_FRAME, &st->mem_preemph);
/*----------------------------------------------------------------*
* Compute spectrum, find energy per critical frequency band
* Track energy and signal dynamics
* Detect NB spectrum in a 16kHz-sampled input
*----------------------------------------------------------------*/
analy_sp( inp, st->Bin_E, st->Bin_E_old, fr_bands, lf_E, &Etot, st->min_band, st->max_band, dummy_buf, PS, fft_buff );
noise_est_pre( Etot, st->ini_frame, &st->Etot_l, &st->Etot_h, &st->Etot_l_lp, &st->Etot_last, &st->Etot_v_h2, &st->sign_dyn_lp, st->harm_cor_cnt, &st->Etot_lp );
/*----------------------------------------------------------------*
* VAD
*----------------------------------------------------------------*/
vad_flag = wb_vad( st, fr_bands, &localVAD, &tmps, &tmps, &tmps, &snr_sum_he, &localVAD_HE_SAD, &st->flag_noisy_speech_snr );
if( vad_flag == 0 )
{
coder_type = INACTIVE;
}
/* apply DTX hangover for CNG analysis */
vad_flag_dtx = dtx_hangover_addition( st, localVAD, vad_flag, st->lp_speech-st->lp_noise, 0, &vad_hover_flag );
/*-----------------------------------------------------------------*
* Select SID or FRAME__NO_DATA frame if DTX enabled
*-----------------------------------------------------------------*/
if ( st->last_core != AMR_WB_CORE )
{
st->fd_cng_reset_flag = 1;
}
else if ( st->fd_cng_reset_flag > 0 && st->fd_cng_reset_flag < 10 )
{
st->fd_cng_reset_flag++;
}
else
{
st->fd_cng_reset_flag = 0;
}
dtx( st, vad_flag_dtx, inp );
/*----------------------------------------------------------------*
* Noise energy down-ward update and total noise energy estimation
* Long-term energies and relative frame energy updates
* Correlation correction as a function of total noise level
*----------------------------------------------------------------*/
noise_est_down( fr_bands, st->bckr, tmpN, tmpE, st->min_band, st->max_band, &st->totalNoise, Etot, &st->Etot_last, &st->Etot_v_h2 );
high_lpn_flag = 0;
long_enr( st, Etot, localVAD_HE_SAD, high_lpn_flag );
relE = Etot - st->lp_speech;
if( st->bwidth != NB )
{
lp_bckr = mean( st->bckr, 10 );
}
else
{
lp_bckr = mean( st->bckr+1, 9 );
}
hp_bckr = 0.5f * (st->bckr[st->max_band-1] + st->bckr[st->max_band]);
st->bckr_tilt_lt = 0.9f * st->bckr_tilt_lt + 0.1f * lp_bckr / hp_bckr;
corr_shift = correlation_shift(st->totalNoise);
/*----------------------------------------------------------------*
* WB, SWB and FB bandwidth detector
*----------------------------------------------------------------*/
bw_detect( st, st->input, localVAD, NULL );
/* in AMR_WB IO, limit the maximum band-width to WB */
if( st->bwidth > WB )
{
st->bwidth = WB;
}
/*----------------------------------------------------------------*
* Perform LP analysis
* Compute weighted inp
* Perform open-loop pitch analysis
* Perform 1/4 pitch precision improvement
*----------------------------------------------------------------*/
if ( vad_flag == 0 )
{
/* reset the OL pitch tracker memories during inactive frames */
pitch_ol_init( &st->old_thres, &st->old_pitch, &st->delta_pit, &st->old_corr ) ;
}
/* LP analysis */
analy_lp_AMR_WB( inp, &ener, A, epsP, isp_new, st->lsp_old1, isf_new, st->old_pitch_la, st->old_voicing_la );
/* compute weighted input */
find_wsp( L_FRAME, L_SUBFR, NB_SUBFR, A, Aw, inp, TILT_FAC, wsp, &st->mem_wsp, GAMMA1, L_LOOK_12k8 );
/* open-loop pitch analysis */
pitch_ol( pitch,voicing, &st->old_pitch, &st->old_corr, corr_shift, &st->old_thres,
&st->delta_pit, st->old_wsp2, wsp, st->mem_decim2, relE, L_LOOK_12k8, 0, st->bwidth, 0 );
st->old_pitch_la = pitch[2];
st->old_voicing_la = voicing[2];
vad_param_updt( st, pitch, voicing, corr_shift, A );
/*------------------------------------------------------------------*
* Update estimated noise energy and voicing cut-off frequency
*-----------------------------------------------------------------*/
noise_est( st, tmpN, pitch, voicing, epsP, Etot, relE, corr_shift, tmpE, fr_bands, &cor_map_sum,
&sp_div, &non_staX, &harm_flag, lf_E, &st->harm_cor_cnt, st->Etot_l_lp, &sp_floor );
/*----------------------------------------------------------------*
* Change the sampling frequency to 16 kHz,
* input@16kHz needed for AMR-WB IO BWE @23.85kbps
*----------------------------------------------------------------*/
if ( st->input_Fs == 16000 )
{
/* no resampling needed, only delay adjustement to account for the FIR resampling delay */
tmps = NS2SA(16000, DELAY_FIR_RESAMPL_NS);
mvr2r( st->mem_decim16k + tmps, new_inp_16k, tmps );
mvr2r( st->input, new_inp_16k + tmps, input_frame - tmps );
mvr2r( st->input + input_frame - 2*tmps, st->mem_decim16k, 2*tmps );
}
else if( st->input_Fs == 32000 || st->input_Fs == 48000 )
{
modify_Fs( st->input, input_frame, st->input_Fs, new_inp_16k, 16000, st->mem_decim16k, 0 );
}
/*----------------------------------------------------------------*
* Encoding of SID frames
*----------------------------------------------------------------*/
if ( st->core_brate == SID_1k75 || st->core_brate == FRAME__NO_DATA )
{
/* encode CNG parameters */
CNG_enc( st, L_FRAME, Aq, inp, ener, isp_new, isf_new , &allow_cn_step, st->burst_ho_cnt, q_env, &sid_bw, st->exc_mem2 );
/* comfort noise generation */
CNG_exc( st->core_brate, L_FRAME, &st->Enew, &st->cng_seed, exc, exc2, &st->lp_ener,
st->last_core_brate, &st->first_CNG, &st->cng_ener_seed, dummy_buf, allow_cn_step, &st->last_allow_cn_step, st->num_ho,
q_env, st->lp_env, st->old_env, st->exc_mem, st->exc_mem1, &sid_bw, &st->cng_ener_seed1, exc3, st->Opt_AMR_WB );
if ( st->first_CNG == 0 )
{
st->first_CNG = 1;
}
/* synthesis */
syn_12k8( L_FRAME, Aq, exc2, dummy_buf, st->LPDmem.mem_syn3, 1 ); /* dummy_buf = temporary buffer to handle syn1[] */
/* reset the encoder */
CNG_reset_enc( st, &(st->LPDmem), pitch_buf, dummy_buf+L_FRAME, 0 );
/* update st->mem_syn1 for ACELP core switching */
mvr2r( st->LPDmem.mem_syn3, st->mem_syn1, M );
/* update ACELP core synthesis filter memory */
mvr2r( st->LPDmem.mem_syn3, st->LPDmem.mem_syn, M );
/* update old synthesis buffer - needed for ACELP internal sampling rate switching */
mvr2r( dummy_buf + L_FRAME - L_SYN_MEM, st->LPDmem.mem_syn_r, L_SYN_MEM );
/* Update MODE2 core switching memory */
deemph( dummy_buf, PREEMPH_FAC, L_FRAME, &(st->LPDmem.syn[M]) );
mvr2r( dummy_buf+L_FRAME-M-1, st->LPDmem.syn, M+1 );
}
/*----------------------------------------------------------------*
* Encoding of all other frames
*----------------------------------------------------------------*/
else
{
/*-----------------------------------------------------------------*
* After inactive period, use the most up-to-date ISPs
*-----------------------------------------------------------------*/
if( st->last_core_brate == FRAME__NO_DATA || st->last_core_brate == SID_1k75 )
{
mvr2r( st->lspCNG, st->lsp_old, M );
isp2isf( st->lspCNG, st->lsf_old, M, INT_FS_12k8 );
set_f( old_exc, 0, L_EXC_MEM );
}
/*-----------------------------------------------------------------*
* ISF Quantization and interpolation
*-----------------------------------------------------------------*/
isf_enc_amr_wb( st, isf_new, isp_new, Aq, &st->stab_fac );
/*---------------------------------------------------------------*
* Calculation of LP residual (filtering through A[z] filter)
*---------------------------------------------------------------*/
calc_residu( inp, res, Aq, L_FRAME );
st->burst_ho_cnt = 0;
/*------------------------------------------------------------*
* Encode excitation
*------------------------------------------------------------*/
encod_amr_wb( st, &(st->LPDmem), inp, Aw, Aq, pitch, voicing, res, syn, exc, exc2, pitch_buf, hf_gain, inp_16k );
/* update st->mem_syn1 for ACELP core switching */
mvr2r( st->LPDmem.mem_syn, st->mem_syn1, M );
/* update old synthesis buffer - needed for ACELP internal sampling rate switching */
mvr2r( syn + L_FRAME - L_SYN_MEM, st->LPDmem.mem_syn_r, L_SYN_MEM );
/* Update MODE2 core switching memory */
mvr2r( syn, dummy_buf, L_FRAME );
deemph( dummy_buf, PREEMPH_FAC, L_FRAME, &(st->LPDmem.syn[M]) );
mvr2r( dummy_buf+L_FRAME-M-1, st->LPDmem.syn, M+1 );
/*--------------------------------------------------------------------------------------*
* Write VAD information into the bitstream in AMR-WB IO mode
*--------------------------------------------------------------------------------------*/
push_indice( st, IND_VAD_FLAG, vad_flag, 1 );
}
/*-----------------------------------------------------------------*
* Updates
*-----------------------------------------------------------------*/
/* update old weighted speech buffer - for OL pitch analysis */
mvr2r( &old_wsp[L_FRAME], st->old_wsp, L_WSP_MEM );
/* update old input signal buffer */
mvr2r( &old_inp[L_FRAME], st->old_inp_12k8, L_INP_MEM );
/* update old input signal @16kHz buffer */
if( st->input_Fs > 8000 )
{
mvr2r( &old_inp_16k[L_FRAME16k], st->old_inp_16k, L_INP_MEM );
}
/* update of old per-band energy spectrum */
mvr2r( fr_bands + NB_BANDS, st->enrO, NB_BANDS );
/* update the last bandwidth */
st->last_input_bwidth = st->input_bwidth;
st->last_bwidth = st->bwidth;
/* update signal buffers */
mvr2r( new_inp, st->buf_speech_enc_pe+L_FRAME, L_FRAME );
mvr2r( wsp, st->buf_wspeech_enc+L_FRAME+L_SUBFR, L_FRAME + L_LOOK_12k8 );
updt_enc( st, L_FRAME, coder_type, old_exc, pitch_buf, 0, Aq, isf_new, isp_new, dummy_buf );
core_encode_update( st );
/* update main codec parameters */
st->last_extl = -1;
st->last_core = st->core;
st->last_L_frame = L_FRAME;
st->last_core_brate = st->core_brate;
st->last_total_brate = st->total_brate;
st->Etot_last = Etot;
st->last_coder_type_raw = st->coder_type_raw;
st->last_codec_mode = st->codec_mode;
/* Increase the counter of initialization frames */
if( st->ini_frame < MAX_FRAME_COUNTER )
{
(st->ini_frame)++;
}
if( st->core_brate > SID_1k75 )
{
st->last_active_brate = st->total_brate;
}
if ( st->core_brate > SID_1k75 && st->first_CNG )
{
if( st->act_cnt >= BUF_DEC_RATE )
{
st->act_cnt = 0;
}
st->act_cnt++;
if( st->act_cnt == BUF_DEC_RATE && st->ho_hist_size > 0 )
{
st->ho_hist_size--;
}
if( ++(st->act_cnt2) >= MIN_ACT_CNG_UPD )
{
st->act_cnt2 = MIN_ACT_CNG_UPD;
}
}
return;
}
} // end of namespace