From 2cb2a93d597fa874c9428d89b7595838ff397713 Mon Sep 17 00:00:00 2001 From: Dmytro Bogovych Date: Wed, 17 Jun 2026 14:18:31 +0300 Subject: [PATCH] - performance optimizations --- src/engine/media/MT_AmrCodec.cpp | 33 ++++++++++-- src/engine/media/MT_AmrCodec.h | 11 ++++ src/engine/media/MT_AudioReceiver.cpp | 74 +++++++++++++++++++-------- src/engine/media/MT_AudioReceiver.h | 36 +++++++++++-- src/engine/media/MT_EvsCodec.cpp | 27 +++++++++- src/engine/media/MT_EvsCodec.h | 14 +++++ src/libs/jrtplib/src/rtpsources.h | 14 ++++- 7 files changed, 177 insertions(+), 32 deletions(-) diff --git a/src/engine/media/MT_AmrCodec.cpp b/src/engine/media/MT_AmrCodec.cpp index fb521496..8c1550d6 100644 --- a/src/engine/media/MT_AmrCodec.cpp +++ b/src/engine/media/MT_AmrCodec.cpp @@ -265,8 +265,20 @@ PCodec AmrNbCodec::CodecFactory::create() AmrNbCodec::AmrNbCodec(const AmrCodecConfig& config) :mConfig(config) { - mEncoderCtx = Encoder_Interface_init(1); - mDecoderCtx = Decoder_Interface_init(); + // Contexts are created lazily (see ensureEncoder/ensureDecoder) - a codec + // resolved only for network-MOS metadata never allocates them. +} + +void AmrNbCodec::ensureEncoder() +{ + if (!mEncoderCtx) + mEncoderCtx = Encoder_Interface_init(1); +} + +void AmrNbCodec::ensureDecoder() +{ + if (!mDecoderCtx) + mDecoderCtx = Decoder_Interface_init(); } AmrNbCodec::~AmrNbCodec() @@ -298,6 +310,8 @@ Codec::Info AmrNbCodec::info() Codec::EncodeResult AmrNbCodec::encode(std::span input, std::span output) { + ensureEncoder(); + if (input.size_bytes() % pcmLength()) return {.mEncoded = 0}; @@ -324,6 +338,8 @@ Codec::EncodeResult AmrNbCodec::encode(std::span input, std::span #define AMR_BITRATE_DTX 15 Codec::DecodeResult AmrNbCodec::decode(std::span input, std::span output) { + ensureDecoder(); + if (mConfig.mOctetAligned) return {.mDecoded = 0}; @@ -427,6 +443,8 @@ Codec::DecodeResult AmrNbCodec::decode(std::span input, std::span size_t AmrNbCodec::plc(int lostFrames, std::span output) { + ensureDecoder(); + if (output.size_bytes() < lostFrames * pcmLength()) return 0; @@ -496,7 +514,14 @@ AmrWbStatistics MT::GAmrWbStatistics; AmrWbCodec::AmrWbCodec(const AmrCodecConfig& config) :mConfig(config) { - mDecoderCtx = D_IF_init(); + // Decoder context is created lazily (see ensureDecoder) - a codec resolved + // only for network-MOS metadata never allocates the AMR-WB decoder state. +} + +void AmrWbCodec::ensureDecoder() +{ + if (!mDecoderCtx) + mDecoderCtx = D_IF_init(); } AmrWbCodec::~AmrWbCodec() @@ -630,6 +655,8 @@ Codec::DecodeResult AmrWbCodec::decodePlain(std::span input, std: Codec::DecodeResult AmrWbCodec::decode(std::span input, std::span output) { + ensureDecoder(); + if (mConfig.mIuUP) return decodeIuup(input, output); else diff --git a/src/engine/media/MT_AmrCodec.h b/src/engine/media/MT_AmrCodec.h index c09b512e..18018880 100644 --- a/src/engine/media/MT_AmrCodec.h +++ b/src/engine/media/MT_AmrCodec.h @@ -33,6 +33,13 @@ protected: int mPreviousPacketLength = 0; size_t mCngCounter = 0; size_t mSwitchCounter = 0; + + // opencore-amr encoder/decoder state is allocated lazily on first encode/decode. + // Network-MOS-only streams resolve codec metadata (name/samplerate/frame timing) + // but never decode, so they must not pay for a context they never use - at scale + // this is ~a decoder state (several KB) saved per network-only stream. + void ensureEncoder(); + void ensureDecoder(); public: class CodecFactory: public Factory { @@ -85,6 +92,10 @@ protected: int mPreviousPacketLength; + // Decoder state is allocated lazily on first decode/plc (see AmrNbCodec) so + // network-MOS-only streams never instantiate the AMR-WB decoder. + void ensureDecoder(); + DecodeResult decodeIuup(std::span input, std::span output); DecodeResult decodePlain(std::span input, std::span output); diff --git a/src/engine/media/MT_AudioReceiver.cpp b/src/engine/media/MT_AudioReceiver.cpp index bece0629..b5b074c3 100644 --- a/src/engine/media/MT_AudioReceiver.cpp +++ b/src/engine/media/MT_AudioReceiver.cpp @@ -184,16 +184,18 @@ std::shared_ptr RtpBuffer::add(const std::shared_ptr(); } -RtpBuffer::FetchResult RtpBuffer::fetch() +void RtpBuffer::trimToHighWater(size_t maxPackets) { Lock l(mGuard); - FetchResult result; - - // See if there is enough information in buffer auto total = findTimelength(); - while (total > mHigh && mPacketList.size() > 1 && 0ms != mHigh) + // Drop the oldest packet while either bound is exceeded: the time-based + // high-water mark (mHigh, when set) or, if maxPackets != 0, the packet-count + // cap. Always keep at least one packet so loss/gap accounting has a reference. + while (mPacketList.size() > 1 && + ((0ms != mHigh && total > mHigh) || + (maxPackets != 0 && mPacketList.size() > maxPackets))) { ICELogMedia( << "Dropping RTP packets from jitter buffer"); total -= mPacketList.front()->timelength(); @@ -233,6 +235,19 @@ RtpBuffer::FetchResult RtpBuffer::fetch() // Increase number in statistics mStat.mPacketDropped++; } +} + +RtpBuffer::FetchResult RtpBuffer::fetch() +{ + Lock l(mGuard); + + FetchResult result; + + // Bound the buffer to the high-water mark before fetching. + trimToHighWater(); + + // See how much audio is buffered now. + auto total = findTimelength(); if (total < mLow || total == 0ms) { @@ -494,13 +509,13 @@ void AudioReceiver::processDecoded(Audio::DataWindow& output, DecodeOptions opti { // Write to audio dump if requested if (mDecodedDump && mDecodedLength) - mDecodedDump->write(mDecodedFrame, mDecodedLength); + mDecodedDump->write(mDecodedFrame.data(), mDecodedLength); // Resample to target rate makeMonoAndResample(options.mResampleToMainRate ? mCodec->samplerate() : 0, mCodec->channels()); // Send to output - output.add(mResampledFrame, mResampledLength); + output.add(mResampledFrame.data(), mResampledLength); } void AudioReceiver::produceSilence(std::chrono::milliseconds length, Audio::DataWindow& output, DecodeOptions options) @@ -517,13 +532,13 @@ void AudioReceiver::produceSilence(std::chrono::milliseconds length, Audio::Data size_t tail_size = tail * sizeof(int16_t) * mCodec->samplerate() / 1000 * mCodec->channels(); for (size_t i = 0; i < chunks; i++) { - memset(mDecodedFrame, 0, chunk_size); + memset(mDecodedFrame.data(), 0, chunk_size); mDecodedLength = chunk_size; processDecoded(output, options); } if (tail) { - memset(mDecodedFrame, 0, tail_size); + memset(mDecodedFrame.data(), 0, tail_size); mDecodedLength = tail_size; processDecoded(output, options); } @@ -537,7 +552,7 @@ void AudioReceiver::produceCNG(std::chrono::milliseconds length, Audio::DataWind if (options.mSkipDecode) mDecodedLength = 0; else - mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), 100, mDecodedFrame, false); + mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), 100, mDecodedFrame.data(), false); if (mDecodedLength) processDecoded(output, options); @@ -550,7 +565,7 @@ void AudioReceiver::produceCNG(std::chrono::milliseconds length, Audio::DataWind if (options.mSkipDecode) mDecodedLength = 0; else - mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), tail, reinterpret_cast(mDecodedFrame), false); + mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), tail, reinterpret_cast(mDecodedFrame.data()), false); if (mDecodedLength) processDecoded(output, options); @@ -568,7 +583,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodeGapTo(Audio::DataWindow& output { // Synthesize comfort noise. It will be done on AUDIO_SAMPLERATE rate directly to mResampledFrame buffer. // Do not forget to send this noise to analysis - mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, reinterpret_cast(mDecodedFrame), false); + mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, reinterpret_cast(mDecodedFrame.data()), false); } else decodePacketTo(output, options, mCngPacket); @@ -581,14 +596,14 @@ AudioReceiver::DecodeResult AudioReceiver::decodeGapTo(Audio::DataWindow& output mDecodedLength = 0; else { - mDecodedLength = mCodec->plc(mFrameCount, {(uint8_t*)mDecodedFrame, sizeof mDecodedFrame}); + mDecodedLength = mCodec->plc(mFrameCount, {(uint8_t*)mDecodedFrame.data(), mDecodedFrame.size() * sizeof(int16_t)}); if (!mDecodedLength) { // PLC is not support or failed // So substitute the silence size_t nr_of_samples = mCodec->frameTime() * mCodec->samplerate() / 1000 * sizeof(short); mDecodedLength = nr_of_samples * sizeof(short); - memset(mDecodedFrame, 0, mDecodedLength); + memset(mDecodedFrame.data(), 0, mDecodedLength); } } } @@ -660,7 +675,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodePacketTo(Audio::DataWindow& out mCngDecoder.decode3389(rtp.GetPayloadData(), rtp.GetPayloadLength()); // Emit CNG mLastPacketLength milliseconds - mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, (short*)mDecodedFrame, true); + mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, (short*)mDecodedFrame.data(), true); if (mDecodedLength) processDecoded(output, options); } @@ -696,7 +711,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodePacketTo(Audio::DataWindow& out { // Decode frame by frame auto codecInput = std::span{rtp.GetPayloadData() + i * mCodec->rtpLength(), (size_t)frameLength}; - auto codecOutput = std::span{(uint8_t*)mDecodedFrame, sizeof mDecodedFrame}; + auto codecOutput = std::span{(uint8_t*)mDecodedFrame.data(), mDecodedFrame.size() * sizeof(int16_t)}; auto r = mCodec->decode(codecInput, codecOutput); mDecodedLength = r.mDecoded; if (mDecodedLength > 0) @@ -798,6 +813,10 @@ AudioReceiver::DecodeResult AudioReceiver::getAudioTo(Audio::DataWindow& output, // ICELogDebug(<< "getAudioTo() for " << options.mElapsed); assert (options.mElapsed != 0ms); + // First decode on this receiver: allocate the scratch buffers. Network-MOS-only + // streams never reach this point, so they never pay for them. + ensureDecodeBuffers(); + // Increase counter of requested audio mRequestedAudio += options.mElapsed; @@ -876,6 +895,19 @@ AudioReceiver::DecodeResult AudioReceiver::getAudioTo(Audio::DataWindow& output, return result; } +void AudioReceiver::ensureDecodeBuffers() +{ + // Allocate the decode/convert/resample scratch buffers to full capacity on the + // first decode. mDecodedFrame being empty means none are allocated yet; they + // are always allocated together, so checking one is enough. + if (mDecodedFrame.empty()) + { + mDecodedFrame.resize(MT_MAX_DECODEBUFFER); + mConvertedFrame.resize(MT_MAX_DECODEBUFFER * 2); + mResampledFrame.resize(MT_MAX_DECODEBUFFER); + } +} + void AudioReceiver::makeMonoAndResample(int rate, int channels) { // Make mono from stereo - engine works with mono only for now @@ -883,12 +915,12 @@ void AudioReceiver::makeMonoAndResample(int rate, int channels) if (channels != AUDIO_CHANNELS) { if (channels == 1) - mConvertedLength = Audio::ChannelConverter::monoToStereo(mDecodedFrame, mDecodedLength, mConvertedFrame, mDecodedLength * 2); + mConvertedLength = Audio::ChannelConverter::monoToStereo(mDecodedFrame.data(), mDecodedLength, mConvertedFrame.data(), mDecodedLength * 2); else - mDecodedLength = Audio::ChannelConverter::stereoToMono(mDecodedFrame, mDecodedLength, mDecodedFrame, mDecodedLength / 2); + mDecodedLength = Audio::ChannelConverter::stereoToMono(mDecodedFrame.data(), mDecodedLength, mDecodedFrame.data(), mDecodedLength / 2); } - void* frames = mConvertedLength ? mConvertedFrame : mDecodedFrame; + void* frames = mConvertedLength ? (void*)mConvertedFrame.data() : (void*)mDecodedFrame.data(); unsigned length = mConvertedLength ? mConvertedLength : mDecodedLength; Audio::Resampler* r = nullptr; @@ -899,13 +931,13 @@ void AudioReceiver::makeMonoAndResample(int rate, int channels) case 32000: r = &mResampler32; break; case 48000: r = &mResampler48; break; default: - memcpy(mResampledFrame, frames, length); + memcpy(mResampledFrame.data(), frames, length); mResampledLength = length; return; } size_t processedInput = 0; - mResampledLength = r->processBuffer(frames, length, processedInput, mResampledFrame, r->getDestLength(length)); + mResampledLength = r->processBuffer(frames, length, processedInput, mResampledFrame.data(), r->getDestLength(length)); // processedInput result value is ignored - it is always equal to length as internal sample rate is 8/16/32/48K } diff --git a/src/engine/media/MT_AudioReceiver.h b/src/engine/media/MT_AudioReceiver.h index 7f0e2c79..77b55bb6 100644 --- a/src/engine/media/MT_AudioReceiver.h +++ b/src/engine/media/MT_AudioReceiver.h @@ -20,6 +20,8 @@ #include #include +#include +#include using namespace std::chrono_literals; namespace MT @@ -103,7 +105,19 @@ public: typedef std::shared_ptr PResultList; FetchResult fetch(); - + + // Drop oldest packets so buffered audio stays within the high-water mark, + // recording packet-loss events for any sequence gaps crossed (the same + // accounting fetch() performs). Used to bound memory on streams that never + // call fetch() - i.e. network-MOS-only streams with audio decode disabled, + // which would otherwise retain every packet for the whole call. + // + // maxPackets, when non-zero, additionally caps the buffer to that many packets + // regardless of buffered time. The decode path (fetch()) leaves it 0 so jitter + // tolerance stays governed by the time-based high-water mark; the network-only + // path passes a small cap since those packets are never decoded. + void trimToHighWater(size_t maxPackets = 0); + protected: unsigned mSsrc = 0; std::chrono::milliseconds mHigh = std::chrono::milliseconds(RTP_BUFFER_HIGH), @@ -240,16 +254,22 @@ protected: // Already decoded data that can be retrieved without actual decoding - it may happen because of getAudioTo() may be limited by time interval Audio::DataWindow mAvailable; - // Temporary buffer to hold decoded data (it is better than allocate data on stack) - int16_t mDecodedFrame[MT_MAX_DECODEBUFFER]; + // Decode/convert/resample scratch buffers. These were inline arrays + // (MT_MAX_DECODEBUFFER * {1,2,1} * int16_t = 256 KB total) carried by every + // AudioReceiver, hence by every StreamDecoder - including network-MOS-only + // streams that never decode. They are now allocated lazily on the first + // getAudioTo() call via ensureDecodeBuffers(); non-decoding streams keep them + // empty. Once allocated they are sized to full capacity and reused, so decode + // behaviour is unchanged. + std::vector mDecodedFrame; // sized to MT_MAX_DECODEBUFFER size_t mDecodedLength = 0; // Buffer to hold data converted to stereo/mono; there is multiplier 2 as it can be stereo audio - int16_t mConvertedFrame[MT_MAX_DECODEBUFFER * 2]; + std::vector mConvertedFrame; // sized to MT_MAX_DECODEBUFFER * 2 size_t mConvertedLength = 0; // Buffer to hold data resampled to AUDIO_SAMPLERATE - int16_t mResampledFrame[MT_MAX_DECODEBUFFER]; + std::vector mResampledFrame; // sized to MT_MAX_DECODEBUFFER size_t mResampledLength = 0; // Last packet time length @@ -272,6 +292,12 @@ protected: std::chrono::milliseconds mRequestedAudio = 0ms; std::chrono::milliseconds mProducedAudio = 0ms; + // Lazily allocate the decode/convert/resample scratch buffers (mDecodedFrame, + // mConvertedFrame, mResampledFrame) to full capacity on the first decode. A + // no-op once allocated. Called at the top of getAudioTo(); network-MOS-only + // streams never reach it, so they never pay the 256 KB. + void ensureDecodeBuffers(); + // Zero rate will make audio mono but resampling will be skipped void makeMonoAndResample(int rate, int channels); diff --git a/src/engine/media/MT_EvsCodec.cpp b/src/engine/media/MT_EvsCodec.cpp index 3eda370f..658f8bf0 100644 --- a/src/engine/media/MT_EvsCodec.cpp +++ b/src/engine/media/MT_EvsCodec.cpp @@ -152,6 +152,27 @@ EVSCodec::EVSCodec(const StreamParameters &sp) { EVSCodec::sp = sp; + // Metadata only - the heavy decoder state is created lazily (ensureDecoder()). + mOutputFs = outputFsFromBw(sp.bw); +} + +int EVSCodec::outputFsFromBw(int bw) +{ + switch (bw) + { + case NB: return 8000; + case WB: return 16000; + case SWB: return 32000; + case FB: return 48000; + } + return 0; +} + +void EVSCodec::ensureDecoder() +{ + if (st_dec) + return; + if ((st_dec = reinterpret_cast(malloc(sizeof(evs::Decoder_State)))) == nullptr) throw std::bad_alloc(); @@ -170,9 +191,9 @@ EVSCodec::~EVSCodec() Codec::Info EVSCodec::info() { return { .mName = MT_EVS_CODECNAME, - .mSamplerate = st_dec->output_Fs, + .mSamplerate = mOutputFs, .mChannels = 1, - .mPcmLength = st_dec->output_Fs / 1000 * sp.ptime * 2, + .mPcmLength = mOutputFs / 1000 * sp.ptime * 2, .mFrameTime = sp.ptime, .mRtpLength = 0 }; @@ -187,6 +208,8 @@ Codec::EncodeResult EVSCodec::encode(std::span input, std::span input, std::span output) { + ensureDecoder(); + if (output.size_bytes() < pcmLength()) return {.mDecoded = 0}; diff --git a/src/engine/media/MT_EvsCodec.h b/src/engine/media/MT_EvsCodec.h index 3c4c9cf2..fa6418b3 100644 --- a/src/engine/media/MT_EvsCodec.h +++ b/src/engine/media/MT_EvsCodec.h @@ -57,7 +57,21 @@ public: private: evs::Decoder_State* st_dec = nullptr; StreamParameters sp; + + // Output sample rate, derived from the negotiated bandwidth (sp.bw) at + // construction. Cached so info()/samplerate()/pcmLength() work for network-MOS + // metadata without allocating the (large) EVS decoder state - see ensureDecoder. + int mOutputFs = 0; + void initDecoder(const StreamParameters& sp); + + // Allocate + initialize the EVS decoder state lazily on first decode(). + // Network-MOS-only streams resolve metadata but never decode, so they never + // pay for the EVS decoder (Decoder_State + CLDFB/FD-CNG sub-allocations). + void ensureDecoder(); + + // Maps an EVS bandwidth (NB/WB/SWB/FB) to its output sample rate in Hz. + static int outputFsFromBw(int bw); }; } // End of namespace diff --git a/src/libs/jrtplib/src/rtpsources.h b/src/libs/jrtplib/src/rtpsources.h index 5d0079f5..3ee4a323 100644 --- a/src/libs/jrtplib/src/rtpsources.h +++ b/src/libs/jrtplib/src/rtpsources.h @@ -44,7 +44,19 @@ #include "rtptypes.h" #include "rtpmemoryobject.h" -#define RTPSOURCES_HASHSIZE 8317 +// Number of buckets in the per-RTPSession SSRC->source hash table. This is an +// inline array of pointers in every RTPSources instance (sizeof == hashsize * +// sizeof(void*)), so it is paid by every RTPSession object regardless of how many +// sources it actually tracks. The original jrtplib default (8317) targets RTP +// mixers/conferences that demultiplex thousands of distinct SSRCs on one session; +// it costs ~65 KB per session. Sevana's per-stream capture sessions carry ~1 SSRC, +// so a far smaller table is ample - collisions are resolved by linked lists, so a +// small size only affects lookup cost (negligible at our source counts), never +// correctness. Overridable at build time for products that genuinely need many +// sources per session. +#ifndef RTPSOURCES_HASHSIZE +#define RTPSOURCES_HASHSIZE 251 +#endif namespace jrtplib {