From 2cb2a93d597fa874c9428d89b7595838ff397713 Mon Sep 17 00:00:00 2001
From: Dmytro Bogovych <dmytro.bogovych@gmail.com>
Date: Wed, 17 Jun 2026 14:18:31 +0300
Subject: [PATCH] - performance optimizations

---
 src/engine/media/MT_AmrCodec.cpp      | 33 ++++++++++--
 src/engine/media/MT_AmrCodec.h        | 11 ++++
 src/engine/media/MT_AudioReceiver.cpp | 74 +++++++++++++++++++--------
 src/engine/media/MT_AudioReceiver.h   | 36 +++++++++++--
 src/engine/media/MT_EvsCodec.cpp      | 27 +++++++++-
 src/engine/media/MT_EvsCodec.h        | 14 +++++
 src/libs/jrtplib/src/rtpsources.h     | 14 ++++-
 7 files changed, 177 insertions(+), 32 deletions(-)
diff --git a/src/engine/media/MT_AmrCodec.cpp b/src/engine/media/MT_AmrCodec.cpp
index fb521496..8c1550d6 100644
--- a/src/engine/media/MT_AmrCodec.cpp
+++ b/src/engine/media/MT_AmrCodec.cpp
@@ -265,8 +265,20 @@ PCodec AmrNbCodec::CodecFactory::create()
 AmrNbCodec::AmrNbCodec(const AmrCodecConfig& config)
     :mConfig(config)
 {
-    mEncoderCtx = Encoder_Interface_init(1);
-    mDecoderCtx = Decoder_Interface_init();
+    // Contexts are created lazily (see ensureEncoder/ensureDecoder) - a codec
+    // resolved only for network-MOS metadata never allocates them.
+}
+
+void AmrNbCodec::ensureEncoder()
+{
+    if (!mEncoderCtx)
+        mEncoderCtx = Encoder_Interface_init(1);
+}
+
+void AmrNbCodec::ensureDecoder()
+{
+    if (!mDecoderCtx)
+        mDecoderCtx = Decoder_Interface_init();
 }
 
 AmrNbCodec::~AmrNbCodec()
@@ -298,6 +310,8 @@ Codec::Info AmrNbCodec::info()
 
 Codec::EncodeResult AmrNbCodec::encode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureEncoder();
+
     if (input.size_bytes() % pcmLength())
         return {.mEncoded = 0};
 
@@ -324,6 +338,8 @@ Codec::EncodeResult AmrNbCodec::encode(std::span<const uint8_t> input, std::span
 #define AMR_BITRATE_DTX 15
 Codec::DecodeResult AmrNbCodec::decode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
     if (mConfig.mOctetAligned)
         return {.mDecoded = 0};
 
@@ -427,6 +443,8 @@ Codec::DecodeResult AmrNbCodec::decode(std::span<const uint8_t> input, std::span
 
 size_t AmrNbCodec::plc(int lostFrames, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
     if (output.size_bytes() < lostFrames * pcmLength())
         return 0;
 
@@ -496,7 +514,14 @@ AmrWbStatistics MT::GAmrWbStatistics;
 AmrWbCodec::AmrWbCodec(const AmrCodecConfig& config)
     :mConfig(config)
 {
-    mDecoderCtx = D_IF_init();
+    // Decoder context is created lazily (see ensureDecoder) - a codec resolved
+    // only for network-MOS metadata never allocates the AMR-WB decoder state.
+}
+
+void AmrWbCodec::ensureDecoder()
+{
+    if (!mDecoderCtx)
+        mDecoderCtx = D_IF_init();
 }
 
 AmrWbCodec::~AmrWbCodec()
@@ -630,6 +655,8 @@ Codec::DecodeResult AmrWbCodec::decodePlain(std::span<const uint8_t> input, std:
 
 Codec::DecodeResult AmrWbCodec::decode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
     if (mConfig.mIuUP)
         return decodeIuup(input, output);
     else
diff --git a/src/engine/media/MT_AmrCodec.h b/src/engine/media/MT_AmrCodec.h
index c09b512e..18018880 100644
--- a/src/engine/media/MT_AmrCodec.h
+++ b/src/engine/media/MT_AmrCodec.h
@@ -33,6 +33,13 @@ protected:
     int mPreviousPacketLength = 0;
     size_t mCngCounter = 0;
     size_t mSwitchCounter = 0;
+
+    // opencore-amr encoder/decoder state is allocated lazily on first encode/decode.
+    // Network-MOS-only streams resolve codec metadata (name/samplerate/frame timing)
+    // but never decode, so they must not pay for a context they never use - at scale
+    // this is ~a decoder state (several KB) saved per network-only stream.
+    void ensureEncoder();
+    void ensureDecoder();
 public:
     class CodecFactory: public Factory
     {
@@ -85,6 +92,10 @@ protected:
 
     int mPreviousPacketLength;
 
+    // Decoder state is allocated lazily on first decode/plc (see AmrNbCodec) so
+    // network-MOS-only streams never instantiate the AMR-WB decoder.
+    void ensureDecoder();
+
     DecodeResult decodeIuup(std::span<const uint8_t> input, std::span<uint8_t> output);
     DecodeResult decodePlain(std::span<const uint8_t> input, std::span<uint8_t> output);
 
diff --git a/src/engine/media/MT_AudioReceiver.cpp b/src/engine/media/MT_AudioReceiver.cpp
index bece0629..b5b074c3 100644
--- a/src/engine/media/MT_AudioReceiver.cpp
+++ b/src/engine/media/MT_AudioReceiver.cpp
@@ -184,16 +184,18 @@ std::shared_ptr<RtpBuffer::Packet> RtpBuffer::add(const std::shared_ptr<jrtplib:
     return std::shared_ptr<Packet>();
 }
 
-RtpBuffer::FetchResult RtpBuffer::fetch()
+void RtpBuffer::trimToHighWater(size_t maxPackets)
 {
     Lock l(mGuard);
 
-    FetchResult result;
-
-    // See if there is enough information in buffer
     auto total = findTimelength();
 
-    while (total > mHigh && mPacketList.size() > 1 && 0ms != mHigh)
+    // Drop the oldest packet while either bound is exceeded: the time-based
+    // high-water mark (mHigh, when set) or, if maxPackets != 0, the packet-count
+    // cap. Always keep at least one packet so loss/gap accounting has a reference.
+    while (mPacketList.size() > 1 &&
+           ((0ms != mHigh && total > mHigh) ||
+            (maxPackets != 0 && mPacketList.size() > maxPackets)))
     {
         ICELogMedia( << "Dropping RTP packets from jitter buffer");
         total -= mPacketList.front()->timelength();
@@ -233,6 +235,19 @@ RtpBuffer::FetchResult RtpBuffer::fetch()
         // Increase number in statistics
         mStat.mPacketDropped++;
     }
+}
+
+RtpBuffer::FetchResult RtpBuffer::fetch()
+{
+    Lock l(mGuard);
+
+    FetchResult result;
+
+    // Bound the buffer to the high-water mark before fetching.
+    trimToHighWater();
+
+    // See how much audio is buffered now.
+    auto total = findTimelength();
 
     if (total < mLow || total == 0ms)
     {
@@ -494,13 +509,13 @@ void AudioReceiver::processDecoded(Audio::DataWindow& output, DecodeOptions opti
 {
     // Write to audio dump if requested
     if (mDecodedDump && mDecodedLength)
-        mDecodedDump->write(mDecodedFrame, mDecodedLength);
+        mDecodedDump->write(mDecodedFrame.data(), mDecodedLength);
 
     // Resample to target rate
     makeMonoAndResample(options.mResampleToMainRate ? mCodec->samplerate() : 0, mCodec->channels());
 
     // Send to output
-    output.add(mResampledFrame, mResampledLength);
+    output.add(mResampledFrame.data(), mResampledLength);
 }
 
 void AudioReceiver::produceSilence(std::chrono::milliseconds length, Audio::DataWindow& output, DecodeOptions options)
@@ -517,13 +532,13 @@ void AudioReceiver::produceSilence(std::chrono::milliseconds length, Audio::Data
     size_t tail_size = tail * sizeof(int16_t) * mCodec->samplerate() / 1000 * mCodec->channels();
     for (size_t i = 0; i < chunks; i++)
     {
-        memset(mDecodedFrame, 0, chunk_size);
+        memset(mDecodedFrame.data(), 0, chunk_size);
         mDecodedLength = chunk_size;
         processDecoded(output, options);
     }
     if (tail)
     {
-        memset(mDecodedFrame, 0, tail_size);
+        memset(mDecodedFrame.data(), 0, tail_size);
         mDecodedLength = tail_size;
         processDecoded(output, options);
     }
@@ -537,7 +552,7 @@ void AudioReceiver::produceCNG(std::chrono::milliseconds length, Audio::DataWind
         if (options.mSkipDecode)
             mDecodedLength = 0;
         else
-            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), 100, mDecodedFrame, false);
+            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), 100, mDecodedFrame.data(), false);
 
         if (mDecodedLength)
             processDecoded(output, options);
@@ -550,7 +565,7 @@ void AudioReceiver::produceCNG(std::chrono::milliseconds length, Audio::DataWind
         if (options.mSkipDecode)
             mDecodedLength = 0;
         else
-            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), tail, reinterpret_cast<short*>(mDecodedFrame), false);
+            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), tail, reinterpret_cast<short*>(mDecodedFrame.data()), false);
 
         if (mDecodedLength)
             processDecoded(output, options);
@@ -568,7 +583,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodeGapTo(Audio::DataWindow& output
         {
             // Synthesize comfort noise. It will be done on AUDIO_SAMPLERATE rate directly to mResampledFrame buffer.
             // Do not forget to send this noise to analysis
-            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, reinterpret_cast<short*>(mDecodedFrame), false);
+            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, reinterpret_cast<short*>(mDecodedFrame.data()), false);
         }
         else
             decodePacketTo(output, options, mCngPacket);
@@ -581,14 +596,14 @@ AudioReceiver::DecodeResult AudioReceiver::decodeGapTo(Audio::DataWindow& output
             mDecodedLength = 0;
         else
         {
-            mDecodedLength = mCodec->plc(mFrameCount, {(uint8_t*)mDecodedFrame, sizeof mDecodedFrame});
+            mDecodedLength = mCodec->plc(mFrameCount, {(uint8_t*)mDecodedFrame.data(), mDecodedFrame.size() * sizeof(int16_t)});
             if (!mDecodedLength)
             {
                 // PLC is not support or failed
                 // So substitute the silence
                 size_t nr_of_samples = mCodec->frameTime() * mCodec->samplerate() / 1000 * sizeof(short);
                 mDecodedLength = nr_of_samples * sizeof(short);
-                memset(mDecodedFrame, 0, mDecodedLength);
+                memset(mDecodedFrame.data(), 0, mDecodedLength);
             }
         }
     }
@@ -660,7 +675,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodePacketTo(Audio::DataWindow& out
                 mCngDecoder.decode3389(rtp.GetPayloadData(), rtp.GetPayloadLength());
 
                 // Emit CNG mLastPacketLength milliseconds
-                mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, (short*)mDecodedFrame, true);
+                mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, (short*)mDecodedFrame.data(), true);
                 if (mDecodedLength)
                     processDecoded(output, options);
             }
@@ -696,7 +711,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodePacketTo(Audio::DataWindow& out
                     {
                         // Decode frame by frame
                         auto codecInput = std::span{rtp.GetPayloadData() + i * mCodec->rtpLength(), (size_t)frameLength};
-                        auto codecOutput = std::span{(uint8_t*)mDecodedFrame, sizeof mDecodedFrame};
+                        auto codecOutput = std::span{(uint8_t*)mDecodedFrame.data(), mDecodedFrame.size() * sizeof(int16_t)};
                         auto r = mCodec->decode(codecInput, codecOutput);
                         mDecodedLength = r.mDecoded;
                         if (mDecodedLength > 0)
@@ -798,6 +813,10 @@ AudioReceiver::DecodeResult AudioReceiver::getAudioTo(Audio::DataWindow& output,
     // ICELogDebug(<< "getAudioTo() for " << options.mElapsed);
     assert (options.mElapsed != 0ms);
 
+    // First decode on this receiver: allocate the scratch buffers. Network-MOS-only
+    // streams never reach this point, so they never pay for them.
+    ensureDecodeBuffers();
+
     // Increase counter of requested audio
     mRequestedAudio += options.mElapsed;
 
@@ -876,6 +895,19 @@ AudioReceiver::DecodeResult AudioReceiver::getAudioTo(Audio::DataWindow& output,
     return result;
 }
 
+void AudioReceiver::ensureDecodeBuffers()
+{
+    // Allocate the decode/convert/resample scratch buffers to full capacity on the
+    // first decode. mDecodedFrame being empty means none are allocated yet; they
+    // are always allocated together, so checking one is enough.
+    if (mDecodedFrame.empty())
+    {
+        mDecodedFrame.resize(MT_MAX_DECODEBUFFER);
+        mConvertedFrame.resize(MT_MAX_DECODEBUFFER * 2);
+        mResampledFrame.resize(MT_MAX_DECODEBUFFER);
+    }
+}
+
 void AudioReceiver::makeMonoAndResample(int rate, int channels)
 {
     // Make mono from stereo - engine works with mono only for now
@@ -883,12 +915,12 @@ void AudioReceiver::makeMonoAndResample(int rate, int channels)
     if (channels != AUDIO_CHANNELS)
     {
         if (channels == 1)
-            mConvertedLength = Audio::ChannelConverter::monoToStereo(mDecodedFrame, mDecodedLength, mConvertedFrame, mDecodedLength * 2);
+            mConvertedLength = Audio::ChannelConverter::monoToStereo(mDecodedFrame.data(), mDecodedLength, mConvertedFrame.data(), mDecodedLength * 2);
         else
-            mDecodedLength = Audio::ChannelConverter::stereoToMono(mDecodedFrame, mDecodedLength, mDecodedFrame, mDecodedLength / 2);
+            mDecodedLength = Audio::ChannelConverter::stereoToMono(mDecodedFrame.data(), mDecodedLength, mDecodedFrame.data(), mDecodedLength / 2);
     }
 
-    void* frames = mConvertedLength ? mConvertedFrame : mDecodedFrame;
+    void* frames = mConvertedLength ? (void*)mConvertedFrame.data() : (void*)mDecodedFrame.data();
     unsigned length = mConvertedLength ? mConvertedLength : mDecodedLength;
 
     Audio::Resampler* r = nullptr;
@@ -899,13 +931,13 @@ void AudioReceiver::makeMonoAndResample(int rate, int channels)
     case 32000:    r = &mResampler32; break;
     case 48000:    r = &mResampler48; break;
     default:
-        memcpy(mResampledFrame, frames, length);
+        memcpy(mResampledFrame.data(), frames, length);
         mResampledLength = length;
         return;
     }
 
     size_t processedInput = 0;
-    mResampledLength = r->processBuffer(frames, length, processedInput, mResampledFrame, r->getDestLength(length));
+    mResampledLength = r->processBuffer(frames, length, processedInput, mResampledFrame.data(), r->getDestLength(length));
     // processedInput result value is ignored - it is always equal to length as internal sample rate is 8/16/32/48K
 }
 
diff --git a/src/engine/media/MT_AudioReceiver.h b/src/engine/media/MT_AudioReceiver.h
index 7f0e2c79..77b55bb6 100644
--- a/src/engine/media/MT_AudioReceiver.h
+++ b/src/engine/media/MT_AudioReceiver.h
@@ -20,6 +20,8 @@
 
 #include <optional>
 #include <chrono>
+#include <vector>
+#include <cstdint>
 using namespace std::chrono_literals;
 
 namespace MT
@@ -103,7 +105,19 @@ public:
     typedef std::shared_ptr<ResultList> PResultList;
 
     FetchResult fetch();
-    
+
+    // Drop oldest packets so buffered audio stays within the high-water mark,
+    // recording packet-loss events for any sequence gaps crossed (the same
+    // accounting fetch() performs). Used to bound memory on streams that never
+    // call fetch() - i.e. network-MOS-only streams with audio decode disabled,
+    // which would otherwise retain every packet for the whole call.
+    //
+    // maxPackets, when non-zero, additionally caps the buffer to that many packets
+    // regardless of buffered time. The decode path (fetch()) leaves it 0 so jitter
+    // tolerance stays governed by the time-based high-water mark; the network-only
+    // path passes a small cap since those packets are never decoded.
+    void trimToHighWater(size_t maxPackets = 0);
+
 protected:
     unsigned    mSsrc = 0;
     std::chrono::milliseconds   mHigh = std::chrono::milliseconds(RTP_BUFFER_HIGH),
@@ -240,16 +254,22 @@ protected:
     // Already decoded data that can be retrieved without actual decoding - it may happen because of getAudioTo() may be limited by time interval
     Audio::DataWindow mAvailable;
 
-    // Temporary buffer to hold decoded data (it is better than allocate data on stack)
-    int16_t mDecodedFrame[MT_MAX_DECODEBUFFER];
+    // Decode/convert/resample scratch buffers. These were inline arrays
+    // (MT_MAX_DECODEBUFFER * {1,2,1} * int16_t = 256 KB total) carried by every
+    // AudioReceiver, hence by every StreamDecoder - including network-MOS-only
+    // streams that never decode. They are now allocated lazily on the first
+    // getAudioTo() call via ensureDecodeBuffers(); non-decoding streams keep them
+    // empty. Once allocated they are sized to full capacity and reused, so decode
+    // behaviour is unchanged.
+    std::vector<int16_t> mDecodedFrame;     // sized to MT_MAX_DECODEBUFFER
     size_t mDecodedLength = 0;
 
     // Buffer to hold data converted to stereo/mono; there is multiplier 2 as it can be stereo audio
-    int16_t mConvertedFrame[MT_MAX_DECODEBUFFER * 2];
+    std::vector<int16_t> mConvertedFrame;   // sized to MT_MAX_DECODEBUFFER * 2
     size_t mConvertedLength = 0;
 
     // Buffer to hold data resampled to AUDIO_SAMPLERATE
-    int16_t mResampledFrame[MT_MAX_DECODEBUFFER];
+    std::vector<int16_t> mResampledFrame;   // sized to MT_MAX_DECODEBUFFER
     size_t mResampledLength = 0;
 
     // Last packet time length
@@ -272,6 +292,12 @@ protected:
     std::chrono::milliseconds mRequestedAudio = 0ms;
     std::chrono::milliseconds mProducedAudio = 0ms;
 
+    // Lazily allocate the decode/convert/resample scratch buffers (mDecodedFrame,
+    // mConvertedFrame, mResampledFrame) to full capacity on the first decode. A
+    // no-op once allocated. Called at the top of getAudioTo(); network-MOS-only
+    // streams never reach it, so they never pay the 256 KB.
+    void ensureDecodeBuffers();
+
     // Zero rate will make audio mono but resampling will be skipped
     void makeMonoAndResample(int rate, int channels);
 
diff --git a/src/engine/media/MT_EvsCodec.cpp b/src/engine/media/MT_EvsCodec.cpp
index 3eda370f..658f8bf0 100644
--- a/src/engine/media/MT_EvsCodec.cpp
+++ b/src/engine/media/MT_EvsCodec.cpp
@@ -152,6 +152,27 @@ EVSCodec::EVSCodec(const StreamParameters &sp)
 {
 	EVSCodec::sp = sp;
 
+    // Metadata only - the heavy decoder state is created lazily (ensureDecoder()).
+    mOutputFs = outputFsFromBw(sp.bw);
+}
+
+int EVSCodec::outputFsFromBw(int bw)
+{
+    switch (bw)
+    {
+    case NB:  return 8000;
+    case WB:  return 16000;
+    case SWB: return 32000;
+    case FB:  return 48000;
+    }
+    return 0;
+}
+
+void EVSCodec::ensureDecoder()
+{
+    if (st_dec)
+        return;
+
     if ((st_dec = reinterpret_cast<evs::Decoder_State*>(malloc(sizeof(evs::Decoder_State)))) == nullptr)
         throw std::bad_alloc();
 
@@ -170,9 +191,9 @@ EVSCodec::~EVSCodec()
 Codec::Info EVSCodec::info() {
     return {
         .mName = MT_EVS_CODECNAME,
-        .mSamplerate = st_dec->output_Fs,
+        .mSamplerate = mOutputFs,
         .mChannels = 1,
-        .mPcmLength = st_dec->output_Fs / 1000 * sp.ptime * 2,
+        .mPcmLength = mOutputFs / 1000 * sp.ptime * 2,
         .mFrameTime = sp.ptime,
         .mRtpLength = 0
     };
@@ -187,6 +208,8 @@ Codec::EncodeResult EVSCodec::encode(std::span<const uint8_t> input, std::span<u
 
 Codec::DecodeResult EVSCodec::decode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
     if (output.size_bytes() < pcmLength())
         return {.mDecoded = 0};
 
diff --git a/src/engine/media/MT_EvsCodec.h b/src/engine/media/MT_EvsCodec.h
index 3c4c9cf2..fa6418b3 100644
--- a/src/engine/media/MT_EvsCodec.h
+++ b/src/engine/media/MT_EvsCodec.h
@@ -57,7 +57,21 @@ public:
 private:
     evs::Decoder_State* st_dec = nullptr;
     StreamParameters sp;
+
+    // Output sample rate, derived from the negotiated bandwidth (sp.bw) at
+    // construction. Cached so info()/samplerate()/pcmLength() work for network-MOS
+    // metadata without allocating the (large) EVS decoder state - see ensureDecoder.
+    int mOutputFs = 0;
+
     void initDecoder(const StreamParameters& sp);
+
+    // Allocate + initialize the EVS decoder state lazily on first decode().
+    // Network-MOS-only streams resolve metadata but never decode, so they never
+    // pay for the EVS decoder (Decoder_State + CLDFB/FD-CNG sub-allocations).
+    void ensureDecoder();
+
+    // Maps an EVS bandwidth (NB/WB/SWB/FB) to its output sample rate in Hz.
+    static int outputFsFromBw(int bw);
 };
 
 } // End of namespace
diff --git a/src/libs/jrtplib/src/rtpsources.h b/src/libs/jrtplib/src/rtpsources.h
index 5d0079f5..3ee4a323 100644
--- a/src/libs/jrtplib/src/rtpsources.h
+++ b/src/libs/jrtplib/src/rtpsources.h
@@ -44,7 +44,19 @@
 #include "rtptypes.h"
 #include "rtpmemoryobject.h"
 
-#define RTPSOURCES_HASHSIZE							8317
+// Number of buckets in the per-RTPSession SSRC->source hash table. This is an
+// inline array of pointers in every RTPSources instance (sizeof == hashsize *
+// sizeof(void*)), so it is paid by every RTPSession object regardless of how many
+// sources it actually tracks. The original jrtplib default (8317) targets RTP
+// mixers/conferences that demultiplex thousands of distinct SSRCs on one session;
+// it costs ~65 KB per session. Sevana's per-stream capture sessions carry ~1 SSRC,
+// so a far smaller table is ample - collisions are resolved by linked lists, so a
+// small size only affects lookup cost (negligible at our source counts), never
+// correctness. Overridable at build time for products that genuinely need many
+// sources per session.
+#ifndef RTPSOURCES_HASHSIZE
+#define RTPSOURCES_HASHSIZE							251
+#endif
 
 namespace jrtplib
 {