- performance optimizations

2026-06-17 14:18:31 +03:00
parent 40f3b34b16
commit 2cb2a93d59
7 changed files with 177 additions and 32 deletions
@@ -265,8 +265,20 @@ PCodec AmrNbCodec::CodecFactory::create()
 AmrNbCodec::AmrNbCodec(const AmrCodecConfig& config)
    :mConfig(config)
 {
-    mEncoderCtx = Encoder_Interface_init(1);
-    mDecoderCtx = Decoder_Interface_init();
+    // Contexts are created lazily (see ensureEncoder/ensureDecoder) - a codec
+    // resolved only for network-MOS metadata never allocates them.
+}
+
+void AmrNbCodec::ensureEncoder()
+{
+    if (!mEncoderCtx)
+        mEncoderCtx = Encoder_Interface_init(1);
+}
+
+void AmrNbCodec::ensureDecoder()
+{
+    if (!mDecoderCtx)
+        mDecoderCtx = Decoder_Interface_init();
 }

 AmrNbCodec::~AmrNbCodec()
@@ -298,6 +310,8 @@ Codec::Info AmrNbCodec::info()

 Codec::EncodeResult AmrNbCodec::encode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureEncoder();
+
    if (input.size_bytes() % pcmLength())
        return {.mEncoded = 0};

@@ -324,6 +338,8 @@ Codec::EncodeResult AmrNbCodec::encode(std::span<const uint8_t> input, std::span
 #define AMR_BITRATE_DTX 15
 Codec::DecodeResult AmrNbCodec::decode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
    if (mConfig.mOctetAligned)
        return {.mDecoded = 0};

@@ -427,6 +443,8 @@ Codec::DecodeResult AmrNbCodec::decode(std::span<const uint8_t> input, std::span

 size_t AmrNbCodec::plc(int lostFrames, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
    if (output.size_bytes() < lostFrames * pcmLength())
        return 0;

@@ -496,7 +514,14 @@ AmrWbStatistics MT::GAmrWbStatistics;
 AmrWbCodec::AmrWbCodec(const AmrCodecConfig& config)
    :mConfig(config)
 {
-    mDecoderCtx = D_IF_init();
+    // Decoder context is created lazily (see ensureDecoder) - a codec resolved
+    // only for network-MOS metadata never allocates the AMR-WB decoder state.
+}
+
+void AmrWbCodec::ensureDecoder()
+{
+    if (!mDecoderCtx)
+        mDecoderCtx = D_IF_init();
 }

 AmrWbCodec::~AmrWbCodec()
@@ -630,6 +655,8 @@ Codec::DecodeResult AmrWbCodec::decodePlain(std::span<const uint8_t> input, std:

 Codec::DecodeResult AmrWbCodec::decode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
    if (mConfig.mIuUP)
        return decodeIuup(input, output);
    else
@@ -33,6 +33,13 @@ protected:
    int mPreviousPacketLength = 0;
    size_t mCngCounter = 0;
    size_t mSwitchCounter = 0;
+
+    // opencore-amr encoder/decoder state is allocated lazily on first encode/decode.
+    // Network-MOS-only streams resolve codec metadata (name/samplerate/frame timing)
+    // but never decode, so they must not pay for a context they never use - at scale
+    // this is ~a decoder state (several KB) saved per network-only stream.
+    void ensureEncoder();
+    void ensureDecoder();
 public:
    class CodecFactory: public Factory
    {
@@ -85,6 +92,10 @@ protected:

    int mPreviousPacketLength;

+    // Decoder state is allocated lazily on first decode/plc (see AmrNbCodec) so
+    // network-MOS-only streams never instantiate the AMR-WB decoder.
+    void ensureDecoder();
+
    DecodeResult decodeIuup(std::span<const uint8_t> input, std::span<uint8_t> output);
    DecodeResult decodePlain(std::span<const uint8_t> input, std::span<uint8_t> output);

@@ -184,16 +184,18 @@ std::shared_ptr<RtpBuffer::Packet> RtpBuffer::add(const std::shared_ptr<jrtplib:
    return std::shared_ptr<Packet>();
 }

-RtpBuffer::FetchResult RtpBuffer::fetch()
+void RtpBuffer::trimToHighWater(size_t maxPackets)
 {
    Lock l(mGuard);

-    FetchResult result;
-
-    // See if there is enough information in buffer
    auto total = findTimelength();

-    while (total > mHigh && mPacketList.size() > 1 && 0ms != mHigh)
+    // Drop the oldest packet while either bound is exceeded: the time-based
+    // high-water mark (mHigh, when set) or, if maxPackets != 0, the packet-count
+    // cap. Always keep at least one packet so loss/gap accounting has a reference.
+    while (mPacketList.size() > 1 &&
+           ((0ms != mHigh && total > mHigh) ||
+            (maxPackets != 0 && mPacketList.size() > maxPackets)))
    {
        ICELogMedia( << "Dropping RTP packets from jitter buffer");
        total -= mPacketList.front()->timelength();
@@ -233,6 +235,19 @@ RtpBuffer::FetchResult RtpBuffer::fetch()
        // Increase number in statistics
        mStat.mPacketDropped++;
    }
+}
+
+RtpBuffer::FetchResult RtpBuffer::fetch()
+{
+    Lock l(mGuard);
+
+    FetchResult result;
+
+    // Bound the buffer to the high-water mark before fetching.
+    trimToHighWater();
+
+    // See how much audio is buffered now.
+    auto total = findTimelength();

    if (total < mLow || total == 0ms)
    {
@@ -494,13 +509,13 @@ void AudioReceiver::processDecoded(Audio::DataWindow& output, DecodeOptions opti
 {
    // Write to audio dump if requested
    if (mDecodedDump && mDecodedLength)
-        mDecodedDump->write(mDecodedFrame, mDecodedLength);
+        mDecodedDump->write(mDecodedFrame.data(), mDecodedLength);

    // Resample to target rate
    makeMonoAndResample(options.mResampleToMainRate ? mCodec->samplerate() : 0, mCodec->channels());

    // Send to output
-    output.add(mResampledFrame, mResampledLength);
+    output.add(mResampledFrame.data(), mResampledLength);
 }

 void AudioReceiver::produceSilence(std::chrono::milliseconds length, Audio::DataWindow& output, DecodeOptions options)
@@ -517,13 +532,13 @@ void AudioReceiver::produceSilence(std::chrono::milliseconds length, Audio::Data
    size_t tail_size = tail * sizeof(int16_t) * mCodec->samplerate() / 1000 * mCodec->channels();
    for (size_t i = 0; i < chunks; i++)
    {
-        memset(mDecodedFrame, 0, chunk_size);
+        memset(mDecodedFrame.data(), 0, chunk_size);
        mDecodedLength = chunk_size;
        processDecoded(output, options);
    }
    if (tail)
    {
-        memset(mDecodedFrame, 0, tail_size);
+        memset(mDecodedFrame.data(), 0, tail_size);
        mDecodedLength = tail_size;
        processDecoded(output, options);
    }
@@ -537,7 +552,7 @@ void AudioReceiver::produceCNG(std::chrono::milliseconds length, Audio::DataWind
        if (options.mSkipDecode)
            mDecodedLength = 0;
        else
-            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), 100, mDecodedFrame, false);
+            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), 100, mDecodedFrame.data(), false);

        if (mDecodedLength)
            processDecoded(output, options);
@@ -550,7 +565,7 @@ void AudioReceiver::produceCNG(std::chrono::milliseconds length, Audio::DataWind
        if (options.mSkipDecode)
            mDecodedLength = 0;
        else
-            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), tail, reinterpret_cast<short*>(mDecodedFrame), false);
+            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), tail, reinterpret_cast<short*>(mDecodedFrame.data()), false);

        if (mDecodedLength)
            processDecoded(output, options);
@@ -568,7 +583,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodeGapTo(Audio::DataWindow& output
        {
            // Synthesize comfort noise. It will be done on AUDIO_SAMPLERATE rate directly to mResampledFrame buffer.
            // Do not forget to send this noise to analysis
-            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, reinterpret_cast<short*>(mDecodedFrame), false);
+            mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, reinterpret_cast<short*>(mDecodedFrame.data()), false);
        }
        else
            decodePacketTo(output, options, mCngPacket);
@@ -581,14 +596,14 @@ AudioReceiver::DecodeResult AudioReceiver::decodeGapTo(Audio::DataWindow& output
            mDecodedLength = 0;
        else
        {
-            mDecodedLength = mCodec->plc(mFrameCount, {(uint8_t*)mDecodedFrame, sizeof mDecodedFrame});
+            mDecodedLength = mCodec->plc(mFrameCount, {(uint8_t*)mDecodedFrame.data(), mDecodedFrame.size() * sizeof(int16_t)});
            if (!mDecodedLength)
            {
                // PLC is not support or failed
                // So substitute the silence
                size_t nr_of_samples = mCodec->frameTime() * mCodec->samplerate() / 1000 * sizeof(short);
                mDecodedLength = nr_of_samples * sizeof(short);
-                memset(mDecodedFrame, 0, mDecodedLength);
+                memset(mDecodedFrame.data(), 0, mDecodedLength);
            }
        }
    }
@@ -660,7 +675,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodePacketTo(Audio::DataWindow& out
                mCngDecoder.decode3389(rtp.GetPayloadData(), rtp.GetPayloadLength());

                // Emit CNG mLastPacketLength milliseconds
-                mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, (short*)mDecodedFrame, true);
+                mDecodedLength = mCngDecoder.produce(mCodec->samplerate(), mLastPacketTimeLength, (short*)mDecodedFrame.data(), true);
                if (mDecodedLength)
                    processDecoded(output, options);
            }
@@ -696,7 +711,7 @@ AudioReceiver::DecodeResult AudioReceiver::decodePacketTo(Audio::DataWindow& out
                    {
                        // Decode frame by frame
                        auto codecInput = std::span{rtp.GetPayloadData() + i * mCodec->rtpLength(), (size_t)frameLength};
-                        auto codecOutput = std::span{(uint8_t*)mDecodedFrame, sizeof mDecodedFrame};
+                        auto codecOutput = std::span{(uint8_t*)mDecodedFrame.data(), mDecodedFrame.size() * sizeof(int16_t)};
                        auto r = mCodec->decode(codecInput, codecOutput);
                        mDecodedLength = r.mDecoded;
                        if (mDecodedLength > 0)
@@ -798,6 +813,10 @@ AudioReceiver::DecodeResult AudioReceiver::getAudioTo(Audio::DataWindow& output,
    // ICELogDebug(<< "getAudioTo() for " << options.mElapsed);
    assert (options.mElapsed != 0ms);

+    // First decode on this receiver: allocate the scratch buffers. Network-MOS-only
+    // streams never reach this point, so they never pay for them.
+    ensureDecodeBuffers();
+
    // Increase counter of requested audio
    mRequestedAudio += options.mElapsed;

@@ -876,6 +895,19 @@ AudioReceiver::DecodeResult AudioReceiver::getAudioTo(Audio::DataWindow& output,
    return result;
 }

+void AudioReceiver::ensureDecodeBuffers()
+{
+    // Allocate the decode/convert/resample scratch buffers to full capacity on the
+    // first decode. mDecodedFrame being empty means none are allocated yet; they
+    // are always allocated together, so checking one is enough.
+    if (mDecodedFrame.empty())
+    {
+        mDecodedFrame.resize(MT_MAX_DECODEBUFFER);
+        mConvertedFrame.resize(MT_MAX_DECODEBUFFER * 2);
+        mResampledFrame.resize(MT_MAX_DECODEBUFFER);
+    }
+}
+
 void AudioReceiver::makeMonoAndResample(int rate, int channels)
 {
    // Make mono from stereo - engine works with mono only for now
@@ -883,12 +915,12 @@ void AudioReceiver::makeMonoAndResample(int rate, int channels)
    if (channels != AUDIO_CHANNELS)
    {
        if (channels == 1)
-            mConvertedLength = Audio::ChannelConverter::monoToStereo(mDecodedFrame, mDecodedLength, mConvertedFrame, mDecodedLength * 2);
+            mConvertedLength = Audio::ChannelConverter::monoToStereo(mDecodedFrame.data(), mDecodedLength, mConvertedFrame.data(), mDecodedLength * 2);
        else
-            mDecodedLength = Audio::ChannelConverter::stereoToMono(mDecodedFrame, mDecodedLength, mDecodedFrame, mDecodedLength / 2);
+            mDecodedLength = Audio::ChannelConverter::stereoToMono(mDecodedFrame.data(), mDecodedLength, mDecodedFrame.data(), mDecodedLength / 2);
    }

-    void* frames = mConvertedLength ? mConvertedFrame : mDecodedFrame;
+    void* frames = mConvertedLength ? (void*)mConvertedFrame.data() : (void*)mDecodedFrame.data();
    unsigned length = mConvertedLength ? mConvertedLength : mDecodedLength;

    Audio::Resampler* r = nullptr;
@@ -899,13 +931,13 @@ void AudioReceiver::makeMonoAndResample(int rate, int channels)
    case 32000:    r = &mResampler32; break;
    case 48000:    r = &mResampler48; break;
    default:
-        memcpy(mResampledFrame, frames, length);
+        memcpy(mResampledFrame.data(), frames, length);
        mResampledLength = length;
        return;
    }

    size_t processedInput = 0;
-    mResampledLength = r->processBuffer(frames, length, processedInput, mResampledFrame, r->getDestLength(length));
+    mResampledLength = r->processBuffer(frames, length, processedInput, mResampledFrame.data(), r->getDestLength(length));
    // processedInput result value is ignored - it is always equal to length as internal sample rate is 8/16/32/48K
 }

@@ -20,6 +20,8 @@

 #include <optional>
 #include <chrono>
+#include <vector>
+#include <cstdint>
 using namespace std::chrono_literals;

 namespace MT
@@ -103,7 +105,19 @@ public:
    typedef std::shared_ptr<ResultList> PResultList;

    FetchResult fetch();
-    
+
+    // Drop oldest packets so buffered audio stays within the high-water mark,
+    // recording packet-loss events for any sequence gaps crossed (the same
+    // accounting fetch() performs). Used to bound memory on streams that never
+    // call fetch() - i.e. network-MOS-only streams with audio decode disabled,
+    // which would otherwise retain every packet for the whole call.
+    //
+    // maxPackets, when non-zero, additionally caps the buffer to that many packets
+    // regardless of buffered time. The decode path (fetch()) leaves it 0 so jitter
+    // tolerance stays governed by the time-based high-water mark; the network-only
+    // path passes a small cap since those packets are never decoded.
+    void trimToHighWater(size_t maxPackets = 0);
+
 protected:
    unsigned    mSsrc = 0;
    std::chrono::milliseconds   mHigh = std::chrono::milliseconds(RTP_BUFFER_HIGH),
@@ -240,16 +254,22 @@ protected:
    // Already decoded data that can be retrieved without actual decoding - it may happen because of getAudioTo() may be limited by time interval
    Audio::DataWindow mAvailable;

-    // Temporary buffer to hold decoded data (it is better than allocate data on stack)
-    int16_t mDecodedFrame[MT_MAX_DECODEBUFFER];
+    // Decode/convert/resample scratch buffers. These were inline arrays
+    // (MT_MAX_DECODEBUFFER * {1,2,1} * int16_t = 256 KB total) carried by every
+    // AudioReceiver, hence by every StreamDecoder - including network-MOS-only
+    // streams that never decode. They are now allocated lazily on the first
+    // getAudioTo() call via ensureDecodeBuffers(); non-decoding streams keep them
+    // empty. Once allocated they are sized to full capacity and reused, so decode
+    // behaviour is unchanged.
+    std::vector<int16_t> mDecodedFrame;     // sized to MT_MAX_DECODEBUFFER
    size_t mDecodedLength = 0;

    // Buffer to hold data converted to stereo/mono; there is multiplier 2 as it can be stereo audio
-    int16_t mConvertedFrame[MT_MAX_DECODEBUFFER * 2];
+    std::vector<int16_t> mConvertedFrame;   // sized to MT_MAX_DECODEBUFFER * 2
    size_t mConvertedLength = 0;

    // Buffer to hold data resampled to AUDIO_SAMPLERATE
-    int16_t mResampledFrame[MT_MAX_DECODEBUFFER];
+    std::vector<int16_t> mResampledFrame;   // sized to MT_MAX_DECODEBUFFER
    size_t mResampledLength = 0;

    // Last packet time length
@@ -272,6 +292,12 @@ protected:
    std::chrono::milliseconds mRequestedAudio = 0ms;
    std::chrono::milliseconds mProducedAudio = 0ms;

+    // Lazily allocate the decode/convert/resample scratch buffers (mDecodedFrame,
+    // mConvertedFrame, mResampledFrame) to full capacity on the first decode. A
+    // no-op once allocated. Called at the top of getAudioTo(); network-MOS-only
+    // streams never reach it, so they never pay the 256 KB.
+    void ensureDecodeBuffers();
+
    // Zero rate will make audio mono but resampling will be skipped
    void makeMonoAndResample(int rate, int channels);

@@ -152,6 +152,27 @@ EVSCodec::EVSCodec(const StreamParameters &sp)
 {
 	EVSCodec::sp = sp;

+    // Metadata only - the heavy decoder state is created lazily (ensureDecoder()).
+    mOutputFs = outputFsFromBw(sp.bw);
+}
+
+int EVSCodec::outputFsFromBw(int bw)
+{
+    switch (bw)
+    {
+    case NB:  return 8000;
+    case WB:  return 16000;
+    case SWB: return 32000;
+    case FB:  return 48000;
+    }
+    return 0;
+}
+
+void EVSCodec::ensureDecoder()
+{
+    if (st_dec)
+        return;
+
    if ((st_dec = reinterpret_cast<evs::Decoder_State*>(malloc(sizeof(evs::Decoder_State)))) == nullptr)
        throw std::bad_alloc();

@@ -170,9 +191,9 @@ EVSCodec::~EVSCodec()
 Codec::Info EVSCodec::info() {
    return {
        .mName = MT_EVS_CODECNAME,
-        .mSamplerate = st_dec->output_Fs,
+        .mSamplerate = mOutputFs,
        .mChannels = 1,
-        .mPcmLength = st_dec->output_Fs / 1000 * sp.ptime * 2,
+        .mPcmLength = mOutputFs / 1000 * sp.ptime * 2,
        .mFrameTime = sp.ptime,
        .mRtpLength = 0
    };
@@ -187,6 +208,8 @@ Codec::EncodeResult EVSCodec::encode(std::span<const uint8_t> input, std::span<u

 Codec::DecodeResult EVSCodec::decode(std::span<const uint8_t> input, std::span<uint8_t> output)
 {
+    ensureDecoder();
+
    if (output.size_bytes() < pcmLength())
        return {.mDecoded = 0};

@@ -57,7 +57,21 @@ public:
 private:
    evs::Decoder_State* st_dec = nullptr;
    StreamParameters sp;
+
+    // Output sample rate, derived from the negotiated bandwidth (sp.bw) at
+    // construction. Cached so info()/samplerate()/pcmLength() work for network-MOS
+    // metadata without allocating the (large) EVS decoder state - see ensureDecoder.
+    int mOutputFs = 0;
+
    void initDecoder(const StreamParameters& sp);
+
+    // Allocate + initialize the EVS decoder state lazily on first decode().
+    // Network-MOS-only streams resolve metadata but never decode, so they never
+    // pay for the EVS decoder (Decoder_State + CLDFB/FD-CNG sub-allocations).
+    void ensureDecoder();
+
+    // Maps an EVS bandwidth (NB/WB/SWB/FB) to its output sample rate in Hz.
+    static int outputFsFromBw(int bw);
 };

 } // End of namespace
@@ -44,7 +44,19 @@
 #include "rtptypes.h"
 #include "rtpmemoryobject.h"

-#define RTPSOURCES_HASHSIZE							8317
+// Number of buckets in the per-RTPSession SSRC->source hash table. This is an
+// inline array of pointers in every RTPSources instance (sizeof == hashsize *
+// sizeof(void*)), so it is paid by every RTPSession object regardless of how many
+// sources it actually tracks. The original jrtplib default (8317) targets RTP
+// mixers/conferences that demultiplex thousands of distinct SSRCs on one session;
+// it costs ~65 KB per session. Sevana's per-stream capture sessions carry ~1 SSRC,
+// so a far smaller table is ample - collisions are resolved by linked lists, so a
+// small size only affects lookup cost (negligible at our source counts), never
+// correctness. Overridable at build time for products that genuinely need many
+// sources per session.
+#ifndef RTPSOURCES_HASHSIZE
+#define RTPSOURCES_HASHSIZE							251
+#endif

 namespace jrtplib
 {