WebRTC实现背景声音的混流

Posted 2021-02-21 Jax

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了WebRTC实现背景声音的混流相关的知识，希望对你有一定的参考价值。

背景

在Windows上使用WebRTC做视频采集，然后使用RTMP进行直播推流。默认情况下WebRTC只会采集麦克风的声音，而不会采集机器的背景声音。需要编码实现背景声音的采集和混音功能。

思路

Windows提供的API中有音频采集的相关方法，官方也给出了简单的说明和示例，虽然不能运行：）。所以可以通过Windows的API来采集PCM格式的音频，然后通过WebRTC的群聊混音机制来进行音频合成

核心代码

音频采集部分

DWORD AudioCaptureCore::DoCaptureThread()
{
    keepRecording_ = true;
    HANDLE waitArray[2] = { _hShutdownCaptureEvent, _hCaptureSamplesReadyEvent };
    HRESULT hr = S_OK;

    LARGE_INTEGER t1;
    LARGE_INTEGER t2;
    int32_t time(0);

    BYTE* syncBuffer = NULL;
    UINT32 syncBufIndex = 0;

    _readSamples = 0;

    // Initialize COM as MTA in this thread.
    ScopedCOMInitializer comInit(ScopedCOMInitializer::kMTA);
    if (!comInit.succeeded()) {
        WEBRTC_TRACE(kTraceError, kTraceAudioDevice, _id,
            "failed to initialize COM in capture thread");
        return 1;
    }

    hr = InitCaptureThreadPriority();
    if (FAILED(hr))
    {
        return hr;
    }

    _Lock();


    REFERENCE_TIME hnsRequestedDuration = REFTIMES_PER_SEC;
    REFERENCE_TIME hnsActualDuration;
    UINT32 bufferLength;
    UINT32 numFramesAvailable;
    IMMDeviceEnumerator *pEnumerator = NULL;
    IMMDevice *pDevice = NULL;
    WAVEFORMATEX *pwfx = NULL;
    UINT32 packetLength = 0;
    BOOL bDone = FALSE;
    BYTE *pData;
    DWORD flags;


    hr = CoCreateInstance(CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL,
        IID_IMMDeviceEnumerator, (void**)&pEnumerator);
    EXIT_ON_ERROR(hr);

        hr = pEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &pDevice);
    EXIT_ON_ERROR(hr);

        hr = pDevice->Activate(IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&_ptrAudioClientIn);
    EXIT_ON_ERROR(hr);

    // 
    hr = _ptrAudioClientIn->GetMixFormat(&pwfx);
    EXIT_ON_ERROR(hr);

    WAVEFORMATEX waveFormat;
    waveFormat.wFormatTag = WAVE_FORMAT_PCM;
    waveFormat.nChannels = 2;
    waveFormat.nSamplesPerSec = pwfx->nSamplesPerSec;
    waveFormat.nAvgBytesPerSec = pwfx->nSamplesPerSec * 4;
    waveFormat.wBitsPerSample = 16;
    waveFormat.nBlockAlign = 4;
    waveFormat.cbSize = 0;

    *pwfx = waveFormat;

    hr = _ptrAudioClientIn->Initialize(AUDCLNT_SHAREMODE_SHARED,
        AUDCLNT_STREAMFLAGS_LOOPBACK,
        hnsRequestedDuration,
        0,
        pwfx,
        NULL);
    EXIT_ON_ERROR(hr);

            // Set the VoE format equal to the AEC output format.
            _recAudioFrameSize = pwfx->nBlockAlign;
            _recSampleRate = pwfx->nSamplesPerSec;
            _recBlockSize = pwfx->nSamplesPerSec / 100;
            _recChannels = pwfx->nChannels;

            if (_ptrAudioBuffer)
            {
                // Update the audio buffer with the selected parameters
                _ptrAudioBuffer->SetRecordingSampleRate(_recSampleRate);
                _ptrAudioBuffer->SetRecordingChannels((uint8_t)_recChannels);
            }
            else
            {
                // We can enter this state during CoreAudioIsSupported() when no AudioDeviceImplementation
                // has been created, hence the AudioDeviceBuffer does not exist.
                // It is OK to end up here since we don‘t initiate any media in CoreAudioIsSupported().
                WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "AudioDeviceBuffer must be attached before streaming can start");
            }


            // Get the size of the allocated buffer.
            hr = _ptrAudioClientIn->GetBufferSize(&bufferLength);
            EXIT_ON_ERROR(hr);

            hr = _ptrAudioClientIn->GetService(__uuidof(IAudioCaptureClient), (void**)&_ptrCaptureClient);
            EXIT_ON_ERROR(hr);

            // Notify the audio sink which format to use.
                // 如上一行注释，以下的代码是将获取到的音频格式传给另外的类（自己定义的），同样的，因为
                // 手动制定了音频格式，所以就不需要通知了
            // hr = pMySink->SetFormat(pwfx);
            // EXIT_ON_ERROR(hr)

            // Calculate the actual duration of the allocated buffer.
            hnsActualDuration = (double)REFTIMES_PER_SEC * bufferLength / pwfx->nSamplesPerSec;

            //hr = _ptrAudioClientIn->Start();  // Start recording.
            //EXIT_ON_ERROR(hr);



    // Get size of capturing buffer (length is expressed as the number of audio frames the buffer can hold).
    // This value is fixed during the capturing session.
    //
            if (_ptrAudioClientIn == NULL)
    {
        WEBRTC_TRACE(kTraceError, kTraceAudioDevice, _id,
            "input state has been modified before capture loop starts.");
        return 1;
    }
            hr = _ptrAudioClientIn->GetBufferSize(&bufferLength);
    EXIT_ON_ERROR(hr);
    WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] size of buffer       : %u", bufferLength);

    // Allocate memory for sync buffer.
    // It is used for compensation between native 44.1 and internal 44.0 and
    // for cases when the capture buffer is larger than 10ms.
    //
    const UINT32 syncBufferSize = 2 * (bufferLength * _recAudioFrameSize);
    syncBuffer = new BYTE[syncBufferSize];
    if (syncBuffer == NULL)
    {
        return (DWORD)E_POINTER;
    }
    WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] size of sync buffer  : %u [bytes]", syncBufferSize);

    // Get maximum latency for the current stream (will not change for the lifetime of the IAudioClient object).
    //
    REFERENCE_TIME latency;
            _ptrAudioClientIn->GetStreamLatency(&latency);
    WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] max stream latency   : %u (%3.2f ms)",
        (DWORD)latency, (double)(latency / 10000.0));

    // Get the length of the periodic interval separating successive processing passes by
    // the audio engine on the data in the endpoint buffer.
    //
    REFERENCE_TIME devPeriod = 0;
    REFERENCE_TIME devPeriodMin = 0;
            _ptrAudioClientIn->GetDevicePeriod(&devPeriod, &devPeriodMin);
    WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] device period        : %u (%3.2f ms)",
        (DWORD)devPeriod, (double)(devPeriod / 10000.0));

    double extraDelayMS = (double)((latency + devPeriod) / 10000.0);
    WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] extraDelayMS         : %3.2f", extraDelayMS);

    double endpointBufferSizeMS = 10.0 * ((double)bufferLength / (double)_recBlockSize);
    WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] endpointBufferSizeMS : %3.2f", endpointBufferSizeMS);

    // Start up the capturing stream.
    //
            hr = _ptrAudioClientIn->Start();
    EXIT_ON_ERROR(hr);

    _UnLock();

    // Set event which will ensure that the calling thread modifies the recording state to true.
    //
    SetEvent(_hCaptureStartedEvent);

    // >> ---------------------------- THREAD LOOP ----------------------------


        while (keepRecording_)
        {
            BYTE *pData = 0;
            UINT32 framesAvailable = 0;
            DWORD flags = 0;
            UINT64 recTime = 0;
            UINT64 recPos = 0;

            std::cout << "bgm audio capturing" << std::endl;

            _Lock();

            // Sanity check to ensure that essential states are not modified
            // during the unlocked period.
                            if (_ptrCaptureClient == NULL || _ptrAudioClientIn == NULL)
            {
                _UnLock();
                WEBRTC_TRACE(kTraceCritical, kTraceAudioDevice, _id,
                    "input state has been modified during unlocked period");
                goto Exit;
            }

            //  Find out how much capture data is available
            //
            hr = _ptrCaptureClient->GetBuffer(&pData,           // packet which is ready to be read by used
                &framesAvailable, // #frames in the captured packet (can be zero)
                &flags,           // support flags (check)
                &recPos,          // device position of first audio frame in data packet
                &recTime);        // value of performance counter at the time of recording the first audio frame

            if (SUCCEEDED(hr))
            {
                if (AUDCLNT_S_BUFFER_EMPTY == hr)
                {
                    // Buffer was empty => start waiting for a new capture notification event
                    _UnLock();
                    continue;
                }

                if (flags & AUDCLNT_BUFFERFLAGS_SILENT)
                {
                    // Treat all of the data in the packet as silence and ignore the actual data values.
                    WEBRTC_TRACE(kTraceWarning, kTraceAudioDevice, _id, "AUDCLNT_BUFFERFLAGS_SILENT");
                    pData = NULL;
                }

                assert(framesAvailable != 0);

                if (pData)
                {
                    CopyMemory(&syncBuffer[syncBufIndex*_recAudioFrameSize], pData, framesAvailable*_recAudioFrameSize);
                }
                else
                {
                    ZeroMemory(&syncBuffer[syncBufIndex*_recAudioFrameSize], framesAvailable*_recAudioFrameSize);
                }
                assert(syncBufferSize >= (syncBufIndex*_recAudioFrameSize) + framesAvailable*_recAudioFrameSize);

                // Release the capture buffer
                //
                hr = _ptrCaptureClient->ReleaseBuffer(framesAvailable);
                EXIT_ON_ERROR(hr);

                _readSamples += framesAvailable;
                syncBufIndex += framesAvailable;

                QueryPerformanceCounter(&t1);

                // Get the current recording and playout delay.
                uint32_t sndCardRecDelay = (uint32_t)
                    (((((UINT64)t1.QuadPart * _perfCounterFactor) - recTime)
                        / 10000) + (10 * syncBufIndex) / _recBlockSize - 10);
                uint32_t sndCardPlayDelay =
                    static_cast<uint32_t>(_sndCardPlayDelay);

                _sndCardRecDelay = sndCardRecDelay;

                while (syncBufIndex >= _recBlockSize)
                {
                    if (_ptrAudioBuffer)
                    {
                        _ptrAudioBuffer->SetRecordedBuffer((const int8_t*)syncBuffer, _recBlockSize);
                        _ptrAudioBuffer->SetVQEData(sndCardPlayDelay,
                            sndCardRecDelay,
                            0);

                        _ptrAudioBuffer->SetTypingStatus(KeyPressed());

                        QueryPerformanceCounter(&t1);    // measure time: START

                        _UnLock();  // release lock while making the callback
                        _ptrAudioBuffer->DeliverRecordedData();
                        _Lock();    // restore the lock

                        QueryPerformanceCounter(&t2);    // measure time: STOP

                                                         // Measure "average CPU load".
                                                         // Basically what we do here is to measure how many percent of our 10ms period
                                                         // is used for encoding and decoding. This value shuld be used as a warning indicator
                                                         // only and not seen as an absolute value. Running at ~100% will lead to bad QoS.
                        time = (int)(t2.QuadPart - t1.QuadPart);
                        _avgCPULoad = (float)(_avgCPULoad*.99 + (time + _playAcc) / (double)(_perfCounterFreq.QuadPart));
                        _playAcc = 0;

                        // Sanity check to ensure that essential states are not modified during the unlocked period
                                                    if (_ptrCaptureClient == NULL || _ptrAudioClientIn == NULL)
                        {
                            _UnLock();
                            WEBRTC_TRACE(kTraceCritical, kTraceAudioDevice, _id, "input state has been modified during unlocked period");
                            goto Exit;
                        }
                    }

                    // store remaining data which was not able to deliver as 10ms segment
                    MoveMemory(&syncBuffer[0], &syncBuffer[_recBlockSize*_recAudioFrameSize], (syncBufIndex - _recBlockSize)*_recAudioFrameSize);
                    syncBufIndex -= _recBlockSize;
                    sndCardRecDelay -= 10;
                }

                if (_AGC)
                {
                    uint32_t newMicLevel = _ptrAudioBuffer->NewMicLevel();
                    if (newMicLevel != 0)
                    {
                        // The VQE will only deliver non-zero microphone levels when a change is needed.
                        // Set this new mic level (received from the observer as return value in the callback).
                        WEBRTC_TRACE(kTraceStream, kTraceAudioDevice, _id, "AGC change of volume: new=%u", newMicLevel);
                        // We store this outside of the audio buffer to avoid
                        // having it overwritten by the getter thread.
                        _newMicLevel = newMicLevel;
                        SetEvent(_hSetCaptureVolumeEvent);
                    }
                }
            }
            else
            {
                // If GetBuffer returns AUDCLNT_E_BUFFER_ERROR, the thread consuming the audio samples
                // must wait for the next processing pass. The client might benefit from keeping a count
                // of the failed GetBuffer calls. If GetBuffer returns this error repeatedly, the client
                // can start a new processing loop after shutting down the current client by calling
                // IAudioClient::Stop, IAudioClient::Reset, and releasing the audio client.
                WEBRTC_TRACE(kTraceError, kTraceAudioDevice, _id,
                    "IAudioCaptureClient::GetBuffer returned AUDCLNT_E_BUFFER_ERROR, hr = 0x%08X", hr);
                goto Exit;
            }

            _UnLock();
        }

    // ---------------------------- THREAD LOOP ---------------------------- <<

            if (_ptrAudioClientIn)
    {
                    hr = _ptrAudioClientIn->Stop();
    }

Exit:
    if (FAILED(hr))
    {
                    _ptrAudioClientIn->Stop();
        _UnLock();
        _TraceCOMError(hr);
    }

    RevertCaptureThreadPriority();

    if (syncBuffer)
    {
        delete[] syncBuffer;
    }

    return (DWORD)hr;
}

声音合成

创建 webrtc::AudioConferenceMixer *audio_mixer_ = nullptr; 在使用多路声音的时候进行混音

int32_t AnyRtmpCore::RecordedDataIsAvailable(const void* audiosamples, const size_t nSamples,
    const size_t nBytesPerSample, const size_t nChannels, const uint32_t samplesPerSec, const uint32_t totalDelayMS,
    const int32_t clockDrift, const uint32_t currentMicLevel, const bool keyPressed, uint32_t& newMicLevel)
{
    std::cout << "[-----------] record data avaliable " << nSamples << nBytesPerSample << nChannels << samplesPerSec << std::endl;
    rtc::CritScope cs(&cs_audio_record_);

    if (microphone_enable_ && bgm_enable_) {
        audio_device_mixer_ptr_->RecordedDataIsAvailable(audioSamples, nSamples,
            nBytesPerSample, nChannels, samplesPerSec, totalDelayMS,
            clockDrift, currentMicLevel, keyPressed, newMicLevel);
        if (audio_mixer_) {
            audio_mixer_->Process();
        }        
    }
    else
    {
        // 当只有一种声音时，不进行混音
        if (audio_record_callback_) {
            if (audio_record_sample_hz_ != samplesPerSec || nChannels != audio_record_channels_) {
                int16_t temp_output[kMaxDataSizeSamples];
                int samples_per_channel_int = resampler_record_.Resample10Msec((int16_t*)audioSamples, samplesPerSec * nChannels,
                    audio_record_sample_hz_ * audio_record_channels_, 1, kMaxDataSizeSamples, temp_output);
                audio_record_callback_->OnRecordAudio(temp_output, audio_record_sample_hz_ / 100, nBytesPerSample, audio_record_channels_, audio_record_sample_hz_, totalDelayMS);
            }
            else {
                audio_record_callback_->OnRecordAudio(audioSamples, nSamples, nBytesPerSample, audio_record_channels_, samplesPerSec, totalDelayMS);
            }
        }
    }
        
    return 0;
}

以上是关于WebRTC实现背景声音的混流的主要内容，如果未能解决你的问题，请参考以下文章