Bug 1397793 - Move to APM - Part 2 - Actual processing. r=pehrsons
authorPaul Adenot <paul@paul.cx>
Mon, 04 Dec 2017 13:34:14 +0100
changeset 396336 2b871bc4a4ba59b94dfb9319b7e6debe91022320
parent 396335 0ec5d3fdaf1bd40f1aeac1d697738d9b2626e8c3
child 396337 2ef19550733d9b053ca945ba613a93977d77bade
push id56975
push userdluca@mozilla.com
push dateThu, 14 Dec 2017 09:59:07 +0000
treeherderautoland@16bcfaad13e1 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerspehrsons
bugs1397793
milestone59.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1397793 - Move to APM - Part 2 - Actual processing. r=pehrsons This also is long, but simple. First, we switch to floats everywhere. This allows to work with any rate, is more flexible with channel layout, and is a stable API (see audio_processing.h in webrtc.org). Then, 10ms worth of audio (already at the graph rate) are poped from the lock-free queue (fed on the other end by the MSG mixer), and does the following: - Down mixing to stereo (if needed) - De-interleaving into planar buffer - Prepare input and output config - Actually make the API call - Free the data Now, first, we should use a ring buffer, and not have to free any data. Then we also should not use a lock-free queue, and synchronously process the reverse-stream, but this is enough code already. Then, the actual mic data processing: - Pop a packet from the packetizer (that gives us 10ms worth of audio, note that we switch from int16_t to float, i.e. we don't do this conversion anymore). - We convert to planar buffers, deinterleaving - Prepare input and output config - Allocate a SharedBuffer of the right size - Process the data with the processing algorithm selected in UpdateSingleSource - Append to the a MediaSegment, and append to the right MediaStreamTrack for the correct SourceMediaStream (the data is already planar and all well). MozReview-Commit-ID: 2IjgHP0GAmw
dom/media/webrtc/AudioOutputObserver.h
dom/media/webrtc/MediaEngineWebRTC.h
dom/media/webrtc/MediaEngineWebRTCAudio.cpp
--- a/dom/media/webrtc/AudioOutputObserver.h
+++ b/dom/media/webrtc/AudioOutputObserver.h
@@ -12,23 +12,23 @@
 
 namespace webrtc {
 class SingleRwFifo;
 }
 
 namespace mozilla {
 
 typedef struct FarEndAudioChunk_ {
-  uint16_t mSamples;
+  size_t mSamples;
   bool mOverrun;
-  int16_t mData[1]; // variable-length
+  AudioDataValue mData[1]; // variable-length
 } FarEndAudioChunk;
 
 // This class is used to packetize and send the mixed audio from an MSG, in
-// int16, to the AEC module of WebRTC.org.
+// float, to the AEC module of WebRTC.org.
 class AudioOutputObserver
 {
 public:
   AudioOutputObserver();
 
   NS_INLINE_DECL_THREADSAFE_REFCOUNTING(AudioOutputObserver);
 
   void Clear();
--- a/dom/media/webrtc/MediaEngineWebRTC.h
+++ b/dom/media/webrtc/MediaEngineWebRTC.h
@@ -511,17 +511,17 @@ private:
   RefPtr<AudioOutputObserver> mAudioOutputObserver;
 
   // Note: shared across all microphone sources
   static int sChannelsOpen;
 
   const UniquePtr<webrtc::AudioProcessing> mAudioProcessing;
 
   // accessed from the GraphDriver thread except for deletion
-  nsAutoPtr<AudioPacketizer<AudioDataValue, AudioDataValue>> mPacketizer;
+  nsAutoPtr<AudioPacketizer<AudioDataValue, float>> mPacketizer;
   ScopedCustomReleasePtr<webrtc::VoEExternalMedia> mVoERenderListener;
 
   // mMonitor protects mSources[] and mPrinicpalIds[] access/changes, and
   // transitions of mState from kStarted to kStopped (which are combined with
   // EndTrack()). mSources[] and mPrincipalHandles[] are accessed from webrtc
   // threads.
   Monitor mMonitor;
   nsTArray<RefPtr<SourceMediaStream>> mSources;
@@ -544,17 +544,18 @@ private:
   // because of prefs or constraints. This allows simply copying the audio into
   // the MSG, skipping resampling and the whole webrtc.org code.
   bool mSkipProcessing;
 
   // To only update microphone when needed, we keep track of previous settings.
   MediaEnginePrefs mLastPrefs;
 
   AlignedFloatBuffer mInputBuffer;
-  AlignedFloatBuffer mInputDownmixBuffer;
+  AlignedFloatBuffer mDeinterleavedBuffer;
+  AlignedAudioBuffer mInputDownmixBuffer;
 };
 
 class MediaEngineWebRTC : public MediaEngine
 {
   typedef MediaEngine Super;
 public:
   explicit MediaEngineWebRTC(MediaEnginePrefs& aPrefs);
 
--- a/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
+++ b/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
@@ -17,23 +17,16 @@
 #endif
 #include "webrtc/modules/audio_device/opensl/single_rw_fifo.h"
 #include "webrtc/voice_engine/voice_engine_defines.h"
 #include "webrtc/modules/audio_processing/include/audio_processing.h"
 #include "webrtc/common_audio/include/audio_util.h"
 
 using namespace webrtc;
 
-#define CHANNELS 1
-#define ENCODING "L16"
-#define DEFAULT_PORT 5555
-
-#define SAMPLE_RATE(freq) ((freq)*2*8) // bps, 16-bit samples
-#define SAMPLE_LENGTH(freq) (((freq)*10)/1000)
-
 // These are restrictions from the webrtc.org code
 #define MAX_CHANNELS 2
 #define MAX_SAMPLING_FREQ 48000 // Hz - multiple of 100
 
 #define MAX_AEC_FIFO_DEPTH 200 // ms - multiple of 10
 static_assert(!(MAX_AEC_FIFO_DEPTH % 10), "Invalid MAX_AEC_FIFO_DEPTH");
 
 namespace mozilla {
@@ -139,38 +132,38 @@ AudioOutputObserver::InsertFarEnd(const 
     aOverran = false;
   }
   // Rechunk to 10ms.
   // The AnalyzeReverseStream() and WebRtcAec_BufferFarend() functions insist on 10ms
   // samples per call.  Annoying...
   while (aFrames) {
     if (!mSaved) {
       mSaved = (FarEndAudioChunk *) moz_xmalloc(sizeof(FarEndAudioChunk) +
-                                                (mChunkSize * channels - 1)*sizeof(int16_t));
+                                                (mChunkSize * channels - 1)*sizeof(AudioDataValue));
       mSaved->mSamples = mChunkSize;
       mSaved->mOverrun = aOverran;
       aOverran = false;
     }
     uint32_t to_copy = mChunkSize - mSamplesSaved;
     if (to_copy > aFrames) {
       to_copy = aFrames;
     }
 
-    int16_t* dest = &(mSaved->mData[mSamplesSaved * channels]);
+    AudioDataValue* dest = &(mSaved->mData[mSamplesSaved * channels]);
     if (aChannels > MAX_CHANNELS) {
       AudioConverter converter(AudioConfig(aChannels, 0), AudioConfig(channels, 0));
       converter.Process(mDownmixBuffer, aBuffer, to_copy);
       ConvertAudioSamples(mDownmixBuffer.Data(), dest, to_copy * channels);
     } else {
       ConvertAudioSamples(aBuffer, dest, to_copy * channels);
     }
 
 #ifdef LOG_FAREND_INSERTION
     if (fp) {
-      fwrite(&(mSaved->mData[mSamplesSaved * aChannels]), to_copy * aChannels, sizeof(int16_t), fp);
+      fwrite(&(mSaved->mData[mSamplesSaved * aChannels]), to_copy * aChannels, sizeof(AudioDataValue), fp);
     }
 #endif
     aFrames -= to_copy;
     mSamplesSaved += to_copy;
     aBuffer += to_copy * aChannels;
 
     if (mSamplesSaved >= mChunkSize) {
       int free_slots = mPlayoutFifo->capacity() - mPlayoutFifo->size();
@@ -198,17 +191,17 @@ MediaEngineWebRTCMicrophoneSource::Media
   , mAudioInput(aAudioInput)
   , mAudioProcessing(AudioProcessing::Create())
   , mMonitor("WebRTCMic.Monitor")
   , mCapIndex(aIndex)
   , mDelayAgnostic(aDelayAgnostic)
   , mExtendedFilter(aExtendedFilter)
   , mTrackID(TRACK_NONE)
   , mStarted(false)
-  , mSampleFrequency(MediaEngine::DEFAULT_SAMPLE_RATE)
+  , mSampleFrequency(MediaEngine::USE_GRAPH_RATE)
   , mTotalFrames(0)
   , mLastLogFrames(0)
   , mSkipProcessing(false)
   , mInputDownmixBuffer(MAX_SAMPLING_FREQ * MAX_CHANNELS / 100)
 {
   MOZ_ASSERT(aAudioInput);
   mDeviceName.Assign(NS_ConvertUTF8toUTF16(name));
   mDeviceUUID.Assign(uuid);
@@ -449,22 +442,34 @@ MediaEngineWebRTCMicrophoneSource::Updat
   switch (mState) {
     case kReleased:
       MOZ_ASSERT(aHandle);
       if (sChannelsOpen != 0) {
         // Until we fix (or wallpaper) support for multiple mic input
         // (Bug 1238038) fail allocation for a second device
         return NS_ERROR_FAILURE;
       }
+      if (mAudioInput->SetRecordingDevice(mCapIndex)) {
+         return NS_ERROR_FAILURE;
+      }
       mAudioInput->SetUserChannelCount(prefs.mChannels);
       if (!AllocChannel()) {
         FreeChannel();
         LOG(("Audio device is not initalized"));
         return NS_ERROR_FAILURE;
       }
+      LOG(("Audio device %d allocated", mCapIndex));
+      {
+        // Update with the actual applied channelCount in order
+        // to store it in settings.
+        uint32_t channelCount = 0;
+        mAudioInput->GetChannelCount(channelCount);
+        MOZ_ASSERT(channelCount > 0);
+        prefs.mChannels = channelCount;
+      }
       break;
 
     case kStarted:
       if (prefs == mLastPrefs) {
         return NS_OK;
       }
 
       if (prefs.mChannels != mLastPrefs.mChannels) {
@@ -671,64 +676,222 @@ MediaEngineWebRTCMicrophoneSource::Notif
                                                     uint32_t aChannels)
 {
   if (mAudioOutputObserver) {
     mAudioOutputObserver->InsertFarEnd(aBuffer, aFrames, false,
                                   aRate, aChannels);
   }
 }
 
+// Only called if we're not in passthrough mode
 void
 MediaEngineWebRTCMicrophoneSource::PacketizeAndProcess(MediaStreamGraph* aGraph,
                                                        const AudioDataValue* aBuffer,
                                                        size_t aFrames,
                                                        TrackRate aRate,
                                                        uint32_t aChannels)
 {
   MOZ_ASSERT(!PassThrough(), "This should be bypassed when in PassThrough mode.");
   size_t offset = 0;
 
   if (!mPacketizer ||
       mPacketizer->PacketSize() != aRate/100u ||
       mPacketizer->Channels() != aChannels) {
     // It's ok to drop the audio still in the packetizer here.
     mPacketizer =
-      new AudioPacketizer<AudioDataValue, AudioDataValue>(aRate/100, aChannels);
+      new AudioPacketizer<AudioDataValue, float>(aRate/100, aChannels);
   }
 
   // On initial capture, throw away all far-end data except the most recent sample
   // since it's already irrelevant and we want to keep avoid confusing the AEC far-end
   // input code with "old" audio.
   if (!mStarted) {
     mStarted  = true;
     while (mAudioOutputObserver->Size() > 1) {
       free(mAudioOutputObserver->Pop()); // only call if size() > 0
     }
   }
 
+  // Feed the far-end audio data (speakers) to the feedback input of the AEC.
+  while (mAudioOutputObserver->Size() > 0) {
+    // Bug 1414837: This will call `free()`, and we should remove it.
+    // Pop gives ownership.
+    UniquePtr<FarEndAudioChunk> buffer(mAudioOutputObserver->Pop()); // only call if size() > 0
+    if (!buffer) {
+      continue;
+    }
+    AudioDataValue* packetDataPointer = buffer->mData;
+    AutoTArray<AudioDataValue*, MAX_CHANNELS> deinterleavedPacketDataChannelPointers;
+    AudioDataValue* interleavedFarend = nullptr;
+    uint32_t channelCountFarend = 0;
+    uint32_t framesPerPacketFarend = 0;
+
+    // Downmix from aChannels to MAX_CHANNELS if needed
+    if (mAudioOutputObserver->PlayoutChannels() > MAX_CHANNELS) {
+      AudioConverter converter(AudioConfig(aChannels, 0, AudioConfig::FORMAT_DEFAULT),
+                               AudioConfig(MAX_CHANNELS, 0, AudioConfig::FORMAT_DEFAULT));
+      framesPerPacketFarend =
+        buffer->mSamples;
+      framesPerPacketFarend =
+        converter.Process(mInputDownmixBuffer,
+                          packetDataPointer,
+                          framesPerPacketFarend);
+      interleavedFarend = mInputDownmixBuffer.Data();
+      channelCountFarend = MAX_CHANNELS;
+      deinterleavedPacketDataChannelPointers.SetLength(MAX_CHANNELS);
+    } else {
+      uint32_t outputChannels = mAudioOutputObserver->PlayoutChannels();
+      interleavedFarend = packetDataPointer;
+      channelCountFarend = outputChannels;
+      framesPerPacketFarend = buffer->mSamples;
+      deinterleavedPacketDataChannelPointers.SetLength(outputChannels);
+    }
+
+    MOZ_ASSERT(interleavedFarend &&
+               (channelCountFarend == 1 || channelCountFarend == 2) &&
+               framesPerPacketFarend);
+
+    offset = 0;
+    for (size_t i = 0; i < deinterleavedPacketDataChannelPointers.Length(); ++i) {
+      deinterleavedPacketDataChannelPointers[i] = packetDataPointer + offset;
+      offset += framesPerPacketFarend;
+    }
+
+    // deinterleave back into the FarEndAudioChunk buffer to save an alloc.
+    // There is enough room because either there is the same number of
+    // channels/frames or we've just downmixed.
+    Deinterleave(interleavedFarend,
+                 framesPerPacketFarend,
+                 channelCountFarend,
+                 deinterleavedPacketDataChannelPointers.Elements());
+
+    // Having the same config for input and output means we potentially save
+    // some CPU. We won't need the output here, the API forces us to set a
+    // valid pointer with enough space.
+    StreamConfig inputConfig(mAudioOutputObserver->PlayoutFrequency(),
+                             channelCountFarend,
+                             false /* we don't use typing detection*/);
+    StreamConfig outputConfig = inputConfig;
+
+    // Prepare a channel pointers array, with enough storage for the
+    // frames.
+    //
+    // If this is a platform that uses s16 for audio input and output,
+    // convert to floats, the APM API we use only accepts floats.
+
+    float* inputData = nullptr;
+#ifdef MOZ_SAMPLE_TYPE_S16
+    // Convert to floats, use mInputBuffer for this.
+    size_t sampleCount = framesPerPacketFarend * channelCountFarend;
+    if (mInputBuffer.Length() < sampleCount) {
+      mInputBuffer.SetLength(sampleCount);
+    }
+    ConvertAudioSamples(buffer->mData, mInputBuffer.Data(), sampleCount);
+    inputData = mInputBuffer.Data();
+#else // MOZ_SAMPLE_TYPE_F32
+    inputData = buffer->mData;
+#endif
+
+    AutoTArray<float*, MAX_CHANNELS> channelsPointers;
+    channelsPointers.SetLength(channelCountFarend);
+    offset = 0;
+    for (size_t i = 0; i < channelsPointers.Length(); ++i) {
+      channelsPointers[i]  = inputData + offset;
+      offset += framesPerPacketFarend;
+    }
+
+    // Passing the same pointers here saves a copy inside this function.
+    int err =
+      mAudioProcessing->ProcessReverseStream(channelsPointers.Elements(),
+                                             inputConfig,
+                                             outputConfig,
+                                             channelsPointers.Elements());
+
+    if (err) {
+      MOZ_LOG(GetMediaManagerLog(), LogLevel::Error,
+          ("error in audio ProcessReverseStream(): %d", err));
+      return;
+    }
+  }
+
+  // Packetize our input data into 10ms chunks, deinterleave into planar channel
+  // buffers, process, and append to the right MediaStreamTrack.
   mPacketizer->Input(aBuffer, static_cast<uint32_t>(aFrames));
 
   while (mPacketizer->PacketsAvailable()) {
     uint32_t samplesPerPacket = mPacketizer->PacketSize() *
       mPacketizer->Channels();
     if (mInputBuffer.Length() < samplesPerPacket) {
       mInputBuffer.SetLength(samplesPerPacket);
+      mDeinterleavedBuffer.SetLength(samplesPerPacket);
     }
-    int16_t* packet = mInputBuffer.Elements();
+    float* packet = mInputBuffer.Data();
     mPacketizer->Output(packet);
 
-    if (aChannels > MAX_CHANNELS) {
-      AudioConverter converter(AudioConfig(aChannels, 0, AudioConfig::FORMAT_S16),
-                               AudioConfig(MAX_CHANNELS, 0, AudioConfig::FORMAT_S16));
-      converter.Process(mInputDownmixBuffer, packet, mPacketizer->PacketSize());
-      mVoERender->ExternalRecordingInsertData(mInputDownmixBuffer.Data(),
-                                              mPacketizer->PacketSize() * MAX_CHANNELS,
-                                              aRate, 0);
-    } else {
-      mVoERender->ExternalRecordingInsertData(packet, samplesPerPacket, aRate, 0);
+    // Deinterleave the input data
+    // Prepare an array pointing to deinterleaved channels.
+    AutoTArray<float*, 8> deinterleavedPacketizedInputDataChannelPointers;
+    deinterleavedPacketizedInputDataChannelPointers.SetLength(aChannels);
+    offset = 0;
+    for (size_t i = 0; i < deinterleavedPacketizedInputDataChannelPointers.Length(); ++i) {
+      deinterleavedPacketizedInputDataChannelPointers[i] = mDeinterleavedBuffer.Data() + offset;
+      offset += aFrames;
+    }
+
+    // Deinterleave to mInputBuffer, pointed to by inputBufferChannelPointers.
+    Deinterleave(packet, mPacketizer->PacketSize(), aChannels,
+        deinterleavedPacketizedInputDataChannelPointers.Elements());
+
+    StreamConfig inputConfig(aRate,
+                             aChannels,
+                             false /* we don't use typing detection*/);
+    StreamConfig outputConfig = inputConfig;
+
+    // Bug 1404965: Get the right delay here, it saves some work down the line.
+    mAudioProcessing->set_stream_delay_ms(0);
+
+    // Bug 1414837: find a way to not allocate here.
+    RefPtr<SharedBuffer> buffer =
+      SharedBuffer::Create(mPacketizer->PacketSize() * aChannels * sizeof(float));
+    AudioSegment segment;
+
+    // Prepare channel pointers to the SharedBuffer created above.
+    AutoTArray<float*, 8> processedOutputChannelPointers;
+    AutoTArray<const float*, 8> processedOutputChannelPointersConst;
+    processedOutputChannelPointers.SetLength(aChannels);
+    processedOutputChannelPointersConst.SetLength(aChannels);
+
+    offset = 0;
+    for (size_t i = 0; i < processedOutputChannelPointers.Length(); ++i) {
+      processedOutputChannelPointers[i] = static_cast<float*>(buffer->Data()) + offset;
+      processedOutputChannelPointersConst[i] = static_cast<float*>(buffer->Data()) + offset;
+      offset += aFrames;
+    }
+
+    mAudioProcessing->ProcessStream(deinterleavedPacketizedInputDataChannelPointers.Elements(),
+                                    inputConfig,
+                                    outputConfig,
+                                    processedOutputChannelPointers.Elements());
+    MonitorAutoLock lock(mMonitor);
+    if (mState != kStarted)
+      return;
+
+    for (size_t i = 0; i < mSources.Length(); ++i) {
+      if (!mSources[i]) { // why ?!
+        continue;
+      }
+
+      // We already have planar audio data of the right format. Insert into the
+      // MSG.
+      MOZ_ASSERT(processedOutputChannelPointers.Length() == aChannels);
+      segment.AppendFrames(buffer.forget(),
+                           processedOutputChannelPointersConst,
+                           mPacketizer->PacketSize(),
+                           mPrincipalHandles[i]);
+      mSources[i]->AppendToTrack(mTrackID, &segment);
     }
   }
 }
 
 template<typename T>
 void
 MediaEngineWebRTCMicrophoneSource::InsertInGraph(const T* aBuffer,
                                                  size_t aFrames,
@@ -745,33 +908,33 @@ MediaEngineWebRTCMicrophoneSource::Inser
       MOZ_LOG(AudioLogModule(), LogLevel::Debug,
               ("%p: Inserting %zu samples into graph, total frames = %" PRIu64,
                (void*)this, aFrames, mTotalFrames));
       mLastLogFrames = mTotalFrames;
     }
   }
 
   size_t len = mSources.Length();
-  for (size_t i = 0; i < len; i++) {
+  for (size_t i = 0; i < len; ++i) {
     if (!mSources[i]) {
       continue;
     }
 
     TimeStamp insertTime;
     // Make sure we include the stream and the track.
     // The 0:1 is a flag to note when we've done the final insert for a given input block.
     LogTime(AsyncLatencyLogger::AudioTrackInsertion,
             LATENCY_STREAM_ID(mSources[i].get(), mTrackID),
             (i+1 < len) ? 0 : 1, insertTime);
 
     // Bug 971528 - Support stereo capture in gUM
     MOZ_ASSERT(aChannels >= 1 && aChannels <= 8,
                "Support up to 8 channels");
 
-    nsAutoPtr<AudioSegment> segment(new AudioSegment());
+    AudioSegment segment;
     RefPtr<SharedBuffer> buffer =
       SharedBuffer::Create(aFrames * aChannels * sizeof(T));
     AutoTArray<const T*, 8> channels;
     if (aChannels == 1) {
       PodCopy(static_cast<T*>(buffer->Data()), aBuffer, aFrames);
       channels.AppendElement(static_cast<T*>(buffer->Data()));
     } else {
       channels.SetLength(aChannels);
@@ -787,21 +950,21 @@ MediaEngineWebRTCMicrophoneSource::Inser
 
       DeinterleaveAndConvertBuffer(aBuffer,
                                    aFrames,
                                    aChannels,
                                    write_channels.Elements());
     }
 
     MOZ_ASSERT(aChannels == channels.Length());
-    segment->AppendFrames(buffer.forget(), channels, aFrames,
+    segment.AppendFrames(buffer.forget(), channels, aFrames,
                          mPrincipalHandles[i]);
-    segment->GetStartTime(insertTime);
+    segment.GetStartTime(insertTime);
 
-    mSources[i]->AppendToTrack(mTrackID, segment);
+    mSources[i]->AppendToTrack(mTrackID, &segment);
   }
 }
 
 // Called back on GraphDriver thread!
 // Note this can be called back after ::Shutdown()
 void
 MediaEngineWebRTCMicrophoneSource::NotifyInputData(MediaStreamGraph* aGraph,
                                                    const AudioDataValue* aBuffer,