bug 1474222 change ConvolverNode output to mono for single channel convolution r=padenot
authorKarl Tomlinson <karlt+@karlt.net>
Mon, 06 Aug 2018 21:24:15 +1200
changeset 486988 3ac736416349f088fb710450f85da8ac51d33374
parent 486987 e6fc5ab5dd8c2293196f897c068f63f9a41c2919
child 486989 a10cd240dcaf8d0597a8685df99526c30344b12f
push id9719
push userffxbld-merge
push dateFri, 24 Aug 2018 17:49:46 +0000
treeherdermozilla-beta@719ec98fba77 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerspadenot
bugs1474222
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
bug 1474222 change ConvolverNode output to mono for single channel convolution r=padenot This also returns to using a single convolver for processing of mono input, which introduces complexity in up-mixing the state of the convolver when a second channel is added. MozReview-Commit-ID: KeBrAswQbtF
dom/media/webaudio/ConvolverNode.cpp
testing/web-platform/meta/webaudio/the-audio-api/the-convolvernode-interface/convolver-response-1-chan.html.ini
--- a/dom/media/webaudio/ConvolverNode.cpp
+++ b/dom/media/webaudio/ConvolverNode.cpp
@@ -25,23 +25,74 @@ NS_IMPL_ADDREF_INHERITED(ConvolverNode, 
 NS_IMPL_RELEASE_INHERITED(ConvolverNode, AudioNode)
 
 class ConvolverNodeEngine final : public AudioNodeEngine
 {
   typedef PlayingRefChangeHandler PlayingRefChanged;
 public:
   ConvolverNodeEngine(AudioNode* aNode, bool aNormalize)
     : AudioNodeEngine(aNode)
-    , mLeftOverData(INT32_MIN)
-    , mSampleRate(0.0f)
     , mUseBackgroundThreads(!aNode->Context()->IsOffline())
     , mNormalize(aNormalize)
   {
   }
 
+  // Indicates how the right output channel is generated.
+  enum class RightConvolverMode {
+    // A right convolver is always used when there is more than one impulse
+    // response channel.
+    Always,
+    // With a single response channel, the mode may be either Direct or
+    // Difference.  The decision on which to use is made when stereo input is
+    // received.  Once the right convolver is in use, convolver state is
+    // suitable only for the selected mode, and so the mode cannot change
+    // until the right convolver contains only silent history.
+    //
+    // With Direct mode, each convolver processes a corresponding channel.
+    // This mode is selected when input is initially stereo or
+    // channelInterpretation is "discrete" at the time or starting the right
+    // convolver when input changes from non-silent mono to stereo.
+    Direct,
+    // Difference mode is selected if channelInterpretation is "speakers" at
+    // the time starting the right convolver when the input changes from mono
+    // to stereo.
+    //
+    // When non-silent input is initially mono, with a single response
+    // channel, the right output channel is not produced until input becomes
+    // stereo.  Only a single convolver is used for mono processing.  When
+    // stereo input arrives after mono input, output must be as if the mono
+    // signal remaining in the left convolver is up-mixed, but the right
+    // convolver has not been initialized with the history of the mono input.
+    // Copying the state of the left convolver into the right convolver is not
+    // desirable, because there is considerable state to copy, and the
+    // different convolvers are intended to process out of phase, which means
+    // that state from one convolver would not directly map to state in
+    // another convolver.
+    //
+    // Instead the distributive property of convolution is used to generate
+    // the right output channel using information in the left output channel.
+    // Using l and r to denote the left and right channel input signals, g the
+    // impulse response, and * convolution, the convolution of the right
+    // channel can be given by
+    //
+    //   r * g = (l + (r - l)) * g
+    //         = l * g + (r - l) * g
+    //
+    // The left convolver continues to process the left channel l to produce
+    // l * g.  The right convolver processes the difference of input channel
+    // signals r - l to produce (r - l) * g.  The outputs of the two
+    // convolvers are added to generate the right channel output r * g.
+    //
+    // The benefit of doing this is that the history of the r - l input for a
+    // "speakers" up-mixed mono signal is zero, and so an empty convolver
+    // already has exactly the right history for mixing the previous mono
+    // signal with the new stereo signal.
+    Difference
+  };
+
   enum Parameters {
     SAMPLE_RATE,
     NORMALIZE
   };
   void SetInt32Parameter(uint32_t aIndex, int32_t aParam) override
   {
     switch (aIndex) {
     case NORMALIZE:
@@ -68,36 +119,64 @@ public:
     // Note about empirical tuning (this is copied from Blink)
     // The maximum FFT size affects reverb performance and accuracy.
     // If the reverb is single-threaded and processes entirely in the real-time audio thread,
     // it's important not to make this too high.  In this case 8192 is a good value.
     // But, the Reverb object is multi-threaded, so we want this as high as possible without losing too much accuracy.
     // Very large FFTs will have worse phase errors. Given these constraints 32768 is a good compromise.
     const size_t MaxFFTSize = 32768;
 
-    mLeftOverData = INT32_MIN; // reset
+    // Reset.
+    mRemainingLeftOutput = INT32_MIN;
+    mRemainingRightOutput = 0;
+    mRemainingRightHistory = 0;
 
     if (aBuffer.IsNull() || !mSampleRate) {
       mReverb = nullptr;
       return;
     }
 
+    // Assume for now that convolution of channel difference is not required.
+    // Direct may change to Difference during processing.
+    mRightConvolverMode =
+      aBuffer.ChannelCount() == 1 ? RightConvolverMode::Direct
+      : RightConvolverMode::Always;
+
     mReverb = new WebCore::Reverb(aBuffer, MaxFFTSize, mUseBackgroundThreads,
                                   mNormalize, mSampleRate);
   }
 
+  void AllocateReverbInput(const AudioBlock& aInput,
+                           uint32_t aTotalChannelCount)
+  {
+    uint32_t inputChannelCount = aInput.ChannelCount();
+    MOZ_ASSERT(inputChannelCount <= aTotalChannelCount);
+    mReverbInput.AllocateChannels(aTotalChannelCount);
+    // Pre-multiply the input's volume
+    for (uint32_t i = 0; i < inputChannelCount; ++i) {
+      const float* src = static_cast<const float*>(aInput.mChannelData[i]);
+      float* dest = mReverbInput.ChannelFloatsForWrite(i);
+      AudioBlockCopyChannelWithScale(src, aInput.mVolume, dest);
+    }
+    // Fill remaining channels with silence
+    for (uint32_t i = inputChannelCount; i < aTotalChannelCount; ++i) {
+      float* dest = mReverbInput.ChannelFloatsForWrite(i);
+      std::fill_n(dest, WEBAUDIO_BLOCK_SIZE, 0.0f);
+    }
+  }
+
   void ProcessBlock(AudioNodeStream* aStream,
                     GraphTime aFrom,
                     const AudioBlock& aInput,
                     AudioBlock* aOutput,
                     bool* aFinished) override;
 
   bool IsActive() const override
   {
-    return mLeftOverData != INT32_MIN;
+    return mRemainingLeftOutput != INT32_MIN;
   }
 
   size_t SizeOfExcludingThis(MallocSizeOf aMallocSizeOf) const override
   {
     size_t amount = AudioNodeEngine::SizeOfExcludingThis(aMallocSizeOf);
 
     amount += mReverbInput.SizeOfExcludingThis(aMallocSizeOf, false);
 
@@ -112,77 +191,208 @@ public:
   {
     return aMallocSizeOf(this) + SizeOfExcludingThis(aMallocSizeOf);
   }
 
 private:
   // Keeping mReverbInput across process calls avoids unnecessary reallocation.
   AudioBlock mReverbInput;
   nsAutoPtr<WebCore::Reverb> mReverb;
-  int32_t mLeftOverData;
-  float mSampleRate;
+  // Tracks samples of the tail remaining to be output.  INT32_MIN is a
+  // special value to indicate that the end of any previous tail has been
+  // handled.
+  int32_t mRemainingLeftOutput = INT32_MIN;
+  // mRemainingRightOutput and mRemainingRightHistory are only used when
+  // mRightOutputMode != Always.  There is no special handling required at the
+  // end of tail times and so INT32_MIN is not used.
+  // mRemainingRightOutput tracks how much longer this node needs to continue
+  // to produce a right output channel.
+  int32_t mRemainingRightOutput = 0;
+  // mRemainingRightHistory tracks how much silent input would be required to
+  // drain the right convolver, which may sometimes be longer than the period
+  // a right output channel is required.
+  int32_t mRemainingRightHistory = 0;
+  float mSampleRate = 0.0f;
+  RightConvolverMode mRightConvolverMode = RightConvolverMode::Always;
   bool mUseBackgroundThreads;
   bool mNormalize;
 };
 
+static void
+AddScaledLeftToRight(AudioBlock* aBlock, float aScale)
+{
+  const float* left = static_cast<const float*>(aBlock->mChannelData[0]);
+  float* right = aBlock->ChannelFloatsForWrite(1);
+  AudioBlockAddChannelWithScale(left, aScale, right);
+}
+
 void
 ConvolverNodeEngine::ProcessBlock(AudioNodeStream* aStream,
                                   GraphTime aFrom,
                                   const AudioBlock& aInput,
                                   AudioBlock* aOutput,
                                   bool* aFinished)
 {
   if (!mReverb) {
     aOutput->SetNull(WEBAUDIO_BLOCK_SIZE);
     return;
   }
 
+  uint32_t inputChannelCount = aInput.ChannelCount();
   if (aInput.IsNull()) {
-    if (mLeftOverData > 0) {
-      mLeftOverData -= WEBAUDIO_BLOCK_SIZE;
-      mReverbInput.AllocateChannels(1);
-      WriteZeroesToAudioBlock(&mReverbInput, 0, WEBAUDIO_BLOCK_SIZE);
+    if (mRemainingLeftOutput > 0) {
+      mRemainingLeftOutput -= WEBAUDIO_BLOCK_SIZE;
+      AllocateReverbInput(aInput, 1); // floats for silence
     } else {
-      if (mLeftOverData != INT32_MIN) {
-        mLeftOverData = INT32_MIN;
+      if (mRemainingLeftOutput != INT32_MIN) {
+        mRemainingLeftOutput = INT32_MIN;
+        MOZ_ASSERT(mRemainingRightOutput <= 0);
+        MOZ_ASSERT(mRemainingRightHistory <= 0);
         aStream->ScheduleCheckForInactive();
         RefPtr<PlayingRefChanged> refchanged =
           new PlayingRefChanged(aStream, PlayingRefChanged::RELEASE);
         aStream->Graph()->
           DispatchToMainThreadAfterStreamStateUpdate(refchanged.forget());
       }
       aOutput->SetNull(WEBAUDIO_BLOCK_SIZE);
       return;
     }
   } else {
-    if (aInput.mVolume != 1.0f) {
-      // Pre-multiply the input's volume
-      uint32_t numChannels = aInput.ChannelCount();
-      mReverbInput.AllocateChannels(numChannels);
-      for (uint32_t i = 0; i < numChannels; ++i) {
-        const float* src = static_cast<const float*>(aInput.mChannelData[i]);
-        float* dest = mReverbInput.ChannelFloatsForWrite(i);
-        AudioBlockCopyChannelWithScale(src, aInput.mVolume, dest);
-      }
-    } else {
-      mReverbInput = aInput;
-    }
-
-    if (mLeftOverData <= 0) {
+    if (mRemainingLeftOutput <= 0) {
       RefPtr<PlayingRefChanged> refchanged =
         new PlayingRefChanged(aStream, PlayingRefChanged::ADDREF);
       aStream->Graph()->
         DispatchToMainThreadAfterStreamStateUpdate(refchanged.forget());
     }
-    mLeftOverData = mReverb->impulseResponseLength();
-    MOZ_ASSERT(mLeftOverData > 0);
+
+    // Use mVolume as a flag to detect whether AllocateReverbInput() gets
+    // called.
+    mReverbInput.mVolume = 0.0f;
+
+    // Special handling of input channel count changes is used when there is
+    // only a single impulse response channel.  See RightConvolverMode.
+    if (mRightConvolverMode != RightConvolverMode::Always) {
+      ChannelInterpretation channelInterpretation =
+        aStream->GetChannelInterpretation();
+      if (inputChannelCount == 2) {
+        if (mRemainingRightHistory <= 0) {
+          // Will start the second convolver.  Choose to convolve the right
+          // channel directly if there is no left tail to up-mix or up-mixing
+          // is "discrete".
+          mRightConvolverMode =
+            (mRemainingLeftOutput <= 0 ||
+             channelInterpretation == ChannelInterpretation::Discrete) ?
+            RightConvolverMode::Direct : RightConvolverMode::Difference;
+        }
+        // The extra WEBAUDIO_BLOCK_SIZE is subtracted below.
+        mRemainingRightOutput =
+          mReverb->impulseResponseLength() + WEBAUDIO_BLOCK_SIZE;
+        mRemainingRightHistory = mRemainingRightOutput;
+        if (mRightConvolverMode == RightConvolverMode::Difference) {
+          AllocateReverbInput(aInput, 2);
+          // Subtract left from right.
+          AddScaledLeftToRight(&mReverbInput, -1.0f);
+        }
+      } else if (mRemainingRightHistory > 0) {
+        // There is one channel of input, but a second convolver also
+        // requires input.  Up-mix appropriately for the second convolver.
+        if ((mRightConvolverMode == RightConvolverMode::Difference) ^
+            (channelInterpretation == ChannelInterpretation::Discrete)) {
+          MOZ_ASSERT(
+            (mRightConvolverMode == RightConvolverMode::Difference &&
+             channelInterpretation == ChannelInterpretation::Speakers) ||
+            (mRightConvolverMode == RightConvolverMode::Direct &&
+             channelInterpretation == ChannelInterpretation::Discrete));
+          // The state is one of the following combinations:
+          // 1) Difference and speakers.
+          //    Up-mixing gives r = l.
+          //    The input to the second convolver is r - l.
+          // 2) Direct and discrete.
+          //    Up-mixing gives r = 0.
+          //    The input to the second convolver is r.
+          //
+          // In each case the input for the second convolver is silence, which
+          // will drain the convolver.
+          AllocateReverbInput(aInput, 2);
+        } else {
+          if (channelInterpretation == ChannelInterpretation::Discrete) {
+            MOZ_ASSERT(mRightConvolverMode == RightConvolverMode::Difference);
+            // channelInterpretation has changed since the second convolver
+            // was added.  "discrete" up-mixing of input would produce a
+            // silent right channel r = 0, but the second convolver needs
+            // r - l for RightConvolverMode::Difference.
+            AllocateReverbInput(aInput, 2);
+            AddScaledLeftToRight(&mReverbInput, -1.0f);
+          } else {
+            MOZ_ASSERT(channelInterpretation ==
+                       ChannelInterpretation::Speakers);
+            MOZ_ASSERT(mRightConvolverMode == RightConvolverMode::Direct);
+            // The Reverb will essentially up-mix the single input channel by
+            // feeding it into both convolvers.
+          }
+          // The second convolver does not have silent input, and so it will
+          // not drain.  It will need to continue processing up-mixed input
+          // because the next input block may be stereo, which would be mixed
+          // with the signal remaining in the convolvers.
+          // The extra WEBAUDIO_BLOCK_SIZE is subtracted below.
+          mRemainingRightHistory =
+            mReverb->impulseResponseLength() + WEBAUDIO_BLOCK_SIZE;
+        }
+      }
+    }
+
+    if (mReverbInput.mVolume == 0.0f) { // not yet set
+      if (aInput.mVolume != 1.0f) {
+        AllocateReverbInput(aInput, inputChannelCount); // pre-multiply
+      } else {
+        mReverbInput = aInput;
+      }
+    }
+
+    mRemainingLeftOutput = mReverb->impulseResponseLength();
+    MOZ_ASSERT(mRemainingLeftOutput > 0);
   }
-  aOutput->AllocateChannels(2);
+
+  // "The ConvolverNode produces a mono output only in the single case where
+  // there is a single input channel and a single-channel buffer."
+  uint32_t outputChannelCount = 2;
+  uint32_t reverbOutputChannelCount = 2;
+  if (mRightConvolverMode != RightConvolverMode::Always) {
+    // When the input changes from stereo to mono, the output continues to be
+    // stereo for the length of the tail time, during which the two channels
+    // may differ.
+    if (mRemainingRightOutput > 0) {
+      MOZ_ASSERT(mRemainingRightHistory > 0);
+      mRemainingRightOutput -= WEBAUDIO_BLOCK_SIZE;
+    } else {
+      outputChannelCount = 1;
+    }
+    // The second convolver keeps processing until it drains.
+    if (mRemainingRightHistory > 0) {
+      mRemainingRightHistory -= WEBAUDIO_BLOCK_SIZE;
+    } else {
+      reverbOutputChannelCount = 1;
+    }
+  }
+
+  // If there are two convolvers, then they each need an output buffer, even
+  // if the second convolver is only processing to keep history of up-mixed
+  // input.
+  aOutput->AllocateChannels(reverbOutputChannelCount);
 
   mReverb->process(&mReverbInput, aOutput);
+
+  if (mRightConvolverMode == RightConvolverMode::Difference &&
+      outputChannelCount == 2) {
+    // Add left to right.
+    AddScaledLeftToRight(aOutput, 1.0f);
+  } else {
+    // Trim if outputChannelCount < reverbOutputChannelCount
+    aOutput->mChannelData.TruncateLength(outputChannelCount);
+  }
 }
 
 ConvolverNode::ConvolverNode(AudioContext* aContext)
   : AudioNode(aContext,
               2,
               ChannelCountMode::Clamped_max,
               ChannelInterpretation::Speakers)
   , mNormalize(true)
--- a/testing/web-platform/meta/webaudio/the-audio-api/the-convolvernode-interface/convolver-response-1-chan.html.ini
+++ b/testing/web-platform/meta/webaudio/the-audio-api/the-convolvernode-interface/convolver-response-1-chan.html.ini
@@ -1,15 +1,9 @@
 [convolver-response-1-chan.html]
-  [X 1: Channel 1: Expected 0 for all values but found 1280 unexpected values: \n\tIndex\tActual\n\t[0\]\t-1.1920928955078125e-7\n\t[1\]\t-4.470348358154297e-8\n\t[2\]\t0.3311062455177307\n\t[3\]\t0.6248593926429749\n\t...and 1276 more errors.]
-    expected: FAIL
-
-  [< [1-channel input\] 1 out of 2 assertions were failed.]
-    expected: FAIL
-
   [X 2: Channel 0 expected to be equal to the array [0,0,0.9458408951759338,0.8448333740234375,0.8210252523422241,0.8620985746383667,0.8430315852165222,0.855602502822876,0.7933436632156372,0.9865825176239014,0.3972480297088623,-0.7786127924919128,-0.9223549962043762,-0.7896472215652466,-0.8727429509162903,-0.8325281143188477...\] but differs in 966 places:\n\tIndex\tActual\t\t\tExpected\n\t[0\]\t2.9802322387695313e-8\t0.0000000000000000e+0\n\t[1\]\t-7.4505805969238281e-8\t0.0000000000000000e+0\n\t[2\]\t9.4584077596664429e-1\t9.4584089517593384e-1\n\t[3\]\t8.4483331441879272e-1\t8.4483337402343750e-1\n\t...and 962 more errors.]
     expected: FAIL
 
   [X 2: Channel 1 expected to be equal to the array [0,0,0.9918842315673828,0.7683960199356079,0.9083511829376221,0.7684863805770874,0.9814503192901611,0.3193226158618927,-0.9322392344474792,-0.8032255172729492,-0.8812425136566162,-0.7985008358955383,-0.9260328412055969,-0.600982666015625,0.7887306809425354,0.8655399680137634...\] but differs in 1034 places:\n\tIndex\tActual\t\t\tExpected\n\t[0\]\t-5.9604644775390625e-8\t0.0000000000000000e+0\n\t[1\]\t4.4703483581542969e-8\t0.0000000000000000e+0\n\t[3\]\t7.6839596033096313e-1\t7.6839601993560791e-1\n\t[4\]\t9.0835124254226685e-1\t9.0835118293762207e-1\n\t...and 1030 more errors.]
     expected: FAIL
 
   [< [2-channel input\] 2 out of 2 assertions were failed.]
     expected: FAIL
@@ -36,20 +30,17 @@
     expected: FAIL
 
   [X 5.1: Channel 1 expected to be equal to the array [0,0,2.4002127647399902,1.8464014530181885,1.242234230041504,0.578858494758606,0.3615039587020874,0.16441935300827026,-0.7429117560386658,-1.5301964282989502,-1.898935079574585,-0.7277188301086426,0.01055973768234253,0.7105643153190613,1.7486152648925781,0.26711004972457886...\] but differs in 1033 places:\n\tIndex\tActual\t\t\tExpected\n\t[0\]\t2.9802322387695313e-7\t0.0000000000000000e+0\n\t[1\]\t-1.8626451492309570e-8\t0.0000000000000000e+0\n\t[4\]\t1.2422341108322144e+0\t1.2422342300415039e+0\n\t[5\]\t5.7885855436325073e-1\t5.7885849475860596e-1\n\t...and 1029 more errors.]
     expected: FAIL
 
   [< [5.1-channel input\] 2 out of 2 assertions were failed.]
     expected: FAIL
 
-  [# AUDIT TASK RUNNER FINISHED: 5 out of 6 tasks were failed.]
-    expected: FAIL
-
-  [X 1: Channel 1: Expected 0 for all values but found 1279 unexpected values: \n\tIndex\tActual\n\t[1\]\t-2.9802322387695312e-8\n\t[2\]\t0.33110618591308594\n\t[3\]\t0.6248594522476196\n\t[4\]\t0.8481202721595764\n\t...and 1275 more errors.]
+  [# AUDIT TASK RUNNER FINISHED: 4 out of 6 tasks were failed.]
     expected: FAIL
 
   [X 2: Channel 0 expected to be equal to the array [0,0,0.9458407163619995,0.844833254814148,0.821025013923645,0.8620984554290771,0.8430314660072327,0.8556023836135864,0.7933435440063477,0.9865822792053223,0.39724797010421753,-0.7786126136779785,-0.9223548769950867,-0.7896471619606018,-0.8727428317070007,-0.8325279355049133...\] but differs in 993 places:\n\tIndex\tActual\t\t\tExpected\n\t[0\]\t-2.0861625671386719e-7\t0.0000000000000000e+0\n\t[1\]\t-2.9802322387695313e-8\t0.0000000000000000e+0\n\t[2\]\t9.4584059715270996e-1\t9.4584071636199951e-1\n\t[4\]\t8.2102489471435547e-1\t8.2102501392364502e-1\n\t...and 989 more errors.]
     expected: FAIL
 
   [X 2: Channel 1 expected to be equal to the array [0,0,0.9918840527534485,0.7683959007263184,0.9083510637283325,0.7684863805770874,0.9814502000808716,0.31932249665260315,-0.9322391152381897,-0.8032253384590149,-0.8812423348426819,-0.7985007762908936,-0.9260326027870178,-0.6009824872016907,0.7887305617332458,0.8655398488044739...\] but differs in 1078 places:\n\tIndex\tActual\t\t\tExpected\n\t[0\]\t5.9604644775390625e-8\t0.0000000000000000e+0\n\t[1\]\t2.9802322387695313e-8\t0.0000000000000000e+0\n\t[2\]\t9.9188399314880371e-1\t9.9188405275344849e-1\n\t[3\]\t7.6839596033096313e-1\t7.6839590072631836e-1\n\t...and 1074 more errors.]
     expected: FAIL