Bug 525444 - (Part 2/3) Added speech service API. r=smaug
authorEitan Isaacson <eitan@monotonous.org>
Wed, 03 Apr 2013 15:13:17 -0700
changeset 127553 408e6df83e2cd851e6194b565d3efee644a5f512
parent 127552 9d72648fb76932f8d72a2a794190ec19cb3e4d2d
child 127554 ddaf0ebcd9277f87a2e91525e93700de1ecb0386
push id1655
push userbhackett@mozilla.com
push dateThu, 11 Apr 2013 23:17:41 +0000
reviewerssmaug
bugs525444
milestone23.0a1
Bug 525444 - (Part 2/3) Added speech service API. r=smaug
content/media/webspeech/synth/Makefile.in
content/media/webspeech/synth/SpeechSynthesis.cpp
content/media/webspeech/synth/SpeechSynthesis.h
content/media/webspeech/synth/SpeechSynthesisUtterance.h
content/media/webspeech/synth/SpeechSynthesisVoice.h
content/media/webspeech/synth/moz.build
content/media/webspeech/synth/nsISpeechService.idl
content/media/webspeech/synth/nsISynthVoiceRegistry.idl
content/media/webspeech/synth/nsSpeechTask.cpp
content/media/webspeech/synth/nsSpeechTask.h
content/media/webspeech/synth/nsSynthVoiceRegistry.cpp
content/media/webspeech/synth/nsSynthVoiceRegistry.h
content/media/webspeech/synth/test/Makefile.in
content/media/webspeech/synth/test/common.js
content/media/webspeech/synth/test/test_speech_queue.html
content/media/webspeech/synth/test/test_speech_simple.html
--- a/content/media/webspeech/synth/Makefile.in
+++ b/content/media/webspeech/synth/Makefile.in
@@ -24,25 +24,27 @@ LOCAL_INCLUDES += $(VPATH:%=-I%)
 CPPSRCS := \
   EnableSpeechSynthesisCheck.cpp \
   SpeechSynthesisUtterance.cpp \
   SpeechSynthesisVoice.cpp \
   SpeechSynthesis.cpp \
   SpeechSynthesisChild.cpp \
   SpeechSynthesisParent.cpp \
   nsSynthVoiceRegistry.cpp \
+  nsSpeechTask.cpp \
   $(NULL)
 
 EXPORTS_NAMESPACES := mozilla/dom
 EXPORTS_mozilla/dom := \
   EnableSpeechSynthesisCheck.h \
   SpeechSynthesisUtterance.h \
   SpeechSynthesisVoice.h \
   SpeechSynthesis.h \
   SpeechSynthesisChild.h \
   SpeechSynthesisParent.h \
   nsSynthVoiceRegistry.h \
+  nsSpeechTask.h \
   $(NULL)
 
 FORCE_STATIC_LIB := 1
 
 include $(topsrcdir)/config/config.mk
 include $(topsrcdir)/config/rules.mk
--- a/content/media/webspeech/synth/SpeechSynthesis.cpp
+++ b/content/media/webspeech/synth/SpeechSynthesis.cpp
@@ -1,14 +1,15 @@
 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* vim:set ts=2 sw=2 sts=2 et cindent: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+#include "nsSpeechTask.h"
 #include "prlog.h"
 
 #include "mozilla/dom/Element.h"
 
 #include "mozilla/dom/SpeechSynthesisBinding.h"
 #include "SpeechSynthesis.h"
 #include "nsSynthVoiceRegistry.h"
 
@@ -38,22 +39,24 @@ TraverseCachedVoices(const nsAString& aK
 {
   nsCycleCollectionTraversalCallback* cb = static_cast<nsCycleCollectionTraversalCallback*>(aData);
   cb->NoteXPCOMChild(aEntry);
   return PL_DHASH_NEXT;
 }
 
 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(SpeechSynthesis)
   NS_IMPL_CYCLE_COLLECTION_UNLINK(mParent)
+  NS_IMPL_CYCLE_COLLECTION_UNLINK(mCurrentTask)
   NS_IMPL_CYCLE_COLLECTION_UNLINK_PRESERVED_WRAPPER
   tmp->mVoiceCache.Clear();
 NS_IMPL_CYCLE_COLLECTION_UNLINK_END
 
 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(SpeechSynthesis)
   NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mParent)
+  NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mCurrentTask)
   NS_IMPL_CYCLE_COLLECTION_TRAVERSE_SCRIPT_OBJECTS
   tmp->mVoiceCache.EnumerateRead(TraverseCachedVoices, &cb);
 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
 
 NS_IMPL_CYCLE_COLLECTION_TRACE_BEGIN(SpeechSynthesis)
   NS_IMPL_CYCLE_COLLECTION_TRACE_PRESERVED_WRAPPER
 NS_IMPL_CYCLE_COLLECTION_TRACE_END
 
@@ -159,41 +162,62 @@ SpeechSynthesis::AdvanceQueue()
   if (doc) {
     Element* elm = doc->GetHtmlElement();
 
     if (elm) {
       elm->GetLang(docLang);
     }
   }
 
+  mCurrentTask =
+    nsSynthVoiceRegistry::GetInstance()->SpeakUtterance(*utterance, docLang);
+
+  if (mCurrentTask) {
+    mCurrentTask->SetSpeechSynthesis(this);
+  }
+
   return;
 }
 
 void
 SpeechSynthesis::Cancel()
 {
+  mSpeechQueue.Clear();
+
+  if (mCurrentTask) {
+    mCurrentTask->Cancel();
+  }
 }
 
 void
 SpeechSynthesis::Pause()
 {
+  if (mCurrentTask) {
+    mCurrentTask->Pause();
+  }
 }
 
 void
 SpeechSynthesis::Resume()
 {
+  if (mCurrentTask) {
+    mCurrentTask->Resume();
+  }
 }
 
 void
-SpeechSynthesis::OnEnd()
+SpeechSynthesis::OnEnd(const nsSpeechTask* aTask)
 {
+  MOZ_ASSERT(mCurrentTask == aTask);
+
   if (!mSpeechQueue.IsEmpty()) {
     mSpeechQueue.RemoveElementAt(0);
   }
 
+  mCurrentTask = nullptr;
   AdvanceQueue();
 }
 
 void
 SpeechSynthesis::GetVoices(nsTArray< nsRefPtr<SpeechSynthesisVoice> >& aResult)
 {
   aResult.Clear();
   uint32_t voiceCount = 0;
--- a/content/media/webspeech/synth/SpeechSynthesis.h
+++ b/content/media/webspeech/synth/SpeechSynthesis.h
@@ -47,25 +47,27 @@ public:
   void Speak(SpeechSynthesisUtterance& aUtterance);
 
   void Cancel();
 
   void Pause();
 
   void Resume();
 
-  void OnEnd();
+  void OnEnd(const nsSpeechTask* aTask);
 
   void GetVoices(nsTArray< nsRefPtr<SpeechSynthesisVoice> >& aResult);
 
 private:
 
   void AdvanceQueue();
 
   nsCOMPtr<nsPIDOMWindow> mParent;
 
   nsTArray<nsRefPtr<SpeechSynthesisUtterance>> mSpeechQueue;
 
+  nsRefPtr<nsSpeechTask> mCurrentTask;
+
   nsRefPtrHashtable<nsStringHashKey, SpeechSynthesisVoice> mVoiceCache;
 };
 
 } // namespace dom
 } // namespace mozilla
--- a/content/media/webspeech/synth/SpeechSynthesisUtterance.h
+++ b/content/media/webspeech/synth/SpeechSynthesisUtterance.h
@@ -6,28 +6,33 @@
 
 #pragma once
 
 #include "nsCOMPtr.h"
 #include "nsDOMEventTargetHelper.h"
 #include "nsString.h"
 
 #include "EnableSpeechSynthesisCheck.h"
+#include "nsSpeechTask.h"
 
 struct JSContext;
 
 namespace mozilla {
 namespace dom {
 
 class SpeechSynthesisVoice;
+class SpeechSynthesis;
+class nsSynthVoiceRegistry;
 
 class SpeechSynthesisUtterance MOZ_FINAL : public nsDOMEventTargetHelper,
                                            public EnableSpeechSynthesisCheck
 {
   friend class SpeechSynthesis;
+  friend class nsSpeechTask;
+  friend class nsSynthVoiceRegistry;
 
 public:
   SpeechSynthesisUtterance(const nsAString& aText);
   virtual ~SpeechSynthesisUtterance();
 
   NS_DECL_ISUPPORTS_INHERITED
 
   NS_FORWARD_NSIDOMEVENTTARGET(nsDOMEventTargetHelper::)
--- a/content/media/webspeech/synth/SpeechSynthesisVoice.h
+++ b/content/media/webspeech/synth/SpeechSynthesisVoice.h
@@ -6,28 +6,31 @@
 
 #pragma once
 
 #include "nsCOMPtr.h"
 #include "nsString.h"
 #include "nsWrapperCache.h"
 
 #include "EnableSpeechSynthesisCheck.h"
+#include "nsISpeechService.h"
 
 struct JSContext;
 
 namespace mozilla {
 namespace dom {
 
+class nsSynthVoiceRegistry;
 class SpeechSynthesis;
 
 class SpeechSynthesisVoice MOZ_FINAL : public nsISupports,
                                        public nsWrapperCache,
                                        public EnableSpeechSynthesisCheck
 {
+  friend class nsSynthVoiceRegistry;
   friend class SpeechSynthesis;
 
 public:
   SpeechSynthesisVoice(nsISupports* aParent, const nsAString& aUri);
 
   virtual ~SpeechSynthesisVoice();
 
   NS_DECL_CYCLE_COLLECTING_ISUPPORTS
--- a/content/media/webspeech/synth/moz.build
+++ b/content/media/webspeech/synth/moz.build
@@ -6,10 +6,11 @@
 MODULE = 'content'
 
 TEST_DIRS += ['test']
 
 XPIDL_MODULE = 'dom_webspeechsynth'
 
 XPIDL_SOURCES += [
     'nsIDOMSpeechSynthesisEvent.idl',
+    'nsISpeechService.idl',
     'nsISynthVoiceRegistry.idl'
     ]
new file mode 100644
--- /dev/null
+++ b/content/media/webspeech/synth/nsISpeechService.idl
@@ -0,0 +1,157 @@
+/* -*- Mode: IDL; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsISupports.idl"
+
+typedef unsigned short SpeechServiceType;
+
+/**
+ * A callback is implemented by the service. For direct audio services, it is
+ * required to implement these, although it could be helpful to use the
+ * cancel method for shutting down the speech resources.
+ */
+[scriptable, uuid(408251b0-1d7b-4876-888f-718859ce8c9d)]
+interface nsISpeechTaskCallback : nsISupports
+{
+  /**
+   * The user or application has paused the speech.
+   */
+  void onPause();
+
+  /**
+   * The user or application has resumed the speech.
+   */
+  void onResume();
+
+  /**
+   * The user or application has canceled the speech.
+   */
+  void onCancel();
+};
+
+
+/**
+ * A task is associated with a single utterance. It is provided by the browser
+ * to the service in the speak() method.
+ */
+[scriptable, builtinclass, uuid(3a60c397-7a04-4cf7-99ea-7432e7a0a1c1)]
+interface nsISpeechTask : nsISupports
+{
+  /**
+   * Prepare browser for speech.
+   *
+   * @param aCallback callback object for mid-speech operations.
+   * @param aChannels number of audio channels. Only required
+   *                    in direct audio services
+   * @param aRate     audio rate. Only required in direct audio services
+   */
+  [optional_argc] void setup(in nsISpeechTaskCallback aCallback,
+                               [optional] in uint32_t aChannels,
+                               [optional] in uint32_t aRate);
+
+  /**
+   * Send audio data to browser.
+   *
+   * @param aData     an Int16Array with PCM-16 audio data.
+   * @param aLandmarks an array of sample offset and landmark pairs.
+   *                     Used for emiting boundary and mark events.
+   */
+  [implicit_jscontext]
+  void sendAudio(in jsval aData, in jsval aLandmarks);
+
+  /**
+   * Dispatch start event.
+   */
+  void dispatchStart();
+
+  /**
+   * Dispatch end event.
+   *
+   * @param aElapsedTime time in seconds since speech has started.
+   * @param aCharIndex   offset of spoken characters.
+   */
+  void dispatchEnd(in float aElapsedTime, in unsigned long aCharIndex);
+
+  /**
+   * Dispatch pause event. Should not be called directly by service.
+   *
+   * @param aElapsedTime time in seconds since speech has started.
+   * @param aCharIndex   offset of spoken characters.
+   */
+  void dispatchPause(in float aElapsedTime, in unsigned long aCharIndex);
+
+  /**
+   * Dispatch resume event. Should not be called directly by service.
+   *
+   * @param aElapsedTime time in seconds since speech has started.
+   * @param aCharIndex   offset of spoken characters.
+   */
+  void dispatchResume(in float aElapsedTime, in unsigned long aCharIndex);
+
+  /**
+   * Dispatch error event.
+   *
+   * @param aElapsedTime time in seconds since speech has started.
+   * @param aCharIndex   offset of spoken characters.
+   */
+  void dispatchError(in float aElapsedTime, in unsigned long aCharIndex);
+
+  /**
+   * Dispatch boundary event.
+   *
+   * @param aName        name of boundary, 'word' or 'sentence'
+   * @param aElapsedTime time in seconds since speech has started.
+   * @param aCharIndex   offset of spoken characters.
+   */
+  void dispatchBoundary(in DOMString aName, in float aElapsedTime,
+                        in unsigned long aCharIndex);
+
+  /**
+   * Dispatch mark event.
+   *
+   * @param aName        mark identifier.
+   * @param aElapsedTime time in seconds since speech has started.
+   * @param aCharIndex   offset of spoken characters.
+   */
+  void dispatchMark(in DOMString aName, in float aElapsedTime, in unsigned long aCharIndex);
+};
+
+/**
+ * The main interface of a speech synthesis service.
+ *
+ * A service's speak method could be implemented in two ways:
+ *  1. Indirect audio - the service is responsible for outputting audio.
+ *    The service calls the nsISpeechTask.dispatch* methods directly. Starting
+ *    with dispatchStart() and ending with dispatchEnd or dispatchError().
+ *
+ *  2. Direct audio - the service provides us with PCM-16 data, and we output it.
+ *    The service does not call the dispatch task methods directly. Instead,
+ *    audio information is provided at setup(), and audio data is sent with
+ *    sendAudio(). The utterance is terminated with an empty sendAudio().
+ */
+[scriptable, uuid(3952d388-050c-47ba-a70f-5fc1cadf1db0)]
+interface nsISpeechService : nsISupports
+{
+  /**
+   * Speak the given text using the voice identified byu the given uri. See
+   * W3C Speech API spec for information about pitch and rate.
+   * https://dvcs.w3.org/hg/speech-api/raw-file/tip/speechapi.html#utterance-attributes
+   *
+   * @param aText  text to utter.
+   * @param aUri   unique voice identifier.
+   * @param aRate  rate to speak voice in.
+   * @param aPitch pitch to speak voice in.
+   * @param aTask  task instance for utterance, used for sending events or audio
+   *                 data back to browser.
+   */
+  void speak(in DOMString aText, in DOMString aUri,
+             in float aRate, in float aPitch,
+             in nsISpeechTask aTask);
+
+  const SpeechServiceType SERVICETYPE_DIRECT_AUDIO = 1;
+  const SpeechServiceType SERVICETYPE_INDIRECT_AUDIO = 2;
+
+  readonly attribute SpeechServiceType serviceType;
+};
--- a/content/media/webspeech/synth/nsISynthVoiceRegistry.idl
+++ b/content/media/webspeech/synth/nsISynthVoiceRegistry.idl
@@ -1,38 +1,40 @@
 /* -*- Mode: IDL; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
  * You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "nsISupports.idl"
 
+interface nsISpeechService;
+
 [scriptable, builtinclass, uuid(53dcc868-4193-4c3c-a1d9-fe5a0a6af2fb)]
 interface nsISynthVoiceRegistry : nsISupports
 {
   /**
    * Register a speech synthesis voice.
    *
    * @param aService      the service that provides this voice.
    * @param aUri          a unique identifier for this voice.
    * @param aName         human-readable name for this voice.
    * @param aLang         a BCP 47 language tag.
    * @param aLocalService true if service does not require network.
    */
-  void addVoice(in nsISupports aService, in DOMString aUri,
+  void addVoice(in nsISpeechService aService, in DOMString aUri,
                 in DOMString aName, in DOMString aLang,
                 in boolean aLocalService);
 
   /**
    * Remove a speech synthesis voice.
    *
    * @param aService the service that was used to add the voice.
    * @param aUri     a unique identifier of an existing voice.
    */
-  void removeVoice(in nsISupports aService, in DOMString aUri);
+  void removeVoice(in nsISpeechService aService, in DOMString aUri);
 
   /**
    * Set a voice as default.
    *
    * @param aUri       a unique identifier of an existing voice.
    * @param aIsDefault true if this voice should be toggled as default.
    */
   void setDefaultVoice(in DOMString aUri, in boolean aIsDefault);
new file mode 100644
--- /dev/null
+++ b/content/media/webspeech/synth/nsSpeechTask.cpp
@@ -0,0 +1,479 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "AudioSegment.h"
+#include "nsSpeechTask.h"
+#include "SpeechSynthesis.h"
+
+#undef LOG
+#ifdef PR_LOGGING
+extern PRLogModuleInfo* GetSpeechSynthLog();
+#define LOG(type, msg) PR_LOG(GetSpeechSynthLog(), type, msg)
+#else
+#define LOG(type, msg)
+#endif
+
+namespace mozilla {
+namespace dom {
+
+class SynthStreamListener : public MediaStreamListener
+{
+public:
+  SynthStreamListener(nsSpeechTask* aSpeechTask) :
+    mSpeechTask(aSpeechTask),
+    mStarted(false)
+  {
+  }
+
+  void DoNotifyStarted()
+  {
+    if (mSpeechTask) {
+      mSpeechTask->DispatchStartImpl();
+    }
+  }
+
+  void DoNotifyFinished()
+  {
+    if (mSpeechTask) {
+      mSpeechTask->DispatchEndImpl(mSpeechTask->GetCurrentTime(),
+                                   mSpeechTask->GetCurrentCharOffset());
+    }
+  }
+
+  virtual void NotifyFinished(MediaStreamGraph* aGraph)
+  {
+    nsCOMPtr<nsIRunnable> event =
+      NS_NewRunnableMethod(this, &SynthStreamListener::DoNotifyFinished);
+    aGraph->DispatchToMainThreadAfterStreamStateUpdate(event.forget());
+  }
+
+  virtual void NotifyBlockingChanged(MediaStreamGraph* aGraph, Blocking aBlocked)
+  {
+    if (aBlocked == Blocking::UNBLOCKED && !mStarted) {
+      mStarted = true;
+      nsCOMPtr<nsIRunnable> event =
+        NS_NewRunnableMethod(this, &SynthStreamListener::DoNotifyStarted);
+      aGraph->DispatchToMainThreadAfterStreamStateUpdate(event.forget());
+    }
+  }
+
+private:
+  // Raw pointer; if we exist, the stream exists,
+  // and 'mSpeechTask' exclusively owns it and therefor exists as well.
+  nsSpeechTask* mSpeechTask;
+
+  bool mStarted;
+};
+
+// nsSpeechTask
+
+NS_IMPL_CYCLE_COLLECTION_1(nsSpeechTask, mSpeechSynthesis);
+
+NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsSpeechTask)
+  NS_INTERFACE_MAP_ENTRY(nsISpeechTask)
+  NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTask)
+NS_INTERFACE_MAP_END
+
+NS_IMPL_CYCLE_COLLECTING_ADDREF(nsSpeechTask)
+NS_IMPL_CYCLE_COLLECTING_RELEASE(nsSpeechTask)
+
+nsSpeechTask::nsSpeechTask(SpeechSynthesisUtterance* aUtterance)
+  : mUtterance(aUtterance)
+  , mCallback(nullptr)
+  , mIndirectAudio(false)
+{
+  mText = aUtterance->mText;
+  mVolume = aUtterance->Volume();
+}
+
+nsSpeechTask::nsSpeechTask(float aVolume, const nsAString& aText)
+  : mUtterance(nullptr)
+  , mVolume(aVolume)
+  , mText(aText)
+  , mCallback(nullptr)
+  , mIndirectAudio(false)
+{
+}
+
+nsSpeechTask::~nsSpeechTask()
+{
+  if (mStream) {
+    if (!mStream->IsDestroyed()) {
+      mStream->Destroy();
+    }
+
+    mStream = nullptr;
+  }
+}
+
+NS_IMETHODIMP
+nsSpeechTask::Setup(nsISpeechTaskCallback* aCallback,
+                    uint32_t aChannels, uint32_t aRate, uint8_t argc)
+{
+  LOG(PR_LOG_DEBUG, ("nsSpeechTask::Setup"));
+
+  mCallback = aCallback;
+
+  if (argc < 2) {
+    return NS_OK;
+  }
+
+  if (mIndirectAudio) {
+    NS_WARNING("Audio info arguments in Setup() are ignored for indirect audio services.");
+  }
+
+  // XXX: Is there setup overhead here that hurtls latency?
+  mStream = MediaStreamGraph::GetInstance()->CreateSourceStream(nullptr);
+  mStream->AddListener(new SynthStreamListener(this));
+
+  // XXX: Support more than one channel
+  NS_ENSURE_TRUE(aChannels == 1, NS_ERROR_FAILURE);
+
+  mChannels = aChannels;
+
+  AudioSegment* segment = new AudioSegment();
+  mStream->AddTrack(1, aRate, 0, segment);
+  mStream->AddAudioOutput(this);
+  mStream->SetAudioOutputVolume(this, mVolume);
+
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::SendAudio(const JS::Value& aData, const JS::Value& aLandmarks,
+                        JSContext* aCx)
+{
+  NS_ENSURE_TRUE(mStream, NS_ERROR_NOT_AVAILABLE);
+  NS_ENSURE_FALSE(mStream->IsDestroyed(), NS_ERROR_NOT_AVAILABLE);
+  NS_ENSURE_TRUE(mChannels, NS_ERROR_FAILURE);
+
+  if (mIndirectAudio) {
+    NS_WARNING("Can't call SendAudio from an indirect audio speech service.");
+    return NS_ERROR_FAILURE;
+  }
+
+  JSAutoRequest ar(aCx);
+  JS::AutoObjectRooter tvr(aCx);
+
+  JSObject* darray = &aData.toObject();
+  JSAutoCompartment ac(aCx, darray);
+
+  JSObject* tsrc = NULL;
+
+  // Allow either Int16Array or plain JS Array
+  if (JS_IsInt16Array(darray)) {
+    tsrc = darray;
+  } else if (JS_IsArrayObject(aCx, darray)) {
+    JSObject* nobj = JS_NewInt16ArrayFromArray(aCx, darray);
+
+    if (!nobj) {
+      return NS_ERROR_DOM_TYPE_MISMATCH_ERR;
+    }
+
+    tsrc = nobj;
+  } else {
+    return NS_ERROR_DOM_TYPE_MISMATCH_ERR;
+  }
+
+  tvr.setObject(tsrc);
+
+  uint32_t dataLength = JS_GetTypedArrayLength(tsrc);
+
+  if (dataLength == 0) {
+    // XXX: We should end the track too, an undetermined bug does not allow that.
+    mStream->Finish();
+    return NS_OK;
+  }
+
+  nsRefPtr<mozilla::SharedBuffer> samples =
+    SharedBuffer::Create(dataLength * sizeof(int16_t));
+  int16_t* frames = static_cast<int16_t*>(samples->Data());
+  int16_t* sframes = JS_GetInt16ArrayData(tsrc);
+
+  for (uint32_t i = 0; i < dataLength; i++) {
+    frames[i] = sframes[i];
+  }
+
+  AudioSegment segment;
+  nsAutoTArray<const int16_t*, 1> channelData;
+  channelData.AppendElement(frames);
+  segment.AppendFrames(samples.forget(), channelData, dataLength);
+  mStream->AppendToTrack(1, &segment);
+  mStream->AdvanceKnownTracksTime(STREAM_TIME_MAX);
+
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::DispatchStart()
+{
+  if (!mIndirectAudio) {
+    NS_WARNING("Can't call DispatchStart() from a direct audio speech service");
+    return NS_ERROR_FAILURE;
+  }
+
+  return DispatchStartImpl();
+}
+
+nsresult
+nsSpeechTask::DispatchStartImpl()
+{
+  LOG(PR_LOG_DEBUG, ("nsSpeechTask::DispatchStart"));
+
+  MOZ_ASSERT(mUtterance);
+  NS_ENSURE_TRUE(mUtterance->mState == SpeechSynthesisUtterance::STATE_PENDING,
+                 NS_ERROR_NOT_AVAILABLE);
+
+  mUtterance->mState = SpeechSynthesisUtterance::STATE_SPEAKING;
+  mUtterance->DispatchSpeechSynthesisEvent(NS_LITERAL_STRING("start"), 0, 0,
+                                           NS_LITERAL_STRING(""));
+
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::DispatchEnd(float aElapsedTime, uint32_t aCharIndex)
+{
+  if (!mIndirectAudio) {
+    NS_WARNING("Can't call DispatchEnd() from a direct audio speech service");
+    return NS_ERROR_FAILURE;
+  }
+
+  return DispatchEndImpl(aElapsedTime, aCharIndex);
+}
+
+nsresult
+nsSpeechTask::DispatchEndImpl(float aElapsedTime, uint32_t aCharIndex)
+{
+  LOG(PR_LOG_DEBUG, ("nsSpeechTask::DispatchEnd\n"));
+
+  MOZ_ASSERT(mUtterance);
+  NS_ENSURE_FALSE(mUtterance->mState == SpeechSynthesisUtterance::STATE_ENDED,
+                  NS_ERROR_NOT_AVAILABLE);
+
+  // XXX: This should not be here, but it prevents a crash in MSG.
+  if (mStream) {
+    mStream->Destroy();
+  }
+
+  if (mSpeechSynthesis) {
+    mSpeechSynthesis->OnEnd(this);
+  }
+
+  mUtterance->mState = SpeechSynthesisUtterance::STATE_ENDED;
+  mUtterance->DispatchSpeechSynthesisEvent(NS_LITERAL_STRING("end"),
+                                           aCharIndex, aElapsedTime,
+                                           NS_LITERAL_STRING(""));
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::DispatchPause(float aElapsedTime, uint32_t aCharIndex)
+{
+  if (!mIndirectAudio) {
+    NS_WARNING("Can't call DispatchPause() from a direct audio speech service");
+    return NS_ERROR_FAILURE;
+  }
+
+  return DispatchPauseImpl(aElapsedTime, aCharIndex);
+}
+
+nsresult
+nsSpeechTask::DispatchPauseImpl(float aElapsedTime, uint32_t aCharIndex)
+{
+  LOG(PR_LOG_DEBUG, ("nsSpeechTask::DispatchPause"));
+  MOZ_ASSERT(mUtterance);
+  NS_ENSURE_FALSE(mUtterance->mPaused, NS_ERROR_NOT_AVAILABLE);
+  NS_ENSURE_FALSE(mUtterance->mState == SpeechSynthesisUtterance::STATE_ENDED,
+                  NS_ERROR_NOT_AVAILABLE);
+
+  mUtterance->mPaused = true;
+  mUtterance->DispatchSpeechSynthesisEvent(NS_LITERAL_STRING("pause"),
+                                           aCharIndex, aElapsedTime,
+                                           NS_LITERAL_STRING(""));
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::DispatchResume(float aElapsedTime, uint32_t aCharIndex)
+{
+  if (!mIndirectAudio) {
+    NS_WARNING("Can't call DispatchResume() from a direct audio speech service");
+    return NS_ERROR_FAILURE;
+  }
+
+  return DispatchResumeImpl(aElapsedTime, aCharIndex);
+}
+
+nsresult
+nsSpeechTask::DispatchResumeImpl(float aElapsedTime, uint32_t aCharIndex)
+{
+  LOG(PR_LOG_DEBUG, ("nsSpeechTask::DispatchResume"));
+  MOZ_ASSERT(mUtterance);
+  NS_ENSURE_TRUE(mUtterance->mPaused, NS_ERROR_NOT_AVAILABLE);
+  NS_ENSURE_FALSE(mUtterance->mState == SpeechSynthesisUtterance::STATE_ENDED,
+                  NS_ERROR_NOT_AVAILABLE);
+
+  mUtterance->mPaused = false;
+  mUtterance->DispatchSpeechSynthesisEvent(NS_LITERAL_STRING("resume"),
+                                           aCharIndex, aElapsedTime,
+                                           NS_LITERAL_STRING(""));
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::DispatchError(float aElapsedTime, uint32_t aCharIndex)
+{
+  if (!mIndirectAudio) {
+    NS_WARNING("Can't call DispatchError() from a direct audio speech service");
+    return NS_ERROR_FAILURE;
+  }
+
+  return DispatchErrorImpl(aElapsedTime, aCharIndex);
+}
+
+nsresult
+nsSpeechTask::DispatchErrorImpl(float aElapsedTime, uint32_t aCharIndex)
+{
+  MOZ_ASSERT(mUtterance);
+  NS_ENSURE_FALSE(mUtterance->mState == SpeechSynthesisUtterance::STATE_ENDED,
+                  NS_ERROR_NOT_AVAILABLE);
+
+  mUtterance->mState = SpeechSynthesisUtterance::STATE_ENDED;
+  mUtterance->DispatchSpeechSynthesisEvent(NS_LITERAL_STRING("error"),
+                                           aCharIndex, aElapsedTime,
+                                           NS_LITERAL_STRING(""));
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::DispatchBoundary(const nsAString& aName,
+                               float aElapsedTime, uint32_t aCharIndex)
+{
+  if (!mIndirectAudio) {
+    NS_WARNING("Can't call DispatchBoundary() from a direct audio speech service");
+    return NS_ERROR_FAILURE;
+  }
+
+  return DispatchBoundaryImpl(aName, aElapsedTime, aCharIndex);
+}
+
+nsresult
+nsSpeechTask::DispatchBoundaryImpl(const nsAString& aName,
+                               float aElapsedTime, uint32_t aCharIndex)
+{
+  MOZ_ASSERT(mUtterance);
+  NS_ENSURE_TRUE(mUtterance->mState == SpeechSynthesisUtterance::STATE_SPEAKING,
+                 NS_ERROR_NOT_AVAILABLE);
+
+  mUtterance->DispatchSpeechSynthesisEvent(NS_LITERAL_STRING("boundary"),
+                                           aCharIndex, aElapsedTime,
+                                           aName);
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsSpeechTask::DispatchMark(const nsAString& aName,
+                           float aElapsedTime, uint32_t aCharIndex)
+{
+  if (!mIndirectAudio) {
+    NS_WARNING("Can't call DispatchMark() from a direct audio speech service");
+    return NS_ERROR_FAILURE;
+  }
+
+  return DispatchMarkImpl(aName, aElapsedTime, aCharIndex);
+}
+
+nsresult
+nsSpeechTask::DispatchMarkImpl(const nsAString& aName,
+                               float aElapsedTime, uint32_t aCharIndex)
+{
+  MOZ_ASSERT(mUtterance);
+  NS_ENSURE_TRUE(mUtterance->mState == SpeechSynthesisUtterance::STATE_SPEAKING,
+                 NS_ERROR_NOT_AVAILABLE);
+
+  mUtterance->DispatchSpeechSynthesisEvent(NS_LITERAL_STRING("mark"),
+                                           aCharIndex, aElapsedTime,
+                                           aName);
+  return NS_OK;
+}
+
+void
+nsSpeechTask::Pause()
+{
+  if (mUtterance->IsPaused() ||
+      mUtterance->GetState() == SpeechSynthesisUtterance::STATE_ENDED) {
+    return;
+  }
+
+  if (mCallback) {
+    DebugOnly<nsresult> rv = mCallback->OnPause();
+    NS_WARN_IF_FALSE(NS_SUCCEEDED(rv), "Unable to call onPause() callback");
+  }
+
+  if (mStream) {
+    mStream->ChangeExplicitBlockerCount(1);
+  }
+
+  DispatchPauseImpl(GetCurrentTime(), GetCurrentCharOffset());
+}
+
+void
+nsSpeechTask::Resume()
+{
+  if (!mUtterance->IsPaused()) {
+    return;
+  }
+
+  if (mCallback) {
+    DebugOnly<nsresult> rv = mCallback->OnResume();
+    NS_WARN_IF_FALSE(NS_SUCCEEDED(rv), "Unable to call onResume() callback");
+  }
+
+  if (mStream) {
+    mStream->ChangeExplicitBlockerCount(-1);
+  }
+
+  DispatchResumeImpl(GetCurrentTime(), GetCurrentCharOffset());
+}
+
+void
+nsSpeechTask::Cancel()
+{
+  LOG(PR_LOG_DEBUG, ("nsSpeechTask::Cancel"));
+
+  if (mCallback) {
+    DebugOnly<nsresult> rv = mCallback->OnCancel();
+    NS_WARN_IF_FALSE(NS_SUCCEEDED(rv), "Unable to call onCancel() callback");
+  }
+
+  if (mStream) {
+    mStream->ChangeExplicitBlockerCount(1);
+  }
+
+  DispatchEndImpl(GetCurrentTime(), GetCurrentCharOffset());
+}
+
+float
+nsSpeechTask::GetCurrentTime()
+{
+  return mStream ? (float)(mStream->GetCurrentTime() / 1000000.0) : 0;
+}
+
+uint32_t
+nsSpeechTask::GetCurrentCharOffset()
+{
+  return mStream && mStream->IsFinished() ? mText.Length() : 0;
+}
+
+void
+nsSpeechTask::SetSpeechSynthesis(SpeechSynthesis* aSpeechSynthesis)
+{
+  mSpeechSynthesis = aSpeechSynthesis;
+}
+
+} // namespace dom
+} // namespace mozilla
new file mode 100644
--- /dev/null
+++ b/content/media/webspeech/synth/nsSpeechTask.h
@@ -0,0 +1,88 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#pragma once
+
+#include "MediaStreamGraph.h"
+#include "SpeechSynthesisUtterance.h"
+#include "nsISpeechService.h"
+
+namespace mozilla {
+namespace dom {
+
+class SpeechSynthesisUtterance;
+class SpeechSynthesis;
+class SynthStreamListener;
+
+class nsSpeechTask : public nsISpeechTask
+{
+  friend class SynthStreamListener;
+
+public:
+  NS_DECL_CYCLE_COLLECTING_ISUPPORTS
+  NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsSpeechTask, nsISpeechTask)
+
+  NS_DECL_NSISPEECHTASK
+
+  nsSpeechTask(SpeechSynthesisUtterance* aUtterance);
+  nsSpeechTask(float aVolume, const nsAString& aText);
+
+  virtual ~nsSpeechTask();
+
+  virtual void Pause();
+
+  virtual void Resume();
+
+  virtual void Cancel();
+
+  float GetCurrentTime();
+
+  uint32_t GetCurrentCharOffset();
+
+  void SetSpeechSynthesis(SpeechSynthesis* aSpeechSynthesis);
+
+  void SetIndirectAudio(bool aIndirectAudio) { mIndirectAudio = aIndirectAudio; }
+
+protected:
+  virtual nsresult DispatchStartImpl();
+
+  virtual nsresult DispatchEndImpl(float aElapsedTime, uint32_t aCharIndex);
+
+  virtual nsresult DispatchPauseImpl(float aElapsedTime, uint32_t aCharIndex);
+
+  virtual nsresult DispatchResumeImpl(float aElapsedTime, uint32_t aCharIndex);
+
+  virtual nsresult DispatchErrorImpl(float aElapsedTime, uint32_t aCharIndex);
+
+  virtual nsresult DispatchBoundaryImpl(const nsAString& aName,
+                                        float aElapsedTime,
+                                        uint32_t aCharIndex);
+
+  virtual nsresult DispatchMarkImpl(const nsAString& aName,
+                                    float aElapsedTime, uint32_t aCharIndex);
+
+  nsRefPtr<SpeechSynthesisUtterance> mUtterance;
+
+  float mVolume;
+
+  nsString mText;
+
+private:
+  void End();
+
+  nsRefPtr<SourceMediaStream> mStream;
+
+  nsCOMPtr<nsISpeechTaskCallback> mCallback;
+
+  uint32_t mChannels;
+
+  nsRefPtr<SpeechSynthesis> mSpeechSynthesis;
+
+  bool mIndirectAudio;
+};
+
+} // namespace dom
+} // namespace mozilla
--- a/content/media/webspeech/synth/nsSynthVoiceRegistry.cpp
+++ b/content/media/webspeech/synth/nsSynthVoiceRegistry.cpp
@@ -1,21 +1,22 @@
 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* vim:set ts=2 sw=2 sts=2 et cindent: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "nsILocaleService.h"
-#include "nsISupports.h"
+#include "nsISpeechService.h"
 #include "nsServiceManagerUtils.h"
 
 #include "SpeechSynthesisUtterance.h"
 #include "SpeechSynthesisVoice.h"
 #include "nsSynthVoiceRegistry.h"
+#include "nsSpeechTask.h"
 
 #include "nsString.h"
 #include "mozilla/StaticPtr.h"
 
 #undef LOG
 #ifdef PR_LOGGING
 extern PRLogModuleInfo* GetSpeechSynthLog();
 #define LOG(type, msg) PR_LOG(GetSpeechSynthLog(), type, msg)
@@ -26,29 +27,29 @@ extern PRLogModuleInfo* GetSpeechSynthLo
 namespace mozilla {
 namespace dom {
 
 // VoiceData
 
 class VoiceData
 {
 public:
-  VoiceData(nsISupports* aService, const nsAString& aUri,
+  VoiceData(nsISpeechService* aService, const nsAString& aUri,
             const nsAString& aName, const nsAString& aLang, bool aIsLocal)
     : mService(aService)
     , mUri(aUri)
     , mName(aName)
     , mLang(aLang)
     , mIsLocal(aIsLocal) {}
 
   ~VoiceData() {}
 
   NS_INLINE_DECL_REFCOUNTING(VoiceData)
 
-  nsCOMPtr<nsISupports> mService;
+  nsCOMPtr<nsISpeechService> mService;
 
   nsString mUri;
 
   nsString mName;
 
   nsString mLang;
 
   bool mIsLocal;
@@ -94,37 +95,35 @@ nsSynthVoiceRegistry::GetInstanceForServ
 
 void
 nsSynthVoiceRegistry::Shutdown()
 {
   LOG(PR_LOG_DEBUG, ("nsSynthVoiceRegistry::Shutdown()"));
   gSynthVoiceRegistry = nullptr;
 }
 
-// nsISynthVoiceRegistry
-
 NS_IMETHODIMP
-nsSynthVoiceRegistry::AddVoice(nsISupports* aService,
+nsSynthVoiceRegistry::AddVoice(nsISpeechService* aService,
                                const nsAString& aUri,
                                const nsAString& aName,
                                const nsAString& aLang,
                                bool aLocalService)
 {
   LOG(PR_LOG_DEBUG,
       ("nsSynthVoiceRegistry::AddVoice uri='%s' name='%s' lang='%s' local=%s",
        NS_ConvertUTF16toUTF8(aUri).get(), NS_ConvertUTF16toUTF8(aName).get(),
        NS_ConvertUTF16toUTF8(aLang).get(),
        aLocalService ? "true" : "false"));
 
   return AddVoiceImpl(aService, aUri, aName, aLang,
                       aLocalService);
 }
 
 NS_IMETHODIMP
-nsSynthVoiceRegistry::RemoveVoice(nsISupports* aService,
+nsSynthVoiceRegistry::RemoveVoice(nsISpeechService* aService,
                                   const nsAString& aUri)
 {
   LOG(PR_LOG_DEBUG,
       ("nsSynthVoiceRegistry::RemoveVoice uri='%s'",
        NS_ConvertUTF16toUTF8(aUri).get()));
 
   bool found = false;
   VoiceData* retval = mUriVoiceMap.GetWeak(aUri, &found);
@@ -227,17 +226,17 @@ nsSynthVoiceRegistry::GetVoiceName(const
   VoiceData* voice = mUriVoiceMap.GetWeak(aUri, &found);
   NS_ENSURE_TRUE(found, NS_ERROR_NOT_AVAILABLE);
 
   aRetval = voice->mName;
   return NS_OK;
 }
 
 nsresult
-nsSynthVoiceRegistry::AddVoiceImpl(nsISupports* aService,
+nsSynthVoiceRegistry::AddVoiceImpl(nsISpeechService* aService,
                                    const nsAString& aUri,
                                    const nsAString& aName,
                                    const nsAString& aLang,
                                    bool aLocalService)
 {
   bool found = false;
   mUriVoiceMap.GetWeak(aUri, &found);
   NS_ENSURE_FALSE(found, NS_ERROR_INVALID_ARG);
@@ -246,10 +245,168 @@ nsSynthVoiceRegistry::AddVoiceImpl(nsISu
                                             aLocalService);
 
   mVoices.AppendElement(voice);
   mUriVoiceMap.Put(aUri, voice);
 
   return NS_OK;
 }
 
+bool
+nsSynthVoiceRegistry::FindVoiceByLang(const nsAString& aLang,
+                                      VoiceData** aRetval)
+{
+  nsAString::const_iterator dashPos, start, end;
+  aLang.BeginReading(start);
+  aLang.EndReading(end);
+
+  while (true) {
+    nsAutoString langPrefix(Substring(start, end));
+
+    for (int32_t i = mDefaultVoices.Length(); i > 0; ) {
+      VoiceData* voice = mDefaultVoices[--i];
+
+      if (StringBeginsWith(voice->mLang, langPrefix)) {
+        *aRetval = voice;
+        return true;
+      }
+    }
+
+    for (int32_t i = mVoices.Length(); i > 0; ) {
+      VoiceData* voice = mVoices[--i];
+
+      if (StringBeginsWith(voice->mLang, langPrefix)) {
+        *aRetval = voice;
+        return true;
+      }
+    }
+
+    dashPos = end;
+    end = start;
+
+    if (!RFindInReadable(NS_LITERAL_STRING("-"), end, dashPos)) {
+      break;
+    }
+  }
+
+  return false;
+}
+
+VoiceData*
+nsSynthVoiceRegistry::FindBestMatch(const nsAString& aUri,
+                                    const nsAString& aLang)
+{
+  if (mVoices.IsEmpty()) {
+    return nullptr;
+  }
+
+  bool found = false;
+  VoiceData* retval = mUriVoiceMap.GetWeak(aUri, &found);
+
+  if (found) {
+    LOG(PR_LOG_DEBUG, ("nsSynthVoiceRegistry::FindBestMatch - Matched URI"));
+    return retval;
+  }
+
+  // Try finding a match for given voice.
+  if (!aLang.IsVoid() && !aLang.IsEmpty()) {
+    if (FindVoiceByLang(aLang, &retval)) {
+      LOG(PR_LOG_DEBUG,
+          ("nsSynthVoiceRegistry::FindBestMatch - Matched language (%s ~= %s)",
+           NS_ConvertUTF16toUTF8(aLang).get(),
+           NS_ConvertUTF16toUTF8(retval->mLang).get()));
+
+      return retval;
+    }
+  }
+
+  // Try UI language.
+  nsresult rv;
+  nsCOMPtr<nsILocaleService> localeService = do_GetService(NS_LOCALESERVICE_CONTRACTID, &rv);
+  NS_ENSURE_SUCCESS(rv, nullptr);
+
+  nsAutoString uiLang;
+  rv = localeService->GetLocaleComponentForUserAgent(uiLang);
+  NS_ENSURE_SUCCESS(rv, nullptr);
+
+  if (FindVoiceByLang(uiLang, &retval)) {
+    LOG(PR_LOG_DEBUG,
+        ("nsSynthVoiceRegistry::FindBestMatch - Matched UI language (%s ~= %s)",
+         NS_ConvertUTF16toUTF8(uiLang).get(),
+         NS_ConvertUTF16toUTF8(retval->mLang).get()));
+
+    return retval;
+  }
+
+  // Try en-US, the language of locale "C"
+  if (FindVoiceByLang(NS_LITERAL_STRING("en-US"), &retval)) {
+    LOG(PR_LOG_DEBUG,
+        ("nsSynthVoiceRegistry::FindBestMatch - Matched C locale language (en-US ~= %s)",
+         NS_ConvertUTF16toUTF8(retval->mLang).get()));
+
+    return retval;
+  }
+
+  // The top default voice is better than nothing...
+  if (!mDefaultVoices.IsEmpty()) {
+    return mDefaultVoices.LastElement();
+  }
+
+  return nullptr;
+}
+
+already_AddRefed<nsSpeechTask>
+nsSynthVoiceRegistry::SpeakUtterance(SpeechSynthesisUtterance& aUtterance,
+                                     const nsAString& aDocLang)
+{
+  nsString lang = nsString(aUtterance.mLang.IsEmpty() ? aDocLang : aUtterance.mLang);
+  nsAutoString uri;
+
+  if (aUtterance.mVoice) {
+    aUtterance.mVoice->GetVoiceURI(uri);
+  }
+
+  nsSpeechTask* task  = new nsSpeechTask(&aUtterance);
+  Speak(aUtterance.mText, lang, uri,
+        aUtterance.Rate(), aUtterance.Pitch(), task);
+
+  NS_IF_ADDREF(task);
+  return task;
+}
+
+void
+nsSynthVoiceRegistry::Speak(const nsAString& aText,
+                            const nsAString& aLang,
+                            const nsAString& aUri,
+                            const float& aRate,
+                            const float& aPitch,
+                            nsSpeechTask* aTask)
+{
+  LOG(PR_LOG_DEBUG,
+      ("nsSynthVoiceRegistry::Speak text='%s' lang='%s' uri='%s' rate=%f pitch=%f",
+       NS_ConvertUTF16toUTF8(aText).get(), NS_ConvertUTF16toUTF8(aLang).get(),
+       NS_ConvertUTF16toUTF8(aUri).get(), aRate, aPitch));
+
+  VoiceData* voice = FindBestMatch(aUri, aLang);
+
+  if (!voice) {
+    NS_WARNING("No voices found.");
+    aTask->DispatchError(0, 0);
+    return;
+  }
+
+  LOG(PR_LOG_DEBUG, ("nsSynthVoiceRegistry::Speak - Using voice URI: %s",
+                     NS_ConvertUTF16toUTF8(voice->mUri).get()));
+
+  SpeechServiceType serviceType;
+
+  DebugOnly<nsresult> rv = voice->mService->GetServiceType(&serviceType);
+  NS_WARN_IF_FALSE(NS_SUCCEEDED(rv), "Failed to get speech service type");
+
+  if (serviceType == nsISpeechService::SERVICETYPE_INDIRECT_AUDIO) {
+    aTask->SetIndirectAudio(true);
+  }
+
+  voice->mService->Speak(aText, voice->mUri, aRate, aPitch, aTask);
+}
+
 } // namespace dom
 } // namespace mozilla
--- a/content/media/webspeech/synth/nsSynthVoiceRegistry.h
+++ b/content/media/webspeech/synth/nsSynthVoiceRegistry.h
@@ -6,16 +6,18 @@
 
 #pragma once
 
 #include "nsAutoPtr.h"
 #include "nsISynthVoiceRegistry.h"
 #include "nsRefPtrHashtable.h"
 #include "nsTArray.h"
 
+class nsISpeechService;
+
 namespace mozilla {
 namespace dom {
 
 class RemoteVoice;
 class SpeechSynthesisUtterance;
 class nsSpeechTask;
 class VoiceData;
 
@@ -23,24 +25,35 @@ class nsSynthVoiceRegistry : public nsIS
 {
 public:
   NS_DECL_ISUPPORTS
   NS_DECL_NSISYNTHVOICEREGISTRY
 
   nsSynthVoiceRegistry();
   virtual ~nsSynthVoiceRegistry();
 
+  already_AddRefed<nsSpeechTask> SpeakUtterance(SpeechSynthesisUtterance& aUtterance,
+                                                const nsAString& aDocLang);
+
+  void Speak(const nsAString& aText, const nsAString& aLang,
+             const nsAString& aUri, const float& aRate, const float& aPitch,
+             nsSpeechTask* aTask);
+
   static nsSynthVoiceRegistry* GetInstance();
 
   static already_AddRefed<nsSynthVoiceRegistry> GetInstanceForService();
 
   static void Shutdown();
 
 private:
-  nsresult AddVoiceImpl(nsISupports* aService,
+  VoiceData* FindBestMatch(const nsAString& aUri, const nsAString& lang);
+
+  bool FindVoiceByLang(const nsAString& aLang, VoiceData** aRetval);
+
+  nsresult AddVoiceImpl(nsISpeechService* aService,
                         const nsAString& aUri,
                         const nsAString& aName,
                         const nsAString& aLang,
                         bool aLocalService);
 
   nsTArray<nsRefPtr<VoiceData> > mVoices;
 
   nsTArray<nsRefPtr<VoiceData> > mDefaultVoices;
--- a/content/media/webspeech/synth/test/Makefile.in
+++ b/content/media/webspeech/synth/test/Makefile.in
@@ -8,11 +8,13 @@ srcdir         := @srcdir@
 VPATH          := @srcdir@
 relativesrcdir := @relativesrcdir@
 
 include $(DEPTH)/config/autoconf.mk
 
 MOCHITEST_FILES := \
   common.js \
   test_setup.html \
+  test_speech_simple.html \
+  test_speech_queue.html \
   $(NULL)
 
 include $(topsrcdir)/config/rules.mk
--- a/content/media/webspeech/synth/test/common.js
+++ b/content/media/webspeech/synth/test/common.js
@@ -1,25 +1,115 @@
 SpecialPowers.setBoolPref("media.webspeech.synth.enabled", true);
 
 var gSpeechRegistry = SpecialPowers.Cc["@mozilla.org/synth-voice-registry;1"]
   .getService(SpecialPowers.Ci.nsISynthVoiceRegistry);
 
 var gAddedVoices = [];
 
-var TestSpeechServiceNoAudio = {
+function SpeechTaskCallback(onpause, onresume, oncancel) {
+  this.onpause = onpause;
+  this.onresume = onresume;
+  this.oncancel = oncancel;
+}
+
+SpeechTaskCallback.prototype = {
+  QueryInterface: function(iid) {
+    return this;
+  },
+
+  getInterfaces: function(c) {},
+
+  getHelperForLanguage: function() {},
+
+  onPause: function onPause() {
+    if (this.onpause)
+      this.onpause();
+  },
+
+  onResume: function onResume() {
+    if (this.onresume)
+      this.onresume();
+  },
+
+  onCancel: function onCancel() {
+    if (this.oncancel)
+      this.oncancel();
+  }
+};
+
+var TestSpeechServiceWithAudio = {
+  CHANNELS: 1,
+  SAMPLE_RATE: 16000,
+
+  serviceType: SpecialPowers.Ci.nsISpeechService.SERVICETYPE_DIRECT_AUDIO,
+
+  speak: function speak(aText, aUri, aRate, aPitch, aTask) {
+    var task = SpecialPowers.wrap(aTask);
+
+    window.setTimeout(
+      function () {
+        task.setup(new SpeechTaskCallback(), this.CHANNELS, this.SAMPLE_RATE);
+        // 0.025 seconds per character.
+        task.sendAudio(new Int16Array((this.SAMPLE_RATE/40)*aText.length), []);
+        task.sendAudio(new Int16Array(0), []);
+      }.bind(this), 0);
+  },
+
   QueryInterface: function(iid) {
     return this;
   },
 
   getInterfaces: function(c) {},
 
   getHelperForLanguage: function() {}
 };
 
+var TestSpeechServiceNoAudio = {
+  serviceType: SpecialPowers.Ci.nsISpeechService.SERVICETYPE_INDIRECT_AUDIO,
+
+  speak: function speak(aText, aUri, aRate, aPitch, aTask) {
+    var pair = this.expectedSpeaks.shift();
+    if (pair) {
+      // XXX: These tests do not happen in OOP
+      var utterance = pair[0];
+      var expected = pair[1];
+
+      is(aText, utterance.text, "Speak text matches utterance text");
+
+      var args = {uri: aUri, rate: aRate, pitch: aPitch};
+
+      for (var attr in args) {
+        if (expected[attr] != undefined)
+          is(args[attr], expected[attr], "expected service arg " + attr);
+      }
+    }
+
+    var task = SpecialPowers.wrap(aTask);
+    task.setup(new SpeechTaskCallback());
+    setTimeout(function () {
+                 task.dispatchStart();
+                 setTimeout(function () {
+                              task.dispatchEnd(aText.length / 2.0, aText.length);
+                            }, 0);
+
+               }, 0);
+  },
+
+  QueryInterface: function(iid) {
+    return this;
+  },
+
+  getInterfaces: function(c) {},
+
+  getHelperForLanguage: function() {},
+
+  expectedSpeaks: []
+};
+
 function synthAddVoice(aServiceName, aName, aLang, aIsLocal) {
   var voicesBefore = speechSynthesis.getVoices().length;
   var uri = "urn:moz-tts:mylittleservice:" + encodeURI(aName + '?' + aLang);
   gSpeechRegistry.addVoice(window[aServiceName], uri, aName, aLang, aIsLocal);
 
   gAddedVoices.push([window[aServiceName], uri]);
   var voicesAfter = speechSynthesis.getVoices().length;
 
@@ -47,8 +137,51 @@ function synthCleanup() {
   var toRemove = gAddedVoices.length;
   var removeArgs;
   while ((removeArgs = gAddedVoices.shift()))
     gSpeechRegistry.removeVoice.apply(gSpeechRegistry.removeVoice, removeArgs);
 
   var voicesAfter = speechSynthesis.getVoices().length;
   is(voicesAfter, voicesBefore - toRemove, "Successfully removed test voices");
 }
+
+function synthTestQueue(aTestArgs, aEndFunc) {
+  var utterances = [];
+  for (var i in aTestArgs) {
+    var uargs = aTestArgs[i][0];
+    var u = new SpeechSynthesisUtterance(uargs.text);
+
+    delete uargs.text;
+
+    for (var attr in uargs)
+      u[attr] = uargs[attr];
+
+    function onend_handler(e) {
+      is(e.target, utterances.shift(), "Target matches utterances");
+      ok(!speechSynthesis.speaking, "speechSynthesis is not speaking.");
+
+      isnot(e.eventType, 'error', "Error in utterance");
+
+      if (utterances.length) {
+        ok(speechSynthesis.pending, "other utterances queued");
+      } else {
+        ok(!speechSynthesis.pending, "queue is empty, nothing pending.");
+        if (aEndFunc)
+          aEndFunc();
+      }
+    }
+
+    u.addEventListener('end', onend_handler);
+    u.addEventListener('error', onend_handler);
+
+    u.addEventListener(
+      'error', function onerror_handler(e) {
+        ok(false, "Error in speech utterance '" + e.target.text + "'");
+      });
+
+    utterances.push(u);
+    TestSpeechServiceNoAudio.expectedSpeaks.push([u, aTestArgs[i][1]]);
+    speechSynthesis.speak(u);
+  }
+
+  ok(!speechSynthesis.speaking, "speechSynthesis is not speaking yet.");
+  ok(speechSynthesis.pending, "speechSynthesis has an utterance queued.");
+}
new file mode 100644
--- /dev/null
+++ b/content/media/webspeech/synth/test/test_speech_queue.html
@@ -0,0 +1,70 @@
+<!DOCTYPE HTML>
+<html lang="en-US">
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=525444
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 525444: Web Speech API, check speech synth queue</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <script type="application/javascript" src="common.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+<body onunload="synthCleanup();">
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=525444">Mozilla Bug 525444</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+  
+</div>
+<pre id="test">
+<script type="application/javascript">
+
+/** Test for Bug 525444 **/
+
+SimpleTest.waitForExplicitFinish();
+
+var englishJamaican = synthAddVoice('TestSpeechServiceNoAudio',
+                                    'Bob Marley', 'en-JM', true);
+var englishBritish = synthAddVoice('TestSpeechServiceNoAudio',
+                                   'Amy Winehouse', 'en-GB', true);
+var englishCanadian = synthAddVoice('TestSpeechServiceNoAudio',
+                                    'Leonard Cohen', 'en-CA', true);
+var frenchCanadian = synthAddVoice('TestSpeechServiceNoAudio',
+                                   'Celine Dion', 'fr-CA', true);
+var spanishMexican = synthAddVoice('TestSpeechServiceNoAudio',
+                                   'Julieta Venegas', 'es-MX', true);
+
+synthSetDefault(englishBritish, true);
+
+synthTestQueue(
+  [[{text: "Hello, world."},
+    { uri: englishBritish }],
+   [{text: "Bonjour tout le monde .", lang: "fr", rate: 0.5, pitch: 0.75},
+    { uri: frenchCanadian, rate: 0.5, pitch: 0.75}],
+   [{text: "How are you doing?", lang: "en-GB"},
+    { rate: 1, pitch: 1, uri: englishBritish}],
+   [{text: "¡hasta mañana", lang: "es-ES"},
+    { uri: spanishMexican }]],
+  function () {
+    synthSetDefault(englishJamaican, true);
+    var test_data = [[{text: "I shot the  sheriff."},
+                      { uri: englishJamaican }]];
+    var voices = speechSynthesis.getVoices();
+    for (var i in voices) {
+      test_data.push([{text: "Hello world", voice: voices[i]},
+                      {uri: voices[i].voiceURI}]);
+    }
+
+    synthTestQueue(test_data,
+                   function () {
+                     synthCleanup();
+                     SimpleTest.finish();
+                   });
+  });
+
+
+
+</script>
+</pre>
+</body>
+</html>
new file mode 100644
--- /dev/null
+++ b/content/media/webspeech/synth/test/test_speech_simple.html
@@ -0,0 +1,53 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=650295
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 650295: Web Speech API check all classes are present</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <script type="application/javascript" src="common.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+<body onunload="synthCleanup();">
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=650295">Mozilla Bug 650295</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+  
+</div>
+<pre id="test">
+<script type="application/javascript">
+
+/** Test for Bug 525444 **/
+
+SimpleTest.waitForExplicitFinish();
+
+synthAddVoice('TestSpeechServiceWithAudio', 'Male 1', 'en-GB', true);
+
+var gotStartEvent = false;
+var gotBoundaryEvent = false;
+var utterance = new SpeechSynthesisUtterance("Hello, world!");
+utterance.addEventListener('start', function(e) {
+  ok(speechSynthesis.speaking, "speechSynthesis is speaking.");
+  ok(!speechSynthesis.pending, "speechSynthesis has no other utterances queued.");
+  gotStartEvent = true;
+});
+
+utterance.addEventListener('end', function(e) {
+  ok(!speechSynthesis.speaking, "speechSynthesis is not speaking.");
+  ok(!speechSynthesis.pending, "speechSynthesis has no other utterances queued.");
+  ok(gotStartEvent, "Got 'start' event.");
+  info('end ' + e.elapsedTime);
+  synthCleanup();
+  SimpleTest.finish();
+});
+
+speechSynthesis.speak(utterance);
+ok(!speechSynthesis.speaking, "speechSynthesis is not speaking yet.");
+ok(speechSynthesis.pending, "speechSynthesis has an utterance queued.");
+
+</script>
+</pre>
+</body>
+</html>