Bug 1434884 - WebVTT parser must be able to deal with partial lines, r=rillian
authorAndrea Marchesini <amarchesini@mozilla.com>
Thu, 08 Feb 2018 11:45:06 +0100
changeset 402950 55a9b2fe876c5f9ac8d33b861b1df4679e5a3191
parent 402949 fdab702b72fd058c47dc776d8735e62b44bff202
child 402951 042c5cd8abb0dd86c7ee59abf14fbf5b3e335b5c
push id33407
push usercbrindusan@mozilla.com
push dateThu, 08 Feb 2018 19:02:31 +0000
treeherdermozilla-central@c5120bcaf7bd [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersrillian
bugs1434884
milestone60.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1434884 - WebVTT parser must be able to deal with partial lines, r=rillian
dom/media/webvtt/moz.build
dom/media/webvtt/tests/test_parser.js
dom/media/webvtt/tests/xpcshell.ini
dom/media/webvtt/vtt.jsm
--- a/dom/media/webvtt/moz.build
+++ b/dom/media/webvtt/moz.build
@@ -14,8 +14,10 @@ XPIDL_MODULE = 'webvtt'
 EXTRA_COMPONENTS += [
   'WebVTT.manifest',
   'WebVTTParserWrapper.js',
 ]
 
 EXTRA_JS_MODULES += [
   'vtt.jsm',
 ]
+
+XPCSHELL_TESTS_MANIFESTS += ['tests/xpcshell.ini']
new file mode 100644
--- /dev/null
+++ b/dom/media/webvtt/tests/test_parser.js
@@ -0,0 +1,77 @@
+"use strict";
+
+ChromeUtils.import("resource://gre/modules/vtt.jsm");
+ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm");
+
+let fakeWindow = {
+  VTTCue: function() {},
+  VTTRegion: function() {},
+};
+
+// We have a better parser check in WPT. Here I want to check that incomplete
+// lines are correctly parsable.
+let tests = [
+  // Signature
+  { input: [ "WEBVTT" ], cue: 0, region: 0 },
+  { input: [ "", "WE", "BVT", "T" ], cue: 0, region: 0 },
+  { input: [ "WEBVTT - This file has no cues." ], cue: 0, region: 0 },
+  { input: [ "WEBVTT", " - ", "This file has no cues." ], cue: 0, region: 0 },
+
+  // Body with IDs
+  { input: [ "WEB", "VTT - This file has cues.\n", "\n", "14\n",
+             "00:01:14", ".815 --> 00:0", "1:18.114\n", "- What?\n", "- Where are we now?\n", "\n",
+             "15\n", "00:01:18.171 --> 00:01:20.991\n", "- T", "his is big bat country.\n", "\n",
+             "16\n", "00:01:21.058 --> 00:01:23.868\n", "- [ Bat", "s Screeching ]\n",
+             "- They won't get in your hair. They're after the bug", "s.\n", ],
+    cue: 3, region: 0 },
+
+  // Body without IDs
+  { input: [ "WEBVTT - This file has c", "ues.\n", "\n",
+             "00:01:14.815 --> 00:01:18.114\n", "- What?\n", "- Where are we now?\n", "\n",
+             "00:01:18.171 --> 00:01:2", "0.991\n", "- ", "This is big bat country.\n", "\n",
+             "00:01:21.058 --> 00:01:23.868\n", "- [ Bats S", "creeching ]\n",
+             "- They won't get in your hair. They're after the bugs.\n", ],
+    cue: 3, region: 0 },
+
+  // Note
+  { input: [ "WEBVTT - This file has no cues.\n", "\n", "NOTE what" ],
+    cue: 0, region: 0 },
+
+  // Regions - This vtt is taken from a WPT
+  { input: [ "WE", "BVTT\n", "\n", "REGION\n", "id:0\n", "\n", "REGION\n", "id:1\n",
+             "region", "an", "chor:0%,0%\n", "\n", "R", "EGION\n", "id:2\n",
+             "regionanchor:18446744073709552000%,18446744", "073709552000%\n", "\n",
+             "REGION\n", "id:3\n", "regionanchor: 100%,100%\n", "regio", "nanchor :100%,100%\n",
+             "regionanchor:100% ,100%\n", "regionanchor:100%, 100%\n",
+             "regionanchor:100 %,100%\n", "regionanchor:10", "0%,100 %\n", "\n",
+             "00:00:00.000 --> 00:00:01.000", " region:0\n", "text\n", "\n",
+             "00:00:00.000 --> 00:00:01.000 region:1\n", "text\n", "\n",
+             "00:00:00.000 --> 00:00:01.000 region:3\n", "text\n" ], cue: 3, region: 4 },
+];
+
+function run_test() {
+
+  tests.forEach(test => {
+    let parser = new WebVTT.Parser(fakeWindow, null);
+    ok(!!parser, "Ok... this is a good starting point");
+
+    let cue = 0;
+    parser.oncue = () => { ++cue; };
+
+    let region = 0;
+    parser.onregion = () => { ++region; };
+
+    parser.onparsingerror = () => {
+      ok(false, "No error accepted");
+    }
+
+    test.input.forEach(input => {
+      parser.parse(new TextEncoder().encode(input));
+    });
+
+    parser.flush();
+
+    equal(cue, test.cue, "Cue value matches");
+    equal(region, test.region, "Region value matches");
+  });
+}
new file mode 100644
--- /dev/null
+++ b/dom/media/webvtt/tests/xpcshell.ini
@@ -0,0 +1,3 @@
+[DEFAULT]
+
+[test_parser.js]
--- a/dom/media/webvtt/vtt.jsm
+++ b/dom/media/webvtt/vtt.jsm
@@ -23,18 +23,17 @@ this.EXPORTED_SYMBOLS = ["WebVTT"];
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 ChromeUtils.import('resource://gre/modules/Services.jsm');
-const { require } = ChromeUtils.import("resource://devtools/shared/Loader.jsm", {});
-const { XPCOMUtils } = require("resource://gre/modules/XPCOMUtils.jsm");
+ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm");
 
 (function(global) {
 
   var _objCreate = Object.create || (function() {
     function F() {}
     return function(o) {
       if (arguments.length !== 1) {
         throw new Error('Object.create shim only accepts one parameter.');
@@ -270,18 +269,18 @@ const { XPCOMUtils } = require("resource
     skipWhitespace();
     cue.endTime = consumeTimeStamp();     // (5) collect cue end time
 
     // 4.1 WebVTT cue settings list.
     skipWhitespace();
     consumeCueSettings(input, cue);
   }
 
-  function onlyContainsWhiteSpaces(input) {
-    return /^[ \f\n\r\t]+$/.test(input);
+  function emptyOrOnlyContainsWhiteSpaces(input) {
+    return input == "" || /^[ \f\n\r\t]+$/.test(input);
   }
 
   function containsTimeDirectionSymbol(input) {
     return input.includes("-->");
   }
 
   function maybeIsTimeStampFormat(input) {
     return /^\s*(\d+:)?(\d{2}):(\d{2})\.(\d+)\s*-->\s*(\d+:)?(\d{2}):(\d{2})\.(\d+)\s*/.test(input);
@@ -1122,66 +1121,72 @@ const { XPCOMUtils } = require("resource
         }
       }
     })();
   };
 
   WebVTT.Parser = function(window, decoder) {
     this.window = window;
     this.state = "INITIAL";
+    this.substate = "";
+    this.substatebuffer = "";
     this.buffer = "";
     this.decoder = decoder || new TextDecoder("utf8");
     this.regionList = [];
   };
 
   WebVTT.Parser.prototype = {
     // If the error is a ParsingError then report it to the consumer if
     // possible. If it's not a ParsingError then throw it like normal.
     reportOrThrowError: function(e) {
       if (e instanceof ParsingError) {
         this.onparsingerror && this.onparsingerror(e);
       } else {
         throw e;
       }
     },
     parse: function (data) {
-      var self = this;
-
       // If there is no data then we won't decode it, but will just try to parse
       // whatever is in buffer already. This may occur in circumstances, for
       // example when flush() is called.
       if (data) {
         // Try to decode the data that we received.
-        self.buffer += self.decoder.decode(data, {stream: true});
+        this.buffer += this.decoder.decode(data, {stream: true});
       }
 
-      function collectNextLine() {
-        var buffer = self.buffer;
+      // This parser is line-based. Let's see if we have a line to parse.
+      while (/\r\n|\n|\r/.test(this.buffer)) {
+        var buffer = this.buffer;
         var pos = 0;
-        while (pos < buffer.length && buffer[pos] !== '\r' && buffer[pos] !== '\n') {
+        while (buffer[pos] !== '\r' && buffer[pos] !== '\n') {
           ++pos;
         }
         var line = buffer.substr(0, pos);
         // Advance the buffer early in case we fail below.
         if (buffer[pos] === '\r') {
           ++pos;
         }
         if (buffer[pos] === '\n') {
           ++pos;
         }
-        self.buffer = buffer.substr(pos);
+        this.buffer = buffer.substr(pos);
+
         // Spec defined replacement.
         line = line.replace(/[\u0000]/g, "\uFFFD");
 
-        if (/^NOTE($|[ \t])/.test(line)) {
-          line = null;
+        if (!/^NOTE($|[ \t])/.test(line)) {
+          this.parseLine(line);
         }
-        return line;
       }
 
+      return this;
+    },
+    parseLine: function(line) {
+      var self = this;
+
       function createCueIfNeeded() {
         if (!self.cue) {
           self.cue = new self.window.VTTCue(0, 0, "");
         }
       }
 
       // Parsing cue identifier and the identifier should be unique.
       // Return true if the input is a cue identifier.
@@ -1277,18 +1282,17 @@ const { XPCOMUtils } = require("resource
             var regionPref = Services.prefs.getBoolPref("media.webvtt.regions.enabled");
             dump("regionPref " + regionPref + "\n");
           }
         }
       }
 
       // Parsing the WebVTT signature, it contains parsing algo step1 to step9.
       // See spec, https://w3c.github.io/webvtt/#file-parsing
-      function parseSignatureMayThrow(input) {
-        let signature = collectNextLine();
+      function parseSignatureMayThrow(signature) {
         if (!/^WEBVTT([ \t].*)?$/.test(signature)) {
           throw new ParsingError(ParsingError.Errors.BadSignature);
         } else {
           self.state = "HEADER";
         }
       }
 
       function parseRegionOrStyle(input) {
@@ -1305,128 +1309,127 @@ const { XPCOMUtils } = require("resource
       // See spec, https://w3c.github.io/webvtt/#collect-a-webvtt-block
       //
       // There are sereval things would appear in header,
       //   1. Region or Style setting
       //   2. Garbage (meaningless string)
       //   3. Empty line
       //   4. Cue's timestamp
       // The case 4 happens when there is no line interval between the header
-      // and the cue blocks. In this case, we should preserve the line and
-      // return it for the next phase parsing.
-      function parseHeader() {
-        let line = null;
-        while (self.buffer && self.state === "HEADER") {
-          line = collectNextLine();
-          var tempStr = "";
-          if (/^REGION|^STYLE/.test(line)) {
-            self.substate = /^REGION/.test(line) ? "REGION" : "STYLE";
+      // and the cue blocks. In this case, we should preserve the line for the
+      // next phase parsing, returning "true".
+      function parseHeader(line) {
+        if (!self.substate && /^REGION|^STYLE/.test(line)) {
+          self.substate = /^REGION/.test(line) ? "REGION" : "STYLE";
+          return false;
+        }
 
-            while (true) {
-              line = collectNextLine();
-              if (!line || maybeIsTimeStampFormat(line) || onlyContainsWhiteSpaces(line) || containsTimeDirectionSymbol(line)) {
-                // parse the tempStr and break the while loop.
-                parseRegionOrStyle(tempStr);
-                break;
-              } else if (/^REGION|^STYLE/.test(line)) {
-                // The line is another REGION/STYLE, parse tempStr then reset tempStr.
-                // Don't break the while loop to parse the next REGION/STYLE.
-                parseRegionOrStyle(tempStr);
-                self.substate = /^REGION/.test(line) ? "REGION" : "STYLE";
-                tempStr = "";
-              } else {
-                tempStr = tempStr + " " + line;
-              }
-            }
+        if (self.substate === "REGION" || self.substate === "STYLE") {
+          if (maybeIsTimeStampFormat(line) ||
+              emptyOrOnlyContainsWhiteSpaces(line) ||
+              containsTimeDirectionSymbol(line)) {
+            parseRegionOrStyle(self.substatebuffer);
+            self.substatebuffer = "";
+            self.substate = null;
+
+            // This is the end of the region or style state.
+            return parseHeader(line);
           }
 
-          if (!line || onlyContainsWhiteSpaces(line)) {
-            // empty line, whitespaces
-            continue;
-          } else if (maybeIsTimeStampFormat(line)) {
-            self.state = "CUE";
-            break;
-          } else if (containsTimeDirectionSymbol(line)) {
-            // string contains "-->"
-            break;
-          } else {
-            //It is an ID.
-            break;
+          if (/^REGION|^STYLE/.test(line)) {
+            // The line is another REGION/STYLE, parse and reset substatebuffer.
+            // Don't break the while loop to parse the next REGION/STYLE.
+            parseRegionOrStyle(self.substatebuffer);
+            self.substatebuffer = "";
+            self.substate = /^REGION/.test(line) ? "REGION" : "STYLE";
+            return false;
           }
-        } // self.state === "HEADER"
 
-        // End parsing header part and doesn't see the timestamp.
-        if (self.state === "HEADER") {
-          self.state = "ID";
+          // We weren't able to parse the line as a header. Accumulate and
+          // return.
+          self.substatebuffer += " " + line;
+          return false;
         }
-        return line;
-      }
 
-      // 5.1 WebVTT file parsing.
-      try {
-        if (self.state === "INITIAL") {
-          parseSignatureMayThrow();
+        if (emptyOrOnlyContainsWhiteSpaces(line)) {
+          // empty line, whitespaces, nothing to do.
+          return false;
+        }
+
+        if (maybeIsTimeStampFormat(line)) {
+          self.state = "CUE";
+          // We want to process the same line again.
+          return true;
         }
 
-        var line;
-        if (self.state === "HEADER") {
-          line = parseHeader();
+        // string contains "-->" or an ID
+        self.state = "ID";
+        return true;
+      }
+
+      try {
+        // 5.1 WebVTT file parsing.
+        if (self.state === "INITIAL") {
+          parseSignatureMayThrow(line);
+          return;
         }
 
-        var nextIteration = false;
-        while (nextIteration || self.buffer) {
-          nextIteration = false;
-          if (!line) {
-            // Since the data receiving is async, we need to wait until the
-            // buffer gets the full line.
-            if (!/\r\n|\n|\r/.test(self.buffer)) {
-              return this;
-            }
-            line = collectNextLine();
+        if (self.state === "HEADER") {
+          // parseHeader returns false if the same line doesn't need to be
+          // parsed again.
+          if (!parseHeader(line)) {
+            return;
+          }
+        }
+
+        if (self.state === "ID") {
+          // If there is no cue identifier, read the next line. 
+          if (line == "") {
+            return;
           }
 
-          switch (self.state) {
-          case "ID":
-            // If there is no cue identifier, keep the line and reuse this line
-            // in next iteration.
-            if (!line || !parseCueIdentifier(line)) {
-              nextIteration = true;
-              continue;
+          // If there is no cue identifier, parse the line again.
+          if (!parseCueIdentifier(line)) {
+            return self.parseLine(line);
+          }
+          return;
+        }
+
+        if (self.state === "CUE") {
+          parseCueMayThrow(line);
+          return;
+        }
+
+        if (self.state === "CUETEXT") {
+          // Report the cue when (1) get an empty line (2) get the "-->""
+          if (emptyOrOnlyContainsWhiteSpaces(line) ||
+              containsTimeDirectionSymbol(line)) {
+            // We are done parsing self cue.
+            self.oncue && self.oncue(self.cue);
+            self.cue = null;
+            self.state = "ID";
+
+            if (emptyOrOnlyContainsWhiteSpaces(line)) {
+              return;
             }
-            break;
-          case "CUE":
-            parseCueMayThrow(line);
-            break;
-          case "CUETEXT":
-            // Report the cue when (1) get an empty line (2) get the "-->""
-            if (!line || containsTimeDirectionSymbol(line)) {
-              // We are done parsing self cue.
-              self.oncue && self.oncue(self.cue);
-              self.cue = null;
-              self.state = "ID";
-              // Keep the line and reuse this line in next iteration.
-              nextIteration = true;
-              continue;
-            }
-            if (self.cue.text) {
-              self.cue.text += "\n";
-            }
-            self.cue.text += line;
-            break;
-          case "BADCUE": // BADCUE
-            // 54-62 - Collect and discard the remaining cue.
-            self.state = "ID";
-            if (line) { // keep this line to ID state.
-              continue;
-            }
-            break;
+
+            // Reuse the same line.
+            return self.parseLine(line);
+          }
+          if (self.cue.text) {
+            self.cue.text += "\n";
           }
-          // The line was already parsed, empty it to ensure we can get the
-          // new line in next iteration.
-          line = null;
+          self.cue.text += line;
+          return;
+        }
+
+        if (self.state === "BADCUE") {
+          // 54-62 - Collect and discard the remaining cue.
+          self.state = "ID";
+          return self.parseLine(line);
         }
       } catch (e) {
         self.reportOrThrowError(e);
 
         // If we are currently parsing a cue, report what we have.
         if (self.state === "CUETEXT" && self.cue && self.oncue) {
           self.oncue(self.cue);
         }
@@ -1437,27 +1440,18 @@ const { XPCOMUtils } = require("resource
       }
       return this;
     },
     flush: function () {
       var self = this;
       try {
         // Finish decoding the stream.
         self.buffer += self.decoder.decode();
-        // Synthesize the end of the current cue or region.
-        if (self.cue || self.state === "HEADER") {
-          self.buffer += "\n\n";
-          self.parse();
-        }
-        // If we've flushed, parsed, and we're still on the INITIAL state then
-        // that means we don't have enough of the stream to parse the first
-        // line.
-        if (self.state === "INITIAL") {
-          throw new ParsingError(ParsingError.Errors.BadSignature);
-        }
+        self.buffer += "\n\n";
+        self.parse();
       } catch(e) {
         self.reportOrThrowError(e);
       }
       self.onflush && self.onflush();
       return this;
     }
   };