Bug 875562 - Part 7: Implement limits for stored crash data; r=Yoric
authorGregory Szorc <gps@mozilla.com>
Tue, 28 Jan 2014 17:33:38 -0800
changeset 168871 96accde0e1dc77f85f2de3a29dfb2aa6dc8f2fd2
parent 168870 0ed0f9d090ca48b45f68727e1a4469c9d6fd29ad
child 168872 f8b656efb478868e4c889c42540b77eb93ce91c5
push id270
push userpvanderbeken@mozilla.com
push dateThu, 06 Mar 2014 09:24:21 +0000
reviewersYoric
bugs875562
milestone30.0a1
Bug 875562 - Part 7: Implement limits for stored crash data; r=Yoric This patch institutes limits for how much crash data can be stored in terms of maximum number of crash events per day. If more events than the limit are encountered, we record that a high water mark has been encountered and we silently discard the payload of future events. We chose to not increment the version of the store payload because no clients are actively saving this data yet, so it doesn't make sense to incur a version bump.
toolkit/components/crashes/CrashManager.jsm
toolkit/components/crashes/docs/crash-events.rst
toolkit/components/crashes/tests/xpcshell/test_crash_manager.js
toolkit/components/crashes/tests/xpcshell/test_crash_store.js
--- a/toolkit/components/crashes/CrashManager.jsm
+++ b/toolkit/components/crashes/CrashManager.jsm
@@ -25,16 +25,23 @@ this.EXPORTED_SYMBOLS = [
  *
  * We defer aggregation for performance reasons, as we don't want too many
  * services competing for I/O immediately after startup.
  */
 const AGGREGATE_STARTUP_DELAY_MS = 57000;
 
 const MILLISECONDS_IN_DAY = 24 * 60 * 60 * 1000;
 
+// Converts Date to days since UNIX epoch.
+// This was copied from /services/metrics.storage.jsm. The implementation
+// does not account for leap seconds.
+function dateToDays(date) {
+  return Math.floor(date.getTime() / MILLISECONDS_IN_DAY);
+}
+
 
 /**
  * A gateway to crash-related data.
  *
  * This type is generic and can be instantiated any number of times.
  * However, most applications will typically only have one instance
  * instantiated and that instance will point to profile and user appdata
  * directories.
@@ -515,63 +522,122 @@ let gCrashManager;
  * from the crash manager for performance reasons: since all crash metadata
  * needs to be loaded into memory for access, we wish to easily dispose of all
  * associated memory when this data is no longer needed. Having an isolated
  * object whose references can easily be lost faciliates that simple disposal.
  *
  * When metadata is updated, the caller must explicitly persist the changes
  * to disk. This prevents excessive I/O during updates.
  *
+ * The store has a mechanism for ensuring it doesn't grow too large. A ceiling
+ * is placed on the number of daily events that can occur for events that can
+ * occur with relatively high frequency, notably plugin crashes and hangs
+ * (plugins can enter cycles where they repeatedly crash). If we've reached
+ * the high water mark and new data arrives, it's silently dropped.
+ * However, the count of actual events is always preserved. This allows
+ * us to report on the severity of problems beyond the storage threshold.
+ *
+ * Main process crashes are excluded from limits because they are both
+ * important and should be rare.
+ *
  * @param storeDir (string)
  *        Directory the store should be located in.
  * @param telemetrySizeKey (string)
  *        The telemetry histogram that should be used to store the size
  *        of the data file.
  */
 function CrashStore(storeDir, telemetrySizeKey) {
   this._storeDir = storeDir;
   this._telemetrySizeKey = telemetrySizeKey;
 
   this._storePath = OS.Path.join(storeDir, "store.json.mozlz4");
 
   // Holds the read data from disk.
   this._data = null;
+
+  // Maps days since UNIX epoch to a Map of event types to counts.
+  // This data structure is populated when the JSON file is loaded
+  // and is also updated when new events are added.
+  this._countsByDay = new Map();
 }
 
 CrashStore.prototype = Object.freeze({
+  // A crash that occurred in the main process.
   TYPE_MAIN_CRASH: "main-crash",
+
+  // A crash in a plugin process.
   TYPE_PLUGIN_CRASH: "plugin-crash",
+
+  // A hang in a plugin process.
   TYPE_PLUGIN_HANG: "plugin-hang",
 
+  // Maximum number of events to store per day. This establishes a
+  // ceiling on the per-type/per-day records that will be stored.
+  HIGH_WATER_DAILY_THRESHOLD: 100,
+
   /**
    * Load data from disk.
    *
-   * @return Promise<null>
+   * @return Promise
    */
   load: function () {
     return Task.spawn(function* () {
+      // Loading replaces data. So reset data structures.
       this._data = {
         v: 1,
         crashes: new Map(),
         corruptDate: null,
       };
+      this._countsByDay = new Map();
 
       try {
         let decoder = new TextDecoder();
         let data = yield OS.File.read(this._storePath, null, {compression: "lz4"});
         data = JSON.parse(decoder.decode(data));
 
         if (data.corruptDate) {
           this._data.corruptDate = new Date(data.corruptDate);
         }
 
+        // actualCounts is used to validate that the derived counts by
+        // days stored in the payload matches up to actual data.
+        let actualCounts = new Map();
+
         for (let id in data.crashes) {
           let crash = data.crashes[id];
           let denormalized = this._denormalize(crash);
+
           this._data.crashes.set(id, denormalized);
+
+          let key = dateToDays(denormalized.crashDate) + "-" + denormalized.type;
+          actualCounts.set(key, (actualCounts.get(key) || 0) + 1);
+        }
+
+        // The validation in this loop is arguably not necessary. We perform
+        // it as a defense against unknown bugs.
+        for (let dayKey in data.countsByDay) {
+          let day = parseInt(dayKey, 10);
+          for (let type in data.countsByDay[day]) {
+            this._ensureCountsForDay(day);
+
+            let count = data.countsByDay[day][type];
+            let key = day + "-" + type;
+
+            // If the payload says we have data for a given day but we
+            // don't, the payload is wrong. Ignore it.
+            if (!actualCounts.has(key)) {
+              continue;
+            }
+
+            // If we encountered more data in the payload than what the
+            // data structure says, use the proper value.
+            count = Math.max(count, actualCounts.get(key));
+
+            this._countsByDay.get(day).set(type, count);
+          }
         }
       } catch (ex if ex instanceof OS.File.Error && ex.becauseNoSuchFile) {
         // Missing files (first use) are allowed.
       } catch (ex) {
         // If we can't load for any reason, mark a corrupt date in the instance
         // and swallow the error.
         //
         // The marking of a corrupted file is intentionally not persisted to
@@ -589,30 +655,51 @@ CrashStore.prototype = Object.freeze({
    */
   save: function () {
     return Task.spawn(function* () {
       if (!this._data) {
         return;
       }
 
       let normalized = {
+        // The version should be incremented whenever the format
+        // changes.
         v: 1,
+        // Maps crash IDs to objects defining the crash.
         crashes: {},
+        // Maps days since UNIX epoch to objects mapping event types to
+        // counts. This is a mirror of this._countsByDay. e.g.
+        // {
+        //    15000: {
+        //        "main-crash": 2,
+        //        "plugin-crash": 1
+        //    }
+        // }
+        countsByDay: {},
+
+        // When the store was last corrupted.
         corruptDate: null,
       };
 
       if (this._data.corruptDate) {
         normalized.corruptDate = this._data.corruptDate.getTime();
       }
 
       for (let [id, crash] of this._data.crashes) {
         let c = this._normalize(crash);
         normalized.crashes[id] = c;
       }
 
+      for (let [day, m] of this._countsByDay) {
+        normalized.countsByDay[day] = {};
+        for (let [type, count] of m) {
+          normalized.countsByDay[day][type] = count;
+        }
+      }
+
       let encoder = new TextEncoder();
       let data = encoder.encode(JSON.stringify(normalized));
       let size = yield OS.File.writeAtomic(this._storePath, data, {
                                            tmpPath: this._storePath + ".tmp",
                                            compression: "lz4"});
       if (this._telemetrySizeKey) {
         Services.telemetry.getHistogramById(this._telemetrySizeKey).add(size);
       }
@@ -724,62 +811,91 @@ CrashStore.prototype = Object.freeze({
       if (crash.id == id) {
         return crash;
       }
     }
 
     return null;
   },
 
-  _ensureCrashRecord: function (id) {
+  _ensureCountsForDay: function (day) {
+    if (!this._countsByDay.has(day)) {
+      this._countsByDay.set(day, new Map());
+    }
+  },
+
+  /**
+   * Ensure the crash record is present in storage.
+   *
+   * Returns the crash record if we're allowed to store it or null
+   * if we've hit the high water mark.
+   *
+   * @param id
+   *        (string) The crash ID.
+   * @param type
+   *        (string) One of the this.TYPE_* constants describing the crash type.
+   * @param date
+   *        (Date) When this crash occurred.
+   *
+   * @return null | object crash record
+   */
+  _ensureCrashRecord: function (id, type, date) {
+    let day = dateToDays(date);
+    this._ensureCountsForDay(day);
+
+    let count = (this._countsByDay.get(day).get(type) || 0) + 1;
+    this._countsByDay.get(day).set(type, count);
+
+    if (count > this.HIGH_WATER_DAILY_THRESHOLD && type != this.TYPE_MAIN_CRASH) {
+      return null;
+    }
+
     if (!this._data.crashes.has(id)) {
       this._data.crashes.set(id, {
         id: id,
-        type: null,
-        crashDate: null,
+        type: type,
+        crashDate: date,
       });
     }
 
-    return this._data.crashes.get(id);
+    let crash = this._data.crashes.get(id);
+    crash.type = type;
+    crash.date = date;
+
+    return crash;
   },
 
   /**
    * Record the occurrence of a crash in the main process.
    *
    * @param id (string) Crash ID. Likely a UUID.
    * @param date (Date) When the crash occurred.
    */
   addMainProcessCrash: function (id, date) {
-    let r = this._ensureCrashRecord(id);
-    r.type = this.TYPE_MAIN_CRASH;
-    r.crashDate = date;
+    this._ensureCrashRecord(id, this.TYPE_MAIN_CRASH, date);
   },
 
   /**
    * Record the occurrence of a crash in a plugin process.
    *
    * @param id (string) Crash ID. Likely a UUID.
    * @param date (Date) When the crash occurred.
    */
   addPluginCrash: function (id, date) {
-    let r = this._ensureCrashRecord(id);
-    r.type = this.TYPE_PLUGIN_CRASH;
-    r.crashDate = date;
+    this._ensureCrashRecord(id, this.TYPE_PLUGIN_CRASH, date);
   },
 
   /**
    * Record the occurrence of a hang in a plugin process.
    *
    * @param id (string) Crash ID. Likely a UUID.
    * @param date (Date) When the hang was reported.
    */
   addPluginHang: function (id, date) {
-    let r = this._ensureCrashRecord(id);
-    r.type = this.TYPE_PLUGIN_HANG;
-    r.crashDate = date;
+    this._ensureCrashRecord(id, this.TYPE_PLUGIN_HANG, date);
   },
 
   get mainProcessCrashes() {
     let crashes = [];
     for (let crash of this.crashes) {
       if (crash.isMainProcessCrash) {
         crashes.push(crash);
       }
@@ -842,16 +958,20 @@ CrashRecord.prototype = Object.freeze({
    * This is a convenience getter. The returned value is used to determine when
    * to expire a record.
    */
   get newestDate() {
     // We currently only have 1 date, so this is easy.
     return this._o.crashDate;
   },
 
+  get oldestDate() {
+    return this._o.crashDate;
+  },
+
   get type() {
     return this._o.type;
   },
 
   get isMainProcessCrash() {
     return this._o.type == CrashStore.prototype.TYPE_MAIN_CRASH;
   },
 
--- a/toolkit/components/crashes/docs/crash-events.rst
+++ b/toolkit/components/crashes/docs/crash-events.rst
@@ -150,14 +150,13 @@ First, in well-behaving installs, crash 
 hangs will be rare and thus the size of the crash data should remain small
 over time.
 
 The choice of a single JSON file has larger implications as the amount of
 crash data grows. As new data is accumulated, we need to read and write
 an entire file to make small updates. LZ4 compression helps reduce I/O.
 But, there is a potential for unbounded file growth. We establish a
 limit for the max age of records. Anything older than that limit is
-pruned. Future patches will also limit the maximum number of records. This
-will establish a hard limit on the size of the file, at least in terms of
-crashes.
-
-Care must be taken when new crash data is recorded, as this will increase
-the size of the file and make I/O a larger concern.
+pruned. We also establish a daily limit on the number of crashes we will
+store. All crashes beyond the first N in a day have no payload and are
+only recorded by the presence of a count. This count ensures we can
+distinguish between ``N`` and ``100 * N``, which are very different
+values!
--- a/toolkit/components/crashes/tests/xpcshell/test_crash_manager.js
+++ b/toolkit/components/crashes/tests/xpcshell/test_crash_manager.js
@@ -253,8 +253,29 @@ add_task(function* test_plugin_hang_even
   Assert.equal(crashes.length, 1);
   Assert.equal(crashes[0].id, "id1");
   Assert.equal(crashes[0].type, "plugin-hang");
   Assert.deepEqual(crashes[0].crashDate, DUMMY_DATE);
 
   count = yield m.aggregateEventsFiles();
   Assert.equal(count, 0);
 });
+
+// Excessive amounts of files should be processed properly.
+add_task(function* test_high_water_mark() {
+  let m = yield getManager();
+
+  let store = yield m._getStore();
+
+  for (let i = 0; i < store.HIGH_WATER_DAILY_THRESHOLD + 1; i++) {
+    yield m.createEventsFile("m" + i, "crash.main.1", DUMMY_DATE, "m" + i);
+    yield m.createEventsFile("pc" + i, "crash.plugin.1", DUMMY_DATE, "pc" + i);
+    yield m.createEventsFile("ph" + i, "hang.plugin.1", DUMMY_DATE, "ph" + i);
+  }
+
+  let count = yield m.aggregateEventsFiles();
+  Assert.equal(count, 3 * bsp.CrashStore.prototype.HIGH_WATER_DAILY_THRESHOLD + 3);
+
+  // Need to fetch again in case the first one was garbage collected.
+  store = yield m._getStore();
+  // +1 is for preserved main process crash.
+  Assert.equal(store.crashesCount, 3 * store.HIGH_WATER_DAILY_THRESHOLD + 1);
+});
--- a/toolkit/components/crashes/tests/xpcshell/test_crash_store.js
+++ b/toolkit/components/crashes/tests/xpcshell/test_crash_store.js
@@ -176,8 +176,56 @@ add_task(function* test_add_mixed_types(
   yield s.load();
 
   Assert.equal(s.crashesCount, 3);
 
   Assert.equal(s.mainProcessCrashes.length, 1);
   Assert.equal(s.pluginCrashes.length, 1);
   Assert.equal(s.pluginHangs.length, 1);
 });
+
+// Crashes added beyond the high water mark behave properly.
+add_task(function* test_high_water() {
+  let s = yield getStore();
+
+  let d1 = new Date(2014, 0, 1, 0, 0, 0);
+  let d2 = new Date(2014, 0, 2, 0, 0, 0);
+
+  for (let i = 0; i < s.HIGH_WATER_DAILY_THRESHOLD + 1; i++) {
+    s.addMainProcessCrash("m1" + i, d1);
+    s.addMainProcessCrash("m2" + i, d2);
+    s.addPluginCrash("pc1" + i, d1);
+    s.addPluginCrash("pc2" + i, d2);
+    s.addPluginHang("ph1" + i, d1);
+    s.addPluginHang("ph2" + i, d2);
+  }
+
+  // We preserve main process crashes. Plugin crashes and hangs beyond should
+  // be discarded.
+  Assert.equal(s.crashesCount, 6 * s.HIGH_WATER_DAILY_THRESHOLD + 2);
+  Assert.equal(s.mainProcessCrashes.length, 2 * s.HIGH_WATER_DAILY_THRESHOLD + 2);
+  Assert.equal(s.pluginCrashes.length, 2 * s.HIGH_WATER_DAILY_THRESHOLD);
+  Assert.equal(s.pluginHangs.length, 2 * s.HIGH_WATER_DAILY_THRESHOLD);
+
+  // But raw counts should be preserved.
+  let day1 = bsp.dateToDays(d1);
+  let day2 = bsp.dateToDays(d2);
+  Assert.ok(s._countsByDay.has(day1));
+  Assert.ok(s._countsByDay.has(day2));
+  Assert.equal(s._countsByDay.get(day1).get(s.TYPE_MAIN_CRASH),
+               s.HIGH_WATER_DAILY_THRESHOLD + 1);
+  Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_CRASH),
+               s.HIGH_WATER_DAILY_THRESHOLD + 1);
+  Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_HANG),
+               s.HIGH_WATER_DAILY_THRESHOLD + 1);
+
+  yield s.save();
+  yield s.load();
+
+  Assert.ok(s._countsByDay.has(day1));
+  Assert.ok(s._countsByDay.has(day2));
+  Assert.equal(s._countsByDay.get(day1).get(s.TYPE_MAIN_CRASH),
+               s.HIGH_WATER_DAILY_THRESHOLD + 1);
+  Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_CRASH),
+               s.HIGH_WATER_DAILY_THRESHOLD + 1);
+  Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_HANG),
+               s.HIGH_WATER_DAILY_THRESHOLD + 1);
+});