Bug 1375635 - Add telemetry for how often and long users are in bad authentication states. r=Dexter,rweiss+418169,tcsc
authorMark Hammond <mhammond@skippinet.com.au>
Thu, 27 Jul 2017 11:59:24 +1000
changeset 427582 bd25a5e1a355a31efcdb63b3c5954746fdfb7dec
parent 427581 de726c1fbb1e85aad4ed2f3f3d91366024818739
child 427583 85e0bc442b6188e1f706f7681c7a586919053294
push id7761
push userjlund@mozilla.com
push dateFri, 15 Sep 2017 00:19:52 +0000
treeherdermozilla-beta@c38455951db4 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersDexter, rweiss
bugs1375635, 418169
milestone57.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1375635 - Add telemetry for how often and long users are in bad authentication states. r=Dexter,rweiss+418169,tcsc MozReview-Commit-ID: BW8ixM3wmxJ
services/sync/modules-testing/utils.js
services/sync/modules/browserid_identity.js
services/sync/tests/unit/test_browserid_identity.js
services/sync/tests/unit/test_browserid_identity_telemetry.js
services/sync/tests/unit/xpcshell.ini
toolkit/components/telemetry/Histograms.json
toolkit/components/telemetry/Scalars.yaml
tools/lint/eslint/modules.json
--- a/services/sync/modules-testing/utils.js
+++ b/services/sync/modules-testing/utils.js
@@ -12,16 +12,17 @@ this.EXPORTED_SYMBOLS = [
   "configureIdentity",
   "SyncTestingInfrastructure",
   "waitForZeroTimer",
   "promiseZeroTimer",
   "promiseNamedTimer",
   "MockFxaStorageManager",
   "AccountState", // from a module import
   "sumHistogram",
+  "getLoginTelemetryScalar",
 ];
 
 var {utils: Cu} = Components;
 
 Cu.import("resource://services-sync/status.js");
 Cu.import("resource://services-common/utils.js");
 Cu.import("resource://services-crypto/utils.js");
 Cu.import("resource://services-sync/util.js");
@@ -264,8 +265,14 @@ this.sumHistogram = function(name, optio
   let snapshot = histogram.snapshot(options.key);
   let sum = -Infinity;
   if (snapshot) {
     sum = snapshot.sum;
   }
   histogram.clear();
   return sum;
 }
+
+this.getLoginTelemetryScalar = function() {
+  let dataset = Services.telemetry.DATASET_RELEASE_CHANNEL_OPTOUT;
+  let snapshot = Services.telemetry.snapshotKeyedScalars(dataset, true);
+  return snapshot.parent ? snapshot.parent["services.sync.sync_login_state_transitions"] : {};
+}
--- a/services/sync/modules/browserid_identity.js
+++ b/services/sync/modules/browserid_identity.js
@@ -41,16 +41,104 @@ var fxAccountsCommon = {};
 Cu.import("resource://gre/modules/FxAccountsCommon.js", fxAccountsCommon);
 
 const OBSERVER_TOPICS = [
   fxAccountsCommon.ONLOGIN_NOTIFICATION,
   fxAccountsCommon.ONLOGOUT_NOTIFICATION,
   fxAccountsCommon.ON_ACCOUNT_STATE_CHANGE_NOTIFICATION,
 ];
 
+// A telemetry helper that records how long a user was in a "bad" state.
+// It is recorded in the *main* ping, *not* the Sync ping.
+// These bad states may persist across browser restarts, and may never change
+// (eg, users may *never* validate)
+this.telemetryHelper = {
+  // These are both the "status" values passed to maybeRecordLoginState and
+  // the key we use for our keyed scalar.
+  STATES: {
+    SUCCESS: "SUCCESS",
+    NOTVERIFIED: "NOTVERIFIED",
+    REJECTED: "REJECTED",
+  },
+
+  PREFS: {
+    REJECTED_AT: "identity.telemetry.loginRejectedAt",
+    APPEARS_PERMANENTLY_REJECTED: "identity.telemetry.loginAppearsPermanentlyRejected",
+    LAST_RECORDED_STATE: "identity.telemetry.lastRecordedState",
+  },
+
+  // How long, in minutes, that we continue to wait for a user to transition
+  // from a "bad" state to a success state. After this has expired, we record
+  // the "how long were they rejected for?" histogram.
+  NUM_MINUTES_TO_RECORD_REJECTED_TELEMETRY: 20160, // 14 days in minutes.
+
+  SCALAR: "services.sync.sync_login_state_transitions", // The scalar we use to report
+
+  maybeRecordLoginState(status) {
+    try {
+      this._maybeRecordLoginState(status);
+    } catch (ex) {
+      log.error("Failed to record login telemetry", ex);
+    }
+  },
+
+  _maybeRecordLoginState(status) {
+    let key = this.STATES[status];
+    if (!key) {
+      throw new Error(`invalid state ${status}`);
+    }
+
+    let when = Svc.Prefs.get(this.PREFS.REJECTED_AT);
+    let howLong = when ? this.nowInMinutes() - when : 0; // minutes.
+    let isNewState = Svc.Prefs.get(this.PREFS.LAST_RECORDED_STATE) != status;
+
+    if (status == this.STATES.SUCCESS) {
+      if (isNewState) {
+        Services.telemetry.keyedScalarSet(this.SCALAR, key, true);
+        Svc.Prefs.set(this.PREFS.LAST_RECORDED_STATE, status);
+      }
+      // If we previously recorded an error state, report how long they were
+      // in the bad state for (in minutes)
+      if (when) {
+        // If we are "permanently rejected" we've already recorded for how
+        // long, so don't do it again.
+        if (!Svc.Prefs.get(this.PREFS.APPEARS_PERMANENTLY_REJECTED)) {
+          Services.telemetry.getHistogramById("WEAVE_LOGIN_FAILED_FOR").add(howLong);
+        }
+      }
+      Svc.Prefs.reset(this.PREFS.REJECTED_AT);
+      Svc.Prefs.reset(this.PREFS.APPEARS_PERMANENTLY_REJECTED);
+    } else {
+      // We are in a failure state.
+      if (Svc.Prefs.get(this.PREFS.APPEARS_PERMANENTLY_REJECTED)) {
+        return; // we've given up, so don't record errors.
+      }
+      if (isNewState) {
+        Services.telemetry.keyedScalarSet(this.SCALAR, key, true);
+        Svc.Prefs.set(this.PREFS.LAST_RECORDED_STATE, status);
+      }
+      if (howLong > this.NUM_MINUTES_TO_RECORD_REJECTED_TELEMETRY) {
+        // We are giving up for this user, so report this "max time" as how
+        // long they were in this state for.
+        Services.telemetry.getHistogramById("WEAVE_LOGIN_FAILED_FOR").add(howLong);
+        Svc.Prefs.set(this.PREFS.APPEARS_PERMANENTLY_REJECTED, true);
+      }
+      if (!Svc.Prefs.has(this.PREFS.REJECTED_AT)) {
+        Svc.Prefs.set(this.PREFS.REJECTED_AT, this.nowInMinutes());
+      }
+    }
+  },
+
+  // hookable by tests.
+  nowInMinutes() {
+    return Math.floor(Date.now() / 1000 / 60);
+  },
+}
+
+
 function deriveKeyBundle(kB) {
   let out = CryptoUtils.hkdf(kB, undefined,
                              "identity.mozilla.com/picl/v1/oldsync", 2 * 32);
   let bundle = new BulkKeyBundle();
   // [encryptionKey, hmacKey]
   bundle.keyPair = [out.slice(0, 32), out.slice(32, 64)];
   return bundle;
 }
@@ -192,16 +280,19 @@ this.BrowserIDManager.prototype = {
       }
 
       this.username = accountData.email;
       this._updateSignedInUser(accountData);
       // The user must be verified before we can do anything at all; we kick
       // this and the rest of initialization off in the background (ie, we
       // don't return the promise)
       this._log.info("Waiting for user to be verified.");
+      if (!accountData.verified) {
+        telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+      }
       this._fxaService.whenVerified(accountData).then(accountData => {
         this._updateSignedInUser(accountData);
 
         this._log.info("Starting fetch for key bundle.");
         return this._fetchTokenForUser();
       }).then(token => {
         this._token = token;
         if (token) {
@@ -606,16 +697,17 @@ this.BrowserIDManager.prototype = {
         // TODO: Make it be only 80% of the duration, so refresh the token
         // before it actually expires. This is to avoid sync storage errors
         // otherwise, we get a nasty notification bar briefly. Bug 966568.
         token.expiration = this._now() + (token.duration * 1000) * 0.80;
         if (!this._syncKeyBundle) {
           // We are given kA/kB as hex.
           this._syncKeyBundle = deriveKeyBundle(CommonUtils.hexToBytes(userData.kB));
         }
+        telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
         return token;
       })
       .catch(err => {
         // TODO: unify these errors - we need to handle errors thrown by
         // both tokenserverclient and hawkclient.
         // A tokenserver error thrown based on a bad response.
         if (err.response && err.response.status === 401) {
           err = new AuthenticationError(err, "tokenserver");
@@ -629,16 +721,17 @@ this.BrowserIDManager.prototype = {
 
         // TODO: write tests to make sure that different auth error cases are handled here
         // properly: auth error getting assertion, auth error getting token (invalid generation
         // and client-state error)
         if (err instanceof AuthenticationError) {
           this._log.error("Authentication error in _fetchTokenForUser", err);
           // set it to the "fatal" LOGIN_FAILED_LOGIN_REJECTED reason.
           this._authFailureReason = LOGIN_FAILED_LOGIN_REJECTED;
+          telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.REJECTED);
         } else {
           this._log.error("Non-authentication error in _fetchTokenForUser", err);
           // for now assume it is just a transient network related problem
           // (although sadly, it might also be a regular unhandled exception)
           this._authFailureReason = LOGIN_FAILED_NETWORK_ERROR;
         }
         // this._authFailureReason being set to be non-null in the above if clause
         // ensures we are in the correct currentAuthState, and
--- a/services/sync/tests/unit/test_browserid_identity.js
+++ b/services/sync/tests/unit/test_browserid_identity.js
@@ -78,16 +78,17 @@ add_test(function test_initial_state() {
 );
 
 add_task(async function test_initialializeWithCurrentIdentity() {
     _("Verify start after initializeWithCurrentIdentity");
     globalBrowseridManager.initializeWithCurrentIdentity();
     await globalBrowseridManager.whenReadyToAuthenticate.promise;
     do_check_true(!!globalBrowseridManager._token);
     do_check_true(globalBrowseridManager.hasValidToken());
+    Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
   }
 );
 
 add_task(async function test_initialializeWithAuthErrorAndDeletedAccount() {
     _("Verify sync unpair after initializeWithCurrentIdentity with auth error + account deleted");
 
     var identityConfig = makeIdentityConfig();
     var browseridManager = new BrowserIDManager();
@@ -127,16 +128,17 @@ add_task(async function test_initialiali
     await browseridManager.initializeWithCurrentIdentity();
     await Assert.rejects(browseridManager.whenReadyToAuthenticate.promise,
                      "should reject due to an auth error");
 
     do_check_true(signCertificateCalled);
     do_check_true(accountStatusCalled);
     do_check_false(browseridManager._token);
     do_check_false(browseridManager.hasValidToken());
+    Assert.deepEqual(getLoginTelemetryScalar(), {REJECTED: 1});
 });
 
 add_task(async function test_initialializeWithNoKeys() {
     _("Verify start after initializeWithCurrentIdentity without kA, kB or keyFetchToken");
     let identityConfig = makeIdentityConfig();
     delete identityConfig.fxaccount.user.kA;
     delete identityConfig.fxaccount.user.kB;
     // there's no keyFetchToken by default, so the initialize should fail.
new file mode 100644
--- /dev/null
+++ b/services/sync/tests/unit/test_browserid_identity_telemetry.js
@@ -0,0 +1,120 @@
+Cu.import("resource://gre/modules/Preferences.jsm");
+Cu.import("resource://testing-common/services/sync/utils.js");
+let {telemetryHelper} = Cu.import("resource://services-sync/browserid_identity.js", {});
+
+const prefs = new Preferences("services.sync.");
+
+function cleanup() {
+  prefs.resetBranch();
+}
+
+add_test(function test_success() {
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  // doing it again should not bump the count.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+
+  // check the prefs.
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), undefined);
+  Assert.equal(prefs.get(telemetryHelper.PREFS.APPEARS_PERMANENTLY_REJECTED), undefined);
+  Assert.equal(prefs.get(telemetryHelper.PREFS.LAST_RECORDED_STATE), telemetryHelper.STATES.SUCCESS);
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_success_to_error() {
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  // transition to an error state
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_unverified() {
+  telemetryHelper.nowInMinutes = () => 10000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), 10000);
+
+  // doing it again should not bump the count and should not change the
+  // original time it was rejected at.
+  telemetryHelper.nowInMinutes = () => 10010;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), 10000);
+
+  // record unverified again - should not record telemetry as it's the same state.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  // now record success - should get the new state recorded and the duration.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+
+  // we are now at 10010 minutes, so should have recorded it took 10.
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 10);
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_unverified_give_up() {
+  telemetryHelper.nowInMinutes = () => 10000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), 10000);
+
+  // bump our timestamp to 50k minutes, which is past our "give up" period.
+  telemetryHelper.nowInMinutes = () => 50000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+
+  // We've already recorded the unverified state, so shouldn't again.
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  // But we *should* have recorded how long it failed for even though it still
+  // fails.
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 40000);
+
+  // Record failure again - we should *not* have any telemetry recorded.
+  telemetryHelper.nowInMinutes = () => 50001;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  // Now record success - it also should *not* again record the failure duration.
+  telemetryHelper.nowInMinutes = () => 60000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  // Even though we were permanently rejected, the SUCCESS recording should
+  // have reset that state, so new error states should work as normal.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  telemetryHelper.nowInMinutes = () => 60001;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 1);
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_bad_state() {
+  // We call the internal implementation to check it throws as the public
+  // method catches and logs.
+  Assert.throws(() => telemetryHelper._maybeRecordLoginState("foo"), /invalid state/);
+  cleanup();
+  run_next_test();
+});
--- a/services/sync/tests/unit/xpcshell.ini
+++ b/services/sync/tests/unit/xpcshell.ini
@@ -44,16 +44,17 @@ tags = addons
 [test_resource.js]
 [test_resource_async.js]
 [test_resource_header.js]
 [test_resource_ua.js]
 [test_syncstoragerequest.js]
 
 # Generic Sync types.
 [test_browserid_identity.js]
+[test_browserid_identity_telemetry.js]
 [test_collection_getBatched.js]
 [test_collections_recovery.js]
 [test_keys.js]
 [test_records_crypto.js]
 [test_records_wbo.js]
 
 # Engine APIs.
 [test_engine.js]
--- a/toolkit/components/telemetry/Histograms.json
+++ b/toolkit/components/telemetry/Histograms.json
@@ -10832,16 +10832,27 @@
   "WEAVE_COMPLETE_SUCCESS_COUNT": {
     "record_in_processes": ["main", "content"],
     "expires_in_version": "default",
     "kind": "exponential",
     "high": 1000,
     "n_buckets": 10,
     "description": "The number of times a sync successfully completed in this session"
   },
+  "WEAVE_LOGIN_FAILED_FOR": {
+    "record_in_processes": ["main"],
+    "expires_in_version": "65",
+    "alert_emails": ["sync-dev@mozilla.org"],
+    "kind": "exponential",
+    "high": 20000,
+    "n_buckets": 100,
+    "releaseChannelCollection": "opt-out",
+    "bug_numbers": [1375635],
+    "description": "The number of minutes the user was in a bad login state."
+  },
   "WEAVE_WIPE_SERVER_SUCCEEDED": {
     "record_in_processes": ["main", "content"],
     "expires_in_version": "55",
     "alert_emails": ["sync-dev@mozilla.org"],
     "kind": "boolean",
     "bug_numbers": [1241699],
     "description": "Stores 1 if a wipeServer call succeeded, and 0 if it failed."
   },
--- a/toolkit/components/telemetry/Scalars.yaml
+++ b/toolkit/components/telemetry/Scalars.yaml
@@ -423,16 +423,31 @@ services.sync:
     kind: string
     keyed: false
     notification_emails:
       - sync-staff@mozilla.com
     release_channel_collection: opt-out
     record_in_processes:
       - main
 
+  sync_login_state_transitions:
+    bug_numbers:
+      - 1375635
+    description: >
+      Records transitions between login states of "success", "unverified"
+      and "rejected.
+    expires: "65"
+    kind: boolean
+    keyed: true
+    notification_emails:
+      - sync-staff@mozilla.com
+    release_channel_collection: opt-out
+    record_in_processes:
+      - main
+
 security:
   pkcs11_modules_loaded:
     bug_numbers:
       - 1369911
     description: >
       A keyed boolean indicating the library names of the PKCS#11 modules that
       have been loaded by the browser.
     expires: "62"
--- a/tools/lint/eslint/modules.json
+++ b/tools/lint/eslint/modules.json
@@ -220,17 +220,17 @@
   "tokenserverclient.js": ["TokenServerClient", "TokenServerClientError", "TokenServerClientNetworkError", "TokenServerClientServerError"],
   "ToolboxProcess.jsm": ["BrowserToolboxProcess"],
   "tps.jsm": ["ACTIONS", "TPS"],
   "Translation.jsm": ["Translation", "TranslationTelemetry"],
   "Traversal.jsm": ["TraversalRules", "TraversalHelper"],
   "UpdateTelemetry.jsm": ["AUSTLMY"],
   "UpdateTopLevelContentWindowIDHelper.jsm": ["trackBrowserWindow"],
   "util.js": ["getChromeWindow", "Utils", "Svc"],
-  "utils.js": ["applicationName", "assert", "Copy", "getBrowserObject", "getChromeWindow", "getWindows", "getWindowByTitle", "getWindowByType", "getWindowId", "getMethodInWindows", "getPreference", "saveDataURL", "setPreference", "sleep", "startTimer", "stopTimer", "takeScreenshot", "unwrapNode", "waitFor", "btoa", "encryptPayload", "makeIdentityConfig", "makeFxAccountsInternalMock", "configureFxAccountIdentity", "configureIdentity", "SyncTestingInfrastructure", "waitForZeroTimer", "Promise", "MockFxaStorageManager", "AccountState", "sumHistogram", "CommonUtils", "CryptoUtils", "TestingUtils", "promiseZeroTimer", "promiseNamedTimer"],
+  "utils.js": ["applicationName", "assert", "Copy", "getBrowserObject", "getChromeWindow", "getWindows", "getWindowByTitle", "getWindowByType", "getWindowId", "getMethodInWindows", "getPreference", "saveDataURL", "setPreference", "sleep", "startTimer", "stopTimer", "takeScreenshot", "unwrapNode", "waitFor", "btoa", "encryptPayload", "makeIdentityConfig", "makeFxAccountsInternalMock", "configureFxAccountIdentity", "configureIdentity", "SyncTestingInfrastructure", "waitForZeroTimer", "Promise", "MockFxaStorageManager", "AccountState", "sumHistogram", "CommonUtils", "CryptoUtils", "TestingUtils", "promiseZeroTimer", "promiseNamedTimer", "getLoginTelemetryScalar"],
   "Utils.jsm": ["Utils", "Logger", "PivotContext", "PrefCache"],
   "VariablesView.jsm": ["VariablesView", "escapeHTML"],
   "VariablesViewController.jsm": ["VariablesViewController", "StackFrameUtils"],
   "version.jsm": ["VERSION"],
   "vtt.jsm": ["WebVTT"],
   "WebChannel.jsm": ["WebChannel", "WebChannelBroker"],
   "WindowDraggingUtils.jsm": ["WindowDraggingElement"],
   "windows.js": ["init", "map"],