Bug 1012999: When STUN global rate limit is exceeded, record this in telemetry. r=ekr
authorEKR <ekr@rtfm.com>
Mon, 19 May 2014 19:16:38 -0700
changeset 203248 0253f2720564f5daac9efdf45a9082328c0e4cfe
parent 203247 a7a36e1dca08277aba3620c788bdb10edb0b7d5f
child 203249 d489e0bc4e5dc16c423913c0e4c99d334b4a665e
push id3741
push userasasaki@mozilla.com
push dateMon, 21 Jul 2014 20:25:18 +0000
treeherdermozilla-beta@4d6f46f5af68 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersekr
bugs1012999
milestone32.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1012999: When STUN global rate limit is exceeded, record this in telemetry. r=ekr
media/mtransport/nr_socket_prsock.cpp
media/mtransport/nr_socket_prsock.h
media/mtransport/nricectx.cpp
media/mtransport/nricectx.h
media/webrtc/signaling/src/peerconnection/PeerConnectionImpl.cpp
media/webrtc/signaling/src/peerconnection/PeerConnectionImpl.h
media/webrtc/signaling/src/peerconnection/WebrtcGlobalInformation.cpp
toolkit/components/telemetry/Histograms.json
--- a/media/mtransport/nr_socket_prsock.cpp
+++ b/media/mtransport/nr_socket_prsock.cpp
@@ -115,16 +115,27 @@ extern "C" {
 #include "stun_hint.h"
 }
 #include "nr_socket_prsock.h"
 #include "simpletokenbucket.h"
 
 // Implement the nsISupports ref counting
 namespace mozilla {
 
+static TimeStamp nr_socket_short_term_violation_time;
+static TimeStamp nr_socket_long_term_violation_time;
+
+TimeStamp NrSocketBase::short_term_violation_time() {
+  return nr_socket_short_term_violation_time;
+}
+
+TimeStamp NrSocketBase::long_term_violation_time() {
+  return nr_socket_long_term_violation_time;
+}
+
 // NrSocketBase implementation
 // async_event APIs
 int NrSocketBase::async_wait(int how, NR_async_cb cb, void *cb_arg,
                              char *function, int line) {
   uint16_t flag;
 
   switch (how) {
     case NR_ASYNC_WAIT_READ:
@@ -507,25 +518,44 @@ int NrSocket::sendto(const void *msg, si
     // (see http://tools.ietf.org/html/draft-thomson-mmusic-ice-webrtc)
 
     // Tolerate rate of 8k/sec, for one second.
     static SimpleTokenBucket burst(8192*1, 8192);
     // Tolerate rate of 3.6k/sec over twenty seconds.
     static SimpleTokenBucket sustained(3686*20, 3686);
 
     // Check number of tokens in each bucket.
-    if (burst.getTokens(UINT32_MAX) < len ||
-        sustained.getTokens(UINT32_MAX) < len) {
+    if (burst.getTokens(UINT32_MAX) < len) {
       r_log(LOG_GENERIC, LOG_ERR,
-                 "Global rate limit for STUN requests exceeded.");
+                 "Short term global rate limit for STUN requests exceeded.");
       MOZ_ASSERT(false,
-                 "Global rate limit for STUN requests exceeded. Go bug "
-                 "bcampen@mozilla.com if you weren't intentionally spamming "
-                 "ICE candidates, or don't know what that means.");
-      ABORT(R_WOULDBLOCK);
+                 "Short term global rate limit for STUN requests exceeded. Go "
+                 "bug bcampen@mozilla.com if you weren't intentionally "
+                 "spamming ICE candidates, or don't know what that means.");
+#ifdef MOZILLA_INTERNAL_API
+      nr_socket_short_term_violation_time = TimeStamp::Now();
+#endif
+      // TODO(bcampen@mozilla.com): Bug 1013007 Once we have better data on
+      // normal usage, re-enable this.
+      // ABORT(R_WOULDBLOCK);
+    }
+
+    if (sustained.getTokens(UINT32_MAX) < len) {
+      r_log(LOG_GENERIC, LOG_ERR,
+                 "Long term global rate limit for STUN requests exceeded.");
+      MOZ_ASSERT(false,
+                 "Long term global rate limit for STUN requests exceeded. Go "
+                 "bug bcampen@mozilla.com if you weren't intentionally "
+                 "spamming ICE candidates, or don't know what that means.");
+#ifdef MOZILLA_INTERNAL_API
+      nr_socket_long_term_violation_time = TimeStamp::Now();
+#endif
+      // TODO(bcampen@mozilla.com): Bug 1013007 Once we have better data on
+      // normal usage, re-enable this.
+      // ABORT(R_WOULDBLOCK);
     }
 
     // Take len tokens from both buckets.
     // (not threadsafe, but no problem since this is only called from STS)
     burst.getTokens(len);
     sustained.getTokens(len);
   }
 
--- a/media/mtransport/nr_socket_prsock.h
+++ b/media/mtransport/nr_socket_prsock.h
@@ -58,16 +58,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #include "nsXPCOM.h"
 #include "nsIEventTarget.h"
 #include "nsIUDPSocketChild.h"
 
 #include "databuffer.h"
 #include "m_cpp_utils.h"
 #include "mozilla/ReentrantMonitor.h"
 #include "mozilla/RefPtr.h"
+#include "mozilla/TimeStamp.h"
 
 // Stub declaration for nICEr type
 typedef struct nr_socket_vtbl_ nr_socket_vtbl;
 
 namespace mozilla {
 
 namespace net {
   union NetAddr;
@@ -105,16 +106,19 @@ public:
   NS_IMETHOD_(MozExternalRefCountType) Release(void) = 0;
 
   uint32_t poll_flags() {
     return poll_flags_;
   }
 
   virtual nr_socket_vtbl *vtbl();  // To access in test classes.
 
+  static TimeStamp short_term_violation_time();
+  static TimeStamp long_term_violation_time();
+
 protected:
   void fire_callback(int how);
 
   bool connect_invoked_;
   nr_transport_addr my_addr_;
 
 private:
   NR_async_cb cbs_[NR_ASYNC_WAIT_WRITE + 1];
--- a/media/mtransport/nricectx.cpp
+++ b/media/mtransport/nricectx.cpp
@@ -84,16 +84,24 @@ extern "C" {
 #include "nricectx.h"
 #include "nricemediastream.h"
 #include "nr_socket_prsock.h"
 #include "nrinterfaceprioritizer.h"
 #include "rlogringbuffer.h"
 
 namespace mozilla {
 
+TimeStamp nr_socket_short_term_violation_time() {
+  return NrSocketBase::short_term_violation_time();
+}
+
+TimeStamp nr_socket_long_term_violation_time() {
+  return NrSocketBase::long_term_violation_time();
+}
+
 MOZ_MTLOG_MODULE("mtransport")
 
 const char kNrIceTransportUdp[] = "udp";
 const char kNrIceTransportTcp[] = "tcp";
 
 static bool initialized = false;
 
 // Implement NSPR-based crypto algorithms
--- a/media/mtransport/nricectx.h
+++ b/media/mtransport/nricectx.h
@@ -45,24 +45,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 // Original author: ekr@rtfm.com
 
 // This is a wrapper around the nICEr ICE stack
 #ifndef nricectx_h__
 #define nricectx_h__
 
+#include <string>
 #include <vector>
 
 #include "sigslot.h"
 
 #include "prnetdb.h"
 
 #include "mozilla/RefPtr.h"
 #include "mozilla/Scoped.h"
+#include "mozilla/TimeStamp.h"
 #include "nsAutoPtr.h"
 #include "nsIEventTarget.h"
 #include "nsITimer.h"
 
 #include "m_cpp_utils.h"
 
 typedef struct nr_ice_ctx_ nr_ice_ctx;
 typedef struct nr_ice_peer_ctx_ nr_ice_peer_ctx;
@@ -74,16 +76,21 @@ typedef struct nr_ice_cand_pair_ nr_ice_
 typedef struct nr_ice_stun_server_ nr_ice_stun_server;
 typedef struct nr_ice_turn_server_ nr_ice_turn_server;
 typedef struct nr_resolver_ nr_resolver;
 
 typedef void* NR_SOCKET;
 
 namespace mozilla {
 
+// Timestamps set whenever a packet is dropped due to global rate limiting
+// (see nr_socket_prsock.cpp)
+TimeStamp nr_socket_short_term_violation_time();
+TimeStamp nr_socket_long_term_violation_time();
+
 class NrIceMediaStream;
 
 extern const char kNrIceTransportUdp[];
 extern const char kNrIceTransportTcp[];
 
 class NrIceStunServer {
  public:
   NrIceStunServer(const PRNetAddr& addr) : has_addr_(true) {
--- a/media/webrtc/signaling/src/peerconnection/PeerConnectionImpl.cpp
+++ b/media/webrtc/signaling/src/peerconnection/PeerConnectionImpl.cpp
@@ -153,17 +153,19 @@ template<>
 struct nsISupportsWeakReference::COMTypeInfo<nsSupportsWeakReference, void> {
   static const nsIID kIID NS_HIDDEN;
 };
 const nsIID nsISupportsWeakReference::COMTypeInfo<nsSupportsWeakReference, void>::kIID = NS_ISUPPORTSWEAKREFERENCE_IID;
 
 namespace sipcc {
 
 #ifdef MOZILLA_INTERNAL_API
-RTCStatsQuery::RTCStatsQuery(bool internal) : internalStats(internal) {
+RTCStatsQuery::RTCStatsQuery(bool internal) :
+  failed(false),
+  internalStats(internal) {
 }
 
 RTCStatsQuery::~RTCStatsQuery() {
   MOZ_ASSERT(NS_IsMainThread());
 }
 
 #endif
 
@@ -2148,16 +2150,19 @@ PeerConnectionImpl::BuildStatsQuery_m(
     return rv;
   }
 
   // We do not use the pcHandle here, since that's risky to expose to content.
   query->report = RTCStatsReportInternalConstruct(
       NS_ConvertASCIItoUTF16(mName.c_str()),
       query->now);
 
+  query->iceStartTime = mIceStartTime;
+  query->failed = isFailed(mIceConnectionState);
+
   // Populate SDP on main
   if (query->internalStats) {
     query->report.mLocalSdp.Construct(
         NS_ConvertASCIItoUTF16(mLocalSDP.c_str()));
     query->report.mRemoteSdp.Construct(
         NS_ConvertASCIItoUTF16(mRemoteSDP.c_str()));
   }
 
--- a/media/webrtc/signaling/src/peerconnection/PeerConnectionImpl.h
+++ b/media/webrtc/signaling/src/peerconnection/PeerConnectionImpl.h
@@ -169,16 +169,20 @@ private:
 // Not an inner class so we can forward declare.
 class RTCStatsQuery {
   public:
     explicit RTCStatsQuery(bool internalStats);
     ~RTCStatsQuery();
 
     mozilla::dom::RTCStatsReportInternal report;
     std::string error;
+    // A timestamp to help with telemetry.
+    mozilla::TimeStamp iceStartTime;
+    // Just for convenience, maybe integrate into the report later
+    bool failed;
 
   private:
     friend class PeerConnectionImpl;
     std::string pcName;
     bool internalStats;
     nsTArray<mozilla::RefPtr<mozilla::MediaPipeline>> pipelines;
     mozilla::RefPtr<NrIceCtx> iceCtx;
     nsTArray<mozilla::RefPtr<NrIceMediaStream>> streams;
--- a/media/webrtc/signaling/src/peerconnection/WebrtcGlobalInformation.cpp
+++ b/media/webrtc/signaling/src/peerconnection/WebrtcGlobalInformation.cpp
@@ -350,16 +350,38 @@ static void StoreLongTermICEStatisticsIm
 
 static void GetStatsForLongTermStorage_s(
     nsAutoPtr<RTCStatsQuery> query) {
 
   MOZ_ASSERT(query);
 
   nsresult rv = PeerConnectionImpl::ExecuteStatsQuery_s(query.get());
 
+  // Check whether packets were dropped due to rate limiting during
+  // this call. (These calls must be made on STS)
+  unsigned char rate_limit_bit_pattern = 0;
+  if (!mozilla::nr_socket_short_term_violation_time().IsNull() &&
+      mozilla::nr_socket_short_term_violation_time() >= query->iceStartTime) {
+    rate_limit_bit_pattern |= 1;
+  }
+  if (!mozilla::nr_socket_long_term_violation_time().IsNull() &&
+      mozilla::nr_socket_long_term_violation_time() >= query->iceStartTime) {
+    rate_limit_bit_pattern |= 2;
+  }
+
+  if (query->failed) {
+    Telemetry::Accumulate(
+        Telemetry::WEBRTC_STUN_RATE_LIMIT_EXCEEDED_BY_TYPE_GIVEN_FAILURE,
+        rate_limit_bit_pattern);
+  } else {
+    Telemetry::Accumulate(
+        Telemetry::WEBRTC_STUN_RATE_LIMIT_EXCEEDED_BY_TYPE_GIVEN_SUCCESS,
+        rate_limit_bit_pattern);
+  }
+
   // Even if Telemetry::Accumulate is threadsafe, we still need to send the
   // query back to main, since that is where it must be destroyed.
   NS_DispatchToMainThread(
       WrapRunnableNM(
           &StoreLongTermICEStatisticsImpl_m,
           rv,
           query),
       NS_DISPATCH_NORMAL);
--- a/toolkit/components/telemetry/Histograms.json
+++ b/toolkit/components/telemetry/Histograms.json
@@ -5133,16 +5133,28 @@
     "description": "A bitpattern indicating what types of candidates were present. Bit 0: Remote server reflexive. Bit 1: Remote relayed. Bit 2: Local server reflexive. Bits 3-6: Local UDP, TCP, TLS, and HTTPS relay respectively."
   },
   "WEBRTC_CANDIDATE_TYPES_GIVEN_FAILURE": {
     "expires_in_version": "never",
     "kind": "enumerated",
     "n_values": 128,
     "description": "Identical to WEBRTC_CANDIDATE_TYPES_GIVEN_SUCCEESS, except recorded only when ICE fails on the stream in question."
   },
+  "WEBRTC_STUN_RATE_LIMIT_EXCEEDED_BY_TYPE_GIVEN_SUCCESS": {
+    "expires_in_version": "never",
+    "kind": "enumerated",
+    "n_values": 4,
+    "description": "For each successful PeerConnection, bit 0 indicates the short-duration rate limit was reached, bit 1 indicates the long-duration rate limit was reached"
+  },
+  "WEBRTC_STUN_RATE_LIMIT_EXCEEDED_BY_TYPE_GIVEN_FAILURE": {
+    "expires_in_version": "never",
+    "kind": "enumerated",
+    "n_values": 4,
+    "description": "For each failed PeerConnection, bit 0 indicates the short-duration rate limit was reached, bit 1 indicates the long-duration rate limit was reached"
+  },
   "WEBRTC_AVSYNC_WHEN_AUDIO_LAGS_VIDEO_MS": {
     "expires_in_version": "never",
     "kind": "exponential",
     "high": 60000,
     "n_buckets": 1000,
     "description": "The delay (in milliseconds) when audio is behind video. Zero delay is counted. Measured every second of a call."
   },
   "WEBRTC_AVSYNC_WHEN_VIDEO_LAGS_AUDIO_MS": {