Bug 1806042 - Replace Narrow No-Break Space (U+202F) and Thin Space (U+2009) in DateTimeFormat/DateTimeIntervalFormat output with regular Space to mitigate breakage on fragile websites. r=anba
authorJonathan Kew <jkew@mozilla.com>
Sat, 24 Dec 2022 10:35:10 +0000 (2022-12-24)
changeset 647383 40e2c54d56186383e063bf99f06e8f691b05a154
parent 647382 4ed93c62e8ed1035b71935791960bb3fc4b3bfe3
child 647384 4b27c1d6a589f9fb2805f915be920893a454916b
push id40505
push usersmolnar@mozilla.com
push dateSun, 25 Dec 2022 09:26:05 +0000 (2022-12-25)
treeherdermozilla-central@d039318db151 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersanba
bugs1806042
milestone110.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1806042 - Replace Narrow No-Break Space (U+202F) and Thin Space (U+2009) in DateTimeFormat/DateTimeIntervalFormat output with regular Space to mitigate breakage on fragile websites. r=anba The data for a bunch of locales was updated in ICU 72 to use U+202F and U+2009 in places where previously it had regular Space characters. Unfortunately, this breaks some sites that attempt to parse the formatted output using naive regular expressions (or similar) that just expect space, rather than "any whitespace", and fail to match against the new formatted output. To mitigate this, until more browsers update to the newer ICU/CLDR data and pressure builds on sites to fix such fragile scripts, we can post-process the formatted output from ICU to replace these "special" spaces with standard ASCII space characters. This workaround is designed to be easily disabled at build time by just changing the DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES #define, when we're ready to try re-enabling the updated formats. Differential Revision: https://phabricator.services.mozilla.com/D165408
intl/components/src/DateIntervalFormat.cpp
intl/components/src/DateTimeFormat.h
--- a/intl/components/src/DateIntervalFormat.cpp
+++ b/intl/components/src/DateIntervalFormat.cpp
@@ -1,12 +1,13 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+#include "DateTimeFormat.h"  // for DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
 #include "DateTimeFormatUtils.h"
 #include "ScopedICUObject.h"
 
 #include "mozilla/intl/Calendar.h"
 #include "mozilla/intl/DateIntervalFormat.h"
 
 namespace mozilla::intl {
 
@@ -65,46 +66,74 @@ Result<UniquePtr<DateIntervalFormat>, IC
   return UniquePtr<DateIntervalFormat>(new DateIntervalFormat(dif));
 }
 
 DateIntervalFormat::~DateIntervalFormat() {
   MOZ_ASSERT(mDateIntervalFormat);
   udtitvfmt_close(mDateIntervalFormat.GetMut());
 }
 
+#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
+// We reach inside the UFormattedValue and modify its internal string. (It's
+// crucial that this is just an in-place replacement that doesn't alter any
+// field positions, etc., )
+static void ReplaceSpecialSpaces(const UFormattedValue* aValue) {
+  UErrorCode status = U_ZERO_ERROR;
+  int32_t len;
+  const UChar* str = ufmtval_getString(aValue, &len, &status);
+  if (U_FAILURE(status)) {
+    return;
+  }
+
+  for (const auto& c : Span(str, len)) {
+    if (IsSpecialSpace(c)) {
+      const_cast<UChar&>(c) = ' ';
+    }
+  }
+}
+#endif
+
 ICUResult DateIntervalFormat::TryFormatCalendar(
     const Calendar& aStart, const Calendar& aEnd,
     AutoFormattedDateInterval& aFormatted, bool* aPracticallyEqual) const {
   MOZ_ASSERT(aFormatted.IsValid());
 
   UErrorCode status = U_ZERO_ERROR;
   udtitvfmt_formatCalendarToResult(mDateIntervalFormat.GetConst(),
                                    aStart.GetUCalendar(), aEnd.GetUCalendar(),
                                    aFormatted.GetFormatted(), &status);
 
   if (U_FAILURE(status)) {
     return Err(ToICUError(status));
   }
 
+#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
+  ReplaceSpecialSpaces(aFormatted.Value());
+#endif
+
   MOZ_TRY(DateFieldsPracticallyEqual(aFormatted.Value(), aPracticallyEqual));
   return Ok();
 }
 
 ICUResult DateIntervalFormat::TryFormatDateTime(
     double aStart, double aEnd, AutoFormattedDateInterval& aFormatted,
     bool* aPracticallyEqual) const {
   MOZ_ASSERT(aFormatted.IsValid());
 
   UErrorCode status = U_ZERO_ERROR;
   udtitvfmt_formatToResult(mDateIntervalFormat.GetConst(), aStart, aEnd,
                            aFormatted.GetFormatted(), &status);
   if (U_FAILURE(status)) {
     return Err(ToICUError(status));
   }
 
+#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
+  ReplaceSpecialSpaces(aFormatted.Value());
+#endif
+
   MOZ_TRY(DateFieldsPracticallyEqual(aFormatted.Value(), aPracticallyEqual));
   return Ok();
 }
 
 ICUResult DateIntervalFormat::TryFormattedToParts(
     const AutoFormattedDateInterval& aFormatted,
     DateTimePartVector& aParts) const {
   MOZ_ASSERT(aFormatted.IsValid());
--- a/intl/components/src/DateTimeFormat.h
+++ b/intl/components/src/DateTimeFormat.h
@@ -15,18 +15,39 @@
 #include "mozilla/Maybe.h"
 #include "mozilla/Result.h"
 #include "mozilla/Span.h"
 #include "mozilla/UniquePtr.h"
 #include "mozilla/Utf8.h"
 #include "mozilla/Variant.h"
 #include "mozilla/Vector.h"
 
+/*
+ * To work around webcompat problems caused by Narrow No-Break Space in
+ * formatted date/time output, where existing code on the web naively
+ * assumes there will be a normal Space, we replace any occurrences of
+ * U+202F in the formatted results with U+0020.
+ *
+ * The intention is to undo this hack once other major browsers are also
+ * ready to ship with the updated (ICU72) i18n data that uses NNBSP.
+ *
+ * See https://bugzilla.mozilla.org/show_bug.cgi?id=1806042 for details,
+ * and see DateIntervalFormat.cpp for the other piece of this hack.
+ */
+#define DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 1
+
 namespace mozilla::intl {
 
+#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
+static inline bool IsSpecialSpace(char16_t c) {
+  // NARROW NO-BREAK SPACE and THIN SPACE
+  return c == 0x202F || c == 0x2009;
+}
+#endif
+
 class Calendar;
 
 /**
  * Intro to mozilla::intl::DateTimeFormat
  * ======================================
  *
  * This component is a Mozilla-focused API for the date formatting provided by
  * ICU. The methods internally call out to ICU4C. This is responsible for and
@@ -324,29 +345,50 @@ class DateTimeFormat final {
                                       UErrorCode* status) {
             return udat_format(mDateFormat, aUnixEpoch, target, length,
                                /* UFieldPosition* */ nullptr, status);
           });
       if (result.isErr()) {
         return result;
       }
 
+#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
+      for (auto& c : u16Vec) {
+        if (IsSpecialSpace(c)) {
+          c = ' ';
+        }
+      }
+#endif
+
       if (!FillBuffer(u16Vec, aBuffer)) {
         return Err(ICUError::OutOfMemory);
       }
       return Ok{};
     } else {
       static_assert(std::is_same_v<typename B::CharType, char16_t>);
 
       // The output buffer is UTF-16. ICU can output directly into this buffer.
-      return FillBufferWithICUCall(
+      auto result = FillBufferWithICUCall(
           aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
             return udat_format(mDateFormat, aUnixEpoch, target, length, nullptr,
                                status);
           });
+      if (result.isErr()) {
+        return result;
+      }
+
+#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
+      for (auto& c : Span(aBuffer.data(), aBuffer.length())) {
+        if (IsSpecialSpace(c)) {
+          c = ' ';
+        }
+      }
+#endif
+
+      return Ok{};
     }
   };
 
   /**
    * Format the Unix epoch time into a DateTimePartVector.
    *
    * The caller has to create the buffer and the vector and pass to this method.
    * The formatted string will be stored in the buffer and formatted parts in
@@ -375,16 +417,24 @@ class DateTimeFormat final {
           return udat_formatForFields(mDateFormat, aUnixEpoch, chars, size,
                                       fpositer, status);
         });
     if (result.isErr()) {
       ufieldpositer_close(fpositer);
       return result.propagateErr();
     }
 
+#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
+    for (auto& c : Span(aBuffer.data(), aBuffer.length())) {
+      if (IsSpecialSpace(c)) {
+        c = ' ';
+      }
+    }
+#endif
+
     return TryFormatToParts(fpositer, aBuffer.length(), aParts);
   }
 
   /**
    * Copies the pattern for the current DateTimeFormat to a buffer.
    *
    * Warning: This method should not be added to new code. In the near future we
    * plan to remove it.