intl/unicharutil/util/IrishCasing.cpp
author B2G Bumper Bot <release+b2gbumper@mozilla.com>
Mon, 11 May 2015 00:49:50 -0700
changeset 243235 bfb045ce7547ae5738ac5e4f89252098a0474ab9
parent 200143 5fb5adcc3835685c8ec779c35301a3f0251b1275
child 361176 2f09a955dbd63dabfe4ae4d256078252492855e3
permissions -rw-r--r--
Bumping manifests a=b2g-bump

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/******************************************************************************

This file provides a finite state machine to support Irish Gaelic uppercasing
rules.

The caller will need to iterate through a string, passing a State variable
along with the current character to each UpperCase call and checking the flags
that are returned:

  If aMarkPos is true, caller must remember the current index in the string as
  a possible target for a future action.

  If aAction is non-zero, then one or more characters from the marked index are
  to be modified:
    1  lowercase the marked letter
    2  lowercase the marked letter and its successor
    3  lowercase the marked letter, and delete its successor


### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
### comments 1 and 4:

v = [a,á,e,é,i,í,o,ó,u,ú]
V = [A,Á,E,É,I,Í,O,Ó,U,Ú]

bhf -> bhF
bhF -> bhF
bp  -> bP
bP  -> bP
dt  -> dT
dT  -> dT
gc  -> gC
gC  -> gC
h{V}  -> h{V}
mb  -> mB
mB  -> mB
n-{v} -> n{V}
n{V} -> n{V}
nd  -> nD
nD  -> nD
ng  -> nG
nG  -> nG
t-{v} -> t{V}
t{V} -> t{V}
ts{v} -> tS{V}
tS{v} -> tS{V}
tS{V} -> tS{V}
tsl  -> tSL
tSl  -> tSL
tSL  -> tSL
tsn  -> tSN
tSn  -> tSN
tSN  -> tSN
tsr  -> tSR
tSr  -> tSR
tSR  -> tSR

### Create table of states and actions for each input class.

Start (non-word) state is #; generic in-word state is _, once we know there's
no special action to do in this word.

         #   _   b   bh  d   g   h   m   n   n-  t   t-  ts
input\state
b        b'  _   _   _   _   _   _   1   _   _   _   _   _
B        _   _   _   _   _   _   _   1   _   _   _   _   _
c        _   _   _   _   _   1   _   _   _   _   _   _   _
C        _   _   _   _   _   1   _   _   _   _   _   _   _
d        d'  _   _   _   _   _   _   _   1   _   _   _   _
D        _   _   _   _   _   _   _   _   1   _   _   _   _
f        _   _   _   2   _   _   _   _   _   _   _   _   _
F        _   _   _   2   _   _   _   _   _   _   _   _   _
g        g'  _   _   _   _   _   _   _   1   _   _   _   _
G        _   _   _   _   _   _   _   _   1   _   _   _   _
h        h'  _   bh  _   _   _   _   _   _   _   _   _   _
l        _   _   _   _   _   _   _   _   _   _   _   _   1
L        _   _   _   _   _   _   _   _   _   _   _   _   1
m        m'  _   _   _   _   _   _   _   _   _   _   _   _
n        n'  _   _   _   _   _   _   _   _   _   _   _   1
N        _   _   _   _   _   _   _   _   _   _   _   _   1
p        _   _   1   _   _   _   _   _   _   _   _   _   _
P        _   _   1   _   _   _   _   _   _   _   _   _   _
r        _   _   _   _   _   _   _   _   _   _   _   _   1
R        _   _   _   _   _   _   _   _   _   _   _   _   1
s        _   _   _   _   _   _   _   _   _   _   ts  _   _
S        _   _   _   _   _   _   _   _   _   _   ts  _   _
t        t'  _   _   _   1   _   _   _   _   _   _   _   _
T        _   _   _   _   1   _   _   _   _   _   _   _   _
vowel    _   _   _   _   _   _   _   _   _   1d  _   1d  1
Vowel    _   _   _   _   _   _   1   _   1   _   1   _   1
hyph     _   _   _   _   _   _   _   _   n-  _   t-  _   _
letter   _   _   _   _   _   _   _   _   _   _   _   _   _
other    #   #   #   #   #   #   #   #   #   #   #   #   #

Actions:
  1            lowercase one letter at start of word
  2            lowercase two letters at start of word
  1d           lowercase one letter at start of word, and delete next
               (and then go to state _, nothing further to do in this word)

else just go to the given state; suffix ' indicates mark start-of-word.

### Consolidate identical states and classes:

         0   1   2   3   4   5   6   7   8   9   A   B
         #   _   b   bh  d   g   h   m   n [nt]- t   ts
input\state
b        b'  _   _   _   _   _   _   1   _   _   _   _
B        _   _   _   _   _   _   _   1   _   _   _   _
[cC]     _   _   _   _   _   1   _   _   _   _   _   _
d        d'  _   _   _   _   _   _   _   1   _   _   _
[DG]     _   _   _   _   _   _   _   _   1   _   _   _
[fF]     _   _   _   2   _   _   _   _   _   _   _   _
g        g'  _   _   _   _   _   _   _   1   _   _   _
h        h'  _   bh  _   _   _   _   _   _   _   _   _
[lLNrR]  _   _   _   _   _   _   _   _   _   _   _   1
m        m'  _   _   _   _   _   _   _   _   _   _   _
n        n'  _   _   _   _   _   _   _   _   _   _   1
[pP]     _   _   1   _   _   _   _   _   _   _   _   _
[sS]     _   _   _   _   _   _   _   _   _   _   ts  _
t        t'  _   _   _   1   _   _   _   _   _   _   _
T        _   _   _   _   1   _   _   _   _   _   _   _
vowel    _   _   _   _   _   _   _   _   _   1d  _   1
Vowel    _   _   _   _   _   _   1   _   1   _   1   1
hyph     _   _   _   _   _   _   _   _ [nt-] _ [nt-] _
letter   _   _   _   _   _   _   _   _   _   _   _   _
other    #   #   #   #   #   #   #   #   #   #   #   #

So we have 20 input classes, and 12 states.

State table array will contain bytes that encode action and new state:

  0x80  -  bit flag: mark start-of-word position
  0x40  -  currently unused
  0x30  -  action mask: 4 values
           0x00  -  do nothing
           0x10  -  lowercase one letter
           0x20  -  lowercase two letters
           0x30  -  lowercase one, delete one
  0x0F  -  next-state mask
******************************************************************************/

#include "IrishCasing.h"

#include "nsUnicodeProperties.h"
#include "nsUnicharUtils.h"

namespace mozilla {

const uint8_t
IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
//  #     _     b     bh    d     g     h     m     n     [nt]- t     ts
  { 0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // b
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // B
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [cC]
  { 0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // d
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // [DG]
  { 0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [fF]
  { 0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // g
  { 0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // h
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // [lLNrR]
  { 0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // m
  { 0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // n
  { 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [pP]
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x01 }, // [sS]
  { 0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // t
  { 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // T
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 0x11 }, // vowel
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 0x11 }, // Vowel
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 0x01 }, // hyph
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // letter
  { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }  // other
};

#define HYPHEN          0x2010
#define NO_BREAK_HYPHEN 0x2011
#define a_ACUTE         0x00e1
#define e_ACUTE         0x00e9
#define i_ACUTE         0x00ed
#define o_ACUTE         0x00f3
#define u_ACUTE         0x00fa
#define A_ACUTE         0x00c1
#define E_ACUTE         0x00c9
#define I_ACUTE         0x00cd
#define O_ACUTE         0x00d3
#define U_ACUTE         0x00da

const uint8_t IrishCasing::sLcClasses[26] = {
  kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel,
  kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter,
  kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel,
  kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t,
  kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
  kClass_letter
};

const uint8_t IrishCasing::sUcClasses[26] = {
  kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel,
  kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter,
  kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel,
  kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T,
  kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
  kClass_letter
};

uint8_t
IrishCasing::GetClass(uint32_t aCh)
{
  using mozilla::unicode::GetGenCategory;
  if (aCh >= 'a' && aCh <= 'z') {
    return sLcClasses[aCh - 'a'];
  } else if (aCh >= 'A' && aCh <= 'Z') {
    return sUcClasses[aCh - 'A'];
  } else if (GetGenCategory(aCh) == nsIUGenCategory::kLetter) {
    if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE ||
        aCh == o_ACUTE || aCh == u_ACUTE) {
      return kClass_vowel;
    } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
               aCh == O_ACUTE || aCh == U_ACUTE) {
      return kClass_Vowel;
    } else {
      return kClass_letter;
    }
  } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
    return kClass_hyph;
  } else {
    return kClass_other;
  }
}

uint32_t
IrishCasing::UpperCase(uint32_t aCh, State& aState,
                       bool& aMarkPos, uint8_t& aAction)
{
  uint8_t cls = GetClass(aCh);
  uint8_t stateEntry = sUppercaseStateTable[cls][aState];
  aMarkPos = !!(stateEntry & kMarkPositionFlag);
  aAction = (stateEntry & kActionMask) >> kActionShift;
  aState = State(stateEntry & kNextStateMask);

  return ToUpperCase(aCh);
}

} // namespace mozilla