Bug 1390550 - Update to latest libhyphen code from upstream. r=ryanvm, a=gchang
authorJonathan Kew <jkew@mozilla.com>
Thu, 17 Aug 2017 09:58:29 +0100
changeset 356264 76c25987a2759a4c649aea1dacf7eb79e983c29f
parent 356263 fbddb5cdd3c705d1d70afa96530a9fb0522cd7ea
child 356265 e45e2146178462a5f21f95e8b0630fd76ce036e1
push id7278
push userryanvm@gmail.com
push dateMon, 28 Aug 2017 14:01:40 +0000
treeherdermozilla-esr52@e45e21461784 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersryanvm, gchang
bugs1390550
milestone52.3.1
Bug 1390550 - Update to latest libhyphen code from upstream. r=ryanvm, a=gchang
intl/hyphenation/README.mozilla
intl/hyphenation/hyphen/AUTHORS
intl/hyphenation/hyphen/README
intl/hyphenation/hyphen/hyphen.c
--- a/intl/hyphenation/README.mozilla
+++ b/intl/hyphenation/README.mozilla
@@ -1,13 +1,13 @@
 About the hyphenation code in this directory
 ============================================
 
 The hyphen directory comes from the Hyphen library, part of the hunspell project.
-  http://sourceforge.net/projects/hunspell/files/Hyphen/.
+  https://github.com/hunspell/hyphen
 
 This code is distributed under the GPL 2.0/LGPL 2.1/MPL 1.1 tri-license, as
 detailed in the associated README and COPYING files.
 
 Note that we do not include other tools and resources found in the complete
 Hyphen package from upstream, so the original README.* files may refer to
 additional files that are not present in the Mozilla source tree.
 
new file mode 100644
--- /dev/null
+++ b/intl/hyphenation/hyphen/AUTHORS
@@ -0,0 +1,17 @@
+Libhnj was written by Raph Levien <raph at acm dot org>.
+
+Original Libhnj source with OOo's patches are managed by Rene Engelhard and
+Chris Halls at Debian: http://packages.debian.org/stable/libdevel/libhnj-dev
+and http://packages.debian.org/unstable/source/libhnj
+
+This subset of Libhnj was extended by
+Peter Novodvorsky <nidd at alt-linux dot org> (OOo integration),
+László Németh <nemeth at numbertext dot org> (non-standard and compound
+hyphenation with Unicode support),
+Nanning Buitenhuis <nanning at elvenkind dot com> (substrings.c)
+
+Write bug reports to László Németh or in the bug tracker of hunspell.sf.net.
+
+---
+Please contact Raph Levien for information about licensing for
+proprietary applications.
--- a/intl/hyphenation/hyphen/README
+++ b/intl/hyphenation/hyphen/README
@@ -43,16 +43,17 @@ tbhyphext.tex: hyphenation exception log
   Generated with the hyphenex script
   (http://www.ctan.org/tex-archive/info/digests/tugboat/hyphenex.sh)
 
   sh hyphenex.sh <tb0hyf.tex >tbhyphext.tex
 
 
 INSTALLATION
 
+autoreconf -fvi
 ./configure
 make
 make install
 
 UNIT TESTS (WITH VALGRIND DEBUGGER)
 
 make check
 VALGRIND=memcheck make check
@@ -65,16 +66,22 @@ or (under Linux)
 
 echo example | ./example hyph_en_US.dic /dev/stdin
 
 NOTE: In the case of Unicode encoded input, convert your words
 to lowercase before hyphenation (under UTF-8 console environment):
 
 cat mywords.txt | awk '{print tolower($0)}' >mywordslow.txt
 
+BUILD DLL USING CROSS-COMPILATION
+
+./configure --host i586-mingw32 --prefix=/tmp/hyphen-dll
+make
+make install
+
 DEVELOPMENT
 
 See README.hyphen for hyphenation algorithm, README.nonstandard
 and doc/tb87nemeth.pdf for non-standard hyphenation,
 README.compound for compound word hyphenation, and tests/*.
 
 Description of the dictionary format:
 
--- a/intl/hyphenation/hyphen/hyphen.c
+++ b/intl/hyphenation/hyphen/hyphen.c
@@ -34,16 +34,17 @@
  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
  * for the specific language governing rights and limitations under the
  * MPL.
  *
  */
 #include <stdlib.h> /* for NULL, malloc */
 #include <stdio.h>  /* for fprintf */
 #include <string.h> /* for strdup */
+#include <limits.h> /* for INT_MAX */
 
 #ifdef UNX
 #include <unistd.h> /* for exit */
 #endif
 
 #define noVERBOSE
 
 /* calculate hyphenmin values with long ligature length (2 or 3 characters
@@ -300,17 +301,17 @@ void hnj_hyphen_load_line(char * buf, Hy
                 }
             } else {
                 hnj_strchomp(repl + 1);
                 replindex = 0;
                 replcut = (signed char) strlen(buf);
             }
             repl = hnj_strdup(repl + 1);
           }
-	  for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
+	  for (i = 0; (unsigned char)buf[i] > (unsigned char)' '; i++)
 	    {
 	      if (buf[i] >= '0' && buf[i] <= '9')
 		pattern[j] = buf[i];
 	      else
 		{
 		  word[j] = buf[i];
 		  pattern[++j] = '0';
 		}
@@ -323,17 +324,17 @@ void hnj_hyphen_load_line(char * buf, Hy
 	    /* Optimize away leading zeroes */
             for (; pattern[i] == '0'; i++);
           } else {
             if (*word == '.') i++;
             /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
             if (dict->utf8) {
                 int pu = -1;        /* unicode character position */
                 int ps = -1;        /* unicode start position (original replindex) */
-                int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
+                size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */
                 for (; pc < (strlen(word) + 1); pc++) {
                 /* beginning of an UTF-8 character (not '10' start bits) */
                     if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
                     if ((ps < 0) && (replindex == pu)) {
                         ps = replindex;
                         replindex = (signed char) pc;
                     }
                     if ((ps >= 0) && ((pu - ps) == replcut)) {
@@ -391,17 +392,17 @@ hnj_hyphen_load_file (FILE *f)
 {
   HyphenDict *dict[2];
   HashTab *hashtab;
   char buf[MAX_CHARS];
   int nextlevel = 0;
   int i, j, k;
   HashEntry *e;
   int state_num = 0;
-// loading one or two dictionaries (separated by NEXTLEVEL keyword)
+/* loading one or two dictionaries (separated by NEXTLEVEL keyword) */
 for (k = 0; k < 2; k++) { 
   hashtab = hnj_hash_new ();
 #ifdef VERBOSE
   global[k] = hashtab;
 #endif
   hnj_hash_insert (hashtab, "", 0);
   dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
   dict[k]->num_states = 1;
@@ -442,17 +443,17 @@ for (k = 0; k < 2; k++) {
 	nextlevel = 1;
 	break;
       } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
     }
   } else if (k == 1) {
     /* default first level: hyphen and ASCII apostrophe */
     if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
     else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
-    strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
+    strncpy(buf, "1-1\n", MAX_CHARS-1); /* buf rewritten by hnj_hyphen_load here */
     buf[MAX_CHARS-1] = '\0';
     hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
     hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
     if (dict[0]->utf8) {
       hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
       hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
     }
   }
@@ -689,89 +690,89 @@ int hnj_ligature(unsigned char c) {
 
 /* character length of the first n byte of the input word */
 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
 {
     int i = 0;
     int j = 0;
     while (j < n && word[j] != '\0') {
       i++;
-      // Unicode ligature support
+      /* Unicode ligature support */
       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
         i += hnj_ligature(word[j + 2]);
       }
       for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
     }
     return i;
 }
 
 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
 	char *** rep, int ** pos, int ** cut, int lhmin)
 {
     int i = 1, j;
 
-    // Unicode ligature support
+    /* Unicode ligature support */
     if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
       i += hnj_ligature(word[2]);
     }
 
-    // ignore numbers
+    /* ignore numbers */
     for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
 
     for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
-      // check length of the non-standard part
+      /* check length of the non-standard part */
       if (*rep && *pos && *cut && (*rep)[j]) {
         char * rh = strchr((*rep)[j], '=');
         if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
           hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
             free((*rep)[j]);
             (*rep)[j] = NULL;
             hyphens[j] = '0';
           }
        } else {
          hyphens[j] = '0';
        }
        j++;
 
-       // Unicode ligature support
+       /* Unicode ligature support */
        if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
          i += hnj_ligature(word[j + 2]);
        }
     } while (utf8 && (word[j] & 0xc0) == 0x80);
     return 0;
 }
 
 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
 	char *** rep, int ** pos, int ** cut, int rhmin)
 {
     int i = 0;
     int j;
 
-    // ignore numbers
+    /* ignore numbers */
     for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
 
     for (j = word_size - 1; i < rhmin && j > 0; j--) {
-      // check length of the non-standard part
+      /* check length of the non-standard part */
       if (*rep && *pos && *cut && (*rep)[j]) {
         char * rh = strchr((*rep)[j], '=');
         if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
           hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
             free((*rep)[j]);
             (*rep)[j] = NULL;
             hyphens[j] = '0';
           }
        } else {
          hyphens[j] = '0';
        }
        if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
     }
     return 0;
 }
 
-// recursive function for compound level hyphenation
+/* recursive function for compound level hyphenation */
 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
     char * hyphens, char *** rep, int ** pos, int ** cut,
     int clhmin, int crhmin, int lend, int rend)
 {
   char *prep_word;
   int i, j, k;
   int state;
   char ch;
@@ -943,49 +944,49 @@ int hnj_hyphen_hyph_(HyphenDict *dict, c
                 i += matchlen[i] - 1;
           }
        }
 
   hnj_free (matchrepl);
   hnj_free (matchlen);
   hnj_free (matchindex);
 
-  // recursive hyphenation of the first (compound) level segments
+  /* recursive hyphenation of the first (compound) level segments */
   if (dict->nextlevel) {
      char ** rep2;
      int * pos2;
      int * cut2;
      char * hyphens2;
      int begin = 0;
 
      rep2 = (char**) hnj_malloc (word_size * sizeof(char *));
      pos2 = (int*) hnj_malloc (word_size * sizeof(int));
      cut2 = (int*) hnj_malloc (word_size * sizeof(int));
      hyphens2 = (char*) hnj_malloc (word_size + 3);
      for (i = 0; i < word_size; i++) rep2[i] = NULL;
      for (i = 0; i < word_size; i++) if 
         (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
-        if (i - begin > 1) {
+        if (i - begin > 0) {
             int hyph = 0;
             prep_word[i + 2] = '\0';
             /* non-standard hyphenation at compound boundary (Schiffahrt) */
             if (rep && *rep && *pos && *cut && (*rep)[i]) {
                 char * l = strchr((*rep)[i], '=');
                 size_t offset = 2 + i - (*pos)[i];
                 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
                 prep_word[prep_word_size - 1] = '\0';
                 if (l) {
                     hyph = (l - (*rep)[i]) - (*pos)[i];
                     prep_word[2 + i + hyph] = '\0';
                 }
             }
             hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
                 hyphens2, &rep2, &pos2, &cut2, clhmin,
                 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
-            for (j = 0; j < i - begin - 1; j++) {
+            for (j = 0; j < i - begin; j++) {
                 hyphens[begin + j] = hyphens2[j];
                 if (rep2[j] && rep && pos && cut) {
                     if (!*rep && !*pos && !*cut) {
                         int k;
                         *rep = (char **) malloc(sizeof(char *) * word_size);
                         *pos = (int *) malloc(sizeof(int) * word_size);
                         *cut = (int *) malloc(sizeof(int) * word_size);
                         for (k = 0; k < word_size; k++) {
@@ -1005,17 +1006,17 @@ int hnj_hyphen_hyph_(HyphenDict *dict, c
                 strncpy(prep_word + offset, word, prep_word_size - offset - 1);
                 prep_word[prep_word_size - 1] = '\0';
             }
         }
         begin = i + 1;
         for (j = 0; j < word_size; j++) rep2[j] = NULL;
      }
      
-     // non-compound
+     /* non-compound */
      if (begin == 0) {
         hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
             hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
         if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
             rep, pos, cut, clhmin);
         if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
             rep, pos, cut, crhmin);
      }
@@ -1068,33 +1069,51 @@ int hnj_hyphen_norm(const char *word, in
   hyphens[j + 1] = '\0';
 #ifdef VERBOSE
   printf ("nums: %s\n", hyphens);
 #endif
   return 0;
 }
 
 /* get the word with all possible hyphenations (output: hyphword) */
-void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, 
+void hnj_hyphen_hyphword(const char * word, int word_size, const char * hyphens,
     char * hyphword, char *** rep, int ** pos, int ** cut)
 {
-  int hyphenslen = l + 5;
+  
+  if (word_size <= 0 || word_size > INT_MAX / 2) {
+    hyphword[0] = '\0';
+    return;
+  }
+  
+  /* hyphword buffer size must be at least 2 * l */
+  int hyphword_size = 2 * word_size - 1;
+
+  int nonstandard = 0;
+  if (*rep && *pos && *cut) {
+    nonstandard = 1;
+  }
 
-  int i, j;
-  for (i = 0, j = 0; i < l; i++, j++) {
-    if (hyphens[i]&1) {
-      hyphword[j] = word[i];
-      if (*rep && *pos && *cut && (*rep)[i]) {
-        size_t offset = j - (*pos)[i] + 1;
-        strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
-        hyphword[hyphenslen-1] = '\0';
-        j += strlen((*rep)[i]) - (*pos)[i];
+  int i;
+  int j = 0;
+  for (i = 0; i < word_size && j < hyphword_size; i++) {
+    hyphword[j++] = word[i];
+    if (hyphens[i]&1 && j < hyphword_size) {
+      if (nonstandard && (*rep)[i] && j >= (*pos)[i]) {
+        /* non-standard */
+        j -= (*pos)[i];
+        char *s = (*rep)[i];
+        while (*s && j < hyphword_size) {
+          hyphword[j++] = *s++;
+        }
         i += (*cut)[i] - (*pos)[i];
-      } else hyphword[++j] = '=';
-    } else hyphword[j] = word[i];
+      } else {
+        /* standard */
+        hyphword[j++] = '=';
+      }
+    }
   }
   hyphword[j] = '\0';
 }
 
 
 /* main api function with default hyphenmin parameters */
 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
 			   const char *word, int word_size, char * hyphens,