Bug 1180113 - Introducing g2p algorithm inside pocketsphinx to allow out of dictionary words to be added to grammars. r=smaug
authorAndre Natal <anatal@gmail.com>
Wed, 05 Aug 2015 00:33:00 +0200
changeset 256301 5eafba8094d02df16400d14401d7859e23bb26b9
parent 256300 d44ecfd932525da059cc9b020e02e55c83ffe55b
child 256302 9238f5ea7bdea6014180924296fe45b2fd3b5f3a
push id63290
push usercbook@mozilla.com
push dateWed, 05 Aug 2015 11:57:35 +0000
treeherdermozilla-inbound@25af02694bad [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssmaug
bugs1180113
milestone42.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1180113 - Introducing g2p algorithm inside pocketsphinx to allow out of dictionary words to be added to grammars. r=smaug Signed-off-by: Andre Natal <anatal@gmail.com>
media/pocketsphinx/src/dict.c
media/pocketsphinx/src/dict.h
media/pocketsphinx/src/fsg_search.c
media/pocketsphinx/src/fsg_search_internal.h
media/pocketsphinx/src/pocketsphinx.c
media/pocketsphinx/src/ps_lattice.c
--- a/media/pocketsphinx/src/dict.c
+++ b/media/pocketsphinx/src/dict.c
@@ -32,16 +32,17 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * ====================================================================
  *
  */
 
 /* System headers. */
 #include <string.h>
+#include <limits.h> // We need this for LONG_MIN
 
 /* SphinxBase headers. */
 #include <sphinxbase/pio.h>
 #include <sphinxbase/strfuncs.h>
 
 /* Local headers. */
 #include "dict.h"
 
@@ -244,24 +245,24 @@ dict_write(dict_t *dict, char const *fil
         ckd_free(phones);
     }
     fclose(fh);
     return 0;
 }
 
 
 dict_t *
-dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
+dict_init(cmd_ln_t *config, bin_mdef_t * mdef, logmath_t *logmath)
 {
     FILE *fp, *fp2;
     int32 n;
     lineiter_t *li;
     dict_t *d;
     s3cipid_t sil;
-    char const *dictfile = NULL, *fillerfile = NULL;
+    char const *dictfile = NULL, *fillerfile = NULL, *arpafile = NULL;
 
     if (config) {
         dictfile = cmd_ln_str_r(config, "-dict");
         fillerfile = cmd_ln_str_r(config, "-fdict");
     }
 
     /*
      * First obtain #words in dictionary (for hash table allocation).
@@ -298,16 +299,29 @@ dict_init(cmd_ln_t *config, bin_mdef_t *
         fseek(fp2, 0L, SEEK_SET);
     }
 
     /*
      * Allocate dict entries.  HACK!!  Allow some extra entries for words not in file.
      * Also check for type size restrictions.
      */
     d = (dict_t *) ckd_calloc(1, sizeof(dict_t));       /* freed in dict_free() */
+    if (config){
+        arpafile = string_join(dictfile, ".dmp",  NULL);
+    }
+    if (arpafile) {
+        ngram_model_t *ngram_g2p_model = ngram_model_read(NULL,arpafile,NGRAM_AUTO,logmath);
+        ckd_free(arpafile);
+        if (!ngram_g2p_model) {
+            E_ERROR("No arpa model found  \n");
+            return NULL;
+        }
+        d->ngram_g2p_model = ngram_g2p_model;
+    }
+
     d->refcnt = 1;
     d->max_words =
         (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
     if (n >= MAX_S3WID) {
         E_ERROR("Number of words in dictionaries (%d) exceeds limit (%d)\n", n,
                 MAX_S3WID);
         fclose(fp);
         fclose(fp2);
@@ -469,21 +483,253 @@ dict_free(dict_t * d)
     }
 
     if (d->word)
         ckd_free((void *) d->word);
     if (d->ht)
         hash_table_free(d->ht);
     if (d->mdef)
         bin_mdef_free(d->mdef);
+    if (d->ngram_g2p_model)
+        ngram_model_free(d->ngram_g2p_model);
     ckd_free((void *) d);
 
     return 0;
 }
 
 void
 dict_report(dict_t * d)
 {
     E_INFO_NOFN("Initialization of dict_t, report:\n");
     E_INFO_NOFN("Max word: %d\n", d->max_words);
     E_INFO_NOFN("No of word: %d\n", d->n_word);
     E_INFO_NOFN("\n");
 }
+
+// This function returns if a string (str) starts with the passed prefix (*pre)
+int
+dict_starts_with(const char *pre, const char *str)
+{
+    size_t lenpre = strlen(pre), lenstr = strlen(str);
+    return lenstr < lenpre ? 0 : strncmp(pre, str, lenpre) == 0;
+}
+
+// Helper function to clear unigram
+void
+free_unigram_t(unigram_t *unigram)
+{
+    ckd_free(unigram->word);
+    ckd_free(unigram->phone);
+}
+
+// This function splits an unigram received (in format e|w}UW) and return a structure
+// containing two fields: the grapheme (before }) in unigram.word and the phoneme (after }) unigram.phone
+unigram_t
+dict_split_unigram(const char * word)
+{
+    size_t total_graphemes = 0;
+    size_t total_phone = 0;
+    int token_pos = 0;
+    int w ;
+    char *phone;
+    char *letter;
+    size_t lenword = 0;
+    char unigram_letter;
+    int add;
+
+    lenword = strlen(word);
+    for (w = 0; w < lenword; w++) {
+        unigram_letter = word[w];
+        if (unigram_letter == '}') {
+            token_pos = w;
+            continue;
+        }
+        if (!token_pos)
+            total_graphemes++;
+        else
+            total_phone++;
+    }
+
+    letter = ckd_calloc(1, total_graphemes+1);
+    add = 0;
+    for (w = 0; w < total_graphemes; w++) {
+        if (word[w] == '|')
+        {
+            add++;
+            continue;
+        }
+        letter[w - add] = word[w];
+    }
+
+    phone = ckd_calloc(1, total_phone+1);
+    for (w = 0; w < total_phone; w++) {
+        if (word[w + 1 + total_graphemes] == '|') {
+            phone[w] = ' ';
+        } else {
+            phone[w] = word[w + 1 + total_graphemes];
+        }
+    }
+
+    unigram_t unigram = { letter , phone};
+
+    return unigram;
+};
+
+// This function calculates the most likely unigram to appear in the current position at the word
+// based on the three latest chosen/winners unigrams (history) and return a structure containing
+// the word id (wid), and lengths of the phoneme and the word
+struct winner_t
+dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, int word_offset)
+{
+    long current_prob = LONG_MIN;
+    struct winner_t winner;
+    int32 i = 0, j = 0;
+    int nused;
+    int32 ngram_order = ngram_model_get_size(model);
+    int32 *history = ckd_calloc((size_t)ngram_order, sizeof(int32));
+    gnode_t *gn;
+    const char *vocab;
+    const char *sub;
+    int32 prob;
+    unigram_t unigram;
+    const int32 *total_unigrams = ngram_model_get_counts(model);
+
+    for (gn = history_list; gn; gn = gnode_next(gn)) {
+        // we need to build history from last to first because glist returns itens from last to first
+        history[ngram_order - j - 1] = gnode_int32(gn);
+        j++;
+        if (j >= ngram_order)
+            break;
+    }
+
+    for (i = 0; i < *total_unigrams; i++) {
+        vocab = ngram_word(model, i);
+        unigram  = dict_split_unigram(vocab);
+        sub = word_grapheme + word_offset;
+        if (dict_starts_with(unigram.word, sub)) {
+            prob = ngram_ng_prob(model, i, history, j, &nused);
+            if (current_prob < prob) {
+                current_prob = prob;
+                winner.winner_wid = i;
+                winner.length_match = strlen(unigram.word);
+                winner.len_phoneme = strlen(unigram.phone);
+            }
+        }
+
+        free_unigram_t(&unigram);
+    }
+
+    if (history)
+        ckd_free(history);
+
+    return winner;
+}
+
+// This function manages the winner unigrams and builds the history of winners to properly generate the final phoneme. In the first part,
+// it gets the most likely unigrams which graphemes compose the word and build a history of wids that is used in this search. In second part, the we
+// use the history of wids to get each correspondent unigram, and on third part, we build the final phoneme word from this history.
+char *
+dict_g2p(char const *word_grapheme, ngram_model_t *ngram_g2p_model)
+{
+    char *final_phone = NULL;
+    int totalh = 0;
+    size_t increment = 1;
+    int word_offset = 0;
+    int j;
+    size_t grapheme_len = 0, final_phoneme_len = 0;
+    glist_t history_list = NULL;
+    gnode_t *gn;
+    int first = 0;
+    struct winner_t winner;
+    const char *word;
+    unigram_t unigram;
+
+    int32 wid_sentence = ngram_wid(ngram_g2p_model,"<s>"); // start with sentence
+    history_list = glist_add_int32(history_list, wid_sentence);
+    grapheme_len = strlen(word_grapheme);
+    for (j = 0 ; j < grapheme_len ; j += increment) {
+        winner = dict_get_winner_wid(ngram_g2p_model, word_grapheme, history_list, word_offset);
+        increment = winner.length_match;
+        if (increment == 0) {
+            E_ERROR("Error trying to find matching phoneme (%s) Exiting.. \n" , word_grapheme);
+            ckd_free(history_list);
+            return NULL;
+        }
+        history_list = glist_add_int32(history_list, winner.winner_wid);
+        totalh = j + 1;
+        word_offset += winner.length_match;
+        final_phoneme_len += winner.len_phoneme;
+    }
+
+    history_list = glist_reverse(history_list);
+    final_phone = ckd_calloc(1, (final_phoneme_len * 2)+1);
+    for (gn = history_list; gn; gn = gnode_next(gn)) {
+        if (!first) {
+            first = 1;
+            continue;
+        }
+        word = ngram_word(ngram_g2p_model, gnode_int32(gn));
+
+        if (!word)
+            continue;
+
+        unigram  = dict_split_unigram(word);
+
+        if (strcmp(unigram.phone, "_") == 0) {
+            free_unigram_t(&unigram);
+            continue;
+        }
+        strcat(final_phone, unigram.phone);
+        strcat(final_phone, " ");
+
+        free_unigram_t(&unigram);
+    }
+
+    if (history_list)
+        glist_free(history_list);
+
+    return final_phone;
+}
+
+// This function just receives the dict lacking word from fsg_search, call the main function dict_g2p, and then adds the word to the memory dict.
+// The second part of this function is the same as pocketsphinx.c: https://github.com/cmusphinx/pocketsphinx/blob/ba6bd21b3601339646d2db6d2297d02a8a6b7029/src/libpocketsphinx/pocketsphinx.c#L816
+int
+dict_add_g2p_word(dict_t *dict, char const *word)
+{
+    int32 wid = 0;
+    s3cipid_t *pron;
+    char **phonestr, *tmp;
+    int np, i;
+    char *phones;
+
+    phones = dict_g2p(word, dict->ngram_g2p_model);
+    if (phones == NULL)
+        return 0;
+
+    E_INFO("Adding phone %s for word %s \n",  phones, word);
+    tmp = ckd_salloc(phones);
+    np = str2words(tmp, NULL, 0);
+    phonestr = ckd_calloc(np, sizeof(*phonestr));
+    str2words(tmp, phonestr, np);
+    pron = ckd_calloc(np, sizeof(*pron));
+    for (i = 0; i < np; ++i) {
+        pron[i] = bin_mdef_ciphone_id(dict->mdef, phonestr[i]);
+        if (pron[i] == -1) {
+            E_ERROR("Unknown phone %s in phone string %s\n",
+                    phonestr[i], tmp);
+            ckd_free(phonestr);
+            ckd_free(tmp);
+            ckd_free(pron);
+            ckd_free(phones);
+            return -1;
+        }
+    }
+    ckd_free(phonestr);
+    ckd_free(tmp);
+    ckd_free(phones);
+    if ((wid = dict_add_word(dict, word, pron, np)) == -1) {
+        ckd_free(pron);
+        return -1;
+    }
+    ckd_free(pron);
+
+    return wid;
+}
--- a/media/pocketsphinx/src/dict.h
+++ b/media/pocketsphinx/src/dict.h
@@ -39,16 +39,17 @@
 #define _S3_DICT_H_
 
 /** \file dict.h
  * \brief Operations on dictionary. 
  */
 
 /* SphinxBase headers. */
 #include <sphinxbase/hash_table.h>
+#include <sphinxbase/ngram_model.h>
 
 /* Local headers. */
 #include "s3types.h"
 #include "bin_mdef.h"
 #include "pocketsphinx_export.h"
 
 #define S3DICT_INC_SZ 4096
 
@@ -81,32 +82,46 @@ typedef struct {
     int32 max_words;	/**< #Entries allocated in dict, including empty slots */
     int32 n_word;	/**< #Occupied entries in dict; ie, excluding empty slots */
     int32 filler_start;	/**< First filler word id (read from filler dict) */
     int32 filler_end;	/**< Last filler word id (read from filler dict) */
     s3wid_t startwid;	/**< FOR INTERNAL-USE ONLY */
     s3wid_t finishwid;	/**< FOR INTERNAL-USE ONLY */
     s3wid_t silwid;	/**< FOR INTERNAL-USE ONLY */
     int nocase;
+    ngram_model_t *ngram_g2p_model;
 } dict_t;
 
+struct winner_t
+{
+    size_t length_match;
+    int winner_wid;
+    size_t len_phoneme;
+};
+
+typedef struct
+{
+    char *word;
+    char *phone;
+} unigram_t;
 
 /**
  * Initialize a new dictionary.
  *
  * If config and mdef are supplied, then the dictionary will be read
  * from the files specified by the -dict and -fdict options in config,
  * with case sensitivity determined by the -dictcase option.
  *
  * Otherwise an empty case-sensitive dictionary will be created.
  *
  * Return ptr to dict_t if successful, NULL otherwise.
  */
 dict_t *dict_init(cmd_ln_t *config, /**< Configuration (-dict, -fdict, -dictcase) or NULL */
-                  bin_mdef_t *mdef  /**< For looking up CI phone IDs (or NULL) */
+                  bin_mdef_t *mdef,  /**< For looking up CI phone IDs (or NULL) */
+                  logmath_t *logmath // To load ngram_model for g2p load. logmath must be retained with logmath_retain() if it is to be used elsewhere.
     );
 
 /**
  * Write dictionary to a file.
  */
 int dict_write(dict_t *dict, char const *filename, char const *format);
 
 /** Return word id for given word string if present.  Otherwise return BAD_S3WID */
@@ -198,13 +213,16 @@ dict_t *dict_retain(dict_t *d);
  * Release a pointer to a dictionary.
  */
 int dict_free(dict_t *d);
 
 /** Report a dictionary structure */
 void dict_report(dict_t *d /**< A dictionary structure */
     );
 
+// g2p functions
+int dict_add_g2p_word(dict_t * dict, char const *word);
+
 #ifdef __cplusplus
 }
 #endif
 
 #endif
--- a/media/pocketsphinx/src/fsg_search.c
+++ b/media/pocketsphinx/src/fsg_search.c
@@ -60,16 +60,17 @@
 #include <sphinxbase/cmd_ln.h>
 
 /* Local headers. */
 #include "pocketsphinx_internal.h"
 #include "ps_lattice_internal.h"
 #include "fsg_search_internal.h"
 #include "fsg_history.h"
 #include "fsg_lextree.h"
+#include "dict.h"
 
 /* Turn this on for detailed debugging dump */
 #define __FSG_DBG__		0
 #define __FSG_DBG_CHAN__	0
 
 static ps_seg_t *fsg_search_seg_iter(ps_search_t *search, int32 *out_score);
 static ps_lattice_t *fsg_search_lattice(ps_search_t *search);
 static int fsg_search_prob(ps_search_t *search);
@@ -134,19 +135,31 @@ fsg_search_check_dict(fsg_search_t *fsgs
     dict = ps_search_dict(fsgs);
     for (i = 0; i < fsg_model_n_word(fsg); ++i) {
         char const *word;
         int32 wid;
 
         word = fsg_model_word_str(fsg, i);
         wid = dict_wordid(dict, word);
         if (wid == BAD_S3WID) {
-    	    E_ERROR("The word '%s' is missing in the dictionary\n", word);
-    	    return FALSE;
-    	}
+            E_WARN("The word '%s' is missing in the dictionary. Trying to create new phoneme \n", word);
+            if (!dict->ngram_g2p_model) {
+                E_ERROR("NO dict->ngram_g2p_model. Aborting..");
+                return FALSE;
+            }
+
+            int new_wid = dict_add_g2p_word(dict, word);
+            if (new_wid > 0){
+                /* Now we also have to add it to dict2pid. */
+                dict2pid_add_word(ps_search_dict2pid(fsgs), new_wid);
+            } else {
+                E_ERROR("Exiting... \n");
+                return FALSE;
+            }
+        }
     }
 
     return TRUE;
 }
 
 static int
 fsg_search_add_altpron(fsg_search_t *fsgs, fsg_model_t *fsg)
 {
--- a/media/pocketsphinx/src/fsg_search_internal.h
+++ b/media/pocketsphinx/src/fsg_search_internal.h
@@ -64,17 +64,18 @@ typedef struct fsg_seg_s {
 
 /**
  * Implementation of FSG search (and "FSG set") structure.
  */
 typedef struct fsg_search_s {
     ps_search_t base;
 
     hmm_context_t *hmmctx; /**< HMM context. */
-
+    char const *arpafile;
+    cmd_ln_t *config;
     fsg_model_t *fsg;		/**< FSG model */
     struct fsg_lextree_s *lextree;/**< Lextree structure for the currently
 				   active FSG */
     struct fsg_history_s *history;/**< For storing the Viterbi search history */
   
     glist_t pnode_active;	/**< Those active in this frame */
     glist_t pnode_active_next;	/**< Those activated for the next frame */
   
--- a/media/pocketsphinx/src/pocketsphinx.c
+++ b/media/pocketsphinx/src/pocketsphinx.c
@@ -273,17 +273,17 @@ ps_reinit(ps_decoder_t *ps, cmd_ln_t *co
             return -1;
         hash_table_enter(ps->searches,
                          ckd_salloc(ps_search_name(ps->phone_loop)),
                          ps->phone_loop);
     }
 
     /* Dictionary and triphone mappings (depends on acmod). */
     /* FIXME: pass config, change arguments, implement LTS, etc. */
-    if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL)
+    if ((ps->dict = dict_init(ps->config, ps->acmod->mdef, ps->acmod->lmath)) == NULL)
         return -1;
     if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL)
         return -1;
 
     lw = cmd_ln_float32_r(config, "-lw");
 
     /* Determine whether we are starting out in FSG or N-Gram search mode.
      * If neither is used skip search initialization. */
@@ -715,17 +715,17 @@ ps_load_dict(ps_decoder_t *ps, char cons
     cmd_ln_set_str_r(newconfig, "-dict", dictfile);
     if (fdictfile)
         cmd_ln_set_str_r(newconfig, "-fdict", fdictfile);
     else
         cmd_ln_set_str_r(newconfig, "-fdict",
                          cmd_ln_str_r(ps->config, "-fdict"));
 
     /* Try to load it. */
-    if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) {
+    if ((dict = dict_init(newconfig, ps->acmod->mdef, ps->acmod->lmath)) == NULL) {
         cmd_ln_free_r(newconfig);
         return -1;
     }
 
     /* Reinit the dict2pid. */
     if ((d2p = dict2pid_build(ps->acmod->mdef, dict)) == NULL) {
         cmd_ln_free_r(newconfig);
         return -1;
--- a/media/pocketsphinx/src/ps_lattice.c
+++ b/media/pocketsphinx/src/ps_lattice.c
@@ -399,20 +399,21 @@ ps_lattice_read(ps_decoder_t *ps,
     int32 pip, silpen, fillpen;
 
     dag = ckd_calloc(1, sizeof(*dag));
 
     if (ps) {
         dag->search = ps->search;
         dag->dict = dict_retain(ps->dict);
         dag->lmath = logmath_retain(ps->lmath);
+        dag->dict = dict_init(NULL, NULL, dag->lmath);
         dag->frate = cmd_ln_int32_r(dag->search->config, "-frate");
     }
     else {
-        dag->dict = dict_init(NULL, NULL);
+        dag->dict = dict_init(NULL, NULL, dag->lmath);
         dag->lmath = logmath_init(1.0001, 0, FALSE);
         dag->frate = 100;
     }
     dag->silence = dict_silwid(dag->dict);
     dag->latnode_alloc = listelem_alloc_init(sizeof(ps_latnode_t));
     dag->latlink_alloc = listelem_alloc_init(sizeof(ps_latlink_t));
     dag->latlink_list_alloc = listelem_alloc_init(sizeof(latlink_list_t));
     dag->refcount = 1;