author | Andre Natal <anatal@gmail.com> |
Wed, 05 Aug 2015 00:33:00 +0200 | |
changeset 256301 | 5eafba8094d02df16400d14401d7859e23bb26b9 |
parent 256300 | d44ecfd932525da059cc9b020e02e55c83ffe55b |
child 256302 | 9238f5ea7bdea6014180924296fe45b2fd3b5f3a |
push id | 63290 |
push user | cbook@mozilla.com |
push date | Wed, 05 Aug 2015 11:57:35 +0000 |
treeherder | mozilla-inbound@25af02694bad [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | smaug |
bugs | 1180113 |
milestone | 42.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/media/pocketsphinx/src/dict.c +++ b/media/pocketsphinx/src/dict.c @@ -32,16 +32,17 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /* System headers. */ #include <string.h> +#include <limits.h> // We need this for LONG_MIN /* SphinxBase headers. */ #include <sphinxbase/pio.h> #include <sphinxbase/strfuncs.h> /* Local headers. */ #include "dict.h" @@ -244,24 +245,24 @@ dict_write(dict_t *dict, char const *fil ckd_free(phones); } fclose(fh); return 0; } dict_t * -dict_init(cmd_ln_t *config, bin_mdef_t * mdef) +dict_init(cmd_ln_t *config, bin_mdef_t * mdef, logmath_t *logmath) { FILE *fp, *fp2; int32 n; lineiter_t *li; dict_t *d; s3cipid_t sil; - char const *dictfile = NULL, *fillerfile = NULL; + char const *dictfile = NULL, *fillerfile = NULL, *arpafile = NULL; if (config) { dictfile = cmd_ln_str_r(config, "-dict"); fillerfile = cmd_ln_str_r(config, "-fdict"); } /* * First obtain #words in dictionary (for hash table allocation). @@ -298,16 +299,29 @@ dict_init(cmd_ln_t *config, bin_mdef_t * fseek(fp2, 0L, SEEK_SET); } /* * Allocate dict entries. HACK!! Allow some extra entries for words not in file. * Also check for type size restrictions. */ d = (dict_t *) ckd_calloc(1, sizeof(dict_t)); /* freed in dict_free() */ + if (config){ + arpafile = string_join(dictfile, ".dmp", NULL); + } + if (arpafile) { + ngram_model_t *ngram_g2p_model = ngram_model_read(NULL,arpafile,NGRAM_AUTO,logmath); + ckd_free(arpafile); + if (!ngram_g2p_model) { + E_ERROR("No arpa model found \n"); + return NULL; + } + d->ngram_g2p_model = ngram_g2p_model; + } + d->refcnt = 1; d->max_words = (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID; if (n >= MAX_S3WID) { E_ERROR("Number of words in dictionaries (%d) exceeds limit (%d)\n", n, MAX_S3WID); fclose(fp); fclose(fp2); @@ -469,21 +483,253 @@ dict_free(dict_t * d) } if (d->word) ckd_free((void *) d->word); if (d->ht) hash_table_free(d->ht); if (d->mdef) bin_mdef_free(d->mdef); + if (d->ngram_g2p_model) + ngram_model_free(d->ngram_g2p_model); ckd_free((void *) d); return 0; } void dict_report(dict_t * d) { E_INFO_NOFN("Initialization of dict_t, report:\n"); E_INFO_NOFN("Max word: %d\n", d->max_words); E_INFO_NOFN("No of word: %d\n", d->n_word); E_INFO_NOFN("\n"); } + +// This function returns if a string (str) starts with the passed prefix (*pre) +int +dict_starts_with(const char *pre, const char *str) +{ + size_t lenpre = strlen(pre), lenstr = strlen(str); + return lenstr < lenpre ? 0 : strncmp(pre, str, lenpre) == 0; +} + +// Helper function to clear unigram +void +free_unigram_t(unigram_t *unigram) +{ + ckd_free(unigram->word); + ckd_free(unigram->phone); +} + +// This function splits an unigram received (in format e|w}UW) and return a structure +// containing two fields: the grapheme (before }) in unigram.word and the phoneme (after }) unigram.phone +unigram_t +dict_split_unigram(const char * word) +{ + size_t total_graphemes = 0; + size_t total_phone = 0; + int token_pos = 0; + int w ; + char *phone; + char *letter; + size_t lenword = 0; + char unigram_letter; + int add; + + lenword = strlen(word); + for (w = 0; w < lenword; w++) { + unigram_letter = word[w]; + if (unigram_letter == '}') { + token_pos = w; + continue; + } + if (!token_pos) + total_graphemes++; + else + total_phone++; + } + + letter = ckd_calloc(1, total_graphemes+1); + add = 0; + for (w = 0; w < total_graphemes; w++) { + if (word[w] == '|') + { + add++; + continue; + } + letter[w - add] = word[w]; + } + + phone = ckd_calloc(1, total_phone+1); + for (w = 0; w < total_phone; w++) { + if (word[w + 1 + total_graphemes] == '|') { + phone[w] = ' '; + } else { + phone[w] = word[w + 1 + total_graphemes]; + } + } + + unigram_t unigram = { letter , phone}; + + return unigram; +}; + +// This function calculates the most likely unigram to appear in the current position at the word +// based on the three latest chosen/winners unigrams (history) and return a structure containing +// the word id (wid), and lengths of the phoneme and the word +struct winner_t +dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, int word_offset) +{ + long current_prob = LONG_MIN; + struct winner_t winner; + int32 i = 0, j = 0; + int nused; + int32 ngram_order = ngram_model_get_size(model); + int32 *history = ckd_calloc((size_t)ngram_order, sizeof(int32)); + gnode_t *gn; + const char *vocab; + const char *sub; + int32 prob; + unigram_t unigram; + const int32 *total_unigrams = ngram_model_get_counts(model); + + for (gn = history_list; gn; gn = gnode_next(gn)) { + // we need to build history from last to first because glist returns itens from last to first + history[ngram_order - j - 1] = gnode_int32(gn); + j++; + if (j >= ngram_order) + break; + } + + for (i = 0; i < *total_unigrams; i++) { + vocab = ngram_word(model, i); + unigram = dict_split_unigram(vocab); + sub = word_grapheme + word_offset; + if (dict_starts_with(unigram.word, sub)) { + prob = ngram_ng_prob(model, i, history, j, &nused); + if (current_prob < prob) { + current_prob = prob; + winner.winner_wid = i; + winner.length_match = strlen(unigram.word); + winner.len_phoneme = strlen(unigram.phone); + } + } + + free_unigram_t(&unigram); + } + + if (history) + ckd_free(history); + + return winner; +} + +// This function manages the winner unigrams and builds the history of winners to properly generate the final phoneme. In the first part, +// it gets the most likely unigrams which graphemes compose the word and build a history of wids that is used in this search. In second part, the we +// use the history of wids to get each correspondent unigram, and on third part, we build the final phoneme word from this history. +char * +dict_g2p(char const *word_grapheme, ngram_model_t *ngram_g2p_model) +{ + char *final_phone = NULL; + int totalh = 0; + size_t increment = 1; + int word_offset = 0; + int j; + size_t grapheme_len = 0, final_phoneme_len = 0; + glist_t history_list = NULL; + gnode_t *gn; + int first = 0; + struct winner_t winner; + const char *word; + unigram_t unigram; + + int32 wid_sentence = ngram_wid(ngram_g2p_model,"<s>"); // start with sentence + history_list = glist_add_int32(history_list, wid_sentence); + grapheme_len = strlen(word_grapheme); + for (j = 0 ; j < grapheme_len ; j += increment) { + winner = dict_get_winner_wid(ngram_g2p_model, word_grapheme, history_list, word_offset); + increment = winner.length_match; + if (increment == 0) { + E_ERROR("Error trying to find matching phoneme (%s) Exiting.. \n" , word_grapheme); + ckd_free(history_list); + return NULL; + } + history_list = glist_add_int32(history_list, winner.winner_wid); + totalh = j + 1; + word_offset += winner.length_match; + final_phoneme_len += winner.len_phoneme; + } + + history_list = glist_reverse(history_list); + final_phone = ckd_calloc(1, (final_phoneme_len * 2)+1); + for (gn = history_list; gn; gn = gnode_next(gn)) { + if (!first) { + first = 1; + continue; + } + word = ngram_word(ngram_g2p_model, gnode_int32(gn)); + + if (!word) + continue; + + unigram = dict_split_unigram(word); + + if (strcmp(unigram.phone, "_") == 0) { + free_unigram_t(&unigram); + continue; + } + strcat(final_phone, unigram.phone); + strcat(final_phone, " "); + + free_unigram_t(&unigram); + } + + if (history_list) + glist_free(history_list); + + return final_phone; +} + +// This function just receives the dict lacking word from fsg_search, call the main function dict_g2p, and then adds the word to the memory dict. +// The second part of this function is the same as pocketsphinx.c: https://github.com/cmusphinx/pocketsphinx/blob/ba6bd21b3601339646d2db6d2297d02a8a6b7029/src/libpocketsphinx/pocketsphinx.c#L816 +int +dict_add_g2p_word(dict_t *dict, char const *word) +{ + int32 wid = 0; + s3cipid_t *pron; + char **phonestr, *tmp; + int np, i; + char *phones; + + phones = dict_g2p(word, dict->ngram_g2p_model); + if (phones == NULL) + return 0; + + E_INFO("Adding phone %s for word %s \n", phones, word); + tmp = ckd_salloc(phones); + np = str2words(tmp, NULL, 0); + phonestr = ckd_calloc(np, sizeof(*phonestr)); + str2words(tmp, phonestr, np); + pron = ckd_calloc(np, sizeof(*pron)); + for (i = 0; i < np; ++i) { + pron[i] = bin_mdef_ciphone_id(dict->mdef, phonestr[i]); + if (pron[i] == -1) { + E_ERROR("Unknown phone %s in phone string %s\n", + phonestr[i], tmp); + ckd_free(phonestr); + ckd_free(tmp); + ckd_free(pron); + ckd_free(phones); + return -1; + } + } + ckd_free(phonestr); + ckd_free(tmp); + ckd_free(phones); + if ((wid = dict_add_word(dict, word, pron, np)) == -1) { + ckd_free(pron); + return -1; + } + ckd_free(pron); + + return wid; +}
--- a/media/pocketsphinx/src/dict.h +++ b/media/pocketsphinx/src/dict.h @@ -39,16 +39,17 @@ #define _S3_DICT_H_ /** \file dict.h * \brief Operations on dictionary. */ /* SphinxBase headers. */ #include <sphinxbase/hash_table.h> +#include <sphinxbase/ngram_model.h> /* Local headers. */ #include "s3types.h" #include "bin_mdef.h" #include "pocketsphinx_export.h" #define S3DICT_INC_SZ 4096 @@ -81,32 +82,46 @@ typedef struct { int32 max_words; /**< #Entries allocated in dict, including empty slots */ int32 n_word; /**< #Occupied entries in dict; ie, excluding empty slots */ int32 filler_start; /**< First filler word id (read from filler dict) */ int32 filler_end; /**< Last filler word id (read from filler dict) */ s3wid_t startwid; /**< FOR INTERNAL-USE ONLY */ s3wid_t finishwid; /**< FOR INTERNAL-USE ONLY */ s3wid_t silwid; /**< FOR INTERNAL-USE ONLY */ int nocase; + ngram_model_t *ngram_g2p_model; } dict_t; +struct winner_t +{ + size_t length_match; + int winner_wid; + size_t len_phoneme; +}; + +typedef struct +{ + char *word; + char *phone; +} unigram_t; /** * Initialize a new dictionary. * * If config and mdef are supplied, then the dictionary will be read * from the files specified by the -dict and -fdict options in config, * with case sensitivity determined by the -dictcase option. * * Otherwise an empty case-sensitive dictionary will be created. * * Return ptr to dict_t if successful, NULL otherwise. */ dict_t *dict_init(cmd_ln_t *config, /**< Configuration (-dict, -fdict, -dictcase) or NULL */ - bin_mdef_t *mdef /**< For looking up CI phone IDs (or NULL) */ + bin_mdef_t *mdef, /**< For looking up CI phone IDs (or NULL) */ + logmath_t *logmath // To load ngram_model for g2p load. logmath must be retained with logmath_retain() if it is to be used elsewhere. ); /** * Write dictionary to a file. */ int dict_write(dict_t *dict, char const *filename, char const *format); /** Return word id for given word string if present. Otherwise return BAD_S3WID */ @@ -198,13 +213,16 @@ dict_t *dict_retain(dict_t *d); * Release a pointer to a dictionary. */ int dict_free(dict_t *d); /** Report a dictionary structure */ void dict_report(dict_t *d /**< A dictionary structure */ ); +// g2p functions +int dict_add_g2p_word(dict_t * dict, char const *word); + #ifdef __cplusplus } #endif #endif
--- a/media/pocketsphinx/src/fsg_search.c +++ b/media/pocketsphinx/src/fsg_search.c @@ -60,16 +60,17 @@ #include <sphinxbase/cmd_ln.h> /* Local headers. */ #include "pocketsphinx_internal.h" #include "ps_lattice_internal.h" #include "fsg_search_internal.h" #include "fsg_history.h" #include "fsg_lextree.h" +#include "dict.h" /* Turn this on for detailed debugging dump */ #define __FSG_DBG__ 0 #define __FSG_DBG_CHAN__ 0 static ps_seg_t *fsg_search_seg_iter(ps_search_t *search, int32 *out_score); static ps_lattice_t *fsg_search_lattice(ps_search_t *search); static int fsg_search_prob(ps_search_t *search); @@ -134,19 +135,31 @@ fsg_search_check_dict(fsg_search_t *fsgs dict = ps_search_dict(fsgs); for (i = 0; i < fsg_model_n_word(fsg); ++i) { char const *word; int32 wid; word = fsg_model_word_str(fsg, i); wid = dict_wordid(dict, word); if (wid == BAD_S3WID) { - E_ERROR("The word '%s' is missing in the dictionary\n", word); - return FALSE; - } + E_WARN("The word '%s' is missing in the dictionary. Trying to create new phoneme \n", word); + if (!dict->ngram_g2p_model) { + E_ERROR("NO dict->ngram_g2p_model. Aborting.."); + return FALSE; + } + + int new_wid = dict_add_g2p_word(dict, word); + if (new_wid > 0){ + /* Now we also have to add it to dict2pid. */ + dict2pid_add_word(ps_search_dict2pid(fsgs), new_wid); + } else { + E_ERROR("Exiting... \n"); + return FALSE; + } + } } return TRUE; } static int fsg_search_add_altpron(fsg_search_t *fsgs, fsg_model_t *fsg) {
--- a/media/pocketsphinx/src/fsg_search_internal.h +++ b/media/pocketsphinx/src/fsg_search_internal.h @@ -64,17 +64,18 @@ typedef struct fsg_seg_s { /** * Implementation of FSG search (and "FSG set") structure. */ typedef struct fsg_search_s { ps_search_t base; hmm_context_t *hmmctx; /**< HMM context. */ - + char const *arpafile; + cmd_ln_t *config; fsg_model_t *fsg; /**< FSG model */ struct fsg_lextree_s *lextree;/**< Lextree structure for the currently active FSG */ struct fsg_history_s *history;/**< For storing the Viterbi search history */ glist_t pnode_active; /**< Those active in this frame */ glist_t pnode_active_next; /**< Those activated for the next frame */
--- a/media/pocketsphinx/src/pocketsphinx.c +++ b/media/pocketsphinx/src/pocketsphinx.c @@ -273,17 +273,17 @@ ps_reinit(ps_decoder_t *ps, cmd_ln_t *co return -1; hash_table_enter(ps->searches, ckd_salloc(ps_search_name(ps->phone_loop)), ps->phone_loop); } /* Dictionary and triphone mappings (depends on acmod). */ /* FIXME: pass config, change arguments, implement LTS, etc. */ - if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL) + if ((ps->dict = dict_init(ps->config, ps->acmod->mdef, ps->acmod->lmath)) == NULL) return -1; if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL) return -1; lw = cmd_ln_float32_r(config, "-lw"); /* Determine whether we are starting out in FSG or N-Gram search mode. * If neither is used skip search initialization. */ @@ -715,17 +715,17 @@ ps_load_dict(ps_decoder_t *ps, char cons cmd_ln_set_str_r(newconfig, "-dict", dictfile); if (fdictfile) cmd_ln_set_str_r(newconfig, "-fdict", fdictfile); else cmd_ln_set_str_r(newconfig, "-fdict", cmd_ln_str_r(ps->config, "-fdict")); /* Try to load it. */ - if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) { + if ((dict = dict_init(newconfig, ps->acmod->mdef, ps->acmod->lmath)) == NULL) { cmd_ln_free_r(newconfig); return -1; } /* Reinit the dict2pid. */ if ((d2p = dict2pid_build(ps->acmod->mdef, dict)) == NULL) { cmd_ln_free_r(newconfig); return -1;
--- a/media/pocketsphinx/src/ps_lattice.c +++ b/media/pocketsphinx/src/ps_lattice.c @@ -399,20 +399,21 @@ ps_lattice_read(ps_decoder_t *ps, int32 pip, silpen, fillpen; dag = ckd_calloc(1, sizeof(*dag)); if (ps) { dag->search = ps->search; dag->dict = dict_retain(ps->dict); dag->lmath = logmath_retain(ps->lmath); + dag->dict = dict_init(NULL, NULL, dag->lmath); dag->frate = cmd_ln_int32_r(dag->search->config, "-frate"); } else { - dag->dict = dict_init(NULL, NULL); + dag->dict = dict_init(NULL, NULL, dag->lmath); dag->lmath = logmath_init(1.0001, 0, FALSE); dag->frate = 100; } dag->silence = dict_silwid(dag->dict); dag->latnode_alloc = listelem_alloc_init(sizeof(ps_latnode_t)); dag->latlink_alloc = listelem_alloc_init(sizeof(ps_latlink_t)); dag->latlink_list_alloc = listelem_alloc_init(sizeof(latlink_list_t)); dag->refcount = 1;