author | Kelly Davis <kdavis@mozilla.com> |
Thu, 21 May 2015 22:18:00 -0400 | |
changeset 247007 | 72d3499011ff584d99b67fef64c5bb231f4d1a78 |
parent 247006 | eee2aca9e032dacadab51e0f2337e9f256883004 |
child 247008 | 123b9a110a78d54a2a8d4c34cbad92cd30a16b49 |
push id | 28848 |
push user | ryanvm@gmail.com |
push date | Wed, 03 Jun 2015 20:00:13 +0000 |
treeherder | mozilla-central@0920f2325a6d [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | smaug, gerv |
bugs | 1051146 |
milestone | 41.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/cmdln_macro.h @@ -0,0 +1,386 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2006 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* cmdln_macro.h - Command line definitions for PocketSphinx */ + +#ifndef __PS_CMDLN_MACRO_H__ +#define __PS_CMDLN_MACRO_H__ + +#include <sphinxbase/cmd_ln.h> +#include <sphinxbase/feat.h> +#include <sphinxbase/fe.h> + +/** Minimal set of command-line options for PocketSphinx. */ +#define POCKETSPHINX_OPTIONS \ + waveform_to_cepstral_command_line_macro(), \ + cepstral_to_feature_command_line_macro(), \ + POCKETSPHINX_ACMOD_OPTIONS, \ + POCKETSPHINX_BEAM_OPTIONS, \ + POCKETSPHINX_SEARCH_OPTIONS, \ + POCKETSPHINX_DICT_OPTIONS, \ + POCKETSPHINX_NGRAM_OPTIONS, \ + POCKETSPHINX_FSG_OPTIONS, \ + POCKETSPHINX_KWS_OPTIONS, \ + POCKETSPHINX_DEBUG_OPTIONS + +/** Options for debugging and logging. */ +#define POCKETSPHINX_DEBUG_OPTIONS \ + { "-logfn", \ + ARG_STRING, \ + NULL, \ + "File to write log messages in" \ + }, \ + { "-debug", \ + ARG_INT32, \ + NULL, \ + "Verbosity level for debugging messages" \ + }, \ + { "-mfclogdir", \ + ARG_STRING, \ + NULL, \ + "Directory to log feature files to" \ + }, \ + { "-rawlogdir", \ + ARG_STRING, \ + NULL, \ + "Directory to log raw audio files to" }, \ + { "-senlogdir", \ + ARG_STRING, \ + NULL, \ + "Directory to log senone score files to" \ + } + +/** Options defining beam width parameters for tuning the search. */ +#define POCKETSPHINX_BEAM_OPTIONS \ +{ "-beam", \ + ARG_FLOAT64, \ + "1e-48", \ + "Beam width applied to every frame in Viterbi search (smaller values mean wider beam)" }, \ +{ "-wbeam", \ + ARG_FLOAT64, \ + "7e-29", \ + "Beam width applied to word exits" }, \ +{ "-pbeam", \ + ARG_FLOAT64, \ + "1e-48", \ + "Beam width applied to phone transitions" }, \ +{ "-lpbeam", \ + ARG_FLOAT64, \ + "1e-40", \ + "Beam width applied to last phone in words" }, \ +{ "-lponlybeam", \ + ARG_FLOAT64, \ + "7e-29", \ + "Beam width applied to last phone in single-phone words" }, \ +{ "-fwdflatbeam", \ + ARG_FLOAT64, \ + "1e-64", \ + "Beam width applied to every frame in second-pass flat search" }, \ +{ "-fwdflatwbeam", \ + ARG_FLOAT64, \ + "7e-29", \ + "Beam width applied to word exits in second-pass flat search" }, \ +{ "-pl_window", \ + ARG_INT32, \ + "5", \ + "Phoneme lookahead window size, in frames" }, \ +{ "-pl_beam", \ + ARG_FLOAT64, \ + "1e-10", \ + "Beam width applied to phone loop search for lookahead" }, \ +{ "-pl_pbeam", \ + ARG_FLOAT64, \ + "1e-10", \ + "Beam width applied to phone loop transitions for lookahead" }, \ +{ "-pl_pip", \ + ARG_FLOAT32, \ + "1.0", \ + "Phone insertion penalty for phone loop" }, \ +{ "-pl_weight", \ + ARG_FLOAT64, \ + "3.0", \ + "Weight for phoneme lookahead penalties" } \ + +/** Options defining other parameters for tuning the search. */ +#define POCKETSPHINX_SEARCH_OPTIONS \ +{ "-compallsen", \ + ARG_BOOLEAN, \ + "no", \ + "Compute all senone scores in every frame (can be faster when there are many senones)" }, \ +{ "-fwdtree", \ + ARG_BOOLEAN, \ + "yes", \ + "Run forward lexicon-tree search (1st pass)" }, \ +{ "-fwdflat", \ + ARG_BOOLEAN, \ + "yes", \ + "Run forward flat-lexicon search over word lattice (2nd pass)" }, \ +{ "-bestpath", \ + ARG_BOOLEAN, \ + "yes", \ + "Run bestpath (Dijkstra) search over word lattice (3rd pass)" }, \ +{ "-backtrace", \ + ARG_BOOLEAN, \ + "no", \ + "Print results and backtraces to log file." }, \ +{ "-latsize", \ + ARG_INT32, \ + "5000", \ + "Initial backpointer table size" }, \ +{ "-maxwpf", \ + ARG_INT32, \ + "-1", \ + "Maximum number of distinct word exits at each frame (or -1 for no pruning)" }, \ +{ "-maxhmmpf", \ + ARG_INT32, \ + "30000", \ + "Maximum number of active HMMs to maintain at each frame (or -1 for no pruning)" }, \ +{ "-min_endfr", \ + ARG_INT32, \ + "0", \ + "Nodes ignored in lattice construction if they persist for fewer than N frames" }, \ +{ "-fwdflatefwid", \ + ARG_INT32, \ + "4", \ + "Minimum number of end frames for a word to be searched in fwdflat search" }, \ +{ "-fwdflatsfwin", \ + ARG_INT32, \ + "25", \ + "Window of frames in lattice to search for successor words in fwdflat search " } + +/** Command-line options for keyword spotting */ +#define POCKETSPHINX_KWS_OPTIONS \ +{ "-keyphrase", \ + ARG_STRING, \ + NULL, \ + "Keyphrase to spot"}, \ +{ "-kws", \ + ARG_STRING, \ + NULL, \ + "A file with keyphrases to spot, one per line"}, \ +{ "-kws_plp", \ + ARG_FLOAT64, \ + "1e-1", \ + "Phone loop probability for keyword spotting" }, \ +{ "-kws_threshold", \ + ARG_FLOAT64, \ + "1", \ + "Threshold for p(hyp)/p(alternatives) ratio" } + +/** Command-line options for finite state grammars. */ +#define POCKETSPHINX_FSG_OPTIONS \ + { "-fsg", \ + ARG_STRING, \ + NULL, \ + "Sphinx format finite state grammar file"}, \ +{ "-jsgf", \ + ARG_STRING, \ + NULL, \ + "JSGF grammar file" }, \ +{ "-toprule", \ + ARG_STRING, \ + NULL, \ + "Start rule for JSGF (first public rule is default)" }, \ +{ "-fsgusealtpron", \ + ARG_BOOLEAN, \ + "yes", \ + "Add alternate pronunciations to FSG"}, \ +{ "-fsgusefiller", \ + ARG_BOOLEAN, \ + "yes", \ + "Insert filler words at each state."} + +/** Command-line options for statistical language models. */ +#define POCKETSPHINX_NGRAM_OPTIONS \ +{ "-allphone", \ + ARG_STRING, \ + NULL, \ + "Perform phoneme decoding with phonetic lm" }, \ +{ "-allphone_ci", \ + ARG_BOOLEAN, \ + "no", \ + "Perform phoneme decoding with phonetic lm and context-independent units only" }, \ +{ "-lm", \ + ARG_STRING, \ + NULL, \ + "Word trigram language model input file" }, \ +{ "-lmctl", \ + ARG_STRING, \ + NULL, \ + "Specify a set of language model\n"}, \ +{ "-lmname", \ + ARG_STRING, \ + NULL, \ + "Which language model in -lmctl to use by default"}, \ +{ "-lw", \ + ARG_FLOAT32, \ + "6.5", \ + "Language model probability weight" }, \ +{ "-fwdflatlw", \ + ARG_FLOAT32, \ + "8.5", \ + "Language model probability weight for flat lexicon (2nd pass) decoding" }, \ +{ "-bestpathlw", \ + ARG_FLOAT32, \ + "9.5", \ + "Language model probability weight for bestpath search" }, \ +{ "-ascale", \ + ARG_FLOAT32, \ + "20.0", \ + "Inverse of acoustic model scale for confidence score calculation" }, \ +{ "-wip", \ + ARG_FLOAT32, \ + "0.65", \ + "Word insertion penalty" }, \ +{ "-nwpen", \ + ARG_FLOAT32, \ + "1.0", \ + "New word transition penalty" }, \ +{ "-pip", \ + ARG_FLOAT32, \ + "1.0", \ + "Phone insertion penalty" }, \ +{ "-uw", \ + ARG_FLOAT32, \ + "1.0", \ + "Unigram weight" }, \ +{ "-silprob", \ + ARG_FLOAT32, \ + "0.005", \ + "Silence word transition probability" }, \ +{ "-fillprob", \ + ARG_FLOAT32, \ + "1e-8", \ + "Filler word transition probability" } \ + +/** Command-line options for dictionaries. */ +#define POCKETSPHINX_DICT_OPTIONS \ + { "-dict", \ + REQARG_STRING, \ + NULL, \ + "Main pronunciation dictionary (lexicon) input file" }, \ + { "-fdict", \ + ARG_STRING, \ + NULL, \ + "Noise word pronunciation dictionary input file" }, \ + { "-dictcase", \ + ARG_BOOLEAN, \ + "no", \ + "Dictionary is case sensitive (NOTE: case insensitivity applies to ASCII characters only)" } \ + +/** Command-line options for acoustic modeling */ +#define POCKETSPHINX_ACMOD_OPTIONS \ +{ "-hmm", \ + ARG_STRING, \ + NULL, \ + "Directory containing acoustic model files."}, \ +{ "-featparams", \ + ARG_STRING, \ + NULL, \ + "File containing feature extraction parameters."}, \ +{ "-mdef", \ + ARG_STRING, \ + NULL, \ + "Model definition input file" }, \ +{ "-senmgau", \ + ARG_STRING, \ + NULL, \ + "Senone to codebook mapping input file (usually not needed)" }, \ +{ "-tmat", \ + ARG_STRING, \ + NULL, \ + "HMM state transition matrix input file" }, \ +{ "-tmatfloor", \ + ARG_FLOAT32, \ + "0.0001", \ + "HMM state transition probability floor (applied to -tmat file)" }, \ +{ "-mean", \ + ARG_STRING, \ + NULL, \ + "Mixture gaussian means input file" }, \ +{ "-var", \ + ARG_STRING, \ + NULL, \ + "Mixture gaussian variances input file" }, \ +{ "-varfloor", \ + ARG_FLOAT32, \ + "0.0001", \ + "Mixture gaussian variance floor (applied to data from -var file)" }, \ +{ "-mixw", \ + ARG_STRING, \ + NULL, \ + "Senone mixture weights input file (uncompressed)" }, \ +{ "-mixwfloor", \ + ARG_FLOAT32, \ + "0.0000001", \ + "Senone mixture weights floor (applied to data from -mixw file)" }, \ +{ "-aw", \ + ARG_INT32, \ + "1", \ + "Inverse weight applied to acoustic scores." }, \ +{ "-sendump", \ + ARG_STRING, \ + NULL, \ + "Senone dump (compressed mixture weights) input file" }, \ +{ "-mllr", \ + ARG_STRING, \ + NULL, \ + "MLLR transformation to apply to means and variances" }, \ +{ "-mmap", \ + ARG_BOOLEAN, \ + "yes", \ + "Use memory-mapped I/O (if possible) for model files" }, \ +{ "-ds", \ + ARG_INT32, \ + "1", \ + "Frame GMM computation downsampling ratio" }, \ +{ "-topn", \ + ARG_INT32, \ + "4", \ + "Maximum number of top Gaussians to use in scoring." }, \ +{ "-topn_beam", \ + ARG_STRING, \ + "0", \ + "Beam width used to determine top-N Gaussians (or a list, per-feature)" },\ +{ "-logbase", \ + ARG_FLOAT32, \ + "1.0001", \ + "Base in which all log-likelihoods calculated" } + +#define CMDLN_EMPTY_OPTION { NULL, 0, NULL, NULL } + +#endif /* __PS_CMDLN_MACRO_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/pocketsphinx-COPYING.txt @@ -0,0 +1,36 @@ +/* ==================================================================== + * Copyright (c) 1999-2015 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/pocketsphinx.h @@ -0,0 +1,658 @@ +/* -*- c-basic-offset:4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/** + * @file pocketsphinx.h Main header file for the PocketSphinx decoder. + */ + +#ifndef __POCKETSPHINX_H__ +#define __POCKETSPHINX_H__ + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +} +#endif + +/* System headers we need. */ +#include <stdio.h> + +/* SphinxBase headers we need. */ +#include <sphinxbase/cmd_ln.h> +#include <sphinxbase/logmath.h> +#include <sphinxbase/fe.h> +#include <sphinxbase/feat.h> + +/* PocketSphinx headers (not many of them!) */ +#include <pocketsphinx_export.h> +#include <cmdln_macro.h> +#include <ps_lattice.h> +#include <ps_mllr.h> + +/** + * PocketSphinx speech recognizer object. + */ +typedef struct ps_decoder_s ps_decoder_t; + +#include <ps_search.h> + +/** + * PocketSphinx N-best hypothesis iterator object. + */ +typedef struct ps_astar_s ps_nbest_t; + +/** + * PocketSphinx segmentation iterator object. + */ +typedef struct ps_seg_s ps_seg_t; + +/** + * Sets default grammar and language model if they are not set explicitly and + * are present in the default search path. + */ +POCKETSPHINX_EXPORT void +ps_default_search_args(cmd_ln_t *); + +/** + * Initialize the decoder from a configuration object. + * + * @note The decoder retains ownership of the pointer + * <code>config</code>, so you must not attempt to free it manually. + * If you wish to reuse it elsewhere, call cmd_ln_retain() on it. + * + * @param config a command-line structure, as created by + * cmd_ln_parse_r() or cmd_ln_parse_file_r(). + */ +POCKETSPHINX_EXPORT +ps_decoder_t *ps_init(cmd_ln_t *config); + +/** + * Reinitialize the decoder with updated configuration. + * + * This function allows you to switch the acoustic model, dictionary, + * or other configuration without creating an entirely new decoding + * object. + * + * @note The decoder retains ownership of the pointer + * <code>config</code>, so you must not attempt to free it manually. + * If you wish to reuse it elsewhere, call cmd_ln_retain() on it. + * + * @param ps Decoder. + * @param config An optional new configuration to use. If this is + * NULL, the previous configuration will be reloaded, + * with any changes applied. + * @return 0 for success, <0 for failure. + */ +POCKETSPHINX_EXPORT +int ps_reinit(ps_decoder_t *ps, cmd_ln_t *config); + +/** + * Returns the argument definitions used in ps_init(). + * + * This is here to avoid exporting global data, which is problematic + * on Win32 and Symbian (and possibly other platforms). + */ +POCKETSPHINX_EXPORT +arg_t const *ps_args(void); + +/** + * Retain a pointer to the decoder. + * + * This increments the reference count on the decoder, allowing it to + * be shared between multiple parent objects. In general you will not + * need to use this function, ever. It is mainly here for the + * convenience of scripting language bindings. + * + * @return pointer to retained decoder. + */ +POCKETSPHINX_EXPORT +ps_decoder_t *ps_retain(ps_decoder_t *ps); + +/** + * Finalize the decoder. + * + * This releases all resources associated with the decoder, including + * any language models or grammars which have been added to it, and + * the initial configuration object passed to ps_init(). + * + * @param ps Decoder to be freed. + * @return New reference count (0 if freed). + */ +POCKETSPHINX_EXPORT +int ps_free(ps_decoder_t *ps); + +/** + * Get the configuration object for this decoder. + * + * @return The configuration object for this decoder. The decoder + * retains ownership of this pointer, so you should not + * attempt to free it manually. Use cmd_ln_retain() if you + * wish to reuse it elsewhere. + */ +POCKETSPHINX_EXPORT +cmd_ln_t *ps_get_config(ps_decoder_t *ps); + +/** + * Get the log-math computation object for this decoder. + * + * @return The log-math object for this decoder. The decoder retains + * ownership of this pointer, so you should not attempt to + * free it manually. Use logmath_retain() if you wish to + * reuse it elsewhere. + */ +POCKETSPHINX_EXPORT +logmath_t *ps_get_logmath(ps_decoder_t *ps); + +/** + * Get the feature extraction object for this decoder. + * + * @return The feature extraction object for this decoder. The + * decoder retains ownership of this pointer, so you should + * not attempt to free it manually. Use fe_retain() if you + * wish to reuse it elsewhere. + */ +POCKETSPHINX_EXPORT +fe_t *ps_get_fe(ps_decoder_t *ps); + +/** + * Get the dynamic feature computation object for this decoder. + * + * @return The dynamic feature computation object for this decoder. The + * decoder retains ownership of this pointer, so you should + * not attempt to free it manually. Use feat_retain() if you + * wish to reuse it elsewhere. + */ +POCKETSPHINX_EXPORT +feat_t *ps_get_feat(ps_decoder_t *ps); + +/** + * Adapt current acoustic model using a linear transform. + * + * @param mllr The new transform to use, or NULL to update the existing + * transform. The decoder retains ownership of this pointer, + * so you should not attempt to free it manually. Use + * ps_mllr_retain() if you wish to reuse it + * elsewhere. + * @return The updated transform object for this decoder, or + * NULL on failure. + */ +POCKETSPHINX_EXPORT +ps_mllr_t *ps_update_mllr(ps_decoder_t *ps, ps_mllr_t *mllr); + +/** + * Reload the pronunciation dictionary from a file. + * + * This function replaces the current pronunciation dictionary with + * the one stored in dictfile. This also causes the active search + * module(s) to be reinitialized, in the same manner as calling + * ps_add_word() with update=TRUE. + * + * @param dictfile Path to dictionary file to load. + * @param fdictfile Path to filler dictionary to load, or NULL to keep + * the existing filler dictionary. + * @param format Format of the dictionary file, or NULL to determine + * automatically (currently unused,should be NULL) + */ +POCKETSPHINX_EXPORT +int ps_load_dict(ps_decoder_t *ps, char const *dictfile, + char const *fdictfile, char const *format); + +/** + * Dump the current pronunciation dictionary to a file. + * + * This function dumps the current pronunciation dictionary to a tex + * + * @param dictfile Path to file where dictionary will be written. + * @param format Format of the dictionary file, or NULL for the + * default (text) format (currently unused, should be NULL) + */ +POCKETSPHINX_EXPORT +int ps_save_dict(ps_decoder_t *ps, char const *dictfile, char const *format); + +/** + * Add a word to the pronunciation dictionary. + * + * This function adds a word to the pronunciation dictionary and the + * current language model (but, obviously, not to the current FSG if + * FSG mode is enabled). If the word is already present in one or the + * other, it does whatever is necessary to ensure that the word can be + * recognized. + * + * @param word Word string to add. + * @param phones Whitespace-separated list of phoneme strings + * describing pronunciation of <code>word</code>. + * @param update If TRUE, update the search module (whichever one is + * currently active) to recognize the newly added word. + * If adding multiple words, it is more efficient to + * pass FALSE here in all but the last word. + * @return The internal ID (>= 0) of the newly added word, or <0 on + * failure. + */ +POCKETSPHINX_EXPORT +int ps_add_word(ps_decoder_t *ps, + char const *word, + char const *phones, + int update); + +/** + * Lookup for the word in the dictionary and return phone transcription + * for it. + * + * @param ps Pocketsphinx decoder + * @param word Word to look for + * + * @return Whitespace-spearated phone string describing the pronunciation of the <code>word</code> + * or NULL if word is not present in the dictionary. The string is + * allocated and must be freed by the user. + */ +POCKETSPHINX_EXPORT +char *ps_lookup_word(ps_decoder_t *ps, + const char *word); + +/** + * Decode a raw audio stream. + * + * No headers are recognized in this files. The configuration + * parameters <tt>-samprate</tt> and <tt>-input_endian</tt> are used + * to determine the sampling rate and endianness of the stream, + * respectively. Audio is always assumed to be 16-bit signed PCM. + * + * @param ps Decoder. + * @param rawfh Previously opened file stream. + * @param maxsamps Maximum number of samples to read from rawfh, or -1 + * to read until end-of-file. + * @return Number of samples of audio. + */ +POCKETSPHINX_EXPORT +long ps_decode_raw(ps_decoder_t *ps, FILE *rawfh, + long maxsamps); + +/** + * Decode a senone score dump file. + * + * @param ps Decoder + * @param fh Previously opened file handle positioned at start of file. + * @return Number of frames read. + */ +POCKETSPHINX_EXPORT +int ps_decode_senscr(ps_decoder_t *ps, FILE *senfh); + +/** + * Start processing of the stream of speech. Channel parameters like + * noise-level are maintained for the stream and reused among utterances. + * Times returned in segment iterators are also stream-wide. + * + * @return 0 for success, <0 on error. + */ +POCKETSPHINX_EXPORT +int ps_start_stream(ps_decoder_t *ps); + +/** + * Start utterance processing. + * + * This function should be called before any utterance data is passed + * to the decoder. It marks the start of a new utterance and + * reinitializes internal data structures. + * + * @param ps Decoder to be started. + * @return 0 for success, <0 on error. + */ +POCKETSPHINX_EXPORT +int ps_start_utt(ps_decoder_t *ps); + +/** + * Decode raw audio data. + * + * @param ps Decoder. + * @param no_search If non-zero, perform feature extraction but don't + * do any recognition yet. This may be necessary if + * your processor has trouble doing recognition in + * real-time. + * @param full_utt If non-zero, this block of data is a full utterance + * worth of data. This may allow the recognizer to + * produce more accurate results. + * @return Number of frames of data searched, or <0 for error. + */ +POCKETSPHINX_EXPORT +int ps_process_raw(ps_decoder_t *ps, + int16 const *data, + size_t n_samples, + int no_search, + int full_utt); + +/** + * Decode acoustic feature data. + * + * @param ps Decoder. + * @param no_search If non-zero, perform feature extraction but don't + * do any recognition yet. This may be necessary if + * your processor has trouble doing recognition in + * real-time. + * @param full_utt If non-zero, this block of data is a full utterance + * worth of data. This may allow the recognizer to + * produce more accurate results. + * @return Number of frames of data searched, or <0 for error. + */ +POCKETSPHINX_EXPORT +int ps_process_cep(ps_decoder_t *ps, + mfcc_t **data, + int n_frames, + int no_search, + int full_utt); + +/** + * Get the number of frames of data searched. + * + * Note that there is a delay between this and the number of frames of + * audio which have been input to the system. This is due to the fact + * that acoustic features are computed using a sliding window of + * audio, and dynamic features are computed over a sliding window of + * acoustic features. + * + * @param ps Decoder. + * @return Number of frames of speech data which have been recognized + * so far. + */ +POCKETSPHINX_EXPORT +int ps_get_n_frames(ps_decoder_t *ps); + +/** + * End utterance processing. + * + * @param ps Decoder. + * @return 0 for success, <0 on error + */ +POCKETSPHINX_EXPORT +int ps_end_utt(ps_decoder_t *ps); + +/** + * Get hypothesis string and path score. + * + * @param ps Decoder. + * @param out_best_score Output: path score corresponding to returned string. + * @return String containing best hypothesis at this point in + * decoding. NULL if no hypothesis is available. + */ +POCKETSPHINX_EXPORT +char const *ps_get_hyp(ps_decoder_t *ps, int32 *out_best_score); + +/** + * Get hypothesis string and final flag. + * + * @param ps Decoder. + * @param out_is_best_score Output: if hypothesis is reached final state in the grammar. + * @return String containing best hypothesis at this point in + * decoding. NULL if no hypothesis is available. + */ +POCKETSPHINX_EXPORT +char const *ps_get_hyp_final(ps_decoder_t *ps, int32 *out_is_final); + +/** + * Get posterior probability. + * + * @note Unless the -bestpath option is enabled, this function will + * always return zero (corresponding to a posterior probability of + * 1.0). Even if -bestpath is enabled, it will also return zero when + * called on a partial result. Ongoing research into effective + * confidence annotation for partial hypotheses may result in these + * restrictions being lifted in future versions. + * + * @param ps Decoder. + * @return Posterior probability of the best hypothesis. + */ +POCKETSPHINX_EXPORT +int32 ps_get_prob(ps_decoder_t *ps); + +/** + * Get word lattice. + * + * There isn't much you can do with this so far, a public API will + * appear in the future. + * + * @param ps Decoder. + * @return Word lattice object containing all hypotheses so far. NULL + * if no hypotheses are available. This pointer is owned by + * the decoder and you should not attempt to free it manually. + * It is only valid until the next utterance, unless you use + * ps_lattice_retain() to retain it. + */ +POCKETSPHINX_EXPORT +ps_lattice_t *ps_get_lattice(ps_decoder_t *ps); + +/** + * Get an iterator over the word segmentation for the best hypothesis. + * + * @param ps Decoder. + * @param out_best_score Output: path score corresponding to hypothesis. + * @return Iterator over the best hypothesis at this point in + * decoding. NULL if no hypothesis is available. + */ +POCKETSPHINX_EXPORT +ps_seg_t *ps_seg_iter(ps_decoder_t *ps, int32 *out_best_score); + +/** + * Get the next segment in a word segmentation. + * + * @param seg Segment iterator. + * @return Updated iterator with the next segment. NULL at end of + * utterance (the iterator will be freed in this case). + */ +POCKETSPHINX_EXPORT +ps_seg_t *ps_seg_next(ps_seg_t *seg); + +/** + * Get word string from a segmentation iterator. + * + * @param seg Segment iterator. + * @return Read-only string giving string name of this segment. This + * is only valid until the next call to ps_seg_next(). + */ +POCKETSPHINX_EXPORT +char const *ps_seg_word(ps_seg_t *seg); + +/** + * Get inclusive start and end frames from a segmentation iterator. + * + * @note These frame numbers are inclusive, i.e. the end frame refers + * to the last frame in which the given word or other segment was + * active. Therefore, the actual duration is *out_ef - *out_sf + 1. + * + * @param seg Segment iterator. + * @param out_sf Output: First frame index in segment. + * @param out_sf Output: Last frame index in segment. + */ +POCKETSPHINX_EXPORT +void ps_seg_frames(ps_seg_t *seg, int *out_sf, int *out_ef); + +/** + * Get language, acoustic, and posterior probabilities from a + * segmentation iterator. + * + * @note Unless the -bestpath option is enabled, this function will + * always return zero (corresponding to a posterior probability of + * 1.0). Even if -bestpath is enabled, it will also return zero when + * called on a partial result. Ongoing research into effective + * confidence annotation for partial hypotheses may result in these + * restrictions being lifted in future versions. + * + * @param out_ascr Output: acoustic model score for this segment. + * @param out_lscr Output: language model score for this segment. + * @param out_lback Output: language model backoff mode for this + * segment (i.e. the number of words used in + * calculating lscr). This field is, of course, only + * meaningful for N-Gram models. + * @return Log posterior probability of current segment. Log is + * expressed in the log-base used in the decoder. To convert + * to linear floating-point, use logmath_exp(ps_get_logmath(), + * pprob). + */ +POCKETSPHINX_EXPORT +int32 ps_seg_prob(ps_seg_t *seg, int32 *out_ascr, int32 *out_lscr, int32 *out_lback); + +/** + * Finish iterating over a word segmentation early, freeing resources. + */ +POCKETSPHINX_EXPORT +void ps_seg_free(ps_seg_t *seg); + +/** + * Get an iterator over the best hypotheses, optionally within a + * selected region of the utterance. Iterator is empty now, it must + * be advanced with ps_nbest_next first. The function may also + * return a NULL which means that there is no hypothesis available for this + * utterance. + * + * @param ps Decoder. + * @param sf Start frame for N-best search (0 for whole utterance) + * @param ef End frame for N-best search (-1 for whole utterance) + * @param ctx1 First word of trigram context (NULL for whole utterance) + * @param ctx2 First word of trigram context (NULL for whole utterance) + * @return Iterator over N-best hypotheses or NULL if no hypothesis is available + */ +POCKETSPHINX_EXPORT +ps_nbest_t *ps_nbest(ps_decoder_t *ps, int sf, int ef, + char const *ctx1, char const *ctx2); + +/** + * Move an N-best list iterator forward. + * + * @param nbest N-best iterator. + * @return Updated N-best iterator, or NULL if no more hypotheses are + * available (iterator is freed ni this case). + */ +POCKETSPHINX_EXPORT +ps_nbest_t *ps_nbest_next(ps_nbest_t *nbest); + +/** + * Get the hypothesis string from an N-best list iterator. + * + * @param nbest N-best iterator. + * @param out_score Output: Path score for this hypothesis. + * @return String containing next best hypothesis. + */ +POCKETSPHINX_EXPORT +char const *ps_nbest_hyp(ps_nbest_t *nbest, int32 *out_score); + +/** + * Get the word segmentation from an N-best list iterator. + * + * @param nbest N-best iterator. + * @param out_score Output: Path score for this hypothesis. + * @return Iterator over the next best hypothesis. + */ +POCKETSPHINX_EXPORT +ps_seg_t *ps_nbest_seg(ps_nbest_t *nbest, int32 *out_score); + +/** + * Finish N-best search early, releasing resources. + * + * @param nbest N-best iterator. + */ +POCKETSPHINX_EXPORT +void ps_nbest_free(ps_nbest_t *nbest); + +/** + * Get performance information for the current utterance. + * + * @param ps Decoder. + * @param out_nspeech Output: Number of seconds of speech. + * @param out_ncpu Output: Number of seconds of CPU time used. + * @param out_nwall Output: Number of seconds of wall time used. + */ +POCKETSPHINX_EXPORT +void ps_get_utt_time(ps_decoder_t *ps, double *out_nspeech, + double *out_ncpu, double *out_nwall); + +/** + * Get overall performance information. + * + * @param ps Decoder. + * @param out_nspeech Output: Number of seconds of speech. + * @param out_ncpu Output: Number of seconds of CPU time used. + * @param out_nwall Output: Number of seconds of wall time used. + */ +POCKETSPHINX_EXPORT +void ps_get_all_time(ps_decoder_t *ps, double *out_nspeech, + double *out_ncpu, double *out_nwall); + +/** + * Checks if the last feed audio buffer contained speech + * + * @param ps Decoder. + * @return 1 if last buffer contained speech, 0 - otherwise + */ +POCKETSPHINX_EXPORT +uint8 ps_get_in_speech(ps_decoder_t *ps); + + +/** + * Sets the limit of the raw audio data to store in decoder + * to retrieve it later on ps_get_rawdata. + * + * @param ps Decoder + * @param size bytes of the utterance to store + */ +POCKETSPHINX_EXPORT +void ps_set_rawdata_size(ps_decoder_t *ps, int32 size); + + +/** + * Retrieves the raw data collected during utterance decoding. + * + * @param ps Decoder + * @param buffer preallocated buffer to store the data, must be within the limit + * set before + * @param size size of the data collected in samples (not bytes). + */ +POCKETSPHINX_EXPORT +void ps_get_rawdata(ps_decoder_t *ps, int16 **buffer, int32 *size); + +/** + * @mainpage PocketSphinx API Documentation + * @author David Huggins-Daines <dhuggins@cs.cmu.edu> + * @version 0.6 + * @date March, 2010 + * + * @section intro_sec Introduction + * + * This is the API documentation for the PocketSphinx speech + * recognition engine. The main API calls are documented in + * <pocketsphinx.h>. + */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __POCKETSPHINX_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/pocketsphinx_export.h @@ -0,0 +1,15 @@ +#ifndef __POCKETSPHINX_EXPORT_H__ +#define __POCKETSPHINX_EXPORT_H__ + +/* Win32/WinCE DLL gunk */ +#if (defined(_WIN32) || defined(_WIN32_WCE)) && !defined(_WIN32_WP) && !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(__WINSCW__) && !defined(__SYMBIAN32__) +#ifdef POCKETSPHINX_EXPORTS /* Visual Studio */ +#define POCKETSPHINX_EXPORT __declspec(dllexport) +#else +#define POCKETSPHINX_EXPORT __declspec(dllimport) +#endif +#else /* !_WIN32 */ +#define POCKETSPHINX_EXPORT +#endif + +#endif /* __POCKETSPHINX_EXPORT_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/ps_lattice.h @@ -0,0 +1,445 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/** + * @file ps_lattice.h Word graph search + */ + +#ifndef __PS_LATTICE_H__ +#define __PS_LATTICE_H__ + +/* SphinxBase headers. */ +#include <sphinxbase/prim_type.h> +#include <sphinxbase/ngram_model.h> + +/* PocketSphinx headers. */ +#include <pocketsphinx_export.h> + +/** + * Word graph structure used in bestpath/nbest search. + */ +typedef struct ps_lattice_s ps_lattice_t; + +/** + * DAG nodes. + * + * A node corresponds to a number of hypothesized instances of a word + * which all share the same starting point. + */ +typedef struct ps_latnode_s ps_latnode_t; + +/** + * Iterator over DAG nodes. + */ +typedef struct ps_latnode_s ps_latnode_iter_t; /* pay no attention to the man behind the curtain */ + +/** + * Links between DAG nodes. + * + * A link corresponds to a single hypothesized instance of a word with + * a given start and end point. + */ +typedef struct ps_latlink_s ps_latlink_t; + +/** + * Iterator over DAG links. + */ +typedef struct latlink_list_s ps_latlink_iter_t; + +/* Forward declaration needed to avoid circular includes */ +struct ps_decoder_s; + +/** + * Read a lattice from a file on disk. + * + * @param ps Decoder to use for processing this lattice, or NULL. + * @param file Path to lattice file. + * @return Newly created lattice, or NULL for failure. + */ +POCKETSPHINX_EXPORT +ps_lattice_t *ps_lattice_read(struct ps_decoder_s *ps, + char const *file); + +/** + * Retain a lattice. + * + * This function retains ownership of a lattice for the caller, + * preventing it from being freed automatically. You must call + * ps_lattice_free() to free it after having called this function. + * + * @return pointer to the retained lattice. + */ +POCKETSPHINX_EXPORT +ps_lattice_t *ps_lattice_retain(ps_lattice_t *dag); + +/** + * Free a lattice. + * + * @return new reference count (0 if dag was freed) + */ +POCKETSPHINX_EXPORT +int ps_lattice_free(ps_lattice_t *dag); + +/** + * Write a lattice to disk. + * + * @return 0 for success, <0 on failure. + */ +POCKETSPHINX_EXPORT +int ps_lattice_write(ps_lattice_t *dag, char const *filename); + +/** + * Write a lattice to disk in HTK format + * + * @return 0 for success, <0 on failure. + */ +POCKETSPHINX_EXPORT +int ps_lattice_write_htk(ps_lattice_t *dag, char const *filename); + +/** + * Get the log-math computation object for this lattice + * + * @return The log-math object for this lattice. The lattice retains + * ownership of this pointer, so you should not attempt to + * free it manually. Use logmath_retain() if you wish to + * reuse it elsewhere. + */ +POCKETSPHINX_EXPORT +logmath_t *ps_lattice_get_logmath(ps_lattice_t *dag); + + +/** + * Start iterating over nodes in the lattice. + * + * @note No particular order of traversal is guaranteed, and you + * should not depend on this. + * + * @param dag Lattice to iterate over. + * @return Iterator over lattice nodes. + */ +POCKETSPHINX_EXPORT +ps_latnode_iter_t *ps_latnode_iter(ps_lattice_t *dag); + +/** + * Move to next node in iteration. + * @param itor Node iterator. + * @return Updated node iterator, or NULL if finished + */ +POCKETSPHINX_EXPORT +ps_latnode_iter_t *ps_latnode_iter_next(ps_latnode_iter_t *itor); + +/** + * Stop iterating over nodes. + * @param itor Node iterator. + */ +POCKETSPHINX_EXPORT +void ps_latnode_iter_free(ps_latnode_iter_t *itor); + +/** + * Get node from iterator. + */ +POCKETSPHINX_EXPORT +ps_latnode_t *ps_latnode_iter_node(ps_latnode_iter_t *itor); + +/** + * Get start and end time range for a node. + * + * @param node Node inquired about. + * @param out_fef Output: End frame of first exit from this node. + * @param out_lef Output: End frame of last exit from this node. + * @return Start frame for all edges exiting this node. + */ +POCKETSPHINX_EXPORT +int ps_latnode_times(ps_latnode_t *node, int16 *out_fef, int16 *out_lef); + +/** + * Get word string for this node. + * + * @param dag Lattice to which node belongs. + * @param node Node inquired about. + * @return Word string for this node (possibly a pronunciation variant). + */ +POCKETSPHINX_EXPORT +char const *ps_latnode_word(ps_lattice_t *dag, ps_latnode_t *node); + +/** + * Get base word string for this node. + * + * @param dag Lattice to which node belongs. + * @param node Node inquired about. + * @return Base word string for this node. + */ +POCKETSPHINX_EXPORT +char const *ps_latnode_baseword(ps_lattice_t *dag, ps_latnode_t *node); + +/** + * Iterate over exits from this node. + * + * @param node Node inquired about. + * @return Iterator over exit links from this node. + */ +POCKETSPHINX_EXPORT +ps_latlink_iter_t *ps_latnode_exits(ps_latnode_t *node); + +/** + * Iterate over entries to this node. + * + * @param node Node inquired about. + * @return Iterator over entry links to this node. + */ +POCKETSPHINX_EXPORT +ps_latlink_iter_t *ps_latnode_entries(ps_latnode_t *node); + +/** + * Get best posterior probability and associated acoustic score from a lattice node. + * + * @param dag Lattice to which node belongs. + * @param node Node inquired about. + * @param out_link Output: exit link with highest posterior probability + * @return Posterior probability of the best link exiting this node. + * Log is expressed in the log-base used in the decoder. To + * convert to linear floating-point, use + * logmath_exp(ps_lattice_get_logmath(), pprob). + */ +POCKETSPHINX_EXPORT +int32 ps_latnode_prob(ps_lattice_t *dag, ps_latnode_t *node, + ps_latlink_t **out_link); + +/** + * Get next link from a lattice link iterator. + * + * @param itor Iterator. + * @return Updated iterator, or NULL if finished. + */ +POCKETSPHINX_EXPORT +ps_latlink_iter_t *ps_latlink_iter_next(ps_latlink_iter_t *itor); + +/** + * Stop iterating over links. + * @param itor Link iterator. + */ +POCKETSPHINX_EXPORT +void ps_latlink_iter_free(ps_latlink_iter_t *itor); + +/** + * Get link from iterator. + */ +POCKETSPHINX_EXPORT +ps_latlink_t *ps_latlink_iter_link(ps_latlink_iter_t *itor); + +/** + * Get start and end times from a lattice link. + * + * @note these are <strong>inclusive</strong> - i.e. the last frame of + * this word is ef, not ef-1. + * + * @param link Link inquired about. + * @param out_sf Output: (optional) start frame of this link. + * @return End frame of this link. + */ +POCKETSPHINX_EXPORT +int ps_latlink_times(ps_latlink_t *link, int16 *out_sf); + +/** + * Get destination and source nodes from a lattice link + * + * @param link Link inquired about + * @param out_src Output: (optional) source node. + * @return destination node + */ +POCKETSPHINX_EXPORT +ps_latnode_t *ps_latlink_nodes(ps_latlink_t *link, ps_latnode_t **out_src); + +/** + * Get word string from a lattice link. + * + * @param dag Lattice to which node belongs. + * @param link Link inquired about + * @return Word string for this link (possibly a pronunciation variant). + */ +POCKETSPHINX_EXPORT +char const *ps_latlink_word(ps_lattice_t *dag, ps_latlink_t *link); + +/** + * Get base word string from a lattice link. + * + * @param dag Lattice to which node belongs. + * @param link Link inquired about + * @return Base word string for this link + */ +POCKETSPHINX_EXPORT +char const *ps_latlink_baseword(ps_lattice_t *dag, ps_latlink_t *link); + +/** + * Get predecessor link in best path. + * + * @param link Link inquired about + * @return Best previous link from bestpath search, if any. Otherwise NULL + */ +POCKETSPHINX_EXPORT +ps_latlink_t *ps_latlink_pred(ps_latlink_t *link); + +/** + * Get acoustic score and posterior probability from a lattice link. + * + * @param dag Lattice to which node belongs. + * @param link Link inquired about + * @param out_ascr Output: (optional) acoustic score. + * @return Posterior probability for this link. Log is expressed in + * the log-base used in the decoder. To convert to linear + * floating-point, use logmath_exp(ps_lattice_get_logmath(), pprob). + */ +POCKETSPHINX_EXPORT +int32 ps_latlink_prob(ps_lattice_t *dag, ps_latlink_t *link, int32 *out_ascr); + +/** + * Create a directed link between "from" and "to" nodes, but if a link already exists, + * choose one with the best link_scr. + */ +POCKETSPHINX_EXPORT +void ps_lattice_link(ps_lattice_t *dag, ps_latnode_t *from, ps_latnode_t *to, + int32 score, int32 ef); + +/** + * Start a forward traversal of edges in a word graph. + * + * @note A keen eye will notice an inconsistency in this API versus + * other types of iterators in PocketSphinx. The reason for this is + * that the traversal algorithm is much more efficient when it is able + * to modify the lattice structure. Therefore, to avoid giving the + * impression that multiple traversals are possible at once, no + * separate iterator structure is provided. + * + * @param dag Lattice to be traversed. + * @param start Start node (source) of traversal. + * @param end End node (goal) of traversal. + * @return First link in traversal. + */ +POCKETSPHINX_EXPORT +ps_latlink_t *ps_lattice_traverse_edges(ps_lattice_t *dag, ps_latnode_t *start, ps_latnode_t *end); + +/** + * Get the next link in forward traversal. + * + * @param dag Lattice to be traversed. + * @param end End node (goal) of traversal. + * @return Next link in traversal. + */ +POCKETSPHINX_EXPORT +ps_latlink_t *ps_lattice_traverse_next(ps_lattice_t *dag, ps_latnode_t *end); + +/** + * Start a reverse traversal of edges in a word graph. + * + * @note See ps_lattice_traverse_edges() for why this API is the way it is. + * + * @param dag Lattice to be traversed. + * @param start Start node (goal) of traversal. + * @param end End node (source) of traversal. + * @return First link in traversal. + */ +POCKETSPHINX_EXPORT +ps_latlink_t *ps_lattice_reverse_edges(ps_lattice_t *dag, ps_latnode_t *start, ps_latnode_t *end); + +/** + * Get the next link in reverse traversal. + * + * @param dag Lattice to be traversed. + * @param start Start node (goal) of traversal. + * @return Next link in traversal. + */ +POCKETSPHINX_EXPORT +ps_latlink_t *ps_lattice_reverse_next(ps_lattice_t *dag, ps_latnode_t *start); + +/** + * Do N-Gram based best-path search on a word graph. + * + * This function calculates both the best path as well as the forward + * probability used in confidence estimation. + * + * @return Final link in best path, NULL on error. + */ +POCKETSPHINX_EXPORT +ps_latlink_t *ps_lattice_bestpath(ps_lattice_t *dag, ngram_model_t *lmset, + float32 lwf, float32 ascale); + +/** + * Calculate link posterior probabilities on a word graph. + * + * This function assumes that bestpath search has already been done. + * + * @return Posterior probability of the utterance as a whole. + */ +POCKETSPHINX_EXPORT +int32 ps_lattice_posterior(ps_lattice_t *dag, ngram_model_t *lmset, + float32 ascale); + +/** + * Prune all links (and associated nodes) below a certain posterior probability. + * + * This function assumes that ps_lattice_posterior() has already been called. + * + * @param beam Minimum posterior probability for links. This is + * expressed in the log-base used in the decoder. To convert + * from linear floating-point, use + * logmath_log(ps_lattice_get_logmath(), prob). + * @return number of arcs removed. + */ +POCKETSPHINX_EXPORT +int32 ps_lattice_posterior_prune(ps_lattice_t *dag, int32 beam); + +#ifdef NOT_IMPLEMENTED_YET +/** + * Expand lattice using an N-gram language model. + * + * This function expands the lattice such that each node represents a + * unique N-gram history, and adds language model scores to the links. + */ +POCKETSPHINX_EXPORT +int32 ps_lattice_ngram_expand(ps_lattice_t *dag, ngram_model_t *lm); +#endif + +/** + * Get the number of frames in the lattice. + * + * @param dag The lattice in question. + * @return Number of frames in this lattice. + */ +POCKETSPHINX_EXPORT +int ps_lattice_n_frames(ps_lattice_t *dag); + +#endif /* __PS_LATTICE_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/ps_mllr.h @@ -0,0 +1,75 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/** + * @file ps_mllr.h Model-space linear transforms for speaker adaptation + */ + +#ifndef __PS_MLLR_H__ +#define __PS_MLLR_H__ + +/* SphinxBase headers. */ +#include <sphinxbase/prim_type.h> +#include <sphinxbase/ngram_model.h> + +/* PocketSphinx headers. */ +#include <pocketsphinx_export.h> + +/** + * Feature space linear transform object. + */ +typedef struct ps_mllr_s ps_mllr_t; + +/** + * Read a speaker-adaptive linear transform from a file. + */ +POCKETSPHINX_EXPORT +ps_mllr_t *ps_mllr_read(char const *file); + +/** + * Retain a pointer to a linear transform. + */ +POCKETSPHINX_EXPORT +ps_mllr_t *ps_mllr_retain(ps_mllr_t *mllr); + +/** + * Release a pointer to a linear transform. + */ +POCKETSPHINX_EXPORT +int ps_mllr_free(ps_mllr_t *mllr); + +#endif /* __PS_MLLR_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/ps_search.h @@ -0,0 +1,296 @@ +/* -*- c-basic-offset:4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2014 Alpha Cephei Inc.. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/** + * @file ps_search.h User can configure several "search" objects with + * different grammars and langauge models and switch them in runtime to + * provide interactive experience for the user. + * + * There are different possible search modes: + * + * <ul> + * <li>keyword - efficiently looks for keyphrase and ignores other speech. allows to configure detection threshold.</li> + * <li>grammar - recognizes speech according to JSGF grammar. Unlike keyphrase grammar search doesn't ignore words which are not in grammar but tries to recognize them.</li> + * <li>ngram/lm - recognizes natural speech with a language model.</li> + * <li>allphone - recognizes phonemes with a phonetic language model.</li> + * </ul> + * + * Each search has a name and can be referenced by a name, names are + * application-specific. The function ps_set_search allows to activate + * the search previously added by a name. + * + * To add the search one needs to point to the grammar/language model + * describing the search. The location of the grammar is specific to the + * application. + * + * The exact design of a searches depends on your application. For + * example, you might want to listen for activation keyword first and once + * keyword is recognized switch to ngram search to recognize actual + * command. Once you recognized the command you can switch to grammar + * search to recognize the confirmation and then switch back to keyword listening + * mode to wait for another command. + * + * If only a simple recognition is required it is sufficient to add a single search or + * just configure the required mode with configuration options. + */ + +#ifndef __PS_SEARCH_H__ +#define __PS_SEARCH_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sphinxbase/fsg_model.h> +#include <sphinxbase/ngram_model.h> + +/** + * PocketSphinx search iterator. + */ +typedef struct ps_search_iter_s ps_search_iter_t; + + +/** + * Actives search with the provided name. + * + * Activates search with the provided name. The search must be added before + * using either ps_set_fsg(), ps_set_lm() or ps_set_kws(). + * + * @return 0 on success, 1 on failure + */ +POCKETSPHINX_EXPORT +int ps_set_search(ps_decoder_t *ps, const char *name); + +/** + * Returns name of curent search in decoder + * + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +const char* ps_get_search(ps_decoder_t *ps); + +/** + * Unsets the search and releases related resources. + * + * Unsets the search previously added with + * using either ps_set_fsg(), ps_set_lm() or ps_set_kws(). + * + * @see ps_set_fsg + * @see ps_set_lm + * @see ps_set_kws + */ +POCKETSPHINX_EXPORT +int ps_unset_search(ps_decoder_t *ps, const char *name); + +/** + * Returns iterator over current searches + * + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +ps_search_iter_t *ps_search_iter(ps_decoder_t *ps); + +/** + * Updates search iterator to point to the next position. + * + * This function automatically frees the iterator object upon reaching + * the final entry. + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +ps_search_iter_t *ps_search_iter_next(ps_search_iter_t *itor); + +/** + * Retrieves the name of the search the iterator points to. + * + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +const char* ps_search_iter_val(ps_search_iter_t *itor); + +/** + * Delete an unfinished search iterator + * + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +void ps_search_iter_free(ps_search_iter_t *itor); + +/** + * Updates search iterator to point to the next position. + * + * This function automatically frees the iterator object upon reaching + * the final entry. + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +const char* ps_search_iter_val(ps_search_iter_t *itor); + + +/** + * Get the language model set object for this decoder. + * + * If N-Gram decoding is not enabled, this will return NULL. You will + * need to enable it using ps_set_lmset(). + * + * @return The language model set object for this decoder. The + * decoder retains ownership of this pointer, so you should + * not attempt to free it manually. Use ngram_model_retain() + * if you wish to reuse it elsewhere. + */ +POCKETSPHINX_EXPORT +ngram_model_t *ps_get_lm(ps_decoder_t *ps, const char *name); + +/** + * Adds new search based on N-gram language model. + * + * Associates N-gram search with the provided name. The search can be activated + * using ps_set_search(). + * + * @see ps_set_search. + */ +POCKETSPHINX_EXPORT +int ps_set_lm(ps_decoder_t *ps, const char *name, ngram_model_t *lm); + +/** + * Adds new search based on N-gram language model. + * + * Convenient method to load N-gram model and create a search. + * + * @see ps_set_lm + */ +POCKETSPHINX_EXPORT +int ps_set_lm_file(ps_decoder_t *ps, const char *name, const char *path); + +/** + * Get the finite-state grammar set object for this decoder. + * + * If FSG decoding is not enabled, this returns NULL. Call + * ps_set_fsgset() to enable it. + * + * @return The current FSG set object for this decoder, or + * NULL if none is available. + */ +POCKETSPHINX_EXPORT +fsg_model_t *ps_get_fsg(ps_decoder_t *ps, const char *name); + +/** + * Adds new search based on finite state grammar. + * + * Associates FSG search with the provided name. The search can be activated + * using ps_set_search(). + * + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +int ps_set_fsg(ps_decoder_t *ps, const char *name, fsg_model_t *fsg); + +/** + * Adds new search using JSGF model. + * + * Convenient method to load JSGF model and create a search. + * + * @see ps_set_fsg + */ +POCKETSPHINX_EXPORT +int ps_set_jsgf_file(ps_decoder_t *ps, const char *name, const char *path); + +/** + * Adds new search using JSGF model. + * + * Convenience method to parse JSGF model from string and create a search. + * + * @see ps_set_fsg + */ +POCKETSPHINX_EXPORT +int ps_set_jsgf_string(ps_decoder_t *ps, const char *name, const char *jsgf_string); + +/** + * Get the current Key phrase to spot + * + * If KWS is not enabled, this returns NULL. Call + * ps_update_kws() to enable it. + * + * @return The current keyphrase to spot + */ +POCKETSPHINX_EXPORT +const char* ps_get_kws(ps_decoder_t *ps, const char *name); + +/** + * Adds keywords from a file to spotting + * + * Associates KWS search with the provided name. The search can be activated + * using ps_set_search(). + * + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +int ps_set_kws(ps_decoder_t *ps, const char *name, const char *keyfile); + +/** + * Adds new keyword to spot + * + * Associates KWS search with the provided name. The search can be activated + * using ps_set_search(). + * + * @see ps_set_search + */ +POCKETSPHINX_EXPORT +int ps_set_keyphrase(ps_decoder_t *ps, const char *name, const char *keyphrase); + +/** + * Adds new search based on phone N-gram language model. + * + * Associates N-gram search with the provided name. The search can be activated + * using ps_set_search(). + * + * @see ps_set_search. + */ +POCKETSPHINX_EXPORT +int ps_set_allphone(ps_decoder_t *ps, const char *name, ngram_model_t *lm); + +/** + * Adds new search based on phone N-gram language model. + * + * Convenient method to load N-gram model and create a search. + * + * @see ps_set_allphone + */ +POCKETSPHINX_EXPORT +int ps_set_allphone_file(ps_decoder_t *ps, const char *name, const char *path); + +#ifdef __cplusplus +} +#endif + +#endif /* __PS_SEARCH_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/acmod.c @@ -0,0 +1,1377 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + + +/** + * @file acmod.c Acoustic model structures for PocketSphinx. + * @author David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +/* System headers. */ +#include <assert.h> +#include <string.h> +#include <math.h> + +/* SphinxBase headers. */ +#include <sphinxbase/prim_type.h> +#include <sphinxbase/err.h> +#include <sphinxbase/cmd_ln.h> +#include <sphinxbase/strfuncs.h> +#include <sphinxbase/byteorder.h> +#include <sphinxbase/feat.h> +#include <sphinxbase/bio.h> + +/* Local headers. */ +#include "cmdln_macro.h" +#include "acmod.h" +#include "s2_semi_mgau.h" +#include "ptm_mgau.h" +#include "ms_mgau.h" + +/* Feature and front-end parameters that may be in feat.params */ +static const arg_t feat_defn[] = { + waveform_to_cepstral_command_line_macro(), + cepstral_to_feature_command_line_macro(), + CMDLN_EMPTY_OPTION +}; + +#ifndef WORDS_BIGENDIAN +#define WORDS_BIGENDIAN 1 +#endif + +static int32 acmod_process_mfcbuf(acmod_t *acmod); + +static int +acmod_init_am(acmod_t *acmod) +{ + char const *mdeffn, *tmatfn, *mllrfn, *hmmdir; + + /* Read model definition. */ + if ((mdeffn = cmd_ln_str_r(acmod->config, "-mdef")) == NULL) { + if ((hmmdir = cmd_ln_str_r(acmod->config, "-hmm")) == NULL) + E_ERROR("Acoustic model definition is not specified either " + "with -mdef option or with -hmm\n"); + else + E_ERROR("Folder '%s' does not contain acoustic model " + "definition 'mdef'\n", hmmdir); + + return -1; + } + + if ((acmod->mdef = bin_mdef_read(acmod->config, mdeffn)) == NULL) { + E_ERROR("Failed to read acoustic model definition from %s\n", mdeffn); + return -1; + } + + /* Read transition matrices. */ + if ((tmatfn = cmd_ln_str_r(acmod->config, "-tmat")) == NULL) { + E_ERROR("No tmat file specified\n"); + return -1; + } + acmod->tmat = tmat_init(tmatfn, acmod->lmath, + cmd_ln_float32_r(acmod->config, "-tmatfloor"), + TRUE); + + /* Read the acoustic models. */ + if ((cmd_ln_str_r(acmod->config, "-mean") == NULL) + || (cmd_ln_str_r(acmod->config, "-var") == NULL) + || (cmd_ln_str_r(acmod->config, "-tmat") == NULL)) { + E_ERROR("No mean/var/tmat files specified\n"); + return -1; + } + + if (cmd_ln_str_r(acmod->config, "-senmgau")) { + E_INFO("Using general multi-stream GMM computation\n"); + acmod->mgau = ms_mgau_init(acmod, acmod->lmath, acmod->mdef); + if (acmod->mgau == NULL) + return -1; + } + else { + E_INFO("Attempting to use PTM computation module\n"); + if ((acmod->mgau = ptm_mgau_init(acmod, acmod->mdef)) == NULL) { + E_INFO("Attempting to use semi-continuous computation module\n"); + if ((acmod->mgau = s2_semi_mgau_init(acmod)) == NULL) { + E_INFO("Falling back to general multi-stream GMM computation\n"); + acmod->mgau = ms_mgau_init(acmod, acmod->lmath, acmod->mdef); + if (acmod->mgau == NULL) + return -1; + } + } + } + + /* If there is an MLLR transform, apply it. */ + if ((mllrfn = cmd_ln_str_r(acmod->config, "-mllr"))) { + ps_mllr_t *mllr = ps_mllr_read(mllrfn); + if (mllr == NULL) + return -1; + acmod_update_mllr(acmod, mllr); + } + + return 0; +} + +static int +acmod_init_feat(acmod_t *acmod) +{ + acmod->fcb = + feat_init(cmd_ln_str_r(acmod->config, "-feat"), + cmn_type_from_str(cmd_ln_str_r(acmod->config,"-cmn")), + cmd_ln_boolean_r(acmod->config, "-varnorm"), + agc_type_from_str(cmd_ln_str_r(acmod->config, "-agc")), + 1, cmd_ln_int32_r(acmod->config, "-ceplen")); + if (acmod->fcb == NULL) + return -1; + + if (cmd_ln_str_r(acmod->config, "-lda")) { + E_INFO("Reading linear feature transformation from %s\n", + cmd_ln_str_r(acmod->config, "-lda")); + if (feat_read_lda(acmod->fcb, + cmd_ln_str_r(acmod->config, "-lda"), + cmd_ln_int32_r(acmod->config, "-ldadim")) < 0) + return -1; + } + + if (cmd_ln_str_r(acmod->config, "-svspec")) { + int32 **subvecs; + E_INFO("Using subvector specification %s\n", + cmd_ln_str_r(acmod->config, "-svspec")); + if ((subvecs = parse_subvecs(cmd_ln_str_r(acmod->config, "-svspec"))) == NULL) + return -1; + if ((feat_set_subvecs(acmod->fcb, subvecs)) < 0) + return -1; + } + + if (cmd_ln_exists_r(acmod->config, "-agcthresh") + && 0 != strcmp(cmd_ln_str_r(acmod->config, "-agc"), "none")) { + agc_set_threshold(acmod->fcb->agc_struct, + cmd_ln_float32_r(acmod->config, "-agcthresh")); + } + + if (acmod->fcb->cmn_struct + && cmd_ln_exists_r(acmod->config, "-cmninit")) { + char *c, *cc, *vallist; + int32 nvals; + + vallist = ckd_salloc(cmd_ln_str_r(acmod->config, "-cmninit")); + c = vallist; + nvals = 0; + while (nvals < acmod->fcb->cmn_struct->veclen + && (cc = strchr(c, ',')) != NULL) { + *cc = '\0'; + acmod->fcb->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof_c(c)); + c = cc + 1; + ++nvals; + } + if (nvals < acmod->fcb->cmn_struct->veclen && *c != '\0') { + acmod->fcb->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof_c(c)); + } + ckd_free(vallist); + } + return 0; +} + +int +acmod_fe_mismatch(acmod_t *acmod, fe_t *fe) +{ + /* Output vector dimension needs to be the same. */ + if (cmd_ln_int32_r(acmod->config, "-ceplen") != fe_get_output_size(fe)) { + E_ERROR("Configured feature length %d doesn't match feature " + "extraction output size %d\n", + cmd_ln_int32_r(acmod->config, "-ceplen"), + fe_get_output_size(fe)); + return TRUE; + } + /* Feature parameters need to be the same. */ + /* ... */ + return FALSE; +} + +int +acmod_feat_mismatch(acmod_t *acmod, feat_t *fcb) +{ + /* Feature type needs to be the same. */ + if (0 != strcmp(cmd_ln_str_r(acmod->config, "-feat"), feat_name(fcb))) + return TRUE; + /* Input vector dimension needs to be the same. */ + if (cmd_ln_int32_r(acmod->config, "-ceplen") != feat_cepsize(fcb)) + return TRUE; + /* FIXME: Need to check LDA and stuff too. */ + return FALSE; +} + +acmod_t * +acmod_init(cmd_ln_t *config, logmath_t *lmath, fe_t *fe, feat_t *fcb) +{ + acmod_t *acmod; + char const *featparams; + + acmod = ckd_calloc(1, sizeof(*acmod)); + acmod->config = cmd_ln_retain(config); + acmod->lmath = lmath; + acmod->state = ACMOD_IDLE; + + /* Look for feat.params in acoustic model dir. */ + if ((featparams = cmd_ln_str_r(acmod->config, "-featparams"))) { + if (NULL != + cmd_ln_parse_file_r(acmod->config, feat_defn, featparams, FALSE)) + E_INFO("Parsed model-specific feature parameters from %s\n", + featparams); + } + + /* Initialize feature computation. */ + if (fe) { + if (acmod_fe_mismatch(acmod, fe)) + goto error_out; + fe_retain(fe); + acmod->fe = fe; + } + else { + /* Initialize a new front end. */ + acmod->fe = fe_init_auto_r(config); + if (acmod->fe == NULL) + goto error_out; + if (acmod_fe_mismatch(acmod, acmod->fe)) + goto error_out; + } + if (fcb) { + if (acmod_feat_mismatch(acmod, fcb)) + goto error_out; + feat_retain(fcb); + acmod->fcb = fcb; + } + else { + /* Initialize a new fcb. */ + if (acmod_init_feat(acmod) < 0) + goto error_out; + } + + /* Load acoustic model parameters. */ + if (acmod_init_am(acmod) < 0) + goto error_out; + + + /* The MFCC buffer needs to be at least as large as the dynamic + * feature window. */ + acmod->n_mfc_alloc = acmod->fcb->window_size * 2 + 1; + acmod->mfc_buf = (mfcc_t **) + ckd_calloc_2d(acmod->n_mfc_alloc, acmod->fcb->cepsize, + sizeof(**acmod->mfc_buf)); + + /* Feature buffer has to be at least as large as MFCC buffer. */ + acmod->n_feat_alloc = acmod->n_mfc_alloc + cmd_ln_int32_r(config, "-pl_window"); + acmod->feat_buf = feat_array_alloc(acmod->fcb, acmod->n_feat_alloc); + acmod->framepos = ckd_calloc(acmod->n_feat_alloc, sizeof(*acmod->framepos)); + + acmod->utt_start_frame = 0; + + /* Senone computation stuff. */ + acmod->senone_scores = ckd_calloc(bin_mdef_n_sen(acmod->mdef), + sizeof(*acmod->senone_scores)); + acmod->senone_active_vec = bitvec_alloc(bin_mdef_n_sen(acmod->mdef)); + acmod->senone_active = ckd_calloc(bin_mdef_n_sen(acmod->mdef), + sizeof(*acmod->senone_active)); + acmod->log_zero = logmath_get_zero(acmod->lmath); + acmod->compallsen = cmd_ln_boolean_r(config, "-compallsen"); + return acmod; + +error_out: + acmod_free(acmod); + return NULL; +} + +void +acmod_free(acmod_t *acmod) +{ + if (acmod == NULL) + return; + + feat_free(acmod->fcb); + fe_free(acmod->fe); + cmd_ln_free_r(acmod->config); + + if (acmod->mfc_buf) + ckd_free_2d((void **)acmod->mfc_buf); + if (acmod->feat_buf) + feat_array_free(acmod->feat_buf); + + if (acmod->mfcfh) + fclose(acmod->mfcfh); + if (acmod->rawfh) + fclose(acmod->rawfh); + if (acmod->senfh) + fclose(acmod->senfh); + + ckd_free(acmod->framepos); + ckd_free(acmod->senone_scores); + ckd_free(acmod->senone_active_vec); + ckd_free(acmod->senone_active); + ckd_free(acmod->rawdata); + + if (acmod->mdef) + bin_mdef_free(acmod->mdef); + if (acmod->tmat) + tmat_free(acmod->tmat); + if (acmod->mgau) + ps_mgau_free(acmod->mgau); + if (acmod->mllr) + ps_mllr_free(acmod->mllr); + + ckd_free(acmod); +} + +ps_mllr_t * +acmod_update_mllr(acmod_t *acmod, ps_mllr_t *mllr) +{ + if (acmod->mllr) + ps_mllr_free(acmod->mllr); + acmod->mllr = mllr; + ps_mgau_transform(acmod->mgau, mllr); + + return mllr; +} + +int +acmod_write_senfh_header(acmod_t *acmod, FILE *logfh) +{ + char nsenstr[64], logbasestr[64]; + + sprintf(nsenstr, "%d", bin_mdef_n_sen(acmod->mdef)); + sprintf(logbasestr, "%f", logmath_get_base(acmod->lmath)); + return bio_writehdr(logfh, + "version", "0.1", + "mdef_file", cmd_ln_str_r(acmod->config, "-mdef"), + "n_sen", nsenstr, + "logbase", logbasestr, NULL); +} + +int +acmod_set_senfh(acmod_t *acmod, FILE *logfh) +{ + if (acmod->senfh) + fclose(acmod->senfh); + acmod->senfh = logfh; + if (logfh == NULL) + return 0; + return acmod_write_senfh_header(acmod, logfh); +} + +int +acmod_set_mfcfh(acmod_t *acmod, FILE *logfh) +{ + int rv = 0; + + if (acmod->mfcfh) + fclose(acmod->mfcfh); + acmod->mfcfh = logfh; + fwrite(&rv, 4, 1, acmod->mfcfh); + return rv; +} + +int +acmod_set_rawfh(acmod_t *acmod, FILE *logfh) +{ + if (acmod->rawfh) + fclose(acmod->rawfh); + acmod->rawfh = logfh; + return 0; +} + +void +acmod_grow_feat_buf(acmod_t *acmod, int nfr) +{ + if (nfr > MAX_N_FRAMES) + E_FATAL("Decoder can not process more than %d frames at once, " + "requested %d\n", MAX_N_FRAMES, nfr); + + acmod->feat_buf = feat_array_realloc(acmod->fcb, acmod->feat_buf, + acmod->n_feat_alloc, nfr); + acmod->framepos = ckd_realloc(acmod->framepos, + nfr * sizeof(*acmod->framepos)); + acmod->n_feat_alloc = nfr; +} + +int +acmod_set_grow(acmod_t *acmod, int grow_feat) +{ + int tmp = acmod->grow_feat; + acmod->grow_feat = grow_feat; + + /* Expand feat_buf to a reasonable size to start with. */ + if (grow_feat && acmod->n_feat_alloc < 128) + acmod_grow_feat_buf(acmod, 128); + + return tmp; +} + +int +acmod_start_utt(acmod_t *acmod) +{ + fe_start_utt(acmod->fe); + acmod->state = ACMOD_STARTED; + acmod->n_mfc_frame = 0; + acmod->n_feat_frame = 0; + acmod->mfc_outidx = 0; + acmod->feat_outidx = 0; + acmod->output_frame = 0; + acmod->senscr_frame = -1; + acmod->n_senone_active = 0; + acmod->mgau->frame_idx = 0; + acmod->rawdata_pos = 0; + + return 0; +} + +int +acmod_end_utt(acmod_t *acmod) +{ + int32 nfr = 0; + + acmod->state = ACMOD_ENDED; + if (acmod->n_mfc_frame < acmod->n_mfc_alloc) { + int inptr; + /* Where to start writing them (circular buffer) */ + inptr = (acmod->mfc_outidx + acmod->n_mfc_frame) % acmod->n_mfc_alloc; + /* nfr is always either zero or one. */ + fe_end_utt(acmod->fe, acmod->mfc_buf[inptr], &nfr); + acmod->n_mfc_frame += nfr; + + /* Process whatever's left, and any leadout or update stats if needed. */ + if (nfr) + nfr = acmod_process_mfcbuf(acmod); + else + feat_update_stats(acmod->fcb); + } + if (acmod->mfcfh) { + long outlen; + int32 rv; + outlen = (ftell(acmod->mfcfh) - 4) / 4; + if (!WORDS_BIGENDIAN) + SWAP_INT32(&outlen); + /* Try to seek and write */ + if ((rv = fseek(acmod->mfcfh, 0, SEEK_SET)) == 0) { + fwrite(&outlen, 4, 1, acmod->mfcfh); + } + fclose(acmod->mfcfh); + acmod->mfcfh = NULL; + } + if (acmod->rawfh) { + fclose(acmod->rawfh); + acmod->rawfh = NULL; + } + + if (acmod->senfh) { + fclose(acmod->senfh); + acmod->senfh = NULL; + } + + return nfr; +} + +static int +acmod_log_mfc(acmod_t *acmod, + mfcc_t **cep, int n_frames) +{ + int i, n; + int32 *ptr = (int32 *)cep[0]; + + n = n_frames * feat_cepsize(acmod->fcb); + /* Swap bytes. */ + if (!WORDS_BIGENDIAN) { + for (i = 0; i < (n * sizeof(mfcc_t)); ++i) { + SWAP_INT32(ptr + i); + } + } + /* Write features. */ + if (fwrite(cep[0], sizeof(mfcc_t), n, acmod->mfcfh) != n) { + E_ERROR_SYSTEM("Failed to write %d values to log file", n); + } + + /* Swap them back. */ + if (!WORDS_BIGENDIAN) { + for (i = 0; i < (n * sizeof(mfcc_t)); ++i) { + SWAP_INT32(ptr + i); + } + } + return 0; +} + +static int +acmod_process_full_cep(acmod_t *acmod, + mfcc_t ***inout_cep, + int *inout_n_frames) +{ + int32 nfr; + + /* Write to log file. */ + if (acmod->mfcfh) + acmod_log_mfc(acmod, *inout_cep, *inout_n_frames); + + /* Resize feat_buf to fit. */ + if (acmod->n_feat_alloc < *inout_n_frames) { + + if (*inout_n_frames > MAX_N_FRAMES) + E_FATAL("Batch processing can not process more than %d frames " + "at once, requested %d\n", MAX_N_FRAMES, *inout_n_frames); + + feat_array_free(acmod->feat_buf); + acmod->feat_buf = feat_array_alloc(acmod->fcb, *inout_n_frames); + acmod->n_feat_alloc = *inout_n_frames; + acmod->n_feat_frame = 0; + acmod->feat_outidx = 0; + } + /* Make dynamic features. */ + nfr = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, inout_n_frames, + TRUE, TRUE, acmod->feat_buf); + acmod->n_feat_frame = nfr; + assert(acmod->n_feat_frame <= acmod->n_feat_alloc); + *inout_cep += *inout_n_frames; + *inout_n_frames = 0; + + return nfr; +} + +static int +acmod_process_full_raw(acmod_t *acmod, + int16 const **inout_raw, + size_t *inout_n_samps) +{ + int32 nfr, ntail; + mfcc_t **cepptr; + + /* Write to logging file if any. */ + if (*inout_n_samps + acmod->rawdata_pos < acmod->rawdata_size) { + memcpy(acmod->rawdata + acmod->rawdata_pos, *inout_raw, *inout_n_samps * sizeof(int16)); + acmod->rawdata_pos += *inout_n_samps; + } + if (acmod->rawfh) + fwrite(*inout_raw, sizeof(int16), *inout_n_samps, acmod->rawfh); + /* Resize mfc_buf to fit. */ + if (fe_process_frames(acmod->fe, NULL, inout_n_samps, NULL, &nfr, NULL) < 0) + return -1; + if (acmod->n_mfc_alloc < nfr + 1) { + ckd_free_2d(acmod->mfc_buf); + acmod->mfc_buf = ckd_calloc_2d(nfr + 1, fe_get_output_size(acmod->fe), + sizeof(**acmod->mfc_buf)); + acmod->n_mfc_alloc = nfr + 1; + } + acmod->n_mfc_frame = 0; + acmod->mfc_outidx = 0; + fe_start_utt(acmod->fe); + if (fe_process_frames(acmod->fe, inout_raw, inout_n_samps, + acmod->mfc_buf, &nfr, NULL) < 0) + return -1; + fe_end_utt(acmod->fe, acmod->mfc_buf[nfr], &ntail); + nfr += ntail; + + cepptr = acmod->mfc_buf; + nfr = acmod_process_full_cep(acmod, &cepptr, &nfr); + acmod->n_mfc_frame = 0; + return nfr; +} + +/** + * Process MFCCs that are in the internal buffer into features. + */ +static int32 +acmod_process_mfcbuf(acmod_t *acmod) +{ + mfcc_t **mfcptr; + int32 ncep; + + ncep = acmod->n_mfc_frame; + /* Also do this in two parts because of the circular mfc_buf. */ + if (acmod->mfc_outidx + ncep > acmod->n_mfc_alloc) { + int32 ncep1 = acmod->n_mfc_alloc - acmod->mfc_outidx; + int saved_state = acmod->state; + + /* Make sure we don't end the utterance here. */ + if (acmod->state == ACMOD_ENDED) + acmod->state = ACMOD_PROCESSING; + mfcptr = acmod->mfc_buf + acmod->mfc_outidx; + ncep1 = acmod_process_cep(acmod, &mfcptr, &ncep1, FALSE); + /* It's possible that not all available frames were filled. */ + ncep -= ncep1; + acmod->n_mfc_frame -= ncep1; + acmod->mfc_outidx += ncep1; + acmod->mfc_outidx %= acmod->n_mfc_alloc; + /* Restore original state (could this really be the end) */ + acmod->state = saved_state; + } + mfcptr = acmod->mfc_buf + acmod->mfc_outidx; + ncep = acmod_process_cep(acmod, &mfcptr, &ncep, FALSE); + acmod->n_mfc_frame -= ncep; + acmod->mfc_outidx += ncep; + acmod->mfc_outidx %= acmod->n_mfc_alloc; + return ncep; +} + +int +acmod_process_raw(acmod_t *acmod, + int16 const **inout_raw, + size_t *inout_n_samps, + int full_utt) +{ + int32 ncep; + int32 out_frameidx; + int16 const *prev_audio_inptr; + + /* If this is a full utterance, process it all at once. */ + if (full_utt) + return acmod_process_full_raw(acmod, inout_raw, inout_n_samps); + + /* Append MFCCs to the end of any that are previously in there + * (in practice, there will probably be none) */ + if (inout_n_samps && *inout_n_samps) { + int inptr; + int32 processed_samples; + + prev_audio_inptr = *inout_raw; + /* Total number of frames available. */ + ncep = acmod->n_mfc_alloc - acmod->n_mfc_frame; + /* Where to start writing them (circular buffer) */ + inptr = (acmod->mfc_outidx + acmod->n_mfc_frame) % acmod->n_mfc_alloc; + + /* Write them in two (or more) parts if there is wraparound. */ + while (inptr + ncep > acmod->n_mfc_alloc) { + int32 ncep1 = acmod->n_mfc_alloc - inptr; + if (fe_process_frames(acmod->fe, inout_raw, inout_n_samps, + acmod->mfc_buf + inptr, &ncep1, &out_frameidx) < 0) + return -1; + + if (out_frameidx > 0) + acmod->utt_start_frame = out_frameidx; + + processed_samples = *inout_raw - prev_audio_inptr; + if (processed_samples + acmod->rawdata_pos < acmod->rawdata_size) { + memcpy(acmod->rawdata + acmod->rawdata_pos, prev_audio_inptr, processed_samples * sizeof(int16)); + acmod->rawdata_pos += processed_samples; + } + /* Write to logging file if any. */ + if (acmod->rawfh) { + fwrite(prev_audio_inptr, sizeof(int16), + processed_samples, + acmod->rawfh); + } + prev_audio_inptr = *inout_raw; + + /* ncep1 now contains the number of frames actually + * processed. This is a good thing, but it means we + * actually still might have some room left at the end of + * the buffer, hence the while loop. Unfortunately it + * also means that in the case where we are really + * actually done, we need to get out totally, hence the + * goto. */ + acmod->n_mfc_frame += ncep1; + ncep -= ncep1; + inptr += ncep1; + inptr %= acmod->n_mfc_alloc; + if (ncep1 == 0) + goto alldone; + } + + assert(inptr + ncep <= acmod->n_mfc_alloc); + if (fe_process_frames(acmod->fe, inout_raw, inout_n_samps, + acmod->mfc_buf + inptr, &ncep, &out_frameidx) < 0) + return -1; + + if (out_frameidx > 0) + acmod->utt_start_frame = out_frameidx; + + + processed_samples = *inout_raw - prev_audio_inptr; + if (processed_samples + acmod->rawdata_pos < acmod->rawdata_size) { + memcpy(acmod->rawdata + acmod->rawdata_pos, prev_audio_inptr, processed_samples * sizeof(int16)); + acmod->rawdata_pos += processed_samples; + } + if (acmod->rawfh) { + fwrite(prev_audio_inptr, sizeof(int16), + processed_samples, acmod->rawfh); + } + prev_audio_inptr = *inout_raw; + acmod->n_mfc_frame += ncep; + alldone: + ; + } + + /* Hand things off to acmod_process_cep. */ + return acmod_process_mfcbuf(acmod); +} + +int +acmod_process_cep(acmod_t *acmod, + mfcc_t ***inout_cep, + int *inout_n_frames, + int full_utt) +{ + int32 nfeat, ncep, inptr; + int orig_n_frames; + + /* If this is a full utterance, process it all at once. */ + if (full_utt) + return acmod_process_full_cep(acmod, inout_cep, inout_n_frames); + + /* Write to log file. */ + if (acmod->mfcfh) + acmod_log_mfc(acmod, *inout_cep, *inout_n_frames); + + /* Maximum number of frames we're going to generate. */ + orig_n_frames = ncep = nfeat = *inout_n_frames; + + /* FIXME: This behaviour isn't guaranteed... */ + if (acmod->state == ACMOD_ENDED) + nfeat += feat_window_size(acmod->fcb); + else if (acmod->state == ACMOD_STARTED) + nfeat -= feat_window_size(acmod->fcb); + + /* Clamp number of features to fit available space. */ + if (nfeat > acmod->n_feat_alloc - acmod->n_feat_frame) { + /* Grow it as needed - we have to grow it at the end of an + * utterance because we can't return a short read there. */ + if (acmod->grow_feat || acmod->state == ACMOD_ENDED) + acmod_grow_feat_buf(acmod, acmod->n_feat_alloc + nfeat); + else + ncep -= (nfeat - (acmod->n_feat_alloc - acmod->n_feat_frame)); + } + + /* Where to start writing in the feature buffer. */ + if (acmod->grow_feat) { + /* Grow to avoid wraparound if grow_feat == TRUE. */ + inptr = acmod->feat_outidx + acmod->n_feat_frame; + while (inptr + nfeat >= acmod->n_feat_alloc) + acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); + } + else { + inptr = (acmod->feat_outidx + acmod->n_feat_frame) % acmod->n_feat_alloc; + } + + + /* FIXME: we can't split the last frame drop properly to be on the bounary, + * so just return + */ + if (inptr + nfeat > acmod->n_feat_alloc && acmod->state == ACMOD_ENDED) { + *inout_n_frames -= ncep; + *inout_cep += ncep; + return 0; + } + + /* Write them in two parts if there is wraparound. */ + if (inptr + nfeat > acmod->n_feat_alloc) { + int32 ncep1 = acmod->n_feat_alloc - inptr; + + /* Make sure we don't end the utterance here. */ + nfeat = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, + &ncep1, + (acmod->state == ACMOD_STARTED), + FALSE, + acmod->feat_buf + inptr); + if (nfeat < 0) + return -1; + /* Move the output feature pointer forward. */ + acmod->n_feat_frame += nfeat; + assert(acmod->n_feat_frame <= acmod->n_feat_alloc); + inptr += nfeat; + inptr %= acmod->n_feat_alloc; + /* Move the input feature pointers forward. */ + *inout_n_frames -= ncep1; + *inout_cep += ncep1; + ncep -= ncep1; + } + + nfeat = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, + &ncep, + (acmod->state == ACMOD_STARTED), + (acmod->state == ACMOD_ENDED), + acmod->feat_buf + inptr); + if (nfeat < 0) + return -1; + acmod->n_feat_frame += nfeat; + assert(acmod->n_feat_frame <= acmod->n_feat_alloc); + /* Move the input feature pointers forward. */ + *inout_n_frames -= ncep; + *inout_cep += ncep; + if (acmod->state == ACMOD_STARTED) + acmod->state = ACMOD_PROCESSING; + + return orig_n_frames - *inout_n_frames; +} + +int +acmod_process_feat(acmod_t *acmod, + mfcc_t **feat) +{ + int i, inptr; + + if (acmod->n_feat_frame == acmod->n_feat_alloc) { + if (acmod->grow_feat) + acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); + else + return 0; + } + + if (acmod->grow_feat) { + /* Grow to avoid wraparound if grow_feat == TRUE. */ + inptr = acmod->feat_outidx + acmod->n_feat_frame; + while (inptr + 1 >= acmod->n_feat_alloc) + acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); + } + else { + inptr = (acmod->feat_outidx + acmod->n_feat_frame) % acmod->n_feat_alloc; + } + for (i = 0; i < feat_dimension1(acmod->fcb); ++i) + memcpy(acmod->feat_buf[inptr][i], + feat[i], feat_dimension2(acmod->fcb, i) * sizeof(**feat)); + ++acmod->n_feat_frame; + assert(acmod->n_feat_frame <= acmod->n_feat_alloc); + + return 1; +} + +static int +acmod_read_senfh_header(acmod_t *acmod) +{ + char **name, **val; + int32 swap; + int i; + + if (bio_readhdr(acmod->insenfh, &name, &val, &swap) < 0) + goto error_out; + for (i = 0; name[i] != NULL; ++i) { + if (!strcmp(name[i], "n_sen")) { + if (atoi(val[i]) != bin_mdef_n_sen(acmod->mdef)) { + E_ERROR("Number of senones in senone file (%d) does not " + "match mdef (%d)\n", atoi(val[i]), + bin_mdef_n_sen(acmod->mdef)); + goto error_out; + } + } + + if (!strcmp(name[i], "logbase")) { + if (fabs(atof_c(val[i]) - logmath_get_base(acmod->lmath)) > 0.001) { + E_ERROR("Logbase in senone file (%f) does not match acmod " + "(%f)\n", atof_c(val[i]), + logmath_get_base(acmod->lmath)); + goto error_out; + } + } + } + acmod->insen_swap = swap; + bio_hdrarg_free(name, val); + return 0; +error_out: + bio_hdrarg_free(name, val); + return -1; +} + +int +acmod_set_insenfh(acmod_t *acmod, FILE *senfh) +{ + acmod->insenfh = senfh; + if (senfh == NULL) { + acmod->n_feat_frame = 0; + acmod->compallsen = cmd_ln_boolean_r(acmod->config, "-compallsen"); + return 0; + } + acmod->compallsen = TRUE; + return acmod_read_senfh_header(acmod); +} + +int +acmod_rewind(acmod_t *acmod) +{ + /* If the feature buffer is circular, this is not possible. */ + if (acmod->output_frame > acmod->n_feat_alloc) { + E_ERROR("Circular feature buffer cannot be rewound (output frame %d, " + "alloc %d)\n", acmod->output_frame, acmod->n_feat_alloc); + return -1; + } + + /* Frames consumed + frames available */ + acmod->n_feat_frame = acmod->output_frame + acmod->n_feat_frame; + + /* Reset output pointers. */ + acmod->feat_outidx = 0; + acmod->output_frame = 0; + acmod->senscr_frame = -1; + acmod->mgau->frame_idx = 0; + + return 0; +} + +int +acmod_advance(acmod_t *acmod) +{ + /* Advance the output pointers. */ + if (++acmod->feat_outidx == acmod->n_feat_alloc) + acmod->feat_outidx = 0; + --acmod->n_feat_frame; + ++acmod->mgau->frame_idx; + + return ++acmod->output_frame; +} + +int +acmod_write_scores(acmod_t *acmod, int n_active, uint8 const *active, + int16 const *senscr, FILE *senfh) +{ + int16 n_active2; + + /* Uncompressed frame format: + * + * (2 bytes) n_active: Number of active senones + * If all senones active: + * (n_active * 2 bytes) scores of active senones + * + * Otherwise: + * (2 bytes) n_active: Number of active senones + * (n_active bytes) deltas to active senones + * (n_active * 2 bytes) scores of active senones + */ + n_active2 = n_active; + if (fwrite(&n_active2, 2, 1, senfh) != 1) + goto error_out; + if (n_active == bin_mdef_n_sen(acmod->mdef)) { + if (fwrite(senscr, 2, n_active, senfh) != n_active) + goto error_out; + } + else { + int i, n; + if (fwrite(active, 1, n_active, senfh) != n_active) + goto error_out; + for (i = n = 0; i < n_active; ++i) { + n += active[i]; + if (fwrite(senscr + n, 2, 1, senfh) != 1) + goto error_out; + } + } + return 0; +error_out: + E_ERROR_SYSTEM("Failed to write frame to senone file"); + return -1; +} + +/** + * Internal version, used for reading previous frames in acmod_score() + */ +static int +acmod_read_scores_internal(acmod_t *acmod) +{ + FILE *senfh = acmod->insenfh; + int16 n_active; + size_t rv; + + if (acmod->n_feat_frame == acmod->n_feat_alloc) { + if (acmod->grow_feat) + acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); + else + return 0; + } + + if (senfh == NULL) + return -1; + + if ((rv = fread(&n_active, 2, 1, senfh)) != 1) + goto error_out; + + acmod->n_senone_active = n_active; + if (acmod->n_senone_active == bin_mdef_n_sen(acmod->mdef)) { + if ((rv = fread(acmod->senone_scores, 2, + acmod->n_senone_active, senfh)) != acmod->n_senone_active) + goto error_out; + } + else { + int i, n; + + if ((rv = fread(acmod->senone_active, 1, + acmod->n_senone_active, senfh)) != acmod->n_senone_active) + goto error_out; + + for (i = 0, n = 0; i < acmod->n_senone_active; ++i) { + int j, sen = n + acmod->senone_active[i]; + for (j = n + 1; j < sen; ++j) + acmod->senone_scores[j] = SENSCR_DUMMY; + + if ((rv = fread(acmod->senone_scores + sen, 2, 1, senfh)) != 1) + goto error_out; + + n = sen; + } + + n++; + while (n < bin_mdef_n_sen(acmod->mdef)) + acmod->senone_scores[n++] = SENSCR_DUMMY; + } + return 1; + +error_out: + if (ferror(senfh)) { + E_ERROR_SYSTEM("Failed to read frame from senone file"); + return -1; + } + return 0; +} + +int +acmod_read_scores(acmod_t *acmod) +{ + int inptr, rv; + + if (acmod->grow_feat) { + /* Grow to avoid wraparound if grow_feat == TRUE. */ + inptr = acmod->feat_outidx + acmod->n_feat_frame; + /* Has to be +1, otherwise, next time acmod_advance() is + * called, this will wrap around. */ + while (inptr + 1 >= acmod->n_feat_alloc) + acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); + } + else { + inptr = (acmod->feat_outidx + acmod->n_feat_frame) % + acmod->n_feat_alloc; + } + + if ((rv = acmod_read_scores_internal(acmod)) != 1) + return rv; + + /* Set acmod->senscr_frame appropriately so that these scores + get reused below in acmod_score(). */ + acmod->senscr_frame = acmod->output_frame + acmod->n_feat_frame; + + E_DEBUG(1,("Frame %d has %d active states\n", + acmod->senscr_frame, acmod->n_senone_active)); + + /* Increment the "feature frame counter" and record the file + * position for the relevant frame in the (possibly circular) + * buffer. */ + ++acmod->n_feat_frame; + acmod->framepos[inptr] = ftell(acmod->insenfh); + + return 1; +} + +static int +calc_frame_idx(acmod_t *acmod, int *inout_frame_idx) +{ + int frame_idx; + + /* Calculate the absolute frame index to be scored. */ + if (inout_frame_idx == NULL) + frame_idx = acmod->output_frame; + else if (*inout_frame_idx < 0) + frame_idx = acmod->output_frame + 1 + *inout_frame_idx; + else + frame_idx = *inout_frame_idx; + + return frame_idx; +} + +static int +calc_feat_idx(acmod_t *acmod, int frame_idx) +{ + int n_backfr, feat_idx; + + n_backfr = acmod->n_feat_alloc - acmod->n_feat_frame; + if (frame_idx < 0 || acmod->output_frame - frame_idx > n_backfr) { + E_ERROR("Frame %d outside queue of %d frames, %d alloc (%d > %d), " + "cannot score\n", frame_idx, acmod->n_feat_frame, + acmod->n_feat_alloc, acmod->output_frame - frame_idx, + n_backfr); + return -1; + } + + /* Get the index in feat_buf/framepos of the frame to be scored. */ + feat_idx = (acmod->feat_outidx + frame_idx - acmod->output_frame) % + acmod->n_feat_alloc; + if (feat_idx < 0) + feat_idx += acmod->n_feat_alloc; + + return feat_idx; +} + +mfcc_t ** +acmod_get_frame(acmod_t *acmod, int *inout_frame_idx) +{ + int frame_idx, feat_idx; + + /* Calculate the absolute frame index requested. */ + frame_idx = calc_frame_idx(acmod, inout_frame_idx); + + /* Calculate position of requested frame in circular buffer. */ + if ((feat_idx = calc_feat_idx(acmod, frame_idx)) < 0) + return NULL; + + if (inout_frame_idx) + *inout_frame_idx = frame_idx; + + return acmod->feat_buf[feat_idx]; +} + +int16 const * +acmod_score(acmod_t *acmod, int *inout_frame_idx) +{ + int frame_idx, feat_idx; + + /* Calculate the absolute frame index to be scored. */ + frame_idx = calc_frame_idx(acmod, inout_frame_idx); + + /* If all senones are being computed, or we are using a senone file, + then we can reuse existing scores. */ + if ((acmod->compallsen || acmod->insenfh) + && frame_idx == acmod->senscr_frame) { + if (inout_frame_idx) + *inout_frame_idx = frame_idx; + return acmod->senone_scores; + } + + /* Calculate position of requested frame in circular buffer. */ + if ((feat_idx = calc_feat_idx(acmod, frame_idx)) < 0) + return NULL; + + /* + * If there is an input senone file locate the appropriate frame and read + * it. + */ + if (acmod->insenfh) { + fseek(acmod->insenfh, acmod->framepos[feat_idx], SEEK_SET); + if (acmod_read_scores_internal(acmod) < 0) + return NULL; + } + else { + /* Build active senone list. */ + acmod_flags2list(acmod); + + /* Generate scores for the next available frame */ + ps_mgau_frame_eval(acmod->mgau, + acmod->senone_scores, + acmod->senone_active, + acmod->n_senone_active, + acmod->feat_buf[feat_idx], + frame_idx, + acmod->compallsen); + } + + if (inout_frame_idx) + *inout_frame_idx = frame_idx; + acmod->senscr_frame = frame_idx; + + /* Dump scores to the senone dump file if one exists. */ + if (acmod->senfh) { + if (acmod_write_scores(acmod, acmod->n_senone_active, + acmod->senone_active, + acmod->senone_scores, + acmod->senfh) < 0) + return NULL; + E_DEBUG(1,("Frame %d has %d active states\n", frame_idx, + acmod->n_senone_active)); + } + + return acmod->senone_scores; +} + +int +acmod_best_score(acmod_t *acmod, int *out_best_senid) +{ + int i, best; + + best = SENSCR_DUMMY; + if (acmod->compallsen) { + for (i = 0; i < bin_mdef_n_sen(acmod->mdef); ++i) { + if (acmod->senone_scores[i] < best) { + best = acmod->senone_scores[i]; + *out_best_senid = i; + } + } + } + else { + int16 *senscr; + senscr = acmod->senone_scores; + for (i = 0; i < acmod->n_senone_active; ++i) { + senscr += acmod->senone_active[i]; + if (*senscr < best) { + best = *senscr; + *out_best_senid = i; + } + } + } + return best; +} + + +void +acmod_clear_active(acmod_t *acmod) +{ + if (acmod->compallsen) + return; + bitvec_clear_all(acmod->senone_active_vec, bin_mdef_n_sen(acmod->mdef)); + acmod->n_senone_active = 0; +} + +#define MPX_BITVEC_SET(a,h,i) \ + if (hmm_mpx_ssid(h,i) != BAD_SSID) \ + bitvec_set((a)->senone_active_vec, hmm_mpx_senid(h,i)) +#define NONMPX_BITVEC_SET(a,h,i) \ + bitvec_set((a)->senone_active_vec, \ + hmm_nonmpx_senid(h,i)) + +void +acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm) +{ + int i; + + if (acmod->compallsen) + return; + if (hmm_is_mpx(hmm)) { + switch (hmm_n_emit_state(hmm)) { + case 5: + MPX_BITVEC_SET(acmod, hmm, 4); + MPX_BITVEC_SET(acmod, hmm, 3); + case 3: + MPX_BITVEC_SET(acmod, hmm, 2); + MPX_BITVEC_SET(acmod, hmm, 1); + MPX_BITVEC_SET(acmod, hmm, 0); + break; + default: + for (i = 0; i < hmm_n_emit_state(hmm); ++i) { + MPX_BITVEC_SET(acmod, hmm, i); + } + } + } + else { + switch (hmm_n_emit_state(hmm)) { + case 5: + NONMPX_BITVEC_SET(acmod, hmm, 4); + NONMPX_BITVEC_SET(acmod, hmm, 3); + case 3: + NONMPX_BITVEC_SET(acmod, hmm, 2); + NONMPX_BITVEC_SET(acmod, hmm, 1); + NONMPX_BITVEC_SET(acmod, hmm, 0); + break; + default: + for (i = 0; i < hmm_n_emit_state(hmm); ++i) { + NONMPX_BITVEC_SET(acmod, hmm, i); + } + } + } +} + +int32 +acmod_flags2list(acmod_t *acmod) +{ + int32 w, l, n, b, total_dists, total_words, extra_bits; + bitvec_t *flagptr; + + total_dists = bin_mdef_n_sen(acmod->mdef); + if (acmod->compallsen) { + acmod->n_senone_active = total_dists; + return total_dists; + } + total_words = total_dists / BITVEC_BITS; + extra_bits = total_dists % BITVEC_BITS; + w = n = l = 0; + for (flagptr = acmod->senone_active_vec; w < total_words; ++w, ++flagptr) { + if (*flagptr == 0) + continue; + for (b = 0; b < BITVEC_BITS; ++b) { + if (*flagptr & (1UL << b)) { + int32 sen = w * BITVEC_BITS + b; + int32 delta = sen - l; + /* Handle excessive deltas "lossily" by adding a few + extra senones to bridge the gap. */ + while (delta > 255) { + acmod->senone_active[n++] = 255; + delta -= 255; + } + acmod->senone_active[n++] = delta; + l = sen; + } + } + } + + for (b = 0; b < extra_bits; ++b) { + if (*flagptr & (1UL << b)) { + int32 sen = w * BITVEC_BITS + b; + int32 delta = sen - l; + /* Handle excessive deltas "lossily" by adding a few + extra senones to bridge the gap. */ + while (delta > 255) { + acmod->senone_active[n++] = 255; + delta -= 255; + } + acmod->senone_active[n++] = delta; + l = sen; + } + } + + acmod->n_senone_active = n; + E_DEBUG(1, ("acmod_flags2list: %d active in frame %d\n", + acmod->n_senone_active, acmod->output_frame)); + return n; +} + +int32 +acmod_stream_offset(acmod_t *acmod) +{ + return acmod->utt_start_frame; +} + +void +acmod_start_stream(acmod_t *acmod) +{ + fe_start_stream(acmod->fe); + acmod->utt_start_frame = 0; +} + +void +acmod_set_rawdata_size(acmod_t *acmod, int32 size) +{ + assert(size >= 0); + acmod->rawdata_size = size; + if (acmod->rawdata_size > 0) { + ckd_free(acmod->rawdata); + acmod->rawdata = ckd_calloc(size, sizeof(int16)); + } +} + +void +acmod_get_rawdata(acmod_t *acmod, int16 **buffer, int32 *size) +{ + if (buffer) { + *buffer = acmod->rawdata; + } + if (size) { + *size = acmod->rawdata_pos; + } +} +
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/acmod.h @@ -0,0 +1,466 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/** + * @file acmod.h Acoustic model structures for PocketSphinx. + * @author David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#ifndef __ACMOD_H__ +#define __ACMOD_H__ + +/* System headers. */ +#include <stdio.h> + +/* SphinxBase headers. */ +#include <sphinxbase/cmd_ln.h> +#include <sphinxbase/logmath.h> +#include <sphinxbase/fe.h> +#include <sphinxbase/feat.h> +#include <sphinxbase/bitvec.h> +#include <sphinxbase/err.h> +#include <sphinxbase/prim_type.h> + +/* Local headers. */ +#include "ps_mllr.h" +#include "bin_mdef.h" +#include "tmat.h" +#include "hmm.h" + +/** + * States in utterance processing. + */ +typedef enum acmod_state_e { + ACMOD_IDLE, /**< Not in an utterance. */ + ACMOD_STARTED, /**< Utterance started, no data yet. */ + ACMOD_PROCESSING, /**< Utterance in progress. */ + ACMOD_ENDED /**< Utterance ended, still buffering. */ +} acmod_state_t; + +/** + * Dummy senone score value for unintentionally active states. + */ +#define SENSCR_DUMMY 0x7fff + +/** + * Feature space linear transform structure. + */ +struct ps_mllr_s { + int refcnt; /**< Reference count. */ + int n_class; /**< Number of MLLR classes. */ + int n_feat; /**< Number of feature streams. */ + int *veclen; /**< Length of input vectors for each stream. */ + float32 ****A; /**< Rotation part of mean transformations. */ + float32 ***b; /**< Bias part of mean transformations. */ + float32 ***h; /**< Diagonal transformation of variances. */ + int32 *cb2mllr; /**< Mapping from codebooks to transformations. */ +}; + +/** + * Acoustic model parameter structure. + */ +typedef struct ps_mgau_s ps_mgau_t; + +typedef struct ps_mgaufuncs_s { + char const *name; + + int (*frame_eval)(ps_mgau_t *mgau, + int16 *senscr, + uint8 *senone_active, + int32 n_senone_active, + mfcc_t ** feat, + int32 frame, + int32 compallsen); + int (*transform)(ps_mgau_t *mgau, + ps_mllr_t *mllr); + void (*free)(ps_mgau_t *mgau); +} ps_mgaufuncs_t; + +struct ps_mgau_s { + ps_mgaufuncs_t *vt; /**< vtable of mgau functions. */ + int frame_idx; /**< frame counter. */ +}; + +#define ps_mgau_base(mg) ((ps_mgau_t *)(mg)) +#define ps_mgau_frame_eval(mg,senscr,senone_active,n_senone_active,feat,frame,compallsen) \ + (*ps_mgau_base(mg)->vt->frame_eval) \ + (mg, senscr, senone_active, n_senone_active, feat, frame, compallsen) +#define ps_mgau_transform(mg, mllr) \ + (*ps_mgau_base(mg)->vt->transform)(mg, mllr) +#define ps_mgau_free(mg) \ + (*ps_mgau_base(mg)->vt->free)(mg) + +/** + * Acoustic model structure. + * + * This object encapsulates all stages of acoustic processing, from + * raw audio input to acoustic score output. The reason for grouping + * all of these modules together is that they all have to "agree" in + * their parameterizations, and the configuration of the acoustic and + * dynamic feature computation is completely dependent on the + * parameters used to build the original acoustic model (which should + * by now always be specified in a feat.params file). + * + * Because there is not a one-to-one correspondence from blocks of + * input audio or frames of input features to frames of acoustic + * scores (due to dynamic feature calculation), results may not be + * immediately available after input, and the output results will not + * correspond to the last piece of data input. + * + * TODO: In addition, this structure serves the purpose of queueing + * frames of features (and potentially also scores in the future) for + * asynchronous passes of recognition operating in parallel. + */ +struct acmod_s { + /* Global objects, not retained. */ + cmd_ln_t *config; /**< Configuration. */ + logmath_t *lmath; /**< Log-math computation. */ + glist_t strings; /**< Temporary acoustic model filenames. */ + + /* Feature computation: */ + fe_t *fe; /**< Acoustic feature computation. */ + feat_t *fcb; /**< Dynamic feature computation. */ + + /* Model parameters: */ + bin_mdef_t *mdef; /**< Model definition. */ + tmat_t *tmat; /**< Transition matrices. */ + ps_mgau_t *mgau; /**< Model parameters. */ + ps_mllr_t *mllr; /**< Speaker transformation. */ + + /* Senone scoring: */ + int16 *senone_scores; /**< GMM scores for current frame. */ + bitvec_t *senone_active_vec; /**< Active GMMs in current frame. */ + uint8 *senone_active; /**< Array of deltas to active GMMs. */ + int senscr_frame; /**< Frame index for senone_scores. */ + int n_senone_active; /**< Number of active GMMs. */ + int log_zero; /**< Zero log-probability value. */ + + /* Utterance processing: */ + mfcc_t **mfc_buf; /**< Temporary buffer of acoustic features. */ + mfcc_t ***feat_buf; /**< Temporary buffer of dynamic features. */ + FILE *rawfh; /**< File for writing raw audio data. */ + FILE *mfcfh; /**< File for writing acoustic feature data. */ + FILE *senfh; /**< File for writing senone score data. */ + FILE *insenfh; /**< Input senone score file. */ + long *framepos; /**< File positions of recent frames in senone file. */ + + /* Rawdata collected during decoding */ + int16 *rawdata; + int32 rawdata_size; + int32 rawdata_pos; + + /* A whole bunch of flags and counters: */ + uint8 state; /**< State of utterance processing. */ + uint8 compallsen; /**< Compute all senones? */ + uint8 grow_feat; /**< Whether to grow feat_buf. */ + uint8 insen_swap; /**< Whether to swap input senone score. */ + + frame_idx_t utt_start_frame; /**< Index of the utterance start in the stream, all timings are relative to that. */ + + frame_idx_t output_frame; /**< Index of next frame of dynamic features. */ + frame_idx_t n_mfc_alloc; /**< Number of frames allocated in mfc_buf */ + frame_idx_t n_mfc_frame; /**< Number of frames active in mfc_buf */ + frame_idx_t mfc_outidx; /**< Start of active frames in mfc_buf */ + frame_idx_t n_feat_alloc; /**< Number of frames allocated in feat_buf */ + frame_idx_t n_feat_frame; /**< Number of frames active in feat_buf */ + frame_idx_t feat_outidx; /**< Start of active frames in feat_buf */ +}; +typedef struct acmod_s acmod_t; + +/** + * Initialize an acoustic model. + * + * @param config a command-line object containing parameters. This + * pointer is not retained by this object. + * @param lmath global log-math parameters. + * @param fe a previously-initialized acoustic feature module to use, + * or NULL to create one automatically. If this is supplied + * and its parameters do not match those in the acoustic + * model, this function will fail. This pointer is not retained. + * @param fe a previously-initialized dynamic feature module to use, + * or NULL to create one automatically. If this is supplied + * and its parameters do not match those in the acoustic + * model, this function will fail. This pointer is not retained. + * @return a newly initialized acmod_t, or NULL on failure. + */ +acmod_t *acmod_init(cmd_ln_t *config, logmath_t *lmath, fe_t *fe, feat_t *fcb); + +/** + * Adapt acoustic model using a linear transform. + * + * @param mllr The new transform to use, or NULL to update the existing + * transform. The decoder retains ownership of this pointer, + * so you should not attempt to free it manually. Use + * ps_mllr_retain() if you wish to reuse it + * elsewhere. + * @return The updated transform object for this decoder, or + * NULL on failure. + */ +ps_mllr_t *acmod_update_mllr(acmod_t *acmod, ps_mllr_t *mllr); + +/** + * Start logging senone scores to a filehandle. + * + * @param acmod Acoustic model object. + * @param logfh Filehandle to log to. + * @return 0 for success, <0 on error. + */ +int acmod_set_senfh(acmod_t *acmod, FILE *senfh); + +/** + * Start logging MFCCs to a filehandle. + * + * @param acmod Acoustic model object. + * @param logfh Filehandle to log to. + * @return 0 for success, <0 on error. + */ +int acmod_set_mfcfh(acmod_t *acmod, FILE *logfh); + +/** + * Start logging raw audio to a filehandle. + * + * @param acmod Acoustic model object. + * @param logfh Filehandle to log to. + * @return 0 for success, <0 on error. + */ +int acmod_set_rawfh(acmod_t *acmod, FILE *logfh); + +/** + * Finalize an acoustic model. + */ +void acmod_free(acmod_t *acmod); + +/** + * Mark the start of an utterance. + */ +int acmod_start_utt(acmod_t *acmod); + +/** + * Mark the end of an utterance. + */ +int acmod_end_utt(acmod_t *acmod); + +/** + * Rewind the current utterance, allowing it to be rescored. + * + * After calling this function, the internal frame index is reset, and + * acmod_score() will return scores starting at the first frame of the + * current utterance. Currently, acmod_set_grow() must have been + * called to enable growing the feature buffer in order for this to + * work. In the future, senone scores may be cached instead. + * + * @return 0 for success, <0 for failure (if the utterance can't be + * rewound due to no feature or score data available) + */ +int acmod_rewind(acmod_t *acmod); + +/** + * Advance the frame index. + * + * This function moves to the next frame of input data. Subsequent + * calls to acmod_score() will return scores for that frame, until the + * next call to acmod_advance(). + * + * @return New frame index. + */ +int acmod_advance(acmod_t *acmod); + +/** + * Set memory allocation policy for utterance processing. + * + * @param grow_feat If non-zero, the internal dynamic feature buffer + * will expand as necessary to encompass any amount of data fed to the + * model. + * @return previous allocation policy. + */ +int acmod_set_grow(acmod_t *acmod, int grow_feat); + +/** + * TODO: Set queue length for utterance processing. + * + * This function allows multiple concurrent passes of search to + * operate on different parts of the utterance. + */ + +/** + * Feed raw audio data to the acoustic model for scoring. + * + * @param inout_raw In: Pointer to buffer of raw samples + * Out: Pointer to next sample to be read + * @param inout_n_samps In: Number of samples available + * Out: Number of samples remaining + * @param full_utt If non-zero, this block represents a full + * utterance and should be processed as such. + * @return Number of frames of data processed. + */ +int acmod_process_raw(acmod_t *acmod, + int16 const **inout_raw, + size_t *inout_n_samps, + int full_utt); + +/** + * Feed acoustic feature data into the acoustic model for scoring. + * + * @param inout_cep In: Pointer to buffer of features + * Out: Pointer to next frame to be read + * @param inout_n_frames In: Number of frames available + * Out: Number of frames remaining + * @param full_utt If non-zero, this block represents a full + * utterance and should be processed as such. + * @return Number of frames of data processed. + */ +int acmod_process_cep(acmod_t *acmod, + mfcc_t ***inout_cep, + int *inout_n_frames, + int full_utt); + +/** + * Feed dynamic feature data into the acoustic model for scoring. + * + * Unlike acmod_process_raw() and acmod_process_cep(), this function + * accepts a single frame at a time. This is because there is no need + * to do buffering when using dynamic features as input. However, if + * the dynamic feature buffer is full, this function will fail, so you + * should either always check the return value, or always pair a call + * to it with a call to acmod_score(). + * + * @param feat Pointer to one frame of dynamic features. + * @return Number of frames processed (either 0 or 1). + */ +int acmod_process_feat(acmod_t *acmod, + mfcc_t **feat); + +/** + * Set up a senone score dump file for input. + * + * @param insenfh File handle of dump file + * @return 0 for success, <0 for failure + */ +int acmod_set_insenfh(acmod_t *acmod, FILE *insenfh); + +/** + * Read one frame of scores from senone score dump file. + * + * @return Number of frames read or <0 on error. + */ +int acmod_read_scores(acmod_t *acmod); + +/** + * Get a frame of dynamic feature data. + * + * @param inout_frame_idx Input: frame index to get, or NULL + * to obtain features for the most recent frame. + * Output: frame index corresponding to this + * set of features. + * @return Feature array, or NULL if requested frame is not available. + */ +mfcc_t **acmod_get_frame(acmod_t *acmod, int *inout_frame_idx); + +/** + * Score one frame of data. + * + * @param inout_frame_idx Input: frame index to score, or NULL + * to obtain scores for the most recent frame. + * Output: frame index corresponding to this + * set of scores. + * @return Array of senone scores for this frame, or NULL if no frame + * is available for scoring (such as if a frame index is + * requested that is not yet or no longer available). The + * data pointed to persists only until the next call to + * acmod_score() or acmod_advance(). + */ +int16 const *acmod_score(acmod_t *acmod, + int *inout_frame_idx); + +/** + * Write senone dump file header. + */ +int acmod_write_senfh_header(acmod_t *acmod, FILE *logfh); + +/** + * Write a frame of senone scores to a dump file. + */ +int acmod_write_scores(acmod_t *acmod, int n_active, uint8 const *active, + int16 const *senscr, FILE *senfh); + + +/** + * Get best score and senone index for current frame. + */ +int acmod_best_score(acmod_t *acmod, int *out_best_senid); + +/** + * Clear set of active senones. + */ +void acmod_clear_active(acmod_t *acmod); + +/** + * Activate senones associated with an HMM. + */ +void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm); + +/** + * Activate a single senone. + */ +#define acmod_activate_sen(acmod, sen) bitvec_set((acmod)->senone_active_vec, sen) + +/** + * Build active list from + */ +int32 acmod_flags2list(acmod_t *acmod); + +/** + * Get the offset of the utterance start of the current stream, helpful for stream-wide timing. + */ +int32 acmod_stream_offset(acmod_t *acmod); + +/** + * Reset the current stream + */ +void acmod_start_stream(acmod_t *acmod); + +/** + * Sets the limit of the raw audio data to store + */ +void acmod_set_rawdata_size(acmod_t *acmod, int32 size); + +/** + * Retrieves the raw data collected during utterance decoding + */ +void acmod_get_rawdata(acmod_t *acmod, int16 **buffer, int32 *size); + +#endif /* __ACMOD_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/allphone_search.c @@ -0,0 +1,913 @@ +/* ==================================================================== + * Copyright (c) 2014 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* +* allphone_search.c -- Search for phonetic decoding. +*/ + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include <sphinxbase/err.h> +#include <sphinxbase/ckd_alloc.h> +#include <sphinxbase/strfuncs.h> +#include <sphinxbase/pio.h> +#include <sphinxbase/cmd_ln.h> + +#include "pocketsphinx_internal.h" +#include "allphone_search.h" + +static ps_lattice_t * +allphone_search_lattice(ps_search_t * search) +{ + //cap + return NULL; +} + +static int +allphone_search_prob(ps_search_t * search) +{ + return 0; +} + +static void +allphone_backtrace(allphone_search_t * allphs, int32 f); + +static void +allphone_search_seg_free(ps_seg_t * seg) +{ + ckd_free(seg); +} + +static void +allphone_search_fill_iter(ps_seg_t *seg, phseg_t *phseg) +{ + seg->sf = phseg->sf; + seg->ef = phseg->ef; + seg->ascr = phseg->score; + seg->lscr = phseg->tscore; + seg->word = bin_mdef_ciphone_str(ps_search_acmod(seg->search)->mdef, phseg->ci); +} + +static ps_seg_t * +allphone_search_seg_next(ps_seg_t * seg) +{ + phseg_iter_t *itor = (phseg_iter_t *) seg; + phseg_t *phseg; + + itor->seg = itor->seg->next; + + if (itor->seg == NULL) { + allphone_search_seg_free(seg); + return NULL; + } + phseg = gnode_ptr(itor->seg); + allphone_search_fill_iter(seg, phseg); + + return seg; +} + +static ps_segfuncs_t fsg_segfuncs = { + /* seg_next */ allphone_search_seg_next, + /* seg_free */ allphone_search_seg_free +}; + + +static ps_seg_t * +allphone_search_seg_iter(ps_search_t * search, int32 * out_score) +{ + allphone_search_t *allphs = (allphone_search_t *) search; + phseg_iter_t *iter; + + allphone_backtrace(allphs, allphs->frame - 1); + if (allphs->segments == NULL) + return NULL; + + iter = ckd_calloc(1, sizeof(phseg_iter_t)); + + iter->base.vt = &fsg_segfuncs; + iter->base.search = search; + iter->seg = allphs->segments; + allphone_search_fill_iter((ps_seg_t *)iter, gnode_ptr(iter->seg)); + + return (ps_seg_t *) iter; +} + +static ps_searchfuncs_t allphone_funcs = { + /* name: */ "allphone", + /* start: */ allphone_search_start, + /* step: */ allphone_search_step, + /* finish: */ allphone_search_finish, + /* reinit: */ allphone_search_reinit, + /* free: */ allphone_search_free, + /* lattice: */ allphone_search_lattice, + /* hyp: */ allphone_search_hyp, + /* prob: */ allphone_search_prob, + /* seg_iter: */ allphone_search_seg_iter, +}; + +/** + * Find PHMM node with same senone sequence and tmat id as the given triphone. + * Return ptr to PHMM node if found, NULL otherwise. + */ +static phmm_t * +phmm_lookup(allphone_search_t * allphs, s3pid_t pid) +{ + phmm_t *p; + bin_mdef_t *mdef; + phmm_t **ci_phmm; + + mdef = ((ps_search_t *) allphs)->acmod->mdef; + ci_phmm = allphs->ci_phmm; + + for (p = ci_phmm[bin_mdef_pid2ci(mdef, pid)]; p; p = p->next) { + if (mdef_pid2tmatid(mdef, p->pid) == mdef_pid2tmatid(mdef, pid)) + if (mdef_pid2ssid(mdef, p->pid) == mdef_pid2ssid(mdef, pid)) + return p; + } + + //not found + return NULL; +} + +static int32 +phmm_link(allphone_search_t * allphs) +{ + s3cipid_t ci, rc; + phmm_t *p, *p2; + int32 *rclist; + int32 i, n_link; + plink_t *l; + bin_mdef_t *mdef; + phmm_t **ci_phmm; + + mdef = ((ps_search_t *) allphs)->acmod->mdef; + ci_phmm = allphs->ci_phmm; + + rclist = (int32 *) ckd_calloc(mdef->n_ciphone + 1, sizeof(int32)); + + /* Create successor links between PHMM nodes */ + n_link = 0; + for (ci = 0; ci < mdef->n_ciphone; ci++) { + for (p = ci_phmm[ci]; p; p = p->next) { + /* Build rclist for p */ + i = 0; + for (rc = 0; rc < mdef->n_ciphone; rc++) { + if (bitvec_is_set(p->rc, rc)) + rclist[i++] = rc; + } + rclist[i] = BAD_S3CIPID; + + /* For each rc in rclist, transition to PHMMs for rc if left context = ci */ + for (i = 0; IS_S3CIPID(rclist[i]); i++) { + for (p2 = ci_phmm[rclist[i]]; p2; p2 = p2->next) { + if (bitvec_is_set(p2->lc, ci)) { + /* transition from p to p2 */ + l = (plink_t *) ckd_calloc(1, sizeof(*l)); + l->phmm = p2; + l->next = p->succlist; + p->succlist = l; + + n_link++; + } + } + } + } + } + + ckd_free(rclist); + + return n_link; +} + +/** + * Build net from phone HMMs + */ +static int +phmm_build(allphone_search_t * allphs) +{ + phmm_t *p, **pid2phmm; + bin_mdef_t *mdef; + int32 lrc_size; + uint32 *lc, *rc; + s3pid_t pid; + s3cipid_t ci; + s3cipid_t *filler; + int n_phmm, n_link; + int i, nphone; + + mdef = ((ps_search_t *) allphs)->acmod->mdef; + allphs->ci_phmm = + (phmm_t **) ckd_calloc(bin_mdef_n_ciphone(mdef), sizeof(phmm_t *)); + pid2phmm = + (phmm_t **) ckd_calloc(bin_mdef_n_phone(mdef), sizeof(phmm_t *)); + + /* For each unique ciphone/triphone entry in mdef, create a PHMM node */ + n_phmm = 0; + nphone = allphs->ci_only ? bin_mdef_n_ciphone(mdef) : bin_mdef_n_phone(mdef); + E_INFO("Building PHMM net of %d phones\n", nphone); + for (pid = 0; pid < nphone; pid++) { + if ((p = phmm_lookup(allphs, pid)) == NULL) { + //not found, should be created + p = (phmm_t *) ckd_calloc(1, sizeof(*p)); + hmm_init(allphs->hmmctx, &(p->hmm), FALSE, + mdef_pid2ssid(mdef, pid), mdef->phone[pid].tmat); + p->pid = pid; + p->ci = bin_mdef_pid2ci(mdef, pid); + p->succlist = NULL; + p->next = allphs->ci_phmm[p->ci]; + allphs->ci_phmm[p->ci] = p; + n_phmm++; + } + pid2phmm[pid] = p; + } + + /* Fill out bitvecs of each PHMM node, alloc continuous memory chunk for context bitvectors */ + lrc_size = bitvec_size(bin_mdef_n_ciphone(mdef)); + lc = ckd_calloc(n_phmm * 2 * lrc_size, sizeof(bitvec_t)); + rc = lc + (n_phmm * lrc_size); + for (ci = 0; ci < mdef->n_ciphone; ci++) { + for (p = allphs->ci_phmm[ci]; p; p = p->next) { + p->lc = lc; + lc += lrc_size; + p->rc = rc; + rc += lrc_size; + } + } + + /* Fill out lc and rc bitmaps (remember to map all fillers to each other!!) */ + filler = + (s3cipid_t *) ckd_calloc(bin_mdef_n_ciphone(mdef) + 1, + sizeof(s3cipid_t)); + + /* Connect fillers */ + i = 0; + for (ci = 0; ci < bin_mdef_n_ciphone(mdef); ci++) { + p = pid2phmm[ci]; + bitvec_set_all(p->lc, bin_mdef_n_ciphone(mdef)); + bitvec_set_all(p->rc, bin_mdef_n_ciphone(mdef)); + if (mdef->phone[ci].info.ci.filler) { + filler[i++] = ci; + } + } + filler[i] = BAD_S3CIPID; + + + /* Loop over cdphones only if ci_only is not set */ + for (pid = bin_mdef_n_ciphone(mdef); pid < nphone; + pid++) { + p = pid2phmm[pid]; + + if (mdef->phone[mdef->phone[pid].info.cd.ctx[1]].info.ci.filler) { + for (i = 0; IS_S3CIPID(filler[i]); i++) + bitvec_set(p->lc, filler[i]); + } + else + bitvec_set(p->lc, mdef->phone[pid].info.cd.ctx[1]); + + if (mdef->phone[mdef->phone[pid].info.cd.ctx[2]].info.ci.filler) { + for (i = 0; IS_S3CIPID(filler[i]); i++) + bitvec_set(p->rc, filler[i]); + } + else + bitvec_set(p->rc, mdef->phone[pid].info.cd.ctx[2]); + } + ckd_free(pid2phmm); + ckd_free(filler); + + /* Create links between PHMM nodes */ + n_link = phmm_link(allphs); + + E_INFO("%d nodes, %d links\n", n_phmm, n_link); + return 0; +} + +static void +phmm_free(allphone_search_t * allphs) +{ + s3cipid_t ci; + bin_mdef_t *mdef; + + if (!allphs->ci_phmm) + //nothing to free + return; + ckd_free(allphs->ci_phmm[0]->lc); + mdef = ((ps_search_t *) allphs)->acmod->mdef; + for (ci = 0; ci < mdef_n_ciphone(mdef); ++ci) { + phmm_t *p, *next; + + for (p = allphs->ci_phmm[ci]; p; p = next) { + plink_t *l, *lnext; + + next = p->next; + for (l = p->succlist; l; l = lnext) { + lnext = l->next; + ckd_free(l); + } + hmm_deinit(&(p->hmm)); + ckd_free(p); + } + } + ckd_free(allphs->ci_phmm); +} + +/** Evaluate active PHMMs */ +static int32 +phmm_eval_all(allphone_search_t * allphs, const int16 * senscr) +{ + s3cipid_t ci; + phmm_t *p; + int32 best; + bin_mdef_t *mdef; + phmm_t **ci_phmm; + + mdef = ((ps_search_t *) allphs)->acmod->mdef; + ci_phmm = allphs->ci_phmm; + + best = WORST_SCORE; + + hmm_context_set_senscore(allphs->hmmctx, senscr); + for (ci = 0; ci < mdef->n_ciphone; ci++) { + for (p = ci_phmm[(unsigned) ci]; p; p = p->next) { + if (hmm_frame(&(p->hmm)) == allphs->frame) { + int32 score; + allphs->n_hmm_eval++; + score = hmm_vit_eval((hmm_t *) p); + if (score > best) + best = score; + } + } + } + + return best; +} + +static void +phmm_exit(allphone_search_t * allphs, int32 best) +{ + s3cipid_t ci; + phmm_t *p; + int32 th, nf; + history_t *h; + blkarray_list_t *history; + bin_mdef_t *mdef; + int32 curfrm; + phmm_t **ci_phmm; + int32 *ci2lmwid; + + th = best + allphs->pbeam; + + history = allphs->history; + mdef = ps_search_acmod(allphs)->mdef; + curfrm = allphs->frame; + ci_phmm = allphs->ci_phmm; + ci2lmwid = allphs->ci2lmwid; + + nf = curfrm + 1; + + for (ci = 0; ci < mdef->n_ciphone; ci++) { + for (p = ci_phmm[(unsigned) ci]; p; p = p->next) { + if (hmm_frame(&(p->hmm)) == curfrm) { + + if (hmm_bestscore(&(p->hmm)) >= th) { + + h = (history_t *) ckd_calloc(1, sizeof(*h)); + h->ef = curfrm; + h->phmm = p; + h->hist = hmm_out_history(&(p->hmm)); + h->score = hmm_out_score(&(p->hmm)); + + if (!allphs->lm) { + h->tscore = allphs->inspen; + } + else { + if (h->hist > 0) { + int32 n_used; + history_t *pred = + blkarray_list_get(history, h->hist); + + if (pred->hist > 0) { + history_t *pred_pred = + blkarray_list_get(history, + h->hist); + h->tscore = + ngram_tg_score(allphs->lm, + ci2lmwid + [pred_pred->phmm->ci], + ci2lmwid[pred-> + phmm->ci], + ci2lmwid[p->ci], + &n_used) >> + SENSCR_SHIFT; + } + else { + h->tscore = + ngram_bg_score(allphs->lm, + ci2lmwid + [pred->phmm->ci], + ci2lmwid[p->ci], + &n_used) >> + SENSCR_SHIFT; + } + } + else { + /* + * This is the beginning SIL and in srch_allphone_begin() + * it's inscore is set to 0. + */ + h->tscore = 0; + } + } + + blkarray_list_append(history, h); + + /* Mark PHMM active in next frame */ + hmm_frame(&(p->hmm)) = nf; + } + else { + /* Reset state scores */ + hmm_clear(&(p->hmm)); + } + } + } + } +} + +static void +phmm_trans(allphone_search_t * allphs, int32 best, + int32 frame_history_start) +{ + history_t *h; + phmm_t *from, *to; + plink_t *l; + int32 newscore, nf, curfrm; + int32 *ci2lmwid; + int32 hist_idx; + + curfrm = allphs->frame; + nf = curfrm + 1; + ci2lmwid = allphs->ci2lmwid; + + /* Transition from exited nodes to initial states of HMMs */ + for (hist_idx = frame_history_start; + hist_idx < blkarray_list_n_valid(allphs->history); hist_idx++) { + h = blkarray_list_get(allphs->history, hist_idx); + from = h->phmm; + for (l = from->succlist; l; l = l->next) { + int32 tscore; + to = l->phmm; + + /* No LM, just use uniform (insertion penalty). */ + if (!allphs->lm) + tscore = allphs->inspen; + /* If they are not in the LM, kill this + * transition. */ + else if (ci2lmwid[to->ci] == NGRAM_INVALID_WID) + continue; + else { + int32 n_used; + if (h->hist > 0) { + history_t *pred = + blkarray_list_get(allphs->history, h->hist); + tscore = + ngram_tg_score(allphs->lm, + ci2lmwid[pred->phmm->ci], + ci2lmwid[from->ci], + ci2lmwid[to->ci], + &n_used) >> SENSCR_SHIFT; + } + else { + tscore = ngram_bg_score(allphs->lm, + ci2lmwid[from->ci], + ci2lmwid[to->ci], + &n_used) >> SENSCR_SHIFT; + } + } + + newscore = h->score + tscore; + if ((newscore > best + allphs->beam) + && (newscore > hmm_in_score(&(to->hmm)))) { + hmm_enter(&(to->hmm), newscore, hist_idx, nf); + } + } + } +} + +ps_search_t * +allphone_search_init(ngram_model_t * lm, + cmd_ln_t * config, + acmod_t * acmod, dict_t * dict, dict2pid_t * d2p) +{ + int i; + bin_mdef_t *mdef; + allphone_search_t *allphs; + static char *lmname = "default"; + + allphs = (allphone_search_t *) ckd_calloc(1, sizeof(*allphs)); + ps_search_init(ps_search_base(allphs), &allphone_funcs, config, acmod, + dict, d2p); + mdef = acmod->mdef; + + allphs->hmmctx = hmm_context_init(bin_mdef_n_emit_state(mdef), + acmod->tmat->tp, NULL, mdef->sseq); + if (allphs->hmmctx == NULL) { + ps_search_free(ps_search_base(allphs)); + return NULL; + } + + allphs->ci_only = cmd_ln_boolean_r(config, "-allphone_ci"); + allphs->lw = cmd_ln_float32_r(config, "-lw"); + + phmm_build(allphs); + + if (lm) { + //language model is defined + allphs->lm = ngram_model_set_init(config, &lm, &lmname, NULL, 1); + if (!allphs->lm) { + E_ERROR + ("Failed to initialize ngram model set for phoneme decoding"); + allphone_search_free((ps_search_t *) allphs); + return NULL; + } + allphs->ci2lmwid = + (int32 *) ckd_calloc(mdef->n_ciphone, + sizeof(*allphs->ci2lmwid)); + for (i = 0; i < mdef->n_ciphone; i++) { + allphs->ci2lmwid[i] = + ngram_wid(allphs->lm, + (char *) bin_mdef_ciphone_str(mdef, i)); + /* Map filler phones to silence if not found */ + if (allphs->ci2lmwid[i] == NGRAM_INVALID_WID + && bin_mdef_ciphone_str(mdef, i)) + allphs->ci2lmwid[i] = + ngram_wid(allphs->lm, + (char *) bin_mdef_ciphone_str(mdef, + mdef_silphone + (mdef))); + } + } + else { + E_WARN + ("Failed to load language model specified in -allphone, doing unconstrained phone-loop decoding\n"); + allphs->inspen = + (int32) (logmath_log + (acmod->lmath, cmd_ln_float32_r(config, "-pip")) + * allphs->lw) >> SENSCR_SHIFT; + } + + allphs->n_tot_frame = 0; + allphs->frame = -1; + allphs->segments = NULL; + + /* Get search pruning parameters */ + allphs->beam + = + (int32) logmath_log(acmod->lmath, + cmd_ln_float64_r(config, "-beam")) + >> SENSCR_SHIFT; + allphs->pbeam + = + (int32) logmath_log(acmod->lmath, + cmd_ln_float64_r(config, "-pbeam")) + >> SENSCR_SHIFT; + + /* LM related weights/penalties */ + allphs->history = blkarray_list_init(); + + /* Acoustic score scale for posterior probabilities. */ + allphs->ascale = 1.0 / cmd_ln_float32_r(config, "-ascale"); + + E_INFO("Allphone(beam: %d, pbeam: %d)\n", allphs->beam, allphs->pbeam); + + ptmr_init(&allphs->perf); + + return (ps_search_t *) allphs; +} + +int +allphone_search_reinit(ps_search_t * search, dict_t * dict, + dict2pid_t * d2p) +{ + allphone_search_t *allphs = (allphone_search_t *) search; + + /* Free old dict2pid, dict */ + ps_search_base_reinit(search, dict, d2p); + + if (!allphs->lm) { + E_WARN + ("-lm argument missing; doing unconstrained phone-loop decoding\n"); + allphs->inspen = + (int32) (logmath_log + (search->acmod->lmath, + cmd_ln_float32_r(search->config, + "-pip")) * + allphs->lw) >> SENSCR_SHIFT; + } + + return 0; +} + +void +allphone_search_free(ps_search_t * search) +{ + allphone_search_t *allphs = (allphone_search_t *) search; + + double n_speech = (double)allphs->n_tot_frame + / cmd_ln_int32_r(ps_search_config(allphs), "-frate"); + + E_INFO("TOTAL fwdflat %.2f CPU %.3f xRT\n", + allphs->perf.t_tot_cpu, + allphs->perf.t_tot_cpu / n_speech); + E_INFO("TOTAL fwdflat %.2f wall %.3f xRT\n", + allphs->perf.t_tot_elapsed, + allphs->perf.t_tot_elapsed / n_speech); + + ps_search_deinit(search); + hmm_context_free(allphs->hmmctx); + phmm_free(allphs); + if (allphs->lm) + ngram_model_free(allphs->lm); + if (allphs->ci2lmwid) + ckd_free(allphs->ci2lmwid); + + blkarray_list_free(allphs->history); + + ckd_free(allphs); +} + +int +allphone_search_start(ps_search_t * search) +{ + allphone_search_t *allphs; + bin_mdef_t *mdef; + s3cipid_t ci; + phmm_t *p; + + allphs = (allphone_search_t *) search; + mdef = search->acmod->mdef; + + /* Reset all HMMs. */ + for (ci = 0; ci < bin_mdef_n_ciphone(mdef); ci++) { + for (p = allphs->ci_phmm[(unsigned) ci]; p; p = p->next) { + hmm_clear(&(p->hmm)); + } + } + + allphs->n_hmm_eval = 0; + allphs->n_sen_eval = 0; + + /* Free history nodes, if any */ + blkarray_list_reset(allphs->history); + + /* Initialize start state of the SILENCE PHMM */ + allphs->frame = 0; + ci = bin_mdef_silphone(mdef); + if (NOT_S3CIPID(ci)) + E_FATAL("Cannot find CI-phone %s\n", S3_SILENCE_CIPHONE); + for (p = allphs->ci_phmm[ci]; p && (p->pid != ci); p = p->next); + if (!p) + E_FATAL("Cannot find HMM for %s\n", S3_SILENCE_CIPHONE); + hmm_enter(&(p->hmm), 0, 0, allphs->frame); + + ptmr_reset(&allphs->perf); + ptmr_start(&allphs->perf); + + return 0; +} + +static void +allphone_search_sen_active(allphone_search_t * allphs) +{ + acmod_t *acmod; + bin_mdef_t *mdef; + phmm_t *p; + int32 ci; + + acmod = ps_search_acmod(allphs); + mdef = acmod->mdef; + + acmod_clear_active(acmod); + for (ci = 0; ci < bin_mdef_n_ciphone(mdef); ci++) + for (p = allphs->ci_phmm[ci]; p; p = p->next) + if (hmm_frame(&(p->hmm)) == allphs->frame) + acmod_activate_hmm(acmod, &(p->hmm)); +} + +int +allphone_search_step(ps_search_t * search, int frame_idx) +{ + int32 bestscr, frame_history_start; + const int16 *senscr; + allphone_search_t *allphs = (allphone_search_t *) search; + acmod_t *acmod = search->acmod; + + if (!acmod->compallsen) + allphone_search_sen_active(allphs); + senscr = acmod_score(acmod, &frame_idx); + allphs->n_sen_eval += acmod->n_senone_active; + bestscr = phmm_eval_all(allphs, senscr); + + frame_history_start = blkarray_list_n_valid(allphs->history); + phmm_exit(allphs, bestscr); + phmm_trans(allphs, bestscr, frame_history_start); + + allphs->frame++; + + return 0; +} + +static int32 +ascore(allphone_search_t * allphs, history_t * h) +{ + int32 score = h->score; + + if (h->hist > 0) { + history_t *pred = blkarray_list_get(allphs->history, h->hist); + score -= pred->score; + } + + return score - h->tscore; +} + +static void +allphone_clear_segments(allphone_search_t * allphs) +{ + gnode_t *gn; + for (gn = allphs->segments; gn; gn = gn->next) { + ckd_free(gnode_ptr(gn)); + } + glist_free(allphs->segments); + allphs->segments = NULL; +} + +static void +allphone_backtrace(allphone_search_t * allphs, int32 f) +{ + int32 best, hist_idx, best_idx; + int32 frm, last_frm; + history_t *h; + phseg_t *s; + + /* Clear old list */ + allphone_clear_segments(allphs); + + frm = last_frm = f; + /* Find the first history entry for the requested frame */ + hist_idx = blkarray_list_n_valid(allphs->history) - 1; + while (hist_idx > 0) { + h = blkarray_list_get(allphs->history, hist_idx); + if (h->ef <= f) { + frm = last_frm = h->ef; + break; + } + hist_idx--; + } + + if (hist_idx < 0) + return; + + /* Find bestscore */ + best = (int32) 0x80000000; + best_idx = -1; + while (frm == last_frm && hist_idx > 0) { + h = blkarray_list_get(allphs->history, hist_idx); + frm = h->ef; + if (h->score > best && frm == last_frm) { + best = h->score; + best_idx = hist_idx; + } + hist_idx--; + } + + if (best_idx < 0) + return; + + /* Backtrace */ + while (best_idx > 0) { + h = blkarray_list_get(allphs->history, best_idx); + s = (phseg_t *) ckd_calloc(1, sizeof(phseg_t)); + s->ci = h->phmm->ci; + s->sf = + (h->hist > + 0) ? ((history_t *) blkarray_list_get(allphs->history, + h->hist))->ef + 1 : 0; + s->ef = h->ef; + s->score = ascore(allphs, h); + s->tscore = h->tscore; + allphs->segments = glist_add_ptr(allphs->segments, s); + + best_idx = h->hist; + } + + return; +} + +int +allphone_search_finish(ps_search_t * search) +{ + allphone_search_t *allphs; + int32 cf, n_hist; + + allphs = (allphone_search_t *) search; + + allphs->n_tot_frame += allphs->frame; + n_hist = blkarray_list_n_valid(allphs->history); + E_INFO + ("%d frames, %d HMMs (%d/fr), %d senones (%d/fr), %d history entries (%d/fr)\n", + allphs->frame, allphs->n_hmm_eval, + (allphs->frame > 0) ? allphs->n_hmm_eval / allphs->frame : 0, + allphs->n_sen_eval, + (allphs->frame > 0) ? allphs->n_sen_eval / allphs->frame : 0, + n_hist, (allphs->frame > 0) ? n_hist / allphs->frame : 0); + + /* Now backtrace. */ + allphone_backtrace(allphs, allphs->frame - 1); + + /* Print out some statistics. */ + ptmr_stop(&allphs->perf); + /* This is the number of frames processed. */ + cf = ps_search_acmod(allphs)->output_frame; + if (cf > 0) { + double n_speech = (double) (cf + 1) + / cmd_ln_int32_r(ps_search_config(allphs), "-frate"); + E_INFO("allphone %.2f CPU %.3f xRT\n", + allphs->perf.t_cpu, allphs->perf.t_cpu / n_speech); + E_INFO("allphone %.2f wall %.3f xRT\n", + allphs->perf.t_elapsed, allphs->perf.t_elapsed / n_speech); + } + + + return 0; +} + +char const * +allphone_search_hyp(ps_search_t * search, int32 * out_score, + int32 * out_is_final) +{ + allphone_search_t *allphs; + phseg_t *p; + gnode_t *gn; + const char *phone_str; + bin_mdef_t *mdef; + int len, hyp_idx, phone_idx; + + allphs = (allphone_search_t *) search; + mdef = search->acmod->mdef; + + /* Create hypothesis */ + if (search->hyp_str) + ckd_free(search->hyp_str); + search->hyp_str = NULL; + + allphone_backtrace(allphs, allphs->frame - 1); + if (allphs->segments == NULL) { + return NULL; + } + + len = glist_count(allphs->segments) * 10; // maximum length of one phone with spacebar + + search->hyp_str = (char *) ckd_calloc(len, sizeof(*search->hyp_str)); + hyp_idx = 0; + for (gn = allphs->segments; gn; gn = gn->next) { + p = gnode_ptr(gn); + phone_str = bin_mdef_ciphone_str(mdef, p->ci); + phone_idx = 0; + while (phone_str[phone_idx] != '\0') + search->hyp_str[hyp_idx++] = phone_str[phone_idx++]; + search->hyp_str[hyp_idx++] = ' '; + } + search->hyp_str[--hyp_idx] = '\0'; + E_INFO("Hyp: %s\n", search->hyp_str); + return search->hyp_str; +}
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/allphone_search.h @@ -0,0 +1,179 @@ +/* -*- c-basic-offset:4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2014 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* + * allphone_search.h -- Search structures for phoneme decoding. + */ + + +#ifndef __ALLPHONE_SEARCH_H__ +#define __ALLPHONE_SEARCH_H__ + + +/* SphinxBase headers. */ +#include <sphinxbase/glist.h> +#include <sphinxbase/cmd_ln.h> +#include <sphinxbase/ngram_model.h> +#include <sphinxbase/bitvec.h> + +/* Local headers. */ +#include "pocketsphinx_internal.h" +#include "blkarray_list.h" +#include "hmm.h" + +/** + * Models a single unique <senone-sequence, tmat> pair. + * Can represent several different triphones, but all with the same parent basephone. + * (NOTE: Word-position attribute of triphone is ignored.) + */ +typedef struct phmm_s { + hmm_t hmm; /**< Base HMM structure */ + s3pid_t pid; /**< Phone id (temp. during init.) */ + s3cipid_t ci; /**< Parent basephone for this PHMM */ + bitvec_t *lc; /**< Set (bit-vector) of left context phones seen for this PHMM */ + bitvec_t *rc; /**< Set (bit-vector) of right context phones seen for this PHMM */ + struct phmm_s *next; /**< Next unique PHMM for same parent basephone */ + struct plink_s *succlist; /**< List of predecessor PHMM nodes */ +} phmm_t; + +/** + * List of links from a PHMM node to its successors; one link per successor. + */ +typedef struct plink_s { + phmm_t *phmm; /**< Successor PHMM node */ + struct plink_s *next; /**< Next link for parent PHMM node */ +} plink_t; + +/** + * History (paths) information at any point in allphone Viterbi search. + */ +typedef struct history_s { + phmm_t *phmm; /**< PHMM ending this path */ + int32 score; /**< Path score for this path */ + int32 tscore; /**< Transition score for this path */ + frame_idx_t ef; /**< End frame */ + int32 hist; /**< Previous history entry */ +} history_t; + +/** + * Phone level segmentation information + */ +typedef struct phseg_s { + s3cipid_t ci; /* CI-phone id */ + frame_idx_t sf, ef; /* Start and end frame for this phone occurrence */ + int32 score; /* Acoustic score for this segment of alignment */ + int32 tscore; /* Transition ("LM") score for this segment */ +} phseg_t; + +/** + * Segment iterator over list of phseg + */ +typedef struct phseg_iter_s { + ps_seg_t base; + glist_t seg; +} phseg_iter_t; + +/** + * Implementation of allphone search structure. + */ +typedef struct allphone_search_s { + ps_search_t base; + + hmm_context_t *hmmctx; /**< HMM context. */ + ngram_model_t *lm; /**< Ngram model set */ + int32 ci_only; /**< Use context-independent phones for decoding */ + phmm_t **ci_phmm; /**< PHMM lists (for each CI phone) */ + int32 *ci2lmwid; /**< Mapping of CI phones to LM word IDs */ + + int32 beam, pbeam; /**< Effective beams after applying beam_factor */ + int32 lw, inspen; /**< Language weights */ + + frame_idx_t frame; /**< Current frame. */ + float32 ascale; /**< Acoustic score scale for posterior probabilities. */ + + int32 n_tot_frame; /**< Total number of frames processed */ + int32 n_hmm_eval; /**< Total HMMs evaluated this utt */ + int32 n_sen_eval; /**< Total senones evaluated this utt */ + + /* Backtrace information */ + blkarray_list_t *history; /**< List of history nodes allocated in each frame */ + /* Hypothesis DAG */ + glist_t segments; + + ptmr_t perf; /**< Performance counter */ + +} allphone_search_t; + +/** + * Create, initialize and return a search module. + */ +ps_search_t *allphone_search_init(ngram_model_t * lm, + cmd_ln_t * config, + acmod_t * acmod, + dict_t * dict, dict2pid_t * d2p); + +/** + * Deallocate search structure. + */ +void allphone_search_free(ps_search_t * search); + +/** + * Update allphone search module. + */ +int allphone_search_reinit(ps_search_t * search, dict_t * dict, + dict2pid_t * d2p); + +/** + * Prepare the allphone search structure for beginning decoding of the next + * utterance. + */ +int allphone_search_start(ps_search_t * search); + +/** + * Step one frame forward through the Viterbi search. + */ +int allphone_search_step(ps_search_t * search, int frame_idx); + +/** + * Windup and clean the allphone search structure after utterance. + */ +int allphone_search_finish(ps_search_t * search); + +/** + * Get hypothesis string from the allphone search. + */ +char const *allphone_search_hyp(ps_search_t * search, int32 * out_score, + int32 * out_is_final); + +#endif /* __ALLPHONE_SEARCH_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/bin_mdef.c @@ -0,0 +1,887 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2005 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/********************************************************************* + * + * File: bin_mdef.c + * + * Description: + * Binary format model definition files, with support for + * heterogeneous topologies and variable-size N-phones + * + * Author: + * David Huggins-Daines <dhuggins@cs.cmu.edu> + *********************************************************************/ + +/* System headers. */ +#include <stdio.h> +#include <string.h> +#include <assert.h> + +/* SphinxBase headers. */ +#include <sphinxbase/prim_type.h> +#include <sphinxbase/ckd_alloc.h> +#include <sphinxbase/byteorder.h> +#include <sphinxbase/case.h> +#include <sphinxbase/err.h> + +/* Local headers. */ +#include "mdef.h" +#include "bin_mdef.h" + +bin_mdef_t * +bin_mdef_read_text(cmd_ln_t *config, const char *filename) +{ + bin_mdef_t *bmdef; + mdef_t *mdef; + int i, nodes, ci_idx, lc_idx, rc_idx; + int nchars; + + if ((mdef = mdef_init((char *) filename, TRUE)) == NULL) + return NULL; + + /* Enforce some limits. */ + if (mdef->n_sen > BAD_SENID) { + E_ERROR("Number of senones exceeds limit: %d > %d\n", + mdef->n_sen, BAD_SENID); + mdef_free(mdef); + return NULL; + } + if (mdef->n_sseq > BAD_SSID) { + E_ERROR("Number of senone sequences exceeds limit: %d > %d\n", + mdef->n_sseq, BAD_SSID); + mdef_free(mdef); + return NULL; + } + /* We use uint8 for ciphones */ + if (mdef->n_ciphone > 255) { + E_ERROR("Number of phones exceeds limit: %d > %d\n", + mdef->n_ciphone, 255); + mdef_free(mdef); + return NULL; + } + + bmdef = ckd_calloc(1, sizeof(*bmdef)); + bmdef->refcnt = 1; + + /* Easy stuff. The mdef.c code has done the heavy lifting for us. */ + bmdef->n_ciphone = mdef->n_ciphone; + bmdef->n_phone = mdef->n_phone; + bmdef->n_emit_state = mdef->n_emit_state; + bmdef->n_ci_sen = mdef->n_ci_sen; + bmdef->n_sen = mdef->n_sen; + bmdef->n_tmat = mdef->n_tmat; + bmdef->n_sseq = mdef->n_sseq; + bmdef->sseq = mdef->sseq; + bmdef->cd2cisen = mdef->cd2cisen; + bmdef->sen2cimap = mdef->sen2cimap; + bmdef->n_ctx = 3; /* Triphones only. */ + bmdef->sil = mdef->sil; + mdef->sseq = NULL; /* We are taking over this one. */ + mdef->cd2cisen = NULL; /* And this one. */ + mdef->sen2cimap = NULL; /* And this one. */ + + /* Get the phone names. If they are not sorted + * ASCII-betically then we are in a world of hurt and + * therefore will simply refuse to continue. */ + bmdef->ciname = ckd_calloc(bmdef->n_ciphone, sizeof(*bmdef->ciname)); + nchars = 0; + for (i = 0; i < bmdef->n_ciphone; ++i) + nchars += strlen(mdef->ciphone[i].name) + 1; + bmdef->ciname[0] = ckd_calloc(nchars, 1); + strcpy(bmdef->ciname[0], mdef->ciphone[0].name); + for (i = 1; i < bmdef->n_ciphone; ++i) { + bmdef->ciname[i] = + bmdef->ciname[i - 1] + strlen(bmdef->ciname[i - 1]) + 1; + strcpy(bmdef->ciname[i], mdef->ciphone[i].name); + if (i > 0 && strcmp(bmdef->ciname[i - 1], bmdef->ciname[i]) > 0) { + /* FIXME: there should be a solution to this, actually. */ + E_ERROR("Phone names are not in sorted order, sorry."); + bin_mdef_free(bmdef); + return NULL; + } + } + + /* Copy over phone information. */ + bmdef->phone = ckd_calloc(bmdef->n_phone, sizeof(*bmdef->phone)); + for (i = 0; i < mdef->n_phone; ++i) { + bmdef->phone[i].ssid = mdef->phone[i].ssid; + bmdef->phone[i].tmat = mdef->phone[i].tmat; + if (i < bmdef->n_ciphone) { + bmdef->phone[i].info.ci.filler = mdef->ciphone[i].filler; + } + else { + bmdef->phone[i].info.cd.wpos = mdef->phone[i].wpos; + bmdef->phone[i].info.cd.ctx[0] = mdef->phone[i].ci; + bmdef->phone[i].info.cd.ctx[1] = mdef->phone[i].lc; + bmdef->phone[i].info.cd.ctx[2] = mdef->phone[i].rc; + } + } + + /* Walk the wpos_ci_lclist once to find the total number of + * nodes and the starting locations for each level. */ + nodes = lc_idx = ci_idx = rc_idx = 0; + for (i = 0; i < N_WORD_POSN; ++i) { + int j; + for (j = 0; j < mdef->n_ciphone; ++j) { + ph_lc_t *lc; + + for (lc = mdef->wpos_ci_lclist[i][j]; lc; lc = lc->next) { + ph_rc_t *rc; + for (rc = lc->rclist; rc; rc = rc->next) { + ++nodes; /* RC node */ + } + ++nodes; /* LC node */ + ++rc_idx; /* Start of RC nodes (after LC nodes) */ + } + ++nodes; /* CI node */ + ++lc_idx; /* Start of LC nodes (after CI nodes) */ + ++rc_idx; /* Start of RC nodes (after CI and LC nodes) */ + } + ++nodes; /* wpos node */ + ++ci_idx; /* Start of CI nodes (after wpos nodes) */ + ++lc_idx; /* Start of LC nodes (after CI nodes) */ + ++rc_idx; /* STart of RC nodes (after wpos, CI, and LC nodes) */ + } + E_INFO("Allocating %d * %d bytes (%d KiB) for CD tree\n", + nodes, sizeof(*bmdef->cd_tree), + nodes * sizeof(*bmdef->cd_tree) / 1024); + bmdef->n_cd_tree = nodes; + bmdef->cd_tree = ckd_calloc(nodes, sizeof(*bmdef->cd_tree)); + for (i = 0; i < N_WORD_POSN; ++i) { + int j; + + bmdef->cd_tree[i].ctx = i; + bmdef->cd_tree[i].n_down = mdef->n_ciphone; + bmdef->cd_tree[i].c.down = ci_idx; +#if 0 + E_INFO("%d => %c (%d@%d)\n", + i, (WPOS_NAME)[i], + bmdef->cd_tree[i].n_down, bmdef->cd_tree[i].c.down); +#endif + + /* Now we can build the rest of the tree. */ + for (j = 0; j < mdef->n_ciphone; ++j) { + ph_lc_t *lc; + + bmdef->cd_tree[ci_idx].ctx = j; + bmdef->cd_tree[ci_idx].c.down = lc_idx; + for (lc = mdef->wpos_ci_lclist[i][j]; lc; lc = lc->next) { + ph_rc_t *rc; + + bmdef->cd_tree[lc_idx].ctx = lc->lc; + bmdef->cd_tree[lc_idx].c.down = rc_idx; + for (rc = lc->rclist; rc; rc = rc->next) { + bmdef->cd_tree[rc_idx].ctx = rc->rc; + bmdef->cd_tree[rc_idx].n_down = 0; + bmdef->cd_tree[rc_idx].c.pid = rc->pid; +#if 0 + E_INFO("%d => %s %s %s %c (%d@%d)\n", + rc_idx, + bmdef->ciname[j], + bmdef->ciname[lc->lc], + bmdef->ciname[rc->rc], + (WPOS_NAME)[i], + bmdef->cd_tree[rc_idx].n_down, + bmdef->cd_tree[rc_idx].c.down); +#endif + + ++bmdef->cd_tree[lc_idx].n_down; + ++rc_idx; + } + /* If there are no triphones here, + * this is considered a leafnode, so + * set the pid to -1. */ + if (bmdef->cd_tree[lc_idx].n_down == 0) + bmdef->cd_tree[lc_idx].c.pid = -1; +#if 0 + E_INFO("%d => %s %s %c (%d@%d)\n", + lc_idx, + bmdef->ciname[j], + bmdef->ciname[lc->lc], + (WPOS_NAME)[i], + bmdef->cd_tree[lc_idx].n_down, + bmdef->cd_tree[lc_idx].c.down); +#endif + + ++bmdef->cd_tree[ci_idx].n_down; + ++lc_idx; + } + + /* As above, so below. */ + if (bmdef->cd_tree[ci_idx].n_down == 0) + bmdef->cd_tree[ci_idx].c.pid = -1; +#if 0 + E_INFO("%d => %d=%s (%d@%d)\n", + ci_idx, j, bmdef->ciname[j], + bmdef->cd_tree[ci_idx].n_down, + bmdef->cd_tree[ci_idx].c.down); +#endif + + ++ci_idx; + } + } + + mdef_free(mdef); + + bmdef->alloc_mode = BIN_MDEF_FROM_TEXT; + return bmdef; +} + +bin_mdef_t * +bin_mdef_retain(bin_mdef_t *m) +{ + ++m->refcnt; + return m; +} + +int +bin_mdef_free(bin_mdef_t * m) +{ + if (m == NULL) + return 0; + if (--m->refcnt > 0) + return m->refcnt; + + switch (m->alloc_mode) { + case BIN_MDEF_FROM_TEXT: + ckd_free(m->ciname[0]); + ckd_free(m->sseq[0]); + ckd_free(m->phone); + ckd_free(m->cd_tree); + break; + case BIN_MDEF_IN_MEMORY: + ckd_free(m->ciname[0]); + break; + case BIN_MDEF_ON_DISK: + break; + } + if (m->filemap) + mmio_file_unmap(m->filemap); + ckd_free(m->cd2cisen); + ckd_free(m->sen2cimap); + ckd_free(m->ciname); + ckd_free(m->sseq); + ckd_free(m); + return 0; +} + +static const char format_desc[] = + "BEGIN FILE FORMAT DESCRIPTION\n" + "int32 n_ciphone; /**< Number of base (CI) phones */\n" + "int32 n_phone; /**< Number of base (CI) phones + (CD) triphones */\n" + "int32 n_emit_state; /**< Number of emitting states per phone (0 if heterogeneous) */\n" + "int32 n_ci_sen; /**< Number of CI senones; these are the first */\n" + "int32 n_sen; /**< Number of senones (CI+CD) */\n" + "int32 n_tmat; /**< Number of transition matrices */\n" + "int32 n_sseq; /**< Number of unique senone sequences */\n" + "int32 n_ctx; /**< Number of phones of context */\n" + "int32 n_cd_tree; /**< Number of nodes in CD tree structure */\n" + "int32 sil; /**< CI phone ID for silence */\n" + "char ciphones[][]; /**< CI phone strings (null-terminated) */\n" + "char padding[]; /**< Padding to a 4-bytes boundary */\n" + "struct { int16 ctx; int16 n_down; int32 pid/down } cd_tree[];\n" + "struct { int32 ssid; int32 tmat; int8 attr[4] } phones[];\n" + "int16 sseq[]; /**< Unique senone sequences */\n" + "int8 sseq_len[]; /**< Number of states in each sseq (none if homogeneous) */\n" + "END FILE FORMAT DESCRIPTION\n"; + +bin_mdef_t * +bin_mdef_read(cmd_ln_t *config, const char *filename) +{ + bin_mdef_t *m; + FILE *fh; + size_t tree_start; + int32 val, i, do_mmap, swap; + long pos, end; + int32 *sseq_size; + + /* Try to read it as text first. */ + if ((m = bin_mdef_read_text(config, filename)) != NULL) + return m; + + E_INFO("Reading binary model definition: %s\n", filename); + if ((fh = fopen(filename, "rb")) == NULL) + return NULL; + + if (fread(&val, 4, 1, fh) != 1) { + fclose(fh); + E_ERROR_SYSTEM("Failed to read byte-order marker from %s\n", + filename); + return NULL; + } + swap = 0; + if (val == BIN_MDEF_OTHER_ENDIAN) { + swap = 1; + E_INFO("Must byte-swap %s\n", filename); + } + if (fread(&val, 4, 1, fh) != 1) { + fclose(fh); + E_ERROR_SYSTEM("Failed to read version from %s\n", filename); + return NULL; + } + if (swap) + SWAP_INT32(&val); + if (val > BIN_MDEF_FORMAT_VERSION) { + E_ERROR("File format version %d for %s is newer than library\n", + val, filename); + fclose(fh); + return NULL; + } + if (fread(&val, 4, 1, fh) != 1) { + fclose(fh); + E_ERROR_SYSTEM("Failed to read header length from %s\n", filename); + return NULL; + } + if (swap) + SWAP_INT32(&val); + /* Skip format descriptor. */ + fseek(fh, val, SEEK_CUR); + + /* Finally allocate it. */ + m = ckd_calloc(1, sizeof(*m)); + m->refcnt = 1; + + /* Check these, to make gcc/glibc shut up. */ +#define FREAD_SWAP32_CHK(dest) \ + if (fread((dest), 4, 1, fh) != 1) { \ + fclose(fh); \ + ckd_free(m); \ + E_ERROR_SYSTEM("Failed to read %s from %s\n", #dest, filename); \ + return NULL; \ + } \ + if (swap) SWAP_INT32(dest); + + FREAD_SWAP32_CHK(&m->n_ciphone); + FREAD_SWAP32_CHK(&m->n_phone); + FREAD_SWAP32_CHK(&m->n_emit_state); + FREAD_SWAP32_CHK(&m->n_ci_sen); + FREAD_SWAP32_CHK(&m->n_sen); + FREAD_SWAP32_CHK(&m->n_tmat); + FREAD_SWAP32_CHK(&m->n_sseq); + FREAD_SWAP32_CHK(&m->n_ctx); + FREAD_SWAP32_CHK(&m->n_cd_tree); + FREAD_SWAP32_CHK(&m->sil); + + /* CI names are first in the file. */ + m->ciname = ckd_calloc(m->n_ciphone, sizeof(*m->ciname)); + + /* Decide whether to read in the whole file or mmap it. */ + do_mmap = config ? cmd_ln_boolean_r(config, "-mmap") : TRUE; + if (swap) { + E_WARN("-mmap specified, but mdef is other-endian. Will not memory-map.\n"); + do_mmap = FALSE; + } + /* Actually try to mmap it. */ + if (do_mmap) { + m->filemap = mmio_file_read(filename); + if (m->filemap == NULL) + do_mmap = FALSE; + } + pos = ftell(fh); + if (do_mmap) { + /* Get the base pointer from the memory map. */ + m->ciname[0] = (char *)mmio_file_ptr(m->filemap) + pos; + /* Success! */ + m->alloc_mode = BIN_MDEF_ON_DISK; + } + else { + /* Read everything into memory. */ + m->alloc_mode = BIN_MDEF_IN_MEMORY; + fseek(fh, 0, SEEK_END); + end = ftell(fh); + fseek(fh, pos, SEEK_SET); + m->ciname[0] = ckd_malloc(end - pos); + if (fread(m->ciname[0], 1, end - pos, fh) != end - pos) + E_FATAL("Failed to read %d bytes of data from %s\n", end - pos, filename); + } + + for (i = 1; i < m->n_ciphone; ++i) + m->ciname[i] = m->ciname[i - 1] + strlen(m->ciname[i - 1]) + 1; + + /* Skip past the padding. */ + tree_start = + m->ciname[i - 1] + strlen(m->ciname[i - 1]) + 1 - m->ciname[0]; + tree_start = (tree_start + 3) & ~3; + m->cd_tree = (cd_tree_t *) (m->ciname[0] + tree_start); + if (swap) { + for (i = 0; i < m->n_cd_tree; ++i) { + SWAP_INT16(&m->cd_tree[i].ctx); + SWAP_INT16(&m->cd_tree[i].n_down); + SWAP_INT32(&m->cd_tree[i].c.down); + } + } + m->phone = (mdef_entry_t *) (m->cd_tree + m->n_cd_tree); + if (swap) { + for (i = 0; i < m->n_phone; ++i) { + SWAP_INT32(&m->phone[i].ssid); + SWAP_INT32(&m->phone[i].tmat); + } + } + sseq_size = (int32 *) (m->phone + m->n_phone); + if (swap) + SWAP_INT32(sseq_size); + m->sseq = ckd_calloc(m->n_sseq, sizeof(*m->sseq)); + m->sseq[0] = (uint16 *) (sseq_size + 1); + if (swap) { + for (i = 0; i < *sseq_size; ++i) + SWAP_INT16(m->sseq[0] + i); + } + if (m->n_emit_state) { + for (i = 1; i < m->n_sseq; ++i) + m->sseq[i] = m->sseq[0] + i * m->n_emit_state; + } + else { + m->sseq_len = (uint8 *) (m->sseq[0] + *sseq_size); + for (i = 1; i < m->n_sseq; ++i) + m->sseq[i] = m->sseq[i - 1] + m->sseq_len[i - 1]; + } + + /* Now build the CD-to-CI mappings using the senone sequences. + * This is the only really accurate way to do it, though it is + * still inaccurate in the case of heterogeneous topologies or + * cross-state tying. */ + m->cd2cisen = (int16 *) ckd_malloc(m->n_sen * sizeof(*m->cd2cisen)); + m->sen2cimap = (int16 *) ckd_malloc(m->n_sen * sizeof(*m->sen2cimap)); + + /* Default mappings (identity, none) */ + for (i = 0; i < m->n_ci_sen; ++i) + m->cd2cisen[i] = i; + for (; i < m->n_sen; ++i) + m->cd2cisen[i] = -1; + for (i = 0; i < m->n_sen; ++i) + m->sen2cimap[i] = -1; + for (i = 0; i < m->n_phone; ++i) { + int32 j, ssid = m->phone[i].ssid; + + for (j = 0; j < bin_mdef_n_emit_state_phone(m, i); ++j) { + int s = bin_mdef_sseq2sen(m, ssid, j); + int ci = bin_mdef_pid2ci(m, i); + /* Take the first one and warn if we have cross-state tying. */ + if (m->sen2cimap[s] == -1) + m->sen2cimap[s] = ci; + if (m->sen2cimap[s] != ci) + E_WARN + ("Senone %d is shared between multiple base phones\n", + s); + + if (j > bin_mdef_n_emit_state_phone(m, ci)) + E_WARN("CD phone %d has fewer states than CI phone %d\n", + i, ci); + else + m->cd2cisen[s] = + bin_mdef_sseq2sen(m, m->phone[ci].ssid, j); + } + } + + /* Set the silence phone. */ + m->sil = bin_mdef_ciphone_id(m, S3_SILENCE_CIPHONE); + + E_INFO + ("%d CI-phone, %d CD-phone, %d emitstate/phone, %d CI-sen, %d Sen, %d Sen-Seq\n", + m->n_ciphone, m->n_phone - m->n_ciphone, m->n_emit_state, + m->n_ci_sen, m->n_sen, m->n_sseq); + fclose(fh); + return m; +} + +int +bin_mdef_write(bin_mdef_t * m, const char *filename) +{ + FILE *fh; + int32 val, i; + + if ((fh = fopen(filename, "wb")) == NULL) + return -1; + + /* Byteorder marker. */ + val = BIN_MDEF_NATIVE_ENDIAN; + fwrite(&val, 1, 4, fh); + /* Version. */ + val = BIN_MDEF_FORMAT_VERSION; + fwrite(&val, 1, sizeof(val), fh); + + /* Round the format descriptor size up to a 4-byte boundary. */ + val = ((sizeof(format_desc) + 3) & ~3); + fwrite(&val, 1, sizeof(val), fh); + fwrite(format_desc, 1, sizeof(format_desc), fh); + /* Pad it with zeros. */ + i = 0; + fwrite(&i, 1, val - sizeof(format_desc), fh); + + /* Binary header things. */ + fwrite(&m->n_ciphone, 4, 1, fh); + fwrite(&m->n_phone, 4, 1, fh); + fwrite(&m->n_emit_state, 4, 1, fh); + fwrite(&m->n_ci_sen, 4, 1, fh); + fwrite(&m->n_sen, 4, 1, fh); + fwrite(&m->n_tmat, 4, 1, fh); + fwrite(&m->n_sseq, 4, 1, fh); + fwrite(&m->n_ctx, 4, 1, fh); + fwrite(&m->n_cd_tree, 4, 1, fh); + /* Write this as a 32-bit value to preserve alignment for the + * non-mmap case (we want things aligned both from the + * beginning of the file and the beginning of the phone + * strings). */ + val = m->sil; + fwrite(&val, 4, 1, fh); + + /* Phone strings. */ + for (i = 0; i < m->n_ciphone; ++i) + fwrite(m->ciname[i], 1, strlen(m->ciname[i]) + 1, fh); + /* Pad with zeros. */ + val = (ftell(fh) + 3) & ~3; + i = 0; + fwrite(&i, 1, val - ftell(fh), fh); + + /* Write CD-tree */ + fwrite(m->cd_tree, sizeof(*m->cd_tree), m->n_cd_tree, fh); + /* Write phones */ + fwrite(m->phone, sizeof(*m->phone), m->n_phone, fh); + if (m->n_emit_state) { + /* Write size of sseq */ + val = m->n_sseq * m->n_emit_state; + fwrite(&val, 4, 1, fh); + + /* Write sseq */ + fwrite(m->sseq[0], sizeof(**m->sseq), + m->n_sseq * m->n_emit_state, fh); + } + else { + int32 n; + + /* Calcluate size of sseq */ + n = 0; + for (i = 0; i < m->n_sseq; ++i) + n += m->sseq_len[i]; + + /* Write size of sseq */ + fwrite(&n, 4, 1, fh); + + /* Write sseq */ + fwrite(m->sseq[0], sizeof(**m->sseq), n, fh); + + /* Write sseq_len */ + fwrite(m->sseq_len, 1, m->n_sseq, fh); + } + fclose(fh); + + return 0; +} + +int +bin_mdef_write_text(bin_mdef_t * m, const char *filename) +{ + FILE *fh; + int p, i, n_total_state; + + if (strcmp(filename, "-") == 0) + fh = stdout; + else { + if ((fh = fopen(filename, "w")) == NULL) + return -1; + } + + fprintf(fh, "0.3\n"); + fprintf(fh, "%d n_base\n", m->n_ciphone); + fprintf(fh, "%d n_tri\n", m->n_phone - m->n_ciphone); + if (m->n_emit_state) + n_total_state = m->n_phone * (m->n_emit_state + 1); + else { + n_total_state = 0; + for (i = 0; i < m->n_phone; ++i) + n_total_state += m->sseq_len[m->phone[i].ssid] + 1; + } + fprintf(fh, "%d n_state_map\n", n_total_state); + fprintf(fh, "%d n_tied_state\n", m->n_sen); + fprintf(fh, "%d n_tied_ci_state\n", m->n_ci_sen); + fprintf(fh, "%d n_tied_tmat\n", m->n_tmat); + fprintf(fh, "#\n# Columns definitions\n"); + fprintf(fh, "#%4s %3s %3s %1s %6s %4s %s\n", + "base", "lft", "rt", "p", "attrib", "tmat", + " ... state id's ..."); + + for (p = 0; p < m->n_ciphone; p++) { + int n_state; + + fprintf(fh, "%5s %3s %3s %1s", m->ciname[p], "-", "-", "-"); + + if (bin_mdef_is_fillerphone(m, p)) + fprintf(fh, " %6s", "filler"); + else + fprintf(fh, " %6s", "n/a"); + fprintf(fh, " %4d", m->phone[p].tmat); + + if (m->n_emit_state) + n_state = m->n_emit_state; + else + n_state = m->sseq_len[m->phone[p].ssid]; + for (i = 0; i < n_state; i++) { + fprintf(fh, " %6u", m->sseq[m->phone[p].ssid][i]); + } + fprintf(fh, " N\n"); + } + + + for (; p < m->n_phone; p++) { + int n_state; + + fprintf(fh, "%5s %3s %3s %c", + m->ciname[m->phone[p].info.cd.ctx[0]], + m->ciname[m->phone[p].info.cd.ctx[1]], + m->ciname[m->phone[p].info.cd.ctx[2]], + (WPOS_NAME)[m->phone[p].info.cd.wpos]); + + if (bin_mdef_is_fillerphone(m, p)) + fprintf(fh, " %6s", "filler"); + else + fprintf(fh, " %6s", "n/a"); + fprintf(fh, " %4d", m->phone[p].tmat); + + + if (m->n_emit_state) + n_state = m->n_emit_state; + else + n_state = m->sseq_len[m->phone[p].ssid]; + for (i = 0; i < n_state; i++) { + fprintf(fh, " %6u", m->sseq[m->phone[p].ssid][i]); + } + fprintf(fh, " N\n"); + } + + if (strcmp(filename, "-") != 0) + fclose(fh); + return 0; +} + +int +bin_mdef_ciphone_id(bin_mdef_t * m, const char *ciphone) +{ + int low, mid, high; + + /* Exact binary search on m->ciphone */ + low = 0; + high = m->n_ciphone; + while (low < high) { + int c; + + mid = (low + high) / 2; + c = strcmp(ciphone, m->ciname[mid]); + if (c == 0) + return mid; + else if (c > 0) + low = mid + 1; + else if (c < 0) + high = mid; + } + return -1; +} + +int +bin_mdef_ciphone_id_nocase(bin_mdef_t * m, const char *ciphone) +{ + int low, mid, high; + + /* Exact binary search on m->ciphone */ + low = 0; + high = m->n_ciphone; + while (low < high) { + int c; + + mid = (low + high) / 2; + c = strcmp_nocase(ciphone, m->ciname[mid]); + if (c == 0) + return mid; + else if (c > 0) + low = mid + 1; + else if (c < 0) + high = mid; + } + return -1; +} + +const char * +bin_mdef_ciphone_str(bin_mdef_t * m, int32 ci) +{ + assert(m != NULL); + assert(ci < m->n_ciphone); + return m->ciname[ci]; +} + +int +bin_mdef_phone_id(bin_mdef_t * m, int32 ci, int32 lc, int32 rc, int32 wpos) +{ + cd_tree_t *cd_tree; + int level, max; + int16 ctx[4]; + + assert(m); + + /* In the future, we might back off when context is not available, + * but for now we'll just return the CI phone. */ + if (lc < 0 || rc < 0) + return ci; + + assert((ci >= 0) && (ci < m->n_ciphone)); + assert((lc >= 0) && (lc < m->n_ciphone)); + assert((rc >= 0) && (rc < m->n_ciphone)); + assert((wpos >= 0) && (wpos < N_WORD_POSN)); + + /* Create a context list, mapping fillers to silence. */ + ctx[0] = wpos; + ctx[1] = ci; + ctx[2] = (m->sil >= 0 + && m->phone[lc].info.ci.filler) ? m->sil : lc; + ctx[3] = (m->sil >= 0 + && m->phone[rc].info.ci.filler) ? m->sil : rc; + + /* Walk down the cd_tree. */ + cd_tree = m->cd_tree; + level = 0; /* What level we are on. */ + max = N_WORD_POSN; /* Number of nodes on this level. */ + while (level < 4) { + int i; + +#if 0 + E_INFO("Looking for context %d=%s in %d at %d\n", + ctx[level], m->ciname[ctx[level]], + max, cd_tree - m->cd_tree); +#endif + for (i = 0; i < max; ++i) { +#if 0 + E_INFO("Look at context %d=%s at %d\n", + cd_tree[i].ctx, + m->ciname[cd_tree[i].ctx], cd_tree + i - m->cd_tree); +#endif + if (cd_tree[i].ctx == ctx[level]) + break; + } + if (i == max) + return -1; +#if 0 + E_INFO("Found context %d=%s at %d, n_down=%d, down=%d\n", + ctx[level], m->ciname[ctx[level]], + cd_tree + i - m->cd_tree, + cd_tree[i].n_down, cd_tree[i].c.down); +#endif + /* Leaf node, stop here. */ + if (cd_tree[i].n_down == 0) + return cd_tree[i].c.pid; + + /* Go down one level. */ + max = cd_tree[i].n_down; + cd_tree = m->cd_tree + cd_tree[i].c.down; + ++level; + } + /* We probably shouldn't get here. */ + return -1; +} + +int +bin_mdef_phone_id_nearest(bin_mdef_t * m, int32 b, int32 l, int32 r, int32 pos) +{ + int p, tmppos; + + + + /* In the future, we might back off when context is not available, + * but for now we'll just return the CI phone. */ + if (l < 0 || r < 0) + return b; + + p = bin_mdef_phone_id(m, b, l, r, pos); + if (p >= 0) + return p; + + /* Exact triphone not found; backoff to other word positions */ + for (tmppos = 0; tmppos < N_WORD_POSN; tmppos++) { + if (tmppos != pos) { + p = bin_mdef_phone_id(m, b, l, r, tmppos); + if (p >= 0) + return p; + } + } + + /* Nothing yet; backoff to silence phone if non-silence filler context */ + /* In addition, backoff to silence phone on left/right if in beginning/end position */ + if (m->sil >= 0) { + int newl = l, newr = r; + if (m->phone[(int)l].info.ci.filler + || pos == WORD_POSN_BEGIN || pos == WORD_POSN_SINGLE) + newl = m->sil; + if (m->phone[(int)r].info.ci.filler + || pos == WORD_POSN_END || pos == WORD_POSN_SINGLE) + newr = m->sil; + if ((newl != l) || (newr != r)) { + p = bin_mdef_phone_id(m, b, newl, newr, pos); + if (p >= 0) + return p; + + for (tmppos = 0; tmppos < N_WORD_POSN; tmppos++) { + if (tmppos != pos) { + p = bin_mdef_phone_id(m, b, newl, newr, tmppos); + if (p >= 0) + return p; + } + } + } + } + + /* Nothing yet; backoff to base phone */ + return b; +} + +int +bin_mdef_phone_str(bin_mdef_t * m, int pid, char *buf) +{ + char *wpos_name; + + assert(m); + assert((pid >= 0) && (pid < m->n_phone)); + wpos_name = WPOS_NAME; + + buf[0] = '\0'; + if (pid < m->n_ciphone) + sprintf(buf, "%s", bin_mdef_ciphone_str(m, pid)); + else { + sprintf(buf, "%s %s %s %c", + bin_mdef_ciphone_str(m, m->phone[pid].info.cd.ctx[0]), + bin_mdef_ciphone_str(m, m->phone[pid].info.cd.ctx[1]), + bin_mdef_ciphone_str(m, m->phone[pid].info.cd.ctx[2]), + wpos_name[m->phone[pid].info.cd.wpos]); + } + return 0; +}
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/bin_mdef.h @@ -0,0 +1,236 @@ +/* -*- c-file-style: "linux" -*- */ +/* ==================================================================== + * Copyright (c) 2005 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/** + * @file bin_mdef.h + * + * Binary format model definition files, with support for + * heterogeneous topologies and variable-size N-phones + * + * @author David Huggins-Daines <dhuggins@cs.cmu.edu> + */ +#ifndef __BIN_MDEF_H__ +#define __BIN_MDEF_H__ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* SphinxBase headers. */ +#include <sphinxbase/mmio.h> +#include <sphinxbase/cmd_ln.h> +#include <pocketsphinx_export.h> + +#include "mdef.h" + +#define BIN_MDEF_FORMAT_VERSION 1 +/* Little-endian machines will write "BMDF" to disk, big-endian ones "FDMB". */ +#define BIN_MDEF_NATIVE_ENDIAN 0x46444d42 /* 'BMDF' in little-endian order */ +#define BIN_MDEF_OTHER_ENDIAN 0x424d4446 /* 'BMDF' in big-endian order */ +#ifdef __GNUC__ +#define __ATTRIBUTE_PACKED __attribute__((packed)) +#else +#define __ATTRIBUTE_PACKED +#endif + +/** + * Phone entry (on-disk, 12 bytes) + */ +typedef struct mdef_entry_s mdef_entry_t; +struct mdef_entry_s { + int32 ssid; /**< Senone sequence ID */ + int32 tmat; /**< Transition matrix ID */ + /* FIXME: is any of this actually necessary? */ + union { + /**< CI phone information - attributes (just "filler" for now) */ + struct { + uint8 filler; + uint8 reserved[3]; + } ci; + /**< CD phone information - context info. */ + struct { + uint8 wpos; + uint8 ctx[3]; /**< quintphones will require hacking */ + } cd; + } info; +} __ATTRIBUTE_PACKED; + +/** + * Invalid senone sequence ID (limited to 16 bits for PocketSphinx). + */ +#define BAD_SSID 0xffff +/** + * Invalid senone ID (limited to 16 bits for PocketSphinx). + */ +#define BAD_SENID 0xffff + +/** + * Node in CD phone tree (on-disk, 8 bytes). + */ +typedef struct cd_tree_s cd_tree_t; +struct cd_tree_s { + int16 ctx; /**< Context (word position or CI phone) */ + int16 n_down; /**< Number of children (0 for leafnode) */ + union { + int32 pid; /**< Phone ID (leafnode) */ + int32 down; /**< Next level of the tree (offset from start of cd_trees) */ + } c; +}; + +/** + * Model definition structure (in-memory). + */ +typedef struct bin_mdef_s bin_mdef_t; +struct bin_mdef_s { + int refcnt; + int32 n_ciphone; /**< Number of base (CI) phones */ + int32 n_phone; /**< Number of base (CI) phones + (CD) triphones */ + int32 n_emit_state; /**< Number of emitting states per phone (0 for heterogeneous) */ + int32 n_ci_sen; /**< Number of CI senones; these are the first */ + int32 n_sen; /**< Number of senones (CI+CD) */ + int32 n_tmat; /**< Number of transition matrices */ + int32 n_sseq; /**< Number of unique senone sequences */ + int32 n_ctx; /**< Number of phones of context */ + int32 n_cd_tree; /**< Number of nodes in cd_tree (below) */ + int16 sil; /**< CI phone ID for silence */ + + mmio_file_t *filemap;/**< File map for this file (if any) */ + char **ciname; /**< CI phone names */ + cd_tree_t *cd_tree; /**< Tree mapping CD phones to phone IDs */ + mdef_entry_t *phone; /**< All phone structures */ + uint16 **sseq; /**< Unique senone sequences (2D array built at load time) */ + uint8 *sseq_len; /**< Number of states in each sseq (NULL for homogeneous) */ + + /* These two are not stored on disk, but are generated at load time. */ + int16 *cd2cisen; /**< Parent CI-senone id for each senone */ + int16 *sen2cimap; /**< Parent CI-phone for each senone (CI or CD) */ + + /** Allocation mode for this object. */ + enum { BIN_MDEF_FROM_TEXT, BIN_MDEF_IN_MEMORY, BIN_MDEF_ON_DISK } alloc_mode; +}; + +#define bin_mdef_is_fillerphone(m,p) (((p) < (m)->n_ciphone) \ + ? (m)->phone[p].info.ci.filler \ + : (m)->phone[(m)->phone[p].info.cd.ctx[0]].info.ci.filler) +#define bin_mdef_is_ciphone(m,p) ((p) < (m)->n_ciphone) +#define bin_mdef_n_ciphone(m) ((m)->n_ciphone) +#define bin_mdef_n_phone(m) ((m)->n_phone) +#define bin_mdef_n_sseq(m) ((m)->n_sseq) +#define bin_mdef_n_emit_state(m) ((m)->n_emit_state) +#define bin_mdef_n_emit_state_phone(m,p) ((m)->n_emit_state ? (m)->n_emit_state \ + : (m)->sseq_len[(m)->phone[p].ssid]) +#define bin_mdef_n_sen(m) ((m)->n_sen) +#define bin_mdef_n_tmat(m) ((m)->n_tmat) +#define bin_mdef_pid2ssid(m,p) ((m)->phone[p].ssid) +#define bin_mdef_pid2tmatid(m,p) ((m)->phone[p].tmat) +#define bin_mdef_silphone(m) ((m)->sil) +#define bin_mdef_sen2cimap(m,s) ((m)->sen2cimap[s]) +#define bin_mdef_sseq2sen(m,ss,pos) ((m)->sseq[ss][pos]) +#define bin_mdef_pid2ci(m,p) (((p) < (m)->n_ciphone) ? (p) \ + : (m)->phone[p].info.cd.ctx[0]) + +/** + * Read a binary mdef from a file. + */ +POCKETSPHINX_EXPORT +bin_mdef_t *bin_mdef_read(cmd_ln_t *config, const char *filename); +/** + * Read a text mdef from a file (creating an in-memory binary mdef). + */ +POCKETSPHINX_EXPORT +bin_mdef_t *bin_mdef_read_text(cmd_ln_t *config, const char *filename); +/** + * Write a binary mdef to a file. + */ +POCKETSPHINX_EXPORT +int bin_mdef_write(bin_mdef_t *m, const char *filename); +/** + * Write a binary mdef to a text file. + */ +POCKETSPHINX_EXPORT +int bin_mdef_write_text(bin_mdef_t *m, const char *filename); +/** + * Retain a pointer to a bin_mdef_t. + */ +bin_mdef_t *bin_mdef_retain(bin_mdef_t *m); +/** + * Release a pointer to a binary mdef. + */ +int bin_mdef_free(bin_mdef_t *m); + +/** + * Context-independent phone lookup. + * @return phone id for ciphone. + */ +int bin_mdef_ciphone_id(bin_mdef_t *m, /**< In: Model structure being queried */ + const char *ciphone); /**< In: ciphone for which id wanted */ + +/** + * Case-insensitive context-independent phone lookup. + * @return phone id for ciphone. + */ +int bin_mdef_ciphone_id_nocase(bin_mdef_t *m, /**< In: Model structure being queried */ + const char *ciphone); /**< In: ciphone for which id wanted */ + +/* Return value: READ-ONLY ciphone string name for the given ciphone id */ +const char *bin_mdef_ciphone_str(bin_mdef_t *m, /**< In: Model structure being queried */ + int32 ci); /**< In: ciphone id for which name wanted */ + +/* Return value: phone id for the given constituents if found, else -1 */ +int bin_mdef_phone_id(bin_mdef_t *m, /**< In: Model structure being queried */ + int32 b, /**< In: base ciphone id */ + int32 l, /**< In: left context ciphone id */ + int32 r, /**< In: right context ciphone id */ + int32 pos); /**< In: Word position */ + +/* Look up a phone id, backing off to other word positions. */ +int bin_mdef_phone_id_nearest(bin_mdef_t * m, int32 b, + int32 l, int32 r, int32 pos); + +/** + * Create a phone string for the given phone (base or triphone) id in the given buf. + * + * @return 0 if successful, -1 if error. + */ +int bin_mdef_phone_str(bin_mdef_t *m, /**< In: Model structure being queried */ + int pid, /**< In: phone id being queried */ + char *buf); /**< Out: On return, buf has the string */ + +#ifdef __cplusplus +}; /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* __BIN_MDEF_H__ */
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/blkarray_list.c @@ -0,0 +1,172 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* + * blkarray_list.c -- block array-based list structure. + * + * HISTORY + * + * 18-Feb-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon + * Started. + */ + +/* System headers. */ +#include <assert.h> + +/* SphinxBase headers. */ +#include <sphinxbase/prim_type.h> +#include <sphinxbase/err.h> +#include <sphinxbase/ckd_alloc.h> + +/* Local headers. */ +#include "blkarray_list.h" + + +#define BLKARRAY_DEFAULT_MAXBLKS 16380 +#define BLKARRAY_DEFAULT_BLKSIZE 16380 + + +blkarray_list_t * +_blkarray_list_init(int32 maxblks, int32 blksize) +{ + blkarray_list_t *bl; + + if ((maxblks <= 0) || (blksize <= 0)) { + E_ERROR("Cannot allocate %dx%d blkarray\n", maxblks, blksize); + return NULL; + } + + bl = (blkarray_list_t *) ckd_calloc(1, sizeof(blkarray_list_t)); + bl->ptr = (void ***) ckd_calloc(maxblks, sizeof(void **)); + bl->maxblks = maxblks; + bl->blksize = blksize; + bl->n_valid = 0; + bl->cur_row = -1; /* No row is allocated (dummy) */ + bl->cur_row_free = blksize; /* The dummy row is full */ + + return bl; +} + + +blkarray_list_t * +blkarray_list_init(void) +{ + return _blkarray_list_init(BLKARRAY_DEFAULT_MAXBLKS, + BLKARRAY_DEFAULT_BLKSIZE); +} + +void +blkarray_list_free(blkarray_list_t *bl) +{ + blkarray_list_reset(bl); + ckd_free(bl->ptr); + ckd_free(bl); +} + + +int32 +blkarray_list_append(blkarray_list_t * bl, void *data) +{ + int32 id; + + assert(bl); + + if (bl->cur_row_free >= bl->blksize) { + /* Previous row is filled; need to allocate a new row */ + bl->cur_row++; + + if (bl->cur_row >= bl->maxblks) { + E_ERROR("Block array (%dx%d) exhausted\n", + bl->maxblks, bl->blksize); + bl->cur_row--; + return -1; + } + + /* Allocate the new row */ + assert(bl->ptr[bl->cur_row] == NULL); + bl->ptr[bl->cur_row] = (void **) ckd_malloc(bl->blksize * + sizeof(void *)); + + bl->cur_row_free = 0; + } + + bl->ptr[bl->cur_row][bl->cur_row_free] = data; + (bl->cur_row_free)++; + + id = (bl->n_valid)++; + assert(id >= 0); + + return id; +} + + +void +blkarray_list_reset(blkarray_list_t * bl) +{ + int32 i, j; + + /* Free all the allocated elements as well as the blocks */ + for (i = 0; i < bl->cur_row; i++) { + for (j = 0; j < bl->blksize; j++) + ckd_free(bl->ptr[i][j]); + + ckd_free(bl->ptr[i]); + bl->ptr[i] = NULL; + } + if (i == bl->cur_row) { /* NEED THIS! (in case cur_row < 0) */ + for (j = 0; j < bl->cur_row_free; j++) + ckd_free(bl->ptr[i][j]); + + ckd_free(bl->ptr[i]); + bl->ptr[i] = NULL; + } + + bl->n_valid = 0; + bl->cur_row = -1; + bl->cur_row_free = bl->blksize; +} + +void * +blkarray_list_get(blkarray_list_t *list, int32 n) +{ + int32 r, c; + + if (n >= blkarray_list_n_valid(list)) + return NULL; + + r = n / blkarray_list_blksize(list); + c = n - (r * blkarray_list_blksize(list)); + + return blkarray_list_ptr(list, r, c); +}
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/blkarray_list.h @@ -0,0 +1,139 @@ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* + * blkarray_list.h -- array-based list structure, for memory and access + * efficiency. + * + * HISTORY + * + * $Log: blkarray_list.h,v $ + * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins + * re-importation + * + * Revision 1.2 2004/12/10 16:48:58 rkm + * Added continuous density acoustic model handling + * + * Revision 1.1 2004/07/16 00:57:12 egouvea + * Added Ravi's implementation of FSG support. + * + * Revision 1.2 2004/05/27 14:22:57 rkm + * FSG cross-word triphones completed (but for single-phone words) + * + * Revision 1.1.1.1 2004/03/01 14:30:31 rkm + * + * + * Revision 1.1 2004/02/26 01:14:48 rkm + * *** empty log message *** + * + * + * 18-Feb-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon + * Started. + */ + + +#ifndef __S2_BLKARRAY_LIST_H__ +#define __S2_BLKARRAY_LIST_H__ + + +#include <sphinxbase/prim_type.h> + + +/* + * For maintaining a (conceptual) "list" of pointers to arbitrary data. + * The application is responsible for knowing the true data type. + * Use an array instead of a true list for efficiency (both memory and + * speed). But use a blocked (2-D) array to allow dynamic resizing at a + * coarse grain. An entire block is allocated or freed, as appropriate. + */ +typedef struct blkarray_list_s { + void ***ptr; /* ptr[][] is the user-supplied ptr */ + int32 maxblks; /* size of ptr (#rows) */ + int32 blksize; /* size of ptr[] (#cols, ie, size of each row) */ + int32 n_valid; /* # entries actually stored in the list */ + int32 cur_row; /* The current row being that has empty entry */ + int32 cur_row_free; /* First entry valid within the current row */ +} blkarray_list_t; + +/* Access macros */ +#define blkarray_list_ptr(l,r,c) ((l)->ptr[r][c]) +#define blkarray_list_maxblks(l) ((l)->maxblks) +#define blkarray_list_blksize(l) ((l)->blksize) +#define blkarray_list_n_valid(l) ((l)->n_valid) +#define blkarray_list_cur_row(l) ((l)->cur_row) +#define blkarray_list_cur_row_free(l) ((l)->cur_row_free) + + +/* + * Initialize and return a new blkarray_list containing an empty list + * (i.e., 0 length). Sized for the given values of maxblks and blksize. + * NOTE: (maxblks * blksize) should not overflow int32, but this is not + * checked. + * Return the allocated entry if successful, NULL if any error. + */ +blkarray_list_t *_blkarray_list_init (int32 maxblks, int32 blksize); + + +/* + * Like _blkarray_list_init() above, but for some default values of + * maxblks and blksize. + */ +blkarray_list_t *blkarray_list_init ( void ); + +/** + * Completely finalize a blkarray_list. + */ +void blkarray_list_free(blkarray_list_t *bl); + + +/* + * Append the given new entry (data) to the end of the list. + * Return the index of the entry if successful, -1 if any error. + * The returned indices are guaranteed to be successive integers (i.e., + * 0, 1, 2...) for successive append operations, until the list is reset, + * when they resume from 0. + */ +int32 blkarray_list_append (blkarray_list_t *, void *data); + + +/* + * Free all the entries in the list (using ckd_free) and reset the + * list length to 0. + */ +void blkarray_list_reset (blkarray_list_t *); + + +/* Gets n-th element of the array list */ +void * blkarray_list_get(blkarray_list_t *, int32 n); + +#endif
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/dict.c @@ -0,0 +1,489 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* System headers. */ +#include <string.h> + +/* SphinxBase headers. */ +#include <sphinxbase/pio.h> +#include <sphinxbase/strfuncs.h> + +/* Local headers. */ +#include "dict.h" + + +#define DELIM " \t\n" /* Set of field separator characters */ +#define DEFAULT_NUM_PHONE (MAX_S3CIPID+1) + +#if WIN32 +#define snprintf sprintf_s +#endif + +extern const char *const cmu6_lts_phone_table[]; + +static s3cipid_t +dict_ciphone_id(dict_t * d, const char *str) +{ + if (d->nocase) + return bin_mdef_ciphone_id_nocase(d->mdef, str); + else + return bin_mdef_ciphone_id(d->mdef, str); +} + + +const char * +dict_ciphone_str(dict_t * d, s3wid_t wid, int32 pos) +{ + assert(d != NULL); + assert((wid >= 0) && (wid < d->n_word)); + assert((pos >= 0) && (pos < d->word[wid].pronlen)); + + return bin_mdef_ciphone_str(d->mdef, d->word[wid].ciphone[pos]); +} + + +s3wid_t +dict_add_word(dict_t * d, char const *word, s3cipid_t const * p, int32 np) +{ + int32 len; + dictword_t *wordp; + s3wid_t newwid; + char *wword; + + if (d->n_word >= d->max_words) { + E_INFO("Reallocating to %d KiB for word entries\n", + (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t) / 1024); + d->word = + (dictword_t *) ckd_realloc(d->word, + (d->max_words + + S3DICT_INC_SZ) * sizeof(dictword_t)); + d->max_words = d->max_words + S3DICT_INC_SZ; + } + + wordp = d->word + d->n_word; + wordp->word = (char *) ckd_salloc(word); /* Freed in dict_free */ + + /* Determine base/alt wids */ + wword = ckd_salloc(word); + if ((len = dict_word2basestr(wword)) > 0) { + int32 w; + + /* Truncated to a baseword string; find its ID */ + if (hash_table_lookup_int32(d->ht, wword, &w) < 0) { + E_ERROR("Missing base word for: %s\n", word); + ckd_free(wword); + ckd_free(wordp->word); + wordp->word = NULL; + return BAD_S3WID; + } + + /* Link into alt list */ + wordp->basewid = w; + wordp->alt = d->word[w].alt; + d->word[w].alt = d->n_word; + } else { + wordp->alt = BAD_S3WID; + wordp->basewid = d->n_word; + } + ckd_free(wword); + + /* Associate word string with d->n_word in hash table */ + if (hash_table_enter_int32(d->ht, wordp->word, d->n_word) != d->n_word) { + ckd_free(wordp->word); + wordp->word = NULL; + return BAD_S3WID; + } + + /* Fill in word entry, and set defaults */ + if (p && (np > 0)) { + wordp->ciphone = (s3cipid_t *) ckd_malloc(np * sizeof(s3cipid_t)); /* Freed in dict_free */ + memcpy(wordp->ciphone, p, np * sizeof(s3cipid_t)); + wordp->pronlen = np; + } + else { + wordp->ciphone = NULL; + wordp->pronlen = 0; + } + + newwid = d->n_word++; + + return newwid; +} + + +static int32 +dict_read(FILE * fp, dict_t * d) +{ + lineiter_t *li; + char **wptr; + s3cipid_t *p; + int32 lineno, nwd; + s3wid_t w; + int32 i, maxwd; + size_t stralloc, phnalloc; + + maxwd = 512; + p = (s3cipid_t *) ckd_calloc(maxwd + 4, sizeof(*p)); + wptr = (char **) ckd_calloc(maxwd, sizeof(char *)); /* Freed below */ + + lineno = 0; + stralloc = phnalloc = 0; + for (li = lineiter_start(fp); li; li = lineiter_next(li)) { + lineno++; + if (0 == strncmp(li->buf, "##", 2) + || 0 == strncmp(li->buf, ";;", 2)) + continue; + + if ((nwd = str2words(li->buf, wptr, maxwd)) < 0) { + /* Increase size of p, wptr. */ + nwd = str2words(li->buf, NULL, 0); + assert(nwd > maxwd); /* why else would it fail? */ + maxwd = nwd; + p = (s3cipid_t *) ckd_realloc(p, (maxwd + 4) * sizeof(*p)); + wptr = (char **) ckd_realloc(wptr, maxwd * sizeof(*wptr)); + } + + if (nwd == 0) /* Empty line */ + continue; + /* wptr[0] is the word-string and wptr[1..nwd-1] the pronunciation sequence */ + if (nwd == 1) { + E_ERROR("Line %d: No pronunciation for word '%s'; ignored\n", + lineno, wptr[0]); + continue; + } + + + /* Convert pronunciation string to CI-phone-ids */ + for (i = 1; i < nwd; i++) { + p[i - 1] = dict_ciphone_id(d, wptr[i]); + if (NOT_S3CIPID(p[i - 1])) { + E_ERROR("Line %d: Phone '%s' is mising in the acoustic model; word '%s' ignored\n", + lineno, wptr[i], wptr[0]); + break; + } + } + + if (i == nwd) { /* All CI-phones successfully converted to IDs */ + w = dict_add_word(d, wptr[0], p, nwd - 1); + if (NOT_S3WID(w)) + E_ERROR + ("Line %d: Failed to add the word '%s' (duplicate?); ignored\n", + lineno, wptr[0]); + else { + stralloc += strlen(d->word[w].word); + phnalloc += d->word[w].pronlen * sizeof(s3cipid_t); + } + } + } + E_INFO("Allocated %d KiB for strings, %d KiB for phones\n", + (int)stralloc / 1024, (int)phnalloc / 1024); + ckd_free(p); + ckd_free(wptr); + + return 0; +} + +int +dict_write(dict_t *dict, char const *filename, char const *format) +{ + FILE *fh; + int i; + + if ((fh = fopen(filename, "w")) == NULL) { + E_ERROR_SYSTEM("Failed to open '%s'", filename); + return -1; + } + for (i = 0; i < dict->n_word; ++i) { + char *phones; + int j, phlen; + if (!dict_real_word(dict, i)) + continue; + for (phlen = j = 0; j < dict_pronlen(dict, i); ++j) + phlen += strlen(dict_ciphone_str(dict, i, j)) + 1; + phones = ckd_calloc(1, phlen); + for (j = 0; j < dict_pronlen(dict, i); ++j) { + strcat(phones, dict_ciphone_str(dict, i, j)); + if (j != dict_pronlen(dict, i) - 1) + strcat(phones, " "); + } + fprintf(fh, "%-30s %s\n", dict_wordstr(dict, i), phones); + ckd_free(phones); + } + fclose(fh); + return 0; +} + + +dict_t * +dict_init(cmd_ln_t *config, bin_mdef_t * mdef) +{ + FILE *fp, *fp2; + int32 n; + lineiter_t *li; + dict_t *d; + s3cipid_t sil; + char const *dictfile = NULL, *fillerfile = NULL; + + if (config) { + dictfile = cmd_ln_str_r(config, "-dict"); + fillerfile = cmd_ln_str_r(config, "-fdict"); + } + + /* + * First obtain #words in dictionary (for hash table allocation). + * Reason: The PC NT system doesn't like to grow memory gradually. Better to allocate + * all the required memory in one go. + */ + fp = NULL; + n = 0; + if (dictfile) { + if ((fp = fopen(dictfile, "r")) == NULL) { + E_ERROR_SYSTEM("Failed to open dictionary file '%s' for reading", dictfile); + return NULL; + } + for (li = lineiter_start(fp); li; li = lineiter_next(li)) { + if (0 != strncmp(li->buf, "##", 2) + && 0 != strncmp(li->buf, ";;", 2)) + n++; + } + fseek(fp, 0L, SEEK_SET); + } + + fp2 = NULL; + if (fillerfile) { + if ((fp2 = fopen(fillerfile, "r")) == NULL) { + E_ERROR_SYSTEM("Failed to open filler dictionary file '%s' for reading", fillerfile); + fclose(fp); + return NULL; + } + for (li = lineiter_start(fp2); li; li = lineiter_next(li)) { + if (0 != strncmp(li->buf, "##", 2) + && 0 != strncmp(li->buf, ";;", 2)) + n++; + } + fseek(fp2, 0L, SEEK_SET); + } + + /* + * Allocate dict entries. HACK!! Allow some extra entries for words not in file. + * Also check for type size restrictions. + */ + d = (dict_t *) ckd_calloc(1, sizeof(dict_t)); /* freed in dict_free() */ + d->refcnt = 1; + d->max_words = + (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID; + if (n >= MAX_S3WID) { + E_ERROR("Number of words in dictionaries (%d) exceeds limit (%d)\n", n, + MAX_S3WID); + fclose(fp); + fclose(fp2); + ckd_free(d); + return NULL; + } + + E_INFO("Allocating %d * %d bytes (%d KiB) for word entries\n", + d->max_words, sizeof(dictword_t), + d->max_words * sizeof(dictword_t) / 1024); + d->word = (dictword_t *) ckd_calloc(d->max_words, sizeof(dictword_t)); /* freed in dict_free() */ + d->n_word = 0; + if (mdef) + d->mdef = bin_mdef_retain(mdef); + + /* Create new hash table for word strings; case-insensitive word strings */ + if (config && cmd_ln_exists_r(config, "-dictcase")) + d->nocase = cmd_ln_boolean_r(config, "-dictcase"); + d->ht = hash_table_new(d->max_words, d->nocase); + + /* Digest main dictionary file */ + if (fp) { + E_INFO("Reading main dictionary: %s\n", dictfile); + dict_read(fp, d); + fclose(fp); + E_INFO("%d words read\n", d->n_word); + } + + /* Now the filler dictionary file, if it exists */ + d->filler_start = d->n_word; + if (fillerfile) { + E_INFO("Reading filler dictionary: %s\n", fillerfile); + dict_read(fp2, d); + fclose(fp2); + E_INFO("%d words read\n", d->n_word - d->filler_start); + } + if (mdef) + sil = bin_mdef_silphone(mdef); + else + sil = 0; + if (dict_wordid(d, S3_START_WORD) == BAD_S3WID) { + dict_add_word(d, S3_START_WORD, &sil, 1); + } + if (dict_wordid(d, S3_FINISH_WORD) == BAD_S3WID) { + dict_add_word(d, S3_FINISH_WORD, &sil, 1); + } + if (dict_wordid(d, S3_SILENCE_WORD) == BAD_S3WID) { + dict_add_word(d, S3_SILENCE_WORD, &sil, 1); + } + + d->filler_end = d->n_word - 1; + + /* Initialize distinguished word-ids */ + d->startwid = dict_wordid(d, S3_START_WORD); + d->finishwid = dict_wordid(d, S3_FINISH_WORD); + d->silwid = dict_wordid(d, S3_SILENCE_WORD); + + if ((d->filler_start > d->filler_end) + || (!dict_filler_word(d, d->silwid))) { + E_ERROR("Word '%s' must occur (only) in filler dictionary\n", + S3_SILENCE_WORD); + dict_free(d); + return NULL; + } + + /* No check that alternative pronunciations for filler words are in filler range!! */ + + return d; +} + + +s3wid_t +dict_wordid(dict_t *d, const char *word) +{ + int32 w; + + assert(d); + assert(word); + + if (hash_table_lookup_int32(d->ht, word, &w) < 0) + return (BAD_S3WID); + return w; +} + + +int +dict_filler_word(dict_t *d, s3wid_t w) +{ + assert(d); + assert((w >= 0) && (w < d->n_word)); + + w = dict_basewid(d, w); + if ((w == d->startwid) || (w == d->finishwid)) + return 0; + if ((w >= d->filler_start) && (w <= d->filler_end)) + return 1; + return 0; +} + +int +dict_real_word(dict_t *d, s3wid_t w) +{ + assert(d); + assert((w >= 0) && (w < d->n_word)); + + w = dict_basewid(d, w); + if ((w == d->startwid) || (w == d->finishwid)) + return 0; + if ((w >= d->filler_start) && (w <= d->filler_end)) + return 0; + return 1; +} + + +int32 +dict_word2basestr(char *word) +{ + int32 i, len; + + len = strlen(word); + if (word[len - 1] == ')') { + for (i = len - 2; (i > 0) && (word[i] != '('); --i); + + if (i > 0) { + /* The word is of the form <baseword>(...); strip from left-paren */ + word[i] = '\0'; + return i; + } + } + + return -1; +} + +dict_t * +dict_retain(dict_t *d) +{ + ++d->refcnt; + return d; +} + +int +dict_free(dict_t * d) +{ + int i; + dictword_t *word; + + if (d == NULL) + return 0; + if (--d->refcnt > 0) + return d->refcnt; + + /* First Step, free all memory allocated for each word */ + for (i = 0; i < d->n_word; i++) { + word = (dictword_t *) & (d->word[i]); + if (word->word) + ckd_free((void *) word->word); + if (word->ciphone) + ckd_free((void *) word->ciphone); + } + + if (d->word) + ckd_free((void *) d->word); + if (d->ht) + hash_table_free(d->ht); + if (d->mdef) + bin_mdef_free(d->mdef); + ckd_free((void *) d); + + return 0; +} + +void +dict_report(dict_t * d) +{ + E_INFO_NOFN("Initialization of dict_t, report:\n"); + E_INFO_NOFN("Max word: %d\n", d->max_words); + E_INFO_NOFN("No of word: %d\n", d->n_word); + E_INFO_NOFN("\n"); +}
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/dict.h @@ -0,0 +1,210 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +#ifndef _S3_DICT_H_ +#define _S3_DICT_H_ + +/** \file dict.h + * \brief Operations on dictionary. + */ + +/* SphinxBase headers. */ +#include <sphinxbase/hash_table.h> + +/* Local headers. */ +#include "s3types.h" +#include "bin_mdef.h" +#include "pocketsphinx_export.h" + +#define S3DICT_INC_SZ 4096 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + \struct dictword_t + \brief a structure for one dictionary word. +*/ +typedef struct { + char *word; /**< Ascii word string */ + s3cipid_t *ciphone; /**< Pronunciation */ + int32 pronlen; /**< Pronunciation length */ + s3wid_t alt; /**< Next alternative pronunciation id, NOT_S3WID if none */ + s3wid_t basewid; /**< Base pronunciation id */ +} dictword_t; + +/** + \struct dict_t + \brief a structure for a dictionary. +*/ + +typedef struct { + int refcnt; + bin_mdef_t *mdef; /**< Model definition used for phone IDs; NULL if none used */ + dictword_t *word; /**< Array of entries in dictionary */ + hash_table_t *ht; /**< Hash table for mapping word strings to word ids */ + int32 max_words; /**< #Entries allocated in dict, including empty slots */ + int32 n_word; /**< #Occupied entries in dict; ie, excluding empty slots */ + int32 filler_start; /**< First filler word id (read from filler dict) */ + int32 filler_end; /**< Last filler word id (read from filler dict) */ + s3wid_t startwid; /**< FOR INTERNAL-USE ONLY */ + s3wid_t finishwid; /**< FOR INTERNAL-USE ONLY */ + s3wid_t silwid; /**< FOR INTERNAL-USE ONLY */ + int nocase; +} dict_t; + + +/** + * Initialize a new dictionary. + * + * If config and mdef are supplied, then the dictionary will be read + * from the files specified by the -dict and -fdict options in config, + * with case sensitivity determined by the -dictcase option. + * + * Otherwise an empty case-sensitive dictionary will be created. + * + * Return ptr to dict_t if successful, NULL otherwise. + */ +dict_t *dict_init(cmd_ln_t *config, /**< Configuration (-dict, -fdict, -dictcase) or NULL */ + bin_mdef_t *mdef /**< For looking up CI phone IDs (or NULL) */ + ); + +/** + * Write dictionary to a file. + */ +int dict_write(dict_t *dict, char const *filename, char const *format); + +/** Return word id for given word string if present. Otherwise return BAD_S3WID */ +POCKETSPHINX_EXPORT +s3wid_t dict_wordid(dict_t *d, const char *word); + +/** + * Return 1 if w is a filler word, 0 if not. A filler word is one that was read in from the + * filler dictionary; however, sentence START and FINISH words are not filler words. + */ +int dict_filler_word(dict_t *d, /**< The dictionary structure */ + s3wid_t w /**< The word ID */ + ); + +/** + * Test if w is a "real" word, i.e. neither a filler word nor START/FINISH. + */ +POCKETSPHINX_EXPORT +int dict_real_word(dict_t *d, /**< The dictionary structure */ + s3wid_t w /**< The word ID */ + ); + +/** + * Add a word with the given ciphone pronunciation list to the dictionary. + * Return value: Result word id if successful, BAD_S3WID otherwise + */ +s3wid_t dict_add_word(dict_t *d, /**< The dictionary structure. */ + char const *word, /**< The word. */ + s3cipid_t const *p, /**< The pronunciation. */ + int32 np /**< Number of phones. */ + ); + +/** + * Return value: CI phone string for the given word, phone position. + */ +const char *dict_ciphone_str(dict_t *d, /**< In: Dictionary to look up */ + s3wid_t wid, /**< In: Component word being looked up */ + int32 pos /**< In: Pronunciation phone position */ + ); + +/** Packaged macro access to dictionary members */ +#define dict_size(d) ((d)->n_word) +#define dict_num_fillers(d) (dict_filler_end(d) - dict_filler_start(d)) +/** + * Number of "real words" in the dictionary. + * + * This is the number of words that are not fillers, <s>, or </s>. + */ +#define dict_num_real_words(d) \ + (dict_size(d) - (dict_filler_end(d) - dict_filler_start(d)) - 2) +#define dict_basewid(d,w) ((d)->word[w].basewid) +#define dict_wordstr(d,w) ((w) < 0 ? NULL : (d)->word[w].word) +#define dict_basestr(d,w) ((d)->word[dict_basewid(d,w)].word) +#define dict_nextalt(d,w) ((d)->word[w].alt) +#define dict_pronlen(d,w) ((d)->word[w].pronlen) +#define dict_pron(d,w,p) ((d)->word[w].ciphone[p]) /**< The CI phones of the word w at position p */ +#define dict_filler_start(d) ((d)->filler_start) +#define dict_filler_end(d) ((d)->filler_end) +#define dict_startwid(d) ((d)->startwid) +#define dict_finishwid(d) ((d)->finishwid) +#define dict_silwid(d) ((d)->silwid) +#define dict_is_single_phone(d,w) ((d)->word[w].pronlen == 1) +#define dict_first_phone(d,w) ((d)->word[w].ciphone[0]) +#define dict_second_phone(d,w) ((d)->word[w].ciphone[1]) +#define dict_second_last_phone(d,w) ((d)->word[w].ciphone[(d)->word[w].pronlen - 2]) +#define dict_last_phone(d,w) ((d)->word[w].ciphone[(d)->word[w].pronlen - 1]) + +/* Hard-coded special words */ +#define S3_START_WORD "<s>" +#define S3_FINISH_WORD "</s>" +#define S3_SILENCE_WORD "<sil>" +#define S3_UNKNOWN_WORD "<UNK>" + +/** + * If the given word contains a trailing "(....)" (i.e., a Sphinx-II style alternative + * pronunciation specification), strip that trailing portion from it. Note that the given + * string is modified. + * Return value: If string was modified, the character position at which the original string + * was truncated; otherwise -1. + */ +int32 dict_word2basestr(char *word); + +/** + * Retain a pointer to an dict_t. + */ +dict_t *dict_retain(dict_t *d); + +/** + * Release a pointer to a dictionary. + */ +int dict_free(dict_t *d); + +/** Report a dictionary structure */ +void dict_report(dict_t *d /**< A dictionary structure */ + ); + +#ifdef __cplusplus +} +#endif + +#endif
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/dict2pid.c @@ -0,0 +1,578 @@ +/* -*- c-basic-offset:4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +#include <string.h> + +#include "dict2pid.h" +#include "hmm.h" + + +/** + * @file dict2pid.c - dictionary word to senone sequence mappings + */ + +void +compress_table(s3ssid_t * uncomp_tab, s3ssid_t * com_tab, + s3cipid_t * ci_map, int32 n_ci) +{ + int32 found; + int32 r; + int32 tmp_r; + + for (r = 0; r < n_ci; r++) { + com_tab[r] = BAD_S3SSID; + ci_map[r] = BAD_S3CIPID; + } + /** Compress this map */ + for (r = 0; r < n_ci; r++) { + + found = 0; + for (tmp_r = 0; tmp_r < r && com_tab[tmp_r] != BAD_S3SSID; tmp_r++) { /* If it appears before, just filled in cimap; */ + if (uncomp_tab[r] == com_tab[tmp_r]) { + found = 1; + ci_map[r] = tmp_r; + break; + } + } + + if (found == 0) { + com_tab[tmp_r] = uncomp_tab[r]; + ci_map[r] = tmp_r; + } + } +} + + +static void +compress_right_context_tree(dict2pid_t * d2p, + s3ssid_t ***rdiph_rc) +{ + int32 n_ci; + int32 b, l, r; + s3ssid_t *rmap; + s3ssid_t *tmpssid; + s3cipid_t *tmpcimap; + bin_mdef_t *mdef = d2p->mdef; + size_t alloc; + + n_ci = mdef->n_ciphone; + + tmpssid = ckd_calloc(n_ci, sizeof(s3ssid_t)); + tmpcimap = ckd_calloc(n_ci, sizeof(s3cipid_t)); + + d2p->rssid = + (xwdssid_t **) ckd_calloc(mdef->n_ciphone, sizeof(xwdssid_t *)); + alloc = mdef->n_ciphone * sizeof(xwdssid_t *); + + for (b = 0; b < n_ci; b++) { + d2p->rssid[b] = + (xwdssid_t *) ckd_calloc(mdef->n_ciphone, sizeof(xwdssid_t)); + alloc += mdef->n_ciphone * sizeof(xwdssid_t); + + for (l = 0; l < n_ci; l++) { + rmap = rdiph_rc[b][l]; + compress_table(rmap, tmpssid, tmpcimap, mdef->n_ciphone); + + for (r = 0; r < mdef->n_ciphone && tmpssid[r] != BAD_S3SSID; + r++); + + if (tmpssid[0] != BAD_S3SSID) { + d2p->rssid[b][l].ssid = ckd_calloc(r, sizeof(s3ssid_t)); + memcpy(d2p->rssid[b][l].ssid, tmpssid, + r * sizeof(s3ssid_t)); + d2p->rssid[b][l].cimap = + ckd_calloc(mdef->n_ciphone, sizeof(s3cipid_t)); + memcpy(d2p->rssid[b][l].cimap, tmpcimap, + (mdef->n_ciphone) * sizeof(s3cipid_t)); + d2p->rssid[b][l].n_ssid = r; + } + else { + d2p->rssid[b][l].ssid = NULL; + d2p->rssid[b][l].cimap = NULL; + d2p->rssid[b][l].n_ssid = 0; + } + } + } + + E_INFO("Allocated %d bytes (%d KiB) for word-final triphones\n", + (int)alloc, (int)alloc / 1024); + ckd_free(tmpssid); + ckd_free(tmpcimap); +} + +static void +compress_left_right_context_tree(dict2pid_t * d2p) +{ + int32 n_ci; + int32 b, l, r; + s3ssid_t *rmap; + s3ssid_t *tmpssid; + s3cipid_t *tmpcimap; + bin_mdef_t *mdef = d2p->mdef; + size_t alloc; + + n_ci = mdef->n_ciphone; + + tmpssid = ckd_calloc(n_ci, sizeof(s3ssid_t)); + tmpcimap = ckd_calloc(n_ci, sizeof(s3cipid_t)); + + assert(d2p->lrdiph_rc); + + d2p->lrssid = + (xwdssid_t **) ckd_calloc(mdef->n_ciphone, sizeof(xwdssid_t *)); + alloc = mdef->n_ciphone * sizeof(xwdssid_t *); + + for (b = 0; b < n_ci; b++) { + + d2p->lrssid[b] = + (xwdssid_t *) ckd_calloc(mdef->n_ciphone, sizeof(xwdssid_t)); + alloc += mdef->n_ciphone * sizeof(xwdssid_t); + + for (l = 0; l < n_ci; l++) { + rmap = d2p->lrdiph_rc[b][l]; + + compress_table(rmap, tmpssid, tmpcimap, mdef->n_ciphone); + + for (r = 0; r < mdef->n_ciphone && tmpssid[r] != BAD_S3SSID; + r++); + + if (tmpssid[0] != BAD_S3SSID) { + d2p->lrssid[b][l].ssid = ckd_calloc(r, sizeof(s3ssid_t)); + memcpy(d2p->lrssid[b][l].ssid, tmpssid, + r * sizeof(s3ssid_t)); + d2p->lrssid[b][l].cimap = + ckd_calloc(mdef->n_ciphone, sizeof(s3cipid_t)); + memcpy(d2p->lrssid[b][l].cimap, tmpcimap, + (mdef->n_ciphone) * sizeof(s3cipid_t)); + d2p->lrssid[b][l].n_ssid = r; + } + else { + d2p->lrssid[b][l].ssid = NULL; + d2p->lrssid[b][l].cimap = NULL; + d2p->lrssid[b][l].n_ssid = 0; + } + } + } + + /* Try to compress lrdiph_rc into lrdiph_rc_compressed */ + ckd_free(tmpssid); + ckd_free(tmpcimap); + + E_INFO("Allocated %d bytes (%d KiB) for single-phone word triphones\n", + (int)alloc, (int)alloc / 1024); +} + +/** + ARCHAN, A duplicate of get_rc_npid in ctxt_table.h. I doubt whether it is correct + because the compressed map has not been checked. +*/ +int32 +get_rc_nssid(dict2pid_t * d2p, s3wid_t w) +{ + int32 pronlen; + s3cipid_t b, lc; + dict_t *dict = d2p->dict; + + pronlen = dict->word[w].pronlen; + b = dict->word[w].ciphone[pronlen - 1]; + + if (pronlen == 1) { + /* Is this true ? + No known left context. But all cimaps (for any l) are identical; pick one + */ + /*E_INFO("Single phone word\n"); */ + return (d2p->lrssid[b][0].n_ssid); + } + else { + /* E_INFO("Multiple phone word\n"); */ + lc = dict->word[w].ciphone[pronlen - 2]; + return (d2p->rssid[b][lc].n_ssid); + } + +} + +s3cipid_t * +dict2pid_get_rcmap(dict2pid_t * d2p, s3wid_t w) +{ + int32 pronlen; + s3cipid_t b, lc; + dict_t *dict = d2p->dict; + + pronlen = dict->word[w].pronlen; + b = dict->word[w].ciphone[pronlen - 1]; + + if (pronlen == 1) { + /* Is this true ? + No known left context. But all cimaps (for any l) are identical; pick one + */ + /*E_INFO("Single phone word\n"); */ + return (d2p->lrssid[b][0].cimap); + } + else { + /* E_INFO("Multiple phone word\n"); */ + lc = dict->word[w].ciphone[pronlen - 2]; + return (d2p->rssid[b][lc].cimap); + } +} + +static void +free_compress_map(xwdssid_t ** tree, int32 n_ci) +{ + int32 b, l; + for (b = 0; b < n_ci; b++) { + for (l = 0; l < n_ci; l++) { + ckd_free(tree[b][l].ssid); + ckd_free(tree[b][l].cimap); + } + ckd_free(tree[b]); + } + ckd_free(tree); +} + +static void +populate_lrdiph(dict2pid_t *d2p, s3ssid_t ***rdiph_rc, s3cipid_t b) +{ + bin_mdef_t *mdef = d2p->mdef; + s3cipid_t l, r; + + for (l = 0; l < bin_mdef_n_ciphone(mdef); l++) { + for (r = 0; r < bin_mdef_n_ciphone(mdef); r++) { + s3pid_t p; + p = bin_mdef_phone_id_nearest(mdef, (s3cipid_t) b, + (s3cipid_t) l, + (s3cipid_t) r, + WORD_POSN_SINGLE); + d2p->lrdiph_rc[b][l][r] + = bin_mdef_pid2ssid(mdef, p); + if (r == bin_mdef_silphone(mdef)) + d2p->ldiph_lc[b][r][l] + = bin_mdef_pid2ssid(mdef, p); + if (rdiph_rc && l == bin_mdef_silphone(mdef)) + rdiph_rc[b][l][r] + = bin_mdef_pid2ssid(mdef, p); + assert(IS_S3SSID(bin_mdef_pid2ssid(mdef, p))); + E_DEBUG(2,("%s(%s,%s) => %d / %d\n", + bin_mdef_ciphone_str(mdef, b), + bin_mdef_ciphone_str(mdef, l), + bin_mdef_ciphone_str(mdef, r), + p, bin_mdef_pid2ssid(mdef, p))); + } + } +} + +int +dict2pid_add_word(dict2pid_t *d2p, + int32 wid) +{ + bin_mdef_t *mdef = d2p->mdef; + dict_t *d = d2p->dict; + + if (dict_pronlen(d, wid) > 1) { + s3cipid_t l; + /* Make sure we have left and right context diphones for this + * word. */ + if (d2p->ldiph_lc[dict_first_phone(d, wid)][dict_second_phone(d, wid)][0] + == BAD_S3SSID) { + E_DEBUG(2, ("Filling in left-context diphones for %s(?,%s)\n", + bin_mdef_ciphone_str(mdef, dict_first_phone(d, wid)), + bin_mdef_ciphone_str(mdef, dict_second_phone(d, wid)))); + for (l = 0; l < bin_mdef_n_ciphone(mdef); l++) { + int p + = bin_mdef_phone_id_nearest(mdef, + dict_first_phone(d, wid), l, + dict_second_phone(d, wid), + WORD_POSN_BEGIN); + d2p->ldiph_lc[dict_first_phone(d, wid)][dict_second_phone(d, wid)][l] + = bin_mdef_pid2ssid(mdef, p); + } + } + if (d2p->rssid[dict_last_phone(d, wid)][dict_second_last_phone(d, wid)].n_ssid + == 0) { + s3ssid_t *rmap; + s3ssid_t *tmpssid; + s3cipid_t *tmpcimap; + s3cipid_t r; + + E_DEBUG(2, ("Filling in right-context diphones for %s(%s,?)\n", + bin_mdef_ciphone_str(mdef, dict_last_phone(d, wid)), + bin_mdef_ciphone_str(mdef, dict_second_last_phone(d, wid)))); + rmap = ckd_calloc(bin_mdef_n_ciphone(mdef), sizeof(*rmap)); + for (r = 0; r < bin_mdef_n_ciphone(mdef); r++) { + int p + = bin_mdef_phone_id_nearest(mdef, + dict_last_phone(d, wid), + dict_second_last_phone(d, wid), r, + WORD_POSN_END); + rmap[r] = bin_mdef_pid2ssid(mdef, p); + } + tmpssid = ckd_calloc(bin_mdef_n_ciphone(mdef), sizeof(*tmpssid)); + tmpcimap = ckd_calloc(bin_mdef_n_ciphone(mdef), sizeof(*tmpcimap)); + compress_table(rmap, tmpssid, tmpcimap, bin_mdef_n_ciphone(mdef)); + for (r = 0; r < mdef->n_ciphone && tmpssid[r] != BAD_S3SSID; r++) + ; + d2p->rssid[dict_last_phone(d, wid)][dict_second_last_phone(d, wid)].ssid = tmpssid; + d2p->rssid[dict_last_phone(d, wid)][dict_second_last_phone(d, wid)].cimap = tmpcimap; + d2p->rssid[dict_last_phone(d, wid)][dict_second_last_phone(d, wid)].n_ssid = r; + ckd_free(rmap); + } + } + else { + /* Make sure we have a left-right context triphone entry for + * this word. */ + E_INFO("Filling in context triphones for %s(?,?)\n", + bin_mdef_ciphone_str(mdef, dict_first_phone(d, wid))); + if (d2p->lrdiph_rc[dict_first_phone(d, wid)][0][0] == BAD_S3SSID) { + populate_lrdiph(d2p, NULL, dict_first_phone(d, wid)); + } + } + + return 0; +} + +s3ssid_t +dict2pid_internal(dict2pid_t *d2p, + int32 wid, + int pos) +{ + int b, l, r, p; + dict_t *dict = d2p->dict; + bin_mdef_t *mdef = d2p->mdef; + + if (pos == 0 || pos == dict_pronlen(dict, wid)) + return BAD_S3SSID; + + b = dict_pron(dict, wid, pos); + l = dict_pron(dict, wid, pos - 1); + r = dict_pron(dict, wid, pos + 1); + p = bin_mdef_phone_id_nearest(mdef, (s3cipid_t) b, + (s3cipid_t) l, (s3cipid_t) r, + WORD_POSN_INTERNAL); + return bin_mdef_pid2ssid(mdef, p); +} + +dict2pid_t * +dict2pid_build(bin_mdef_t * mdef, dict_t * dict) +{ + dict2pid_t *dict2pid; + s3ssid_t ***rdiph_rc; + bitvec_t *ldiph, *rdiph, *single; + int32 pronlen; + int32 b, l, r, w, p; + + E_INFO("Building PID tables for dictionary\n"); + assert(mdef); + assert(dict); + + dict2pid = (dict2pid_t *) ckd_calloc(1, sizeof(dict2pid_t)); + dict2pid->refcount = 1; + dict2pid->mdef = bin_mdef_retain(mdef); + dict2pid->dict = dict_retain(dict); + E_INFO("Allocating %d^3 * %d bytes (%d KiB) for word-initial triphones\n", + mdef->n_ciphone, sizeof(s3ssid_t), + mdef->n_ciphone * mdef->n_ciphone * mdef->n_ciphone * sizeof(s3ssid_t) / 1024); + dict2pid->ldiph_lc = + (s3ssid_t ***) ckd_calloc_3d(mdef->n_ciphone, mdef->n_ciphone, + mdef->n_ciphone, sizeof(s3ssid_t)); + /* Only used internally to generate rssid */ + rdiph_rc = + (s3ssid_t ***) ckd_calloc_3d(mdef->n_ciphone, mdef->n_ciphone, + mdef->n_ciphone, sizeof(s3ssid_t)); + + dict2pid->lrdiph_rc = (s3ssid_t ***) ckd_calloc_3d(mdef->n_ciphone, + mdef->n_ciphone, + mdef->n_ciphone, + sizeof + (s3ssid_t)); + /* Actually could use memset for this, if BAD_S3SSID is guaranteed + * to be 65535... */ + for (b = 0; b < mdef->n_ciphone; ++b) { + for (r = 0; r < mdef->n_ciphone; ++r) { + for (l = 0; l < mdef->n_ciphone; ++l) { + dict2pid->ldiph_lc[b][r][l] = BAD_S3SSID; + dict2pid->lrdiph_rc[b][l][r] = BAD_S3SSID; + rdiph_rc[b][l][r] = BAD_S3SSID; + } + } + } + + /* Track which diphones / ciphones have been seen. */ + ldiph = bitvec_alloc(mdef->n_ciphone * mdef->n_ciphone); + rdiph = bitvec_alloc(mdef->n_ciphone * mdef->n_ciphone); + single = bitvec_alloc(mdef->n_ciphone); + + for (w = 0; w < dict_size(dict2pid->dict); w++) { + pronlen = dict_pronlen(dict, w); + + if (pronlen >= 2) { + b = dict_first_phone(dict, w); + r = dict_second_phone(dict, w); + /* Populate ldiph_lc */ + if (bitvec_is_clear(ldiph, b * mdef->n_ciphone + r)) { + /* Mark this diphone as done */ + bitvec_set(ldiph, b * mdef->n_ciphone + r); + + /* Record all possible ssids for b(?,r) */ + for (l = 0; l < bin_mdef_n_ciphone(mdef); l++) { + p = bin_mdef_phone_id_nearest(mdef, (s3cipid_t) b, + (s3cipid_t) l, (s3cipid_t) r, + WORD_POSN_BEGIN); + dict2pid->ldiph_lc[b][r][l] = bin_mdef_pid2ssid(mdef, p); + } + } + + + /* Populate rdiph_rc */ + l = dict_second_last_phone(dict, w); + b = dict_last_phone(dict, w); + if (bitvec_is_clear(rdiph, b * mdef->n_ciphone + l)) { + /* Mark this diphone as done */ + bitvec_set(rdiph, b * mdef->n_ciphone + l); + + for (r = 0; r < bin_mdef_n_ciphone(mdef); r++) { + p = bin_mdef_phone_id_nearest(mdef, (s3cipid_t) b, + (s3cipid_t) l, (s3cipid_t) r, + WORD_POSN_END); + rdiph_rc[b][l][r] = bin_mdef_pid2ssid(mdef, p); + } + } + } + else if (pronlen == 1) { + b = dict_pron(dict, w, 0); + E_DEBUG(1,("Building tables for single phone word %s phone %d = %s\n", + dict_wordstr(dict, w), b, bin_mdef_ciphone_str(mdef, b))); + /* Populate lrdiph_rc (and also ldiph_lc, rdiph_rc if needed) */ + if (bitvec_is_clear(single, b)) { + populate_lrdiph(dict2pid, rdiph_rc, b); + bitvec_set(single, b); + } + } + } + + bitvec_free(ldiph); + bitvec_free(rdiph); + bitvec_free(single); + + /* Try to compress rdiph_rc into rdiph_rc_compressed */ + compress_right_context_tree(dict2pid, rdiph_rc); + compress_left_right_context_tree(dict2pid); + + ckd_free_3d(rdiph_rc); + + dict2pid_report(dict2pid); + return dict2pid; +} + +dict2pid_t * +dict2pid_retain(dict2pid_t *d2p) +{ + ++d2p->refcount; + return d2p; +} + +int +dict2pid_free(dict2pid_t * d2p) +{ + if (d2p == NULL) + return 0; + if (--d2p->refcount > 0) + return d2p->refcount; + + if (d2p->ldiph_lc) + ckd_free_3d((void ***) d2p->ldiph_lc); + + if (d2p->lrdiph_rc) + ckd_free_3d((void ***) d2p->lrdiph_rc); + + if (d2p->rssid) + free_compress_map(d2p->rssid, bin_mdef_n_ciphone(d2p->mdef)); + + if (d2p->lrssid) + free_compress_map(d2p->lrssid, bin_mdef_n_ciphone(d2p->mdef)); + + bin_mdef_free(d2p->mdef); + dict_free(d2p->dict); + ckd_free(d2p); + return 0; +} + +void +dict2pid_report(dict2pid_t * d2p) +{ +} + +void +dict2pid_dump(FILE * fp, dict2pid_t * d2p) +{ + int32 w, p, pronlen; + int32 i, j, b, l, r; + bin_mdef_t *mdef = d2p->mdef; + dict_t *dict = d2p->dict; + + fprintf(fp, "# INTERNAL (wd comssid ssid ssid ... ssid comssid)\n"); + for (w = 0; w < dict_size(dict); w++) { + fprintf(fp, "%30s ", dict_wordstr(dict, w)); + + pronlen = dict_pronlen(dict, w); + for (p = 0; p < pronlen; p++) + fprintf(fp, " %5d", dict2pid_internal(d2p, w, p)); + fprintf(fp, "\n"); + } + fprintf(fp, "#\n"); + + fprintf(fp, "# LDIPH_LC (b r l ssid)\n"); + for (b = 0; b < bin_mdef_n_ciphone(mdef); b++) { + for (r = 0; r < bin_mdef_n_ciphone(mdef); r++) { + for (l = 0; l < bin_mdef_n_ciphone(mdef); l++) { + if (IS_S3SSID(d2p->ldiph_lc[b][r][l])) + fprintf(fp, "%6s %6s %6s %5d\n", bin_mdef_ciphone_str(mdef, (s3cipid_t) b), bin_mdef_ciphone_str(mdef, (s3cipid_t) r), bin_mdef_ciphone_str(mdef, (s3cipid_t) l), d2p->ldiph_lc[b][r][l]); /* RAH, ldiph_lc is returning an int32, %d expects an int16 */ + } + } + } + fprintf(fp, "#\n"); + + fprintf(fp, "# SSEQ %d (senid senid ...)\n", mdef->n_sseq); + for (i = 0; i < mdef->n_sseq; i++) { + fprintf(fp, "%5d ", i); + for (j = 0; j < bin_mdef_n_emit_state(mdef); j++) + fprintf(fp, " %5d", mdef->sseq[i][j]); + fprintf(fp, "\n"); + } + fprintf(fp, "#\n"); + fprintf(fp, "# END\n"); + + fflush(fp); +}
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/dict2pid.h @@ -0,0 +1,180 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2014 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +#ifndef _S3_DICT2PID_H_ +#define _S3_DICT2PID_H_ + +/* System headers. */ +#include <stdio.h> + +/* SphinxBase headers. */ +#include <sphinxbase/logmath.h> +#include <sphinxbase/bitvec.h> + +/* Local headers. */ +#include "s3types.h" +#include "bin_mdef.h" +#include "dict.h" + +/** \file dict2pid.h + * \brief Building triphones for a dictionary. + * + * This is one of the more complicated parts of a cross-word + * triphone model decoder. The first and last phones of each word + * get their left and right contexts, respectively, from other + * words. For single-phone words, both its contexts are from other + * words, simultaneously. As these words are not known beforehand, + * life gets complicated. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \struct xwdssid_t + * \brief cross word triphone model structure + */ + +typedef struct { + s3ssid_t *ssid; /**< Senone Sequence ID list for all context ciphones */ + s3cipid_t *cimap; /**< Index into ssid[] above for each ci phone */ + int32 n_ssid; /**< #Unique ssid in above, compressed ssid list */ +} xwdssid_t; + +/** + \struct dict2pid_t + \brief Building composite triphone (as well as word internal triphones) with the dictionary. +*/ + +typedef struct { + int refcount; + + bin_mdef_t *mdef; /**< Model definition, used to generate + internal ssids on the fly. */ + dict_t *dict; /**< Dictionary this table refers to. */ + + /*Notice the order of the arguments */ + /* FIXME: This is crying out for compression - in Mandarin we have + * 180 context independent phones, which makes this an 11MB + * array. */ + s3ssid_t ***ldiph_lc; /**< For multi-phone words, [base][rc][lc] -> ssid; filled out for + word-initial base x rc combinations in current vocabulary */ + + + xwdssid_t **rssid; /**< Right context state sequence id table + First dimension: base phone, + Second dimension: left context. + */ + + + s3ssid_t ***lrdiph_rc; /**< For single-phone words, [base][lc][rc] -> ssid; filled out for + single-phone base x lc combinations in current vocabulary */ + + xwdssid_t **lrssid; /**< Left-Right context state sequence id table + First dimension: base phone, + Second dimension: left context. + */ +} dict2pid_t; + +/** Access macros; not designed for arbitrary use */ +#define dict2pid_rssid(d,ci,lc) (&(d)->rssid[ci][lc]) +#define dict2pid_ldiph_lc(d,b,r,l) ((d)->ldiph_lc[b][r][l]) +#define dict2pid_lrdiph_rc(d,b,l,r) ((d)->lrdiph_rc[b][l][r]) + +/** + * Build the dict2pid structure for the given model/dictionary + */ +dict2pid_t *dict2pid_build(bin_mdef_t *mdef, /**< A model definition*/ + dict_t *dict /**< An initialized dictionary */ + ); + +/** + * Retain a pointer to dict2pid + */ +dict2pid_t *dict2pid_retain(dict2pid_t *d2p); + +/** + * Free the memory dict2pid structure + */ +int dict2pid_free(dict2pid_t *d2p /**< In: the d2p */ + ); + +/** + * Return the senone sequence ID for the given word position. + */ +s3ssid_t dict2pid_internal(dict2pid_t *d2p, + int32 wid, + int pos); + +/** + * Add a word to the dict2pid structure (after adding it to dict). + */ +int dict2pid_add_word(dict2pid_t *d2p, + int32 wid); + +/** + * For debugging + */ +void dict2pid_dump(FILE *fp, /**< In: a file pointer */ + dict2pid_t *d2p /**< In: a dict2pid_t structure */ + ); + +/** Report a dict2pid data structure */ +void dict2pid_report(dict2pid_t *d2p /**< In: a dict2pid_t structure */ + ); + +/** + * Get number of rc + */ +int32 get_rc_nssid(dict2pid_t *d2p, /**< In: a dict2pid */ + s3wid_t w /**< In: a wid */ + ); + +/** + * Get RC map + */ +s3cipid_t* dict2pid_get_rcmap(dict2pid_t *d2p, /**< In: a dict2pid */ + s3wid_t w /**< In: a wid */ + ); + +#ifdef __cplusplus +} +#endif + + +#endif
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/fsg_history.c @@ -0,0 +1,317 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* + * fsg_history.c -- FSG Viterbi decode history + * + * ********************************************** + * CMU ARPA Speech Project + * + * Copyright (c) 1999 Carnegie Mellon University. + * ALL RIGHTS RESERVED. + * ********************************************** + * + * HISTORY + * + * 25-Feb-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University + * Started.. + */ + +/* System headers. */ +#include <assert.h> + +/* SphinxBase headers. */ +#include <sphinxbase/prim_type.h> +#include <sphinxbase/err.h> +#include <sphinxbase/ckd_alloc.h> + +/* Local headers. */ +#include "fsg_search_internal.h" +#include "fsg_history.h" + + +#define __FSG_DBG__ 0 + + +fsg_history_t * +fsg_history_init(fsg_model_t * fsg, dict_t *dict) +{ + fsg_history_t *h; + + h = (fsg_history_t *) ckd_calloc(1, sizeof(fsg_history_t)); + h->fsg = fsg; + h->entries = blkarray_list_init(); + + if (fsg && dict) { + h->n_ciphone = bin_mdef_n_ciphone(dict->mdef); + h->frame_entries = + (glist_t **) ckd_calloc_2d(fsg_model_n_state(fsg), + bin_mdef_n_ciphone(dict->mdef), + sizeof(**h->frame_entries)); + } + else { + h->frame_entries = NULL; + } + + return h; +} + +void +fsg_history_free(fsg_history_t *h) +{ + int32 s, lc, ns, np; + gnode_t *gn; + + if (h->fsg) { + ns = fsg_model_n_state(h->fsg); + np = h->n_ciphone; + + for (s = 0; s < ns; s++) { + for (lc = 0; lc < np; lc++) { + for (gn = h->frame_entries[s][lc]; gn; gn = gnode_next(gn)) { + ckd_free(gnode_ptr(gn)); + } + glist_free(h->frame_entries[s][lc]); + } + } + } + ckd_free_2d(h->frame_entries); + blkarray_list_free(h->entries); + ckd_free(h); +} + + +void +fsg_history_set_fsg(fsg_history_t *h, fsg_model_t *fsg, dict_t *dict) +{ + if (blkarray_list_n_valid(h->entries) != 0) { + E_WARN("Switching FSG while history not empty; history cleared\n"); + blkarray_list_reset(h->entries); + } + + if (h->frame_entries) + ckd_free_2d((void **) h->frame_entries); + h->frame_entries = NULL; + h->fsg = fsg; + + if (fsg && dict) { + h->n_ciphone = bin_mdef_n_ciphone(dict->mdef); + h->frame_entries = + (glist_t **) ckd_calloc_2d(fsg_model_n_state(fsg), + bin_mdef_n_ciphone(dict->mdef), + sizeof(glist_t)); + } +} + + +void +fsg_history_entry_add(fsg_history_t * h, + fsg_link_t * link, + int32 frame, int32 score, int32 pred, + int32 lc, fsg_pnode_ctxt_t rc) +{ + fsg_hist_entry_t *entry, *new_entry; + int32 s; + gnode_t *gn, *prev_gn; + + /* Skip the optimization for the initial dummy entries; always enter them */ + if (frame < 0) { + new_entry = + (fsg_hist_entry_t *) ckd_calloc(1, sizeof(fsg_hist_entry_t)); + new_entry->fsglink = link; + new_entry->frame = frame; + new_entry->score = score; + new_entry->pred = pred; + new_entry->lc = lc; + new_entry->rc = rc; + + blkarray_list_append(h->entries, (void *) new_entry); + return; + } + + s = fsg_link_to_state(link); + + /* Locate where this entry should be inserted in frame_entries[s][lc] */ + prev_gn = NULL; + for (gn = h->frame_entries[s][lc]; gn; gn = gnode_next(gn)) { + entry = (fsg_hist_entry_t *) gnode_ptr(gn); + + if (score BETTER_THAN entry->score) + break; /* Found where to insert new entry */ + + /* Existing entry score not worse than new score */ + if (FSG_PNODE_CTXT_SUB(&rc, &(entry->rc)) == 0) + return; /* rc set reduced to 0; new entry can be ignored */ + + prev_gn = gn; + } + + /* Create new entry after prev_gn (if prev_gn is NULL, at head) */ + new_entry = + (fsg_hist_entry_t *) ckd_calloc(1, sizeof(fsg_hist_entry_t)); + new_entry->fsglink = link; + new_entry->frame = frame; + new_entry->score = score; + new_entry->pred = pred; + new_entry->lc = lc; + new_entry->rc = rc; /* Note: rc set must be non-empty at this point */ + + if (!prev_gn) { + h->frame_entries[s][lc] = glist_add_ptr(h->frame_entries[s][lc], + (void *) new_entry); + prev_gn = h->frame_entries[s][lc]; + } + else + prev_gn = glist_insert_ptr(prev_gn, (void *) new_entry); + + /* + * Update the rc set of all the remaining entries in the list. At this + * point, gn is the entry, if any, immediately following new entry. + */ + while (gn) { + entry = (fsg_hist_entry_t *) gnode_ptr(gn); + + if (FSG_PNODE_CTXT_SUB(&(entry->rc), &rc) == 0) { + /* rc set of entry reduced to 0; can prune this entry */ + ckd_free((void *) entry); + gn = gnode_free(gn, prev_gn); + } + else { + prev_gn = gn; + gn = gnode_next(gn); + } + } +} + + +/* + * Transfer the surviving history entries for this frame into the permanent + * history table. + */ +void +fsg_history_end_frame(fsg_history_t * h) +{ + int32 s, lc, ns, np; + gnode_t *gn; + fsg_hist_entry_t *entry; + + ns = fsg_model_n_state(h->fsg); + np = h->n_ciphone; + + for (s = 0; s < ns; s++) { + for (lc = 0; lc < np; lc++) { + for (gn = h->frame_entries[s][lc]; gn; gn = gnode_next(gn)) { + entry = (fsg_hist_entry_t *) gnode_ptr(gn); + blkarray_list_append(h->entries, (void *) entry); + } + + glist_free(h->frame_entries[s][lc]); + h->frame_entries[s][lc] = NULL; + } + } +} + + +fsg_hist_entry_t * +fsg_history_entry_get(fsg_history_t * h, int32 id) +{ + return ((fsg_hist_entry_t *) blkarray_list_get(h->entries, id)); +} + + +void +fsg_history_reset(fsg_history_t * h) +{ + blkarray_list_reset(h->entries); +} + + +int32 +fsg_history_n_entries(fsg_history_t * h) +{ + return (blkarray_list_n_valid(h->entries)); +} + +void +fsg_history_utt_start(fsg_history_t * h) +{ + int32 s, lc, ns, np; + + assert(blkarray_list_n_valid(h->entries) == 0); + assert(h->frame_entries); + + ns = fsg_model_n_state(h->fsg); + np = h->n_ciphone; + + for (s = 0; s < ns; s++) { + for (lc = 0; lc < np; lc++) { + assert(h->frame_entries[s][lc] == NULL); + } + } +} + +void +fsg_history_utt_end(fsg_history_t * h) +{ +} + +void +fsg_history_print(fsg_history_t *h, dict_t *dict) +{ + int bpidx, bp; + + for (bpidx = 0; bpidx < blkarray_list_n_valid(h->entries); bpidx++) { + bp = bpidx; + printf("History entry: "); + while (bp > 0) { + fsg_hist_entry_t *hist_entry = fsg_history_entry_get(h, bp); + fsg_link_t *fl = fsg_hist_entry_fsglink(hist_entry); + char const *baseword; + int32 wid; + bp = fsg_hist_entry_pred(hist_entry); + wid = fsg_link_wid(fl); + + if (fl == NULL) + continue; + + baseword = fsg_model_word_str(h->fsg, wid); + + printf("%s(%d->%d:%d) ", baseword, + fsg_link_from_state(hist_entry->fsglink), + fsg_link_to_state(hist_entry->fsglink), + hist_entry->frame); + } + printf("\n"); + } +}
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/fsg_history.h @@ -0,0 +1,215 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * fsg_history.h -- FSG Viterbi decode history + * + * ********************************************** + * CMU ARPA Speech Project + * + * Copyright (c) 1999 Carnegie Mellon University. + * ALL RIGHTS RESERVED. + * ********************************************** + * + * HISTORY + * + * $Log: fsg_history.h,v $ + * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins + * re-importation + * + * Revision 1.1 2004/07/16 00:57:12 egouvea + * Added Ravi's implementation of FSG support. + * + * Revision 1.7 2004/07/07 22:30:35 rkm + * *** empty log message *** + * + * Revision 1.6 2004/07/07 13:56:33 rkm + * Added reporting of (acoustic score - best senone score)/frame + * + * Revision 1.5 2004/06/25 14:49:08 rkm + * Optimized size of history table and speed of word transitions by maintaining only best scoring word exits at each state + * + * Revision 1.4 2004/06/23 20:32:16 rkm + * *** empty log message *** + * + * Revision 1.3 2004/05/27 15:16:08 rkm + * *** empty log message *** + * + * + * 25-Feb-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University + * Started, based on S3.3 version. + */ + + +#ifndef __S2_FSG_HISTORY_H__ +#define __S2_FSG_HISTORY_H__ + + +/* SphinxBase headers. */ +#include <sphinxbase/prim_type.h> +#include <sphinxbase/fsg_model.h> + +/* Local headers. */ +#include "blkarray_list.h" +#include "fsg_lextree.h" +#include "dict.h" + +/* + * The Viterbi history structure. This is a tree, with the root at the + * FSG start state, at frame 0, with a null predecessor. + */ + +/* + * A single Viterbi history entry + */ +typedef struct fsg_hist_entry_s { + fsg_link_t *fsglink; /* Link taken result in this entry */ + int32 score; /* Total path score at the end of this + transition */ + int32 pred; /* Predecessor entry; -1 if none */ + frame_idx_t frame; /* Ending frame for this entry */ + int16 lc; /* Left context provided by this entry to + succeeding words */ + fsg_pnode_ctxt_t rc; /* Possible right contexts to which this entry + applies */ +} fsg_hist_entry_t; + +/* Access macros */ +#define fsg_hist_entry_fsglink(v) ((v)->fsglink) +#define fsg_hist_entry_frame(v) ((v)->frame) +#define fsg_hist_entry_score(v) ((v)->score) +#define fsg_hist_entry_pred(v) ((v)->pred) +#define fsg_hist_entry_lc(v) ((v)->lc) +#define fsg_hist_entry_rc(v) ((v)->rc) + + +/* + * The entire tree of history entries (fsg_history_t.entries). + * Optimization: In a given frame, there may be several history entries, with + * the same left and right phonetic context, terminating in a particular state. + * Only the best scoring one of these needs to be saved, since everything else + * will be pruned according to the Viterbi algorithm. frame_entries is used + * temporarily in each frame to determine these best scoring entries in that + * frame. Only the ones not pruned are transferred to entries at the end of + * the frame. However, null transitions are a problem since they create + * entries that depend on entries created in the CURRENT frame. Hence, this + * pruning is done in two stages: first for the non-null transitions, and then + * for the null transitions alone. (This solution is sub-optimal, and can be + * improved with a little more work. SMOP.) + * Why is frame_entries a list? Each entry has a unique terminating state, + * and has a unique lc CIphone. But it has a SET of rc CIphones. + * frame_entries[s][lc] is an ordered list of entries created in the current + * frame, terminating in state s, and with left context lc. The list is in + * descending order of path score. When a new entry with (s,lc) arrives, + * its position in the list is determined. Then its rc set is modified by + * subtracting the union of the rc's of all its predecessors (i.e., better + * scoring entries). If the resulting rc set is empty, the entry is discarded. + * Otherwise, it is inserted, and the rc sets of all downstream entries in the + * list are updated by subtracting the new entry's rc. If any of them becomes + * empty, it is also discarded. + * As mentioned earlier, this procedure is applied in two stages, for the + * non-null transitions, and the null transitions, separately. + */ +typedef struct fsg_history_s { + fsg_model_t *fsg; /* The FSG for which this object applies */ + blkarray_list_t *entries; /* A list of history table entries; the root + entry is the first element of the list */ + glist_t **frame_entries; + int n_ciphone; +} fsg_history_t; + + +/* + * One-time intialization: Allocate and return an initially empty history + * module. + */ +fsg_history_t *fsg_history_init(fsg_model_t *fsg, dict_t *dict); + +void fsg_history_utt_start(fsg_history_t *h); + +void fsg_history_utt_end(fsg_history_t *h); + + +/* + * Create a history entry recording the completion of the given FSG + * transition, at the end of the given frame, with the given score, and + * the given predecessor history entry. + * The entry is initially temporary, and may be superseded by another + * with a higher score. The surviving entries must be transferred to + * the main history table, via fsg_history_end_frame(). + */ +void fsg_history_entry_add (fsg_history_t *h, + fsg_link_t *l, /* FSG transition */ + int32 frame, + int32 score, + int32 pred, + int32 lc, + fsg_pnode_ctxt_t rc); + +/* + * Transfer the surviving history entries for this frame into the permanent + * history table. This function can be called several times during a frame. + * Each time, the entries surviving so far are transferred, and the temporary + * lists cleared. This feature is used to handle the entries due to non-null + * transitions and null transitions separately. + */ +void fsg_history_end_frame (fsg_history_t *h); + + +/* Clear the hitory table */ +void fsg_history_reset (fsg_history_t *h); + + +/* Return the number of valid entries in the given history table */ +int32 fsg_history_n_entries (fsg_history_t *h); + +/* + * Return a ptr to the history entry for the given ID; NULL if there is no + * such entry. + */ +fsg_hist_entry_t *fsg_history_entry_get(fsg_history_t *h, int32 id); + + +/* + * Switch the FSG associated with the given history module. Should be done + * when the history list is empty. If not empty, the list is cleared. + */ +void fsg_history_set_fsg (fsg_history_t *h, fsg_model_t *fsg, dict_t *dict); + +/* Free the given Viterbi search history object */ +void fsg_history_free (fsg_history_t *h); + +/* Print the entire history */ +void fsg_history_print(fsg_history_t *h, dict_t *dict); + +#endif
new file mode 100644 --- /dev/null +++ b/media/pocketsphinx/src/fsg_lextree.c @@ -0,0 +1,835 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2010 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/** + * @file fsg_lextree.c + * @brief The collection of all the lextrees for the entire FSM. + * @author M K Ravishankar <rkm@cs.cmu.edu> + * @author Bhiksha Raj <bhiksha@cs.cmu.edu> + */ + +/* System headers. */ +#include <stdio.h> +#include <string.h> +#include <assert.h> + +/* SphinxBase headers. */ +#include <sphinxbase/ckd_alloc.h> +#include <sphinxbase/err.h> + +/* Local headers. */ +#include "fsg_lextree.h" + +#define __FSG_DBG__ 0 + +/* A linklist structure that is actually used to build local lextrees at grammar nodes */ +typedef struct fsg_glist_linklist_t { + int32 ci, rc; + glist_t glist; + struct fsg_glist_linklist_t *next