/usr/include/sphinxbase/cont_ad.h is in libsphinxbase-dev 0.4.1-0ubuntu4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 | /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 1999-2001 Carnegie Mellon University. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* This work was supported in part by funding from the Defense Advanced
* Research Projects Agency and the National Science Foundation of the
* United States of America, and the CMU Sphinx Speech Consortium.
*
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
/*
* cont_ad.h -- Continuous A/D listening and silence filtering module.
*
* **********************************************
* CMU ARPA Speech Project
*
* Copyright (c) 1996 Carnegie Mellon University.
* ALL RIGHTS RESERVED.
* **********************************************
*
* HISTORY
*
* 13-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
* Added spf and adbufsize to cont_ad_t in order to support variable
* frame sizes depending on audio sampling rate.
*
* 30-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
* Added FILE* argument to cont_ad_powhist_dump().
*
* 16-Jan-98 Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University
* Changed to use dB instead of the weird power measure.
* Added most system parameters to cont_ad_t instead of hardwiring
* them in cont_ad.c.
* Added cont_ad_set_params() and cont_ad_get_params().
*
* 28-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
* Added cont_ad_t.siglvl.
*
* 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
* Added the option for cont_ad_read to return -1 on EOF.
*
* 21-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
* Added cont_ad_set_thresh().
*
* 20-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
* Separated thresholds for speech and silence.
*
* 17-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
* Created, based loosely on Steve Reed's original implementation.
*/
#ifndef _CONT_AD_H_
#define _CONT_AD_H_
/* Win32/WinCE DLL gunk */
#include <sphinxbase_export.h>
#include <prim_type.h>
/**
* \file cont_ad.h
* \brief Continuous A/D listening and silence filtering module.
*
* This module is intended to be interposed as a filter between any
* raw A/D source and the application to remove silence regions. Its
* main purpose is to remove regions of silence from the raw input
* speech. It is initialized with a raw A/D source function (during
* the cont_ad_init call). The application is responsible for setting
* up the A/D source, turning recording on and off as it desires.
* Filtered A/D data can be read by the application using the
* cont_ad_read function.
*
* In other words, the application calls cont_ad_read instead of the
* raw A/D source function (e.g., ad_read in libad) to obtain filtered
* A/D data with silence regions removed. This module itself does not
* enforce any other structural changes to the application.
*
* The cont_ad_read function also updates an "absolute" timestamp (see
* cont_ad_t.read_ts) at the end of each invocation. The timestamp
* indicates the total number of samples of A/D data read until this
* point, including data discarded as silence frames. The application
* is responsible for using this timestamp to make any policy
* decisions regarding utterance boundaries or whatever.
*/
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
#if 0
/* Fool Emacs. */
}
#endif
/* States of continuous listening module */
#define CONT_AD_STATE_SIL 0
#define CONT_AD_STATE_SPEECH 1
/**
* \struct spseg_t
* \brief (FOR INTERNAL USE ) Data structure for maintaining speech (non-silence) segments not yet consumed by the
* application.
*/
typedef struct spseg_s {
int32 startfrm; /**< Frame-id in adbuf (see below) of start of this segment */
int32 nfrm; /**< Number of frames in segment (may wrap around adbuf) */
struct spseg_s *next; /**< Next speech segment (with some intervening silence) */
} spseg_t;
/**
* \struct cont_ad_t
* \brief Continuous listening module or object
* Continuous listening module or object. An application can open and maintain several
* such objects, if necessary.
* FYI: Module always in one of two states: SILENCE or SPEECH. Transitions between the
* two detected by sliding a window spanning several frames and looking for some minimum
* number of frames of the other type.
*/
typedef struct {
/* Function to be called for obtaining A/D data (see prototype for ad_read in ad.h) */
int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max);
ad_rec_t *ad; /**< A/D device argument for adfunc. Also, ad->sps used to
determine frame size (spf, see below) */
int32 rawmode; /**< Pass all input data through, without filtering silence */
int16 *adbuf; /**< Circular buffer for maintaining A/D data read until consumed */
/* **************************************************************************
* state, read_ts, and siglvl are provided for READ-ONLY use by client
* applications, and are updated by calls to cont_ad_read() (see below). All
* other variables should be left alone.
*/
int32 state; /**< State of data returned by most recent cont_ad_read call;
CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH. */
int32 read_ts; /**< Absolute timestamp (total no. of raw samples consumed
upto the most recent cont_ad_read call, starting from
the very beginning). Note that this is a 32-bit
integer; applications should guard against overflow. */
int32 seglen; /**< Total no. of raw samples consumed in the segment
returned by the most recent cont_ad_read call. Can be
used to detect silence segments that have stretched long
enough to terminate an utterance */
int32 siglvl; /**< Max signal level for the data consumed by the most recent
cont_ad_read call (dB range: 0-99). Can be used to
update a V-U meter, for example. */
/* ************************************************************************ */
int32 sps; /**< Samples/sec; moved from ad->sps to break dependence on
ad by N. Roy.*/
int32 eof; /**< Whether the source ad device has encountered EOF */
int32 spf; /**< Samples/frame; audio level is analyzed within frames */
int32 adbufsize; /**< Buffer size (Number of samples) */
int32 prev_sample; /**< For pre-emphasis filter */
int32 headfrm; /**< Frame number in adbuf with unconsumed A/D data */
int32 n_frm; /**< Number of complete frames of unconsumed A/D data in adbuf */
int32 n_sample; /**< Number of samples of unconsumed data in adbuf */
int32 tot_frm; /**< Total number of frames of A/D data read, including consumed ones */
int32 noise_level; /**< PWP: what we claim as the "current" noise level */
int32 *pow_hist; /**< Histogram of frame power, moving window, decayed */
char *frm_pow; /**< Frame power */
int32 auto_thresh; /**< Do automatic threshold adjustment or not */
int32 delta_sil; /**< Max silence power/frame ABOVE noise level */
int32 delta_speech; /**< Min speech power/frame ABOVE noise level */
int32 min_noise; /**< noise lower than this we ignore */
int32 max_noise; /**< noise higher than this signals an error */
int32 winsize; /**< how many frames to look at for speech det */
int32 speech_onset; /**< start speech on >= these many frames out of winsize, of >= delta_speech */
int32 sil_onset; /**< end speech on >= these many frames out of winsize, of <= delta_sil */
int32 leader; /**< pad beggining of speech with this many extra frms */
int32 trailer; /**< pad end of speech with this many extra frms */
int32 thresh_speech;/**< Frame considered to be speech if power >= thresh_speech
(for transitioning from SILENCE to SPEECH state) */
int32 thresh_sil; /**< Frame considered to be silence if power <= thresh_sil
(for transitioning from SPEECH to SILENCE state) */
int32 thresh_update;/**< Number of frames before next update to pow_hist/thresholds */
float32 adapt_rate; /**< Linear interpolation constant for rate at which noise level adapted
to each estimate;
range: 0-1; 0=> no adaptation, 1=> instant adaptation */
int32 tail_state; /**< State at the end of its internal buffer (internal use):
CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH. Note: This is
different from cont_ad_t.state. */
int32 win_startfrm; /**< Where next analysis window begins */
int32 win_validfrm; /**< Number of frames currently available from win_startfrm for analysis */
int32 n_other; /**< If in SILENCE state, number of frames in analysis window considered to
be speech; otherwise number of frames considered to be silence */
spseg_t *spseg_head;/**< First of unconsumed speech segments */
spseg_t *spseg_tail;/**< Last of unconsumed speech segments */
FILE *rawfp; /**< If non-NULL, raw audio input data processed by cont_ad
is dumped to this file. Controlled by user application
via cont_ad_set_rawfp(). NULL when cont_ad object is
initially created. */
FILE *logfp; /**< If non-NULL, write detailed logs of this object's
progress to the file. Controlled by user application
via cont_ad_set_logfp(). NULL when cont_ad object is
initially created. */
int32 n_calib_frame; /**< Number of frames of calibration data seen so far. */
} cont_ad_t;
/**
* Initialize a continuous listening/silence filtering object.
*
* One time initialization of a continuous listening/silence filtering
* object/module. This can work in either "stream mode", where it
* reads data from an audio device represented by
* <code>ad_rec_t</code>, or in "block mode", where it filters out
* silence regions from blocks of data passed into it.
*
* @param ad An audio device to read from, or NULL to operate in block mode.
* @param adfunc The function used to read audio from <code>ad</code>,
* or NULL to operate in block mode. This is usually ad_read().
* @return A pointer to a READ-ONLY structure used in other calls to
* the object. If any error occurs, the return value is NULL.
*/
SPHINXBASE_EXPORT
cont_ad_t *cont_ad_init (ad_rec_t *ad, /**< In: The A/D source object to be filtered */
int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max)
/**< In: adfunc = source function to be invoked
to obtain raw A/D data. See ad.h for the
required prototype definition. */
);
/**
* Initializes a continuous listening object which simply passes data through (!)
*
* Like cont_ad_init, but put the module in raw mode; i.e., all data is passed
* through, unfiltered. (By special request.)
*/
SPHINXBASE_EXPORT
cont_ad_t *cont_ad_init_rawmode (ad_rec_t *ad,
int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max));
/**
* Read raw audio data into the silence filter.
*
* The main read routine for reading speech/silence segmented audio data. Audio
* data is copied into the caller provided buffer, much like a file read routine.
*
* In "block mode", i.e. if NULL was passed as a read function to
* <code>cont_ad_init</code>, the data in <code>buf</code> is taken as
* input, and any non-silence data is written back to <code>buf</code>
* on exit. In this case, you must take care that <code>max</code>
* does not overflow the internal buffer of the silence filter. The
* available number of samples can be obtained by calling
* cont_ad_buffer_space(). Any excess data will be discarded.
*
* In normal mode, only speech segments are copied; silence segments are dropped.
* In rawmode (cont_ad module initialized using cont_ad_init_rawmode()), all data
* are passed through to the caller. But, in either case, any single call to
* cont_ad_read will never return data that crosses a speech/silence segment
* boundary.
*
* The following variables are updated for use by the caller (see cont_ad_t above):
* cont_ad_t.state,
* cont_ad_t.read_ts,
* cont_ad_t.seglen,
* cont_ad_t.siglvl.
*
* Return value: Number of samples actually read, possibly 0; <0 if EOF on A/D source.
*/
SPHINXBASE_EXPORT
int32 cont_ad_read (cont_ad_t *r, /**< In: Object pointer returned by cont_ad_init */
int16 *buf, /**< In/Out: In block mode, contains input data.
On return, buf contains A/D data returned
by this function, if any. */
int32 max /**< In: Maximum number of samples to be filled into buf.
NOTE: max must be at least 256; otherwise
the functions returns -1. */
);
/**
* Get the maximum number of samples which can be passed into cont_ad_read().
*/
SPHINXBASE_EXPORT
int32 cont_ad_buffer_space(cont_ad_t *r);
/**
* Calibrate the silence filter.
*
* Calibration to determine an initial silence threshold. This function can be called
* any number of times. It should be called at least once immediately after cont_ad_init.
* The silence threshold is also updated internally once in a while, so this function
* only needs to be called in the middle if there is a definite change in the recording
* environment.
* The application is responsible for making sure that the raw audio source is turned on
* before the calibration.
* Return value: 0 if successful, <0 otherwise.
*/
SPHINXBASE_EXPORT
int32 cont_ad_calib (cont_ad_t *cont /**< In: object pointer returned by cont_ad_init */
);
/**
* Calibrate the silence filter without an audio device.
*
* If the application has not passed an audio device into the silence filter
* at initialisation, this routine can be used to calibrate the filter. The
* buf (of length max samples) should contain audio data for calibration. This
* data is assumed to be completely consumed. More than one call may be
* necessary to fully calibrate.
* Return value: 0 if successful, <0 on failure, >0 if calibration not
* complete.
*/
SPHINXBASE_EXPORT
int32 cont_ad_calib_loop (cont_ad_t *r, int16 *buf, int32 max);
/**
* Get the number of samples required to calibrate the silence filter.
*
* Since, as mentioned above, the calibration data is assumed to be
* fully consumed, it may be desirable to "hold onto" this data in
* case it contains useful speech. This function returns the number
* of samples required to calibrate the silence filter, which is
* useful in allocating a buffer to store this data.
*
* @return Number of samples required for successful calibration.
*/
SPHINXBASE_EXPORT
int32 cont_ad_calib_size(cont_ad_t *r);
/**
* Set silence and speech threshold parameters.
*
* The silence threshold is the max power
* level, RELATIVE to the peak background noise level, in any silence frame. Similarly,
* the speech threshold is the min power level, RELATIVE to the peak background noise
* level, in any speech frame. In general, silence threshold <= speech threshold.
* Increasing the thresholds (say, from the default value of 2 to 3 or 4) reduces the
* sensitivity to background noise, but may also increase the chances of clipping actual
* speech.
* @return: 0 if successful, <0 otherwise.
*/
SPHINXBASE_EXPORT
int32 cont_ad_set_thresh (cont_ad_t *cont, /**< In: Object ptr from cont_ad_init */
int32 sil, /**< In: silence threshold (default 2) */
int32 sp /**< In: speech threshold (default 2) */
);
/**
* Set the changable parameters.
*
* delta_sil, delta_speech, min_noise, and max_noise are in dB,
* winsize, speech_onset, sil_onset, leader and trailer are in frames of
* 16 ms length (256 samples @ 16kHz sampling).
*/
SPHINXBASE_EXPORT
int32 cont_ad_set_params (cont_ad_t *r, int32 delta_sil, int32 delta_speech,
int32 min_noise, int32 max_noise,
int32 winsize, int32 speech_onset, int32 sil_onset,
int32 leader, int32 trailer,
float32 adapt_rate);
/**
* PWP 1/14/98 -- get the changable params.
*
* delta_sil, delta_speech, min_noise, and max_noise are in dB,
* winsize, speech_onset, sil_onset, leader and trailer are in frames of
* 16 ms length (256 samples @ 16kHz sampling).
*/
SPHINXBASE_EXPORT
int32 cont_ad_get_params (cont_ad_t *r, int32 *delta_sil, int32 *delta_speech,
int32 *min_noise, int32 *max_noise,
int32 *winsize, int32 *speech_onset, int32 *sil_onset,
int32 *leader, int32 *trailer,
float32 *adapt_rate);
/**
* Reset, discarding any accumulated speech segments.
* @return 0 if successful, <0 otherwise.
*/
SPHINXBASE_EXPORT
int32 cont_ad_reset (cont_ad_t *cont); /* In: Object pointer from cont_ad_init */
/**
* Close the continuous listening object.
*/
SPHINXBASE_EXPORT
int32 cont_ad_close (cont_ad_t *cont); /* In: Object pointer from cont_ad_init */
/**
* Dump the power histogram. For debugging...
*/
SPHINXBASE_EXPORT
void cont_ad_powhist_dump (FILE *fp, cont_ad_t *cont);
/**
* Detach the given continuous listening module from the associated audio device.
* @return 0 if successful, -1 otherwise.
*/
SPHINXBASE_EXPORT
int32 cont_ad_detach (cont_ad_t *c);
/**
* Attach the continuous listening module to the given audio device/function.
* (Like cont_ad_init, but without the calibration.)
* @return 0 if successful, -1 otherwise.
*/
SPHINXBASE_EXPORT
int32 cont_ad_attach (cont_ad_t *c, ad_rec_t *a, int32 (*func)(ad_rec_t *, int16 *, int32));
/**
* Set a file for dumping raw audio input.
*
* The application can ask cont_ad to dump the raw audio input that cont_ad
* processes to a file. Use this function to give the FILE* to the cont_ad
* object. If invoked with fp == NULL, dumping is turned off. The application
* is responsible for opening and closing the file. If fp is non-NULL, cont_ad
* assumes the file pointer is valid and opened for writing.
*
* @return 0 if successful, -1 otherwise.
*/
SPHINXBASE_EXPORT
int32 cont_ad_set_rawfp (cont_ad_t *c, /* The cont_ad object being addressed */
FILE *fp); /* File to which raw audio data is to
be dumped; NULL to stop dumping. */
/**
* Set the file to which cont_ad logs its progress.
*
* Mainly for debugging. If <code>fp</code> is NULL, logging is turned off.
*
* @return 0 if successful, -1 otherwise.
*/
SPHINXBASE_EXPORT
int32 cont_ad_set_logfp (cont_ad_t *c, /* The cont_ad object being addressed */
FILE *fp); /* File to which logs are written;
NULL to stop logging. */
/**
* Set the silence and speech thresholds.
*
* For this to remain permanently in effect, the auto_thresh field of
* the continuous listening module should be set to FALSE or 0.
* Otherwise the thresholds may be modified by the noise- level
* adaptation.
*/
SPHINXBASE_EXPORT
int32 cont_set_thresh(cont_ad_t *r, int32 silence, int32 speech);
#ifdef __cplusplus
}
#endif
#endif
|