This file is indexed.

/usr/include/sphinx3/lm.h is in libs3decoder-dev 0.8-0ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
 * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * This work was supported in part by funding from the Defense Advanced 
 * Research Projects Agency and the National Science Foundation of the 
 * United States of America, and the CMU Sphinx Speech Consortium.
 *
 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ====================================================================
 *
 */
/*
 * lm.h - Disk/memory based word-trigram backoff LM
 *
 * **********************************************
 * CMU ARPA Speech Project
 *
 * Copyright (c) 1997 Carnegie Mellon University.
 * ALL RIGHTS RESERVED.
 * **********************************************
 *
 * HISTORY
 * $Log: lm.h,v $
 * Revision 1.16  2006/03/02 22:10:36  arthchan2003
 * Add *g_write into the code.
 *
 * Revision 1.15  2006/02/28 22:26:51  egouvea
 * Moved definition of lm_wid() outside of the #if 0/#endif block, so
 * it's declared.
 *
 * Revision 1.14  2006/02/24 13:38:08  arthchan2003
 * Added lm_read, it is a simple version of lm_read_advance.
 *
 * Revision 1.13  2006/02/23 04:16:29  arthchan2003
 * Merged from SPHINX3_5_2_RCI_IRII_BRANCH:
 * Splited the original lm.c into five parts,
 * a, lm.c - a controller of other subroutines.
 * b, lm_3g.c - implement TXT-based lm operations
 * c, lm_3g_dmp.c - implement DMP-based lm operations
 * d, lm_attfsm.c - implement FSM-based lm operations
 * e, lmset.c - implement sets of lm.
 *
 * Revision 1.12.4.3  2006/01/16 19:56:37  arthchan2003
 * 1, lm_rawscore doesn't need a language weight, 2, Support dumping the LM in FST format.  This code used Yannick Esteve's and LIUM code.
 *
 * Revision 1.12.4.2  2005/11/17 06:15:22  arthchan2003
 * Added input-encoding and output-encoding into the lm structure.
 *
 * Revision 1.12.4.1  2005/07/13 01:46:22  arthchan2003
 * 1, Fixed dox-doc, 2, Added more documentation for major functions such as lm_read and lm_write.
 *
 * Revision 1.12  2005/06/21 22:24:02  arthchan2003
 * Log. In this change, I introduced a new interface for lm ,which is
 * call lmset_t. lmset_t wraps up multiple lm, n_lm, n_alloclm into the
 * same structure and handle LM initialization (lm_init) switching,
 * (lmset_curlm_widx), delete LM (lmset_delete_lm).  The internal
 * structure is called lmarray and is an array of pointers of lm.  The
 * current lm is always maintained and pointed by a pointer called cur_lm
 * . This substantially clarify the structure of the code.  At this
 * check-in, not every core function of lmset is completed.
 * e.g. lmset_add_lm because that required testing of several LM reading
 * routines and could be quite time-consuming.
 *
 * Log. Another notable change is the fact dict2lmwid map is started to
 * be part of the LM. The reason of this is clearly described inside the
 * code. Don't want to repeat here.
 *
 * Log. The new interface has been already used broadly in both Sphinx
 * 3.0 and sphinx 3.x family of tools.
 *
 * Revision 1.5  2005/06/18 03:22:28  archan
 * Add lmset_init. A wrapper function of various LM initialization and initialize an lmset It is now used in decode, livepretend, dag and astar.
 *
 * Revision 1.4  2005/06/17 23:44:40  archan
 * Sphinx3 to s3.generic, 1, Support -lmname in decode and livepretend.  2, Wrap up the initialization of dict2lmwid to lm initialization. 3, add Dave's trick in LM switching in mode 4 of the search.
 *
 * Revision 1.3  2005/06/13 04:02:59  archan
 * Fixed most doxygen-style documentation under libs3decoder.
 *
 * Revision 1.2  2005/05/10 21:21:54  archan
 * Three functionalities added but not tested. Code on 1) addition/deletion of LM in mode 4. 2) reading text-based LM 3) Converting txt-based LM to dmp-based LM.
 *
 * Revision 1.1  2005/05/04 06:08:07  archan
 * Refactor all lm routines except fillpen.c into ./libs3decoder/liblm/ . This will be equivalent to ./lib/liblm in future.
 *
 * Revision 1.6  2005/05/04 04:02:24  archan
 * Implementation of lm addition, deletion in (mode 4) time-switching tree implementation of search.  Not yet tested. Just want to keep up my own momentum.
 *
 * Revision 1.5  2005/04/21 23:50:26  archan
 * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in.  At this moment, everything in search mode 5 is already done.  It is time to test the idea whether the search can really be used.
 *
 * Revision 1.4  2005/04/20 03:37:59  archan
 * LM code changes: functions are added to set, add and delete LM from the lmset, change the legacy lmset data structure to contain n_lm and n_alloc_lm.
 *
 * Revision 1.3  2005/03/30 01:22:47  archan
 * Fixed mistakes in last updates. Add
 *
 * 
 * 20.Apr.2001  RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu)
 *              Adding lm_free() to free allocated memory
 * 
 * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
 * 		Added lm_t.access_type; made lm_wid externally visible.
 * 
 * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
 * 		Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz.
 * 
 * 13-Feb-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
 *              Created from original S3 version.
 */


#ifndef _S3_LM_H_
#define _S3_LM_H_

#include <stdio.h>

#include <logmath.h>
#include <hash_table.h>
#include <cmd_ln.h>

#ifdef __cplusplus
extern "C" {
#endif
#if 0
} /* Fool Emacs into not indenting things. */
#endif

#define LM_DICTWID_BADMAP	-16000		/** An illegal mapping */
#define LM_CLASSID_BASE		0x01000000	/** Interpreted as LMclass ID */

/** Upper limit of the words of Sphinx 3.X */
#define LM_LEGACY_CONSTANT      BAD_S3LMWID          /**< =65535 (~65k), this is introduced 
							since 1996 when Ravi first wrote Sphinx 3.0. It
							was with us since. 
						     */

#define LM_SPHINX_CONSTANT      BAD_S3LMWID32      /**< (4 billion), ARCHAN: this is introduced by in Sphinx 3.6
						      during the time of Release Candidate I (2006 March). The caveat of using
						      this constant is that it is much hard to detect byte-swapping problem.
						      in general. Also, if the world has more than 10000 cities, each has 1 million
						      roads name. We are stuck in this case. I assume this will happen in 
						      year3001. 
						   */


#define LM_CLASSID_TO_CLASS(m,i)	((m)->lmclass[(i)-LM_CLASSID_BASE])

#define MIN_PROB_F       -99.0  /**< The minimum value of probabilities and
                                   backoff weights. When changing, notice
                                   that both s2 and s3 may transform this 
                                   number to very small integer (say -2e-31)
                                   This will easily cause integer wrap 
                                   around.  -99 is chosen for that reason. 
                                */

#define LM_ALLOC_BLOCK      16  /** The number of LMs to allocate at a time.                                    
                                 */

/**
   Sucess and error message. 
 */
#define LM_SUCCESS           1  /**< Constant that indicates an operation succeed 
                                 */
#define LM_FAIL              0  /**< Constant that define an operation failed.  */
#define LM_NOT_FOUND        -1  /**< Constant which indicate an LM couldn't be 
                                   found */
#define LM_OFFSET_TOO_LARGE -2  /**< Constant where the 16 bit LM was
                                   used, but th tgcount is larger than
                                   LM_LEGACY_CONSTANT (65535). This
                                   breaks addressing scheme in the
                                   current LM.
                                */
#define LM_NO_DATA_MARK     -3  /**< When reading text-based LM,
                                   return thisif we see no data
                                   mark  */
#define LM_UNKNOWN_NG       -4  /**< When reading the header of LM, if
                                   there is unknown K for K-gram */
#define LM_BAD_LM_COUNT     -5  /**< When reading LM, if count is bad,
                                   return this msg */
#define LM_UNKNOWN_WORDS    -6  /**< When an unknown word is found
                                   during LM readin, return this
                                   message */
#define LM_BAD_BIGRAM       -7  /**< A bad bigram, it could be word
                                   ids larger than # of unigram, it
                                   could be word id smaller than 0.
                                   It could also be bigram out of
                                   bound.
                                */
#define LM_BAD_TRIGRAM      -8  /**< A bad trigram, it could be word
                                   ids larger than # of unigram, it
                                   could be word id smaller than 0.
                                   It could also be bigram out of
                                   bound.
                                */
#define LM_BAD_QUADGRAM     -9  /**< (RESERVED BUT NOT USED) A bad
                                   quadgram (4-gram), it could be word
                                   ids larger than # of unigram, it
                                   could be word id smaller than 0.
                                   It could also be bigram out of
                                   bound.
                                */
#define LM_BAD_QUINGRAM     -10  /**< (RESERVED BUT NOT USED) A bad
                                    quingram (5-gram), it could be
                                    word ids larger than # of unigram,
                                    it could be word id smaller than
                                    0.  It could also be bigram out of
                                    bound.  BTW, there is no need to
                                    remind me the mixed use of
                                    quadgram and quingram is stupid
                                    English.  I read Manning and
                                    Schultze.
                                 */
#define LM_BAD_NGRAM       -11  /**< (RESERVED BUT NOT USED) A bad
                                   n-gram.  generalization of message
                                   -7 to -10. In our case, we don't
                                   make the message as specific as
                                   possible.
                                 */
#define LM_TOO_MANY_NGRAM  -12  /**< When reading LM, if the number of
                                   n-grams is more than the number
                                   specified header.  return this
                                   header */
#define LM_NO_MINUS_1GRAM  -13  /**< When reading n-gram, if the
                                   corresponding (n-1)-gram doesn't
                                   exists, return this message. */
#define LM_FILE_NOT_FOUND  -14  /**< When couldn't find the LM file,
                                   return this message */
#define LM_CANNOT_ALLOCATE -15  /**< When cannot allocate tables in LM 
                                   return this message */

/** Versioning of LM */
#define LMDMP_VERSIONNULL 0   /**< VERSION 0 is oldest, in the past, we
				 used to use the version number to
				 store the number of unigram, you will
				 see logic that said vn > LMDMP_VERSIONNULL				 
			      */
				 
#define LMDMP_VERSION_TG_16BIT -1 /**< VERSION 1 is the simplest DMP file which
				     is trigram or lower which used 16 bits in
				     bigram and trigram.*/

#define LMDMP_VERSION_TG_16BIT_V2 -2 /**< VERSION 2 means legacy VERSION 1 DMP file
					which has log_bg_seg_sz != 9*/

#define LMDMP_VERSION_TG_32BIT -3 /**< VERSION 3 is the 32 bit
				     extension of VERSION 1 but
				     the bigram and trigram are
				     represented by 32 bits data
				     structure */

#define LMTXT_VERSION         1000 /**< VERSION 1000 is the text-based LM */
#define LMFST_VERSION         1001 /**< VERSION 1001 is the FST-based LM */
#define LMFORCED_TXT32VERSION 1002 /**< VERSION 1002 is the internal version of
                                      text-based LM. The difference betwwen
                                      1002 and 1000 is that 1002 will assume
                                      LM is 32bits.  This fact is used in 
                                      lm_is32bits(lm)
                                   */


#define NO_WORD	-1

#include "s3types.h"
#include "lmclass.h"
#include "dict.h"

/*
 * ARCHAN 20050503: comment copied from Sphinx 2
 * Bigram probs and bo-wts, and trigram probs are kept in separate tables
 * rather than within the bigram_t and trigram_t structures.  These tables
 * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h).
 * The following tree structure is used to construct these tables of unique
 * values.  Whenever a new value is read from the LM file, the sorted tree
 * structure is searched to see if the value already exists, and inserted
 * if not found.
 */

/** \file lm.h
    \brief Language model 

    This is the header file for language model support in Sphinx 3. 
    Sphinx 3 supports language model in 4 formats. The four formats are

    ARPA format: First appear in Sphinx 2. We port it to Sphinx 3 in
    3.X (X=6)

    DMP : Sphinx 3 slow and fast used it, so does later in Sphinx 3.X
    (X>4)

    DMP32 : We start to break the limit of number of words of
    65535. This is the first LM file format in Sphinx 3.X that could
    capture 4 billion words in the language model
     
    FST: In AT&T format, we start to support in 3.X (X=6).

    At 20060302
    we can only read and used ARPA, DMP-based format in the decoder. 
    we can write ARPA, DMP, DMP32 and FST file format. 
*/
  
/** \struct lmlog_t
    \brief Log quantities represented in either floating or integer format 
*/
typedef union {
    float32 f; /**< The floating point component */
    int32 l;   /**< The integer component */
} lmlog_t;



/** \struct sorted_entry_t
    \brief single entry used in the linked list structure of lm reading
*/
  
typedef struct sorted_entry_s {
    lmlog_t val;		/**< value being kept in this node */
    uint32 lower;	/**< index of another entry.  All descendants down
			   this path have their val < this node's val.
			   0 => no son exists (0 is root index) */
    uint32 higher;	/**< index of another entry.  All descendants down
			   this path have their val > this node's val
			   0 => no son exists (0 is root index) */
} sorted_entry_t;

/** \struct sorted_list_t
 *
 * \brief The sorted list used lm reading.  list is a (64K long) array.  The first entry is the root of the tree and is created during initialization.
 */
typedef struct {
    sorted_entry_t *list; /**< Beginnig of the list  */
    int32 free;		/**< first free element in list */
} sorted_list_t;

/** \struct ug_t
 * \brief A unigram structure
 * Please see 
 */
typedef struct {
    s3wid_t dictwid;	/**< Dictionary word id, or BAD_S3WID if unknown.  However, the LM
                           module merely sets this field to BAD_S3WID.  It is upto the
                           application to fill in this field (HACK!!), so that this
                           module can be independent of a dictionary. */
    lmlog_t prob;       /**< Unigram probability */
    lmlog_t bowt;
    int32 firstbg;	/**< 1st bigram entry on disk */
} ug_t;

/** \struct bg_t
 * \brief A bigram structure
 */

typedef struct {
    s3lmwid_t wid;	/**< LM wid (index into lm_t.ug) */
    uint16 probid;      /**< Index into array of actualy bigram probs*/
    uint16 bowtid;      /**< Index into array of actualy bigram backoff wts */
    uint16 firsttg;     /**< 1st trigram entry on disk (see tg_segbase below) */
} bg_t;


/** \struct bg32_t 
 * \brief A bigram structure which has 32 bits. 
 */
typedef struct {
    s3lmwid32_t wid;	/**< LM wid (index into lm_t.ug) */
    uint32 probid;      /**< Index into array of actualy bigram probs*/
    uint32 bowtid;      /**< Index into array of actualy bigram backoff wts */
    uint32 firsttg;     /**< 1st trigram entry on disk (see tg_segbase below) */
} bg32_t;


/** \struct tg_t
 * \brief A trigram structure
 */

typedef struct {
    s3lmwid_t wid;	/**< LM wid (index into lm_t.ug) */
    uint16 probid;      /**< Index into array of actualy trigram probs*/
} tg_t;


/** \struct tg32_t
 * \brief A 32 bits version of tg_t 
 */

typedef struct {
    s3lmwid32_t wid;	/**< LM wid (index into lm_t.ug) */
    uint32 probid;      /**< Index into array of actualy trigram probs*/
} tg32_t;


/** \struct membg_t
 *  \brief Management of in-memory bigrams.  Not used if all bigrams in memory.
 */
typedef struct {
    bg_t *bg;		/**< Bigrams for a specific unigram; see lm_t.membg */
    int32 used;		/**< Whether used since last lm_reset.  If not used, at the next
                           lm_reset bg are freed */
} membg_t;

/** \struct membg32_t
 *
 * \brief A 32 bits version of membg_t
 */
typedef struct {
    bg32_t *bg32;		/**< Bigrams for a specific unigram; see lm_t.membg */
    int32 used;		/**< Whether used since last lm_reset.  If not used, at the next
                           lm_reset bg are freed */
} membg32_t;


/**
 * \struct tginfo_t
 * \brief trigram cache that enhance locating trigram for a given bigram (w_1,w_2)
 *
 * The following trigram information cache eliminates most traversals of 1g->2g->3g
 * tree to locate trigrams for a given bigram (w1,w2).  The organization is optimized
 * for locality of access.  All bigrams (*,w2) for a given w2, for which trigrams have
 * been accessed "recently", form a linear linked list, pointed to by lm_t.tginfo[w2].
 * If disk-based, all trigrams for the given bg loaded upon request.  Cached info (and
 * tg if disk-based) freed at lm_reset if not used since last such reset.
 */
typedef struct tginfo_s {
    s3lmwid_t w1;		/**< w1 component of bigram w1,w2.  All bigrams with
				   same w2 linked together. */
    int32 n_tg;			/**< #tg for parent bigram w1,w2 */
    tg_t *tg;			/**< Trigrams for w1,w2 */
    int32 bowt;			/**< tg bowt for w1,w2 */
    int32 used;			/**< whether used since last lm_reset */
    struct tginfo_s *next;	/**< Next w1 with same parent w2 */
} tginfo_t;

/**
 * \struct tginfo32_t
 * \brief 32 bit version of tginfo
 *
 */
typedef struct tginfo32_s {
    s3lmwid32_t w1;		/**< w1 component of bigram w1,w2.  All bigrams with
				   same w2 linked together. */
    int32 n_tg;			/**< #tg for parent bigram w1,w2 */
    tg32_t *tg32;			/**< Trigrams for w1,w2 */
    int32 bowt;			/**< tg bowt for w1,w2 */
    int32 used;			/**< whether used since last lm_reset */
    struct tginfo32_s *next;	/**< Next w1 with same parent w2 */
} tginfo32_t;


/*
 * \struct lm_tgcache_entry_t
 * Entries in a fast and dirty cache for trigram lookups.  See lm_t.tgcache.
 */
typedef struct {
    s3lmwid_t lwid[3];		/**< 0 = oldest, 2 = newest (i.e., P(2|0,1)) */
    int32 lscr;			/**< LM score for above trigram */
} lm_tgcache_entry_t;


/*
 * \struct lm_tgcache_entry32_t
 * \brief 32 bit version of lm_tg_cache_entry
 */
typedef struct {
    s3lmwid32_t lwid[3];		/**< 0 = oldest, 2 = newest (i.e., P(2|0,1)) */
    int32 lscr;			/**< LM score for above trigram */
} lm_tgcache_entry32_t;



/* 
 * A note on lm/dict/dict2lm.   -ARCHAN 20050616
 * 
 * In older versions of sphinx3 (<s3.4). dict2lm is a separate object
 * from lm and dict.  A kb actually owns a dict2lm so programer will
 * read the lm.  This seprates the initalization of lm and dict2lm and
 * it makes a lot of sense if there is **only one** lm and **only one
 * dict2lm. 
 * 
 * However, when multiple LMs and switching of them is required.
 * Then, the problem of the above architecture starts to show up.  For
 * example, 
 *  lmset=lm_read_ctl ();
 *  for(i=0;i<kb->n_lm;i++){
 *   dict2lmwid[i]=wid_dict_lm_map
 *  }
 * At the same time, one will also have an array of lms (lmset[i]) for 
 * corresponding dict2lm[i]!
 *
 * Of course, having multiple arrays of things will somedays caused
 * problems.
 *
 * The resolution is that we observed that the dict2lm map mostly
 * changed when the lm needs to change. Also, the fact that the
 * dictionary pronounciation itself seldom changes. That is partially
 * caused by the fact we don't have too much research on So at the
 * end, that is why it makes sense to let the lm to own a dict2lm.
 * 
 * What if we also allow the dictionary to change? That is a tough
 * question.  In that case perhaps, we should still inventory of sets
 * of lm and dict2lm and allow lm to store a pointer of dict2lm.  Once
 * there are changes in dict, programmer will be responsible to update
 * dict2lm. (Storing pointers will allow programmers not to update
 * everything but just lms corresponding to a particular dict.)  I
 * guess in that case it will be sign of having a wrapper that control
 * both lm and dict together.
 */

/*
 * Comments by RKM
 * To conserve space, bg/tg probs/ptrs kept in many tables.  Since the number of
 * distinct prob values << #bg/#tg, these table indices can be easily fit into
 * 16 bits.  bgprob and bgbowt are such indices.  The firsttg entry for a bigram
 * is harder.  It is supposed to be the index of the first trigram entry for each
 * bigram.  But #tg can be >> 2^16.  Hence the following segmentation scheme:
 * Partition bigrams into segments of lm_t.bg_seg_sz consecutive entries, such that
 * #trigrams in each segment <= 2**16 (the corresponding trigram segment).  The
 * bigram_t.firsttg value is then a 16-bit relative index within the trigram
 * segment.  A separate table--lm_t.tg_segbase--has the absolute index of the
 * 1st trigram for each segment.
 */

/* Default values for lm_t.log_bg_seg.sz */
#define LOG2_BG_SEG_SZ  9	
#define BG_SEG_SZ       (1 << (LOG2_BG_SEG_SZ))
#define LM_TGCACHE_SIZE		100003	/* A prime no. (hopefully it IS one!) */

/* 20040211 ARCHAN: Yes! Indeed it is a prime */

/** \struct lm_t
 * \brief The language model.
 * All unigrams are read into memory on initialization.
 * Bigrams and trigrams read in on demand.
 */
typedef struct lm_s {
    char *name ;        /**< The name of the LM */
    int32 n_ug;         /**< #unigrams in LM */
    int32 n_bg;         /**< #bigrams in entire LM */
    int32 n_tg;         /**< #trigrams in entire LM */
    int32 max_ug;       /**< To which n_ug can grow with dynamic addition of words */

    int32 n_ng;           /**< if unigram, n_ng=1, if bigram n_bg=2 and so one */
    
    char **wordstr;	/**< The LM word list (in unigram order) */
    
    
    uint32 log_bg_seg_sz;/**< See big comment above */
    uint32 bg_seg_sz;

    ug_t *ug;           /**< Unigrams */

    /* 20040225 ARCHAN : Data structure to maintain dictionary information */
    /* Data structure for dictionary to LM words look up mapping */
    /* 20060306 ARCHAN: Change this to a 32 bits data structure */
    s3lmwid32_t *dict2lmwid; /**< a mapping from dictionary word to LM word */
    s3lmwid32_t startlwid;	/**< S3_START_WORD id, if it exists */
    s3lmwid32_t finishlwid;	/**< S3_FINISH_WORD id, if it exists */

    bg_t *bg;		/**< NULL iff disk-based */
    tg_t *tg;		/**< NULL iff disk-based */
    membg_t *membg;	/**< membg[w1] = bigrams for lm wid w1 (used iff disk-based) */
    tginfo_t **tginfo;	/**< tginfo[w2] = fast trigram access info for bigrams (*,w2) */


    lm_tgcache_entry_t *tgcache; /**< <w0,w1,w2> hashed to an entry into
                                    this array.  Only the last trigram
                                    mapping to any * given hash entry is
                                    kept in that entry.  (The cache
                                    doesn't have to be super-efficient.)
                                 */


    /**************************/


    bg32_t *bg32;		/**< Bigram 32 bits, NULL iff disk-based */
    tg32_t *tg32;		/**< Trigram 32 bits NULL iff disk-based */
    membg32_t *membg32;	/**< membg 32bits membg[w1] = bigrams for lm wid w1 (used iff disk-based) */
    tginfo32_t **tginfo32;	/**< tginfo 32bits tginfo[w2] = fast trigram access info for bigrams (*,w2) */

    lm_tgcache_entry32_t *tgcache32; /** tgcache 32 bits */

    /**************************/
    
    lmlog_t *bgprob;    /**< Table of actual bigram probs */
    lmlog_t *tgprob;    /**< Table of actual trigram probs */
    lmlog_t *tgbowt;    /**< Table of actual trigram backoff weights */
    int32 *tg_segbase;  /**< tg_segbase[i>>lm_t.log_bg_seg_sz] = index of 1st
                           trigram for bigram segment (i>>lm_t.log_bg_seg_sz) */
    int32 n_bgprob;
    int32 n_tgprob;
    int32 n_tgbowt;

    FILE *fp;
    int32 byteswap;     /**< Whether this file is in the WRONG byte order */
    int32 bgoff;        /**< BG offsets into DMP file (used iff disk-based) */
    int32 tgoff;        /**< TG offsets into DMP file (used iff disk-based) */

    float32 lw;		/**< Language weight currently in effect for this LM */
    int32 wip;          /**< logs3(word insertion penalty) in effect for this LM */
    
    
    /* Statistics */
    int32 n_bg_fill;    /**< #bg fill operations */
    int32 n_bg_inmem;   /**< #bg in memory */
    int32 n_bg_score;   /**< #bg_score operations */
    int32 n_bg_bo;	/**< #bg_score ops backed off to ug */
    int32 n_tg_fill;	/**< Similar stats for trigrams */
    int32 n_tg_inmem;   /**< #tg in memory */
    int32 n_tg_score;   /**< #tg_score operations */
    int32 n_tg_bo;      /**< #tg_score ops backed off to bg */
    int32 n_tgcache_hit;  /**< # of trigram cache hit ops backed off to bg */
    
    int32 access_type;	/**< Updated on every lm_{tg,bg,ug}_score call to reflect the kind of
                           n-gram accessed: 3 for 3-gram, 2 for 2-gram and 1 for 1-gram */


    int32 isLM_IN_MEMORY;  /**< Whether LM in in memory, it is a property, potentially it means
                              the code could allow you some model to be disk-based, some are not. */

    int32 dict_size;  /**< Only used in class-based LM, because class-based LM is addressed in 
                         the dictionary space. */
    hash_table_t *HT;		/**<  hash table for word-string->word-id map */

  
    /* Data structure that maintains the class information */
    lmclass_t **lmclass;   /**< LM class for this LM */
    int32 n_lmclass;      /**< # LM class */
    int32 *inclass_ugscore; /**< An array of inter-class unigram probability */


    int32 inputenc ; /**< Input encoding method */
    int32 outputenc ; /**< Output encoding method */
    int32 version;  /**< The version number of LM, in particular, this is the version that recently
                       read in. 
                    */
    int32 is32bits; /**< Whether the current LM is 32 bits or not. Derived from version and n_ug*/

    /* Arrays of unique bigram probs and bo-wts, and trigram probs */
    sorted_list_t sorted_prob2; /**< Temporary Variable: Sorted list */
    sorted_list_t sorted_bowt2; /**< Temporary Variable: Sorted list */
    sorted_list_t sorted_prob3; /**< Temporary Variable: Sorted list */
    int32 max_sorted_entries; /**< Temporary Variable: 2x the maximum size of the MAX_SORTED_ENTRIES*/

    logmath_t *logmath;
} lm_t;



/** \struct lmset_t
    \brief Structure for multiple LM, provide operations for addition/deletion/read
    Structure for multiple, named LMs, started from s2
*/
typedef struct lmset_s {
    lm_t **lmarray;  /**< 1 dimensional array of pointers of lm_t */
    lm_t *cur_lm; /**< TEMPORARY VARIABLE: The current LM */

    int32 cur_lm_idx; /**< TEMPORARY VARIABLE : The current LM index */
    int32 n_lm;       /**< number of LM */
    int32 n_alloc_lm; /**< number of allocated LM */
} lmset_t;

/** Access macros; not meant for arbitrary use */
#define lm_lmwid2dictwid(lm,u)	((lm)->ug[u].dictwid)
#define lm_n_ug(lm)		((lm)->n_ug)
#define lm_n_bg(lm)		((lm)->n_bg)
#define lm_n_tg(lm)		((lm)->n_tg)
#define lm_wordstr(lm,u)	((lm)->wordstr[u])
#define lm_startwid(lm)		((lm)->startlwid)
#define lm_finishwid(lm)	((lm)->finishlwid)
#define lm_access_type(lm)	((lm)->access_type)


/** \struct wordprob_t
    \brief Generic structure that could be used at any n-gram level 
*/
typedef struct {
    s3wid_t wid;	/**< NOTE: dictionary wid; may be BAD_S3WID if not available */
    int32 prob;         /**< The probability */
} wordprob_t;
  

/** A wrapper function of controlling the behavior of LM initialization 
 * 
 * (ARCHAN 20050617) lmset_init controls the behavior how the lmset
 * which is an array of lm was initialized by different command-line
 * arguments.  lmfile and lmctlfile are mutually exclusive.  Each
 * will invoke one reading functions.  
 * 
 * In the case of -lmfile is specified.  A lmset with one single lm
 * (or lmset->n_lm=1) will be returned. The single lm's name will be
 * called lmname.
 *
 * In the case of -lmctlfile is specified. A lmset with multiple lms
 * will be returned. The number of lm will depend on the number of
 * lm specified by -lmctlfile.  For the format, please read the
 * current format of -lmctlfile in lm.c
 *
 * ctl_lm is the equivalent of -ctl for lm.  When -ctl_lm is not
 * specified in command-line (ctl_lm is NULL). Then either lm with
 * name lmname will be used as the default lm.  If lmname is NULL, then
 * the first lm will be named as the "default"
 *
 * lmdumpdir is currently not used. It is there for backward
 * compatibility purpose. 
 *
 * lw,wip,uw are language weight, word insertion pernalty and
 * unigram weight. Their values are crucial to computation of the
 * language model score. Therefore, the programmer is urged to
 * carefully set these three values and also be careful of the 
 * order. 
 * 
 * dict is assumed to be a pre-initialized dict_t structure which is
 * used in deriving the mapping between the dictionary word and the
 * lm words
 *
 * ARCHAN 20050711 -lminmemory is the only global variable that
 * control the code and we haven't explicitly specify it.  Currently,
 * if the LM is DMP, both -lminmeory=0 or -lminmeory=1 could be used. 
 * if the LM is txt-base, only -lminmemory=1 is accepted. (This will
 * be changed in future.)
 *
 *
 * ARCHAN 20050705: A survival guide for this part of the code.  Our
 * language mode code is unnecessarily complicated and is mainly
 * caused by the fact the way we specified class-based LM and
 * multiple LM are inter-dependent. For example, one could specify a
 * multiple LMs file (i.e. lmctlfile) and have no classes.  However,
 * if one would like to specify class information even with a single
 * LM, one need to use a multiple LM file format (i.e. lmctlfile).
 *
 * This difficulty is well-observed in the period of Sphinx
 * 3.4-3.6. That might imply that a new LM format is needed if we
 * want to sustain this part of the development.
 *
 */
S3DECODER_EXPORT
lmset_t* lmset_init(const char* lmfile,  /**< The lm file name, lmfile and lmctlfile are mutally exclusive */
		    const char* lmctlfile, /**< The file that specified multiple LMs and class information, lmfile and lmctlfile are mutually exclusive */
		    const char* ctl_lm,    /**< The control file that describes which lm to use for a particular utterance*/
		    const char* lmname,    /**< The LM name to use if ctl_lm is not specified  */
		    const char* lmdumpdir, /**< Currently not used */
		    float32 lw,      /**< Language model weight */
		    float32 wip,     /**< Word insertion penalty */
		    float32 uw,      /**< Unigram weight */
		    dict_t *dict,     /**< A pre-initialized dict_t structure */
		    logmath_t *logmath
    );


/* It is still a sore point: To have two interfaces for two different
   type of input.  Some of the code is still duplicated.  Changing
   one doesn't the other one will be changed
*/

/** 
 *  Read a single LM into the lmset. 
 */
lmset_t* lmset_read_lm(const char *lmfile, /**< In: The LM file */
		       dict_t *dict,       /**< In: A pre-initialized dictionary file*/
		       const char *lmname, /**< In: The LM name */
		       float64 lw,         /**< The language weight */
		       float64 wip,        /**< The word insertion penalty */
		       float64 uw,          /**< The unigram weight */
		       const char *lmdumpdir, /**< In: LM dump dir */
		       logmath_t *logmath
    );

/**
 * Read the LM control file. **Usually**, it is also a class-based LM, 
 */

lmset_t* lmset_read_ctl(const char * ctlfile,/**< Control file name */
			dict_t* dict,  /**< In: Dictionary */
			float64 lw,	/**< In: Language weight */
			float64 wip,	/**< In: Word insertion penalty */
			float64 uw,    /**< In: Unigram weight */
			const char* lmdumpdir, /**< In: LMdumpdir */
			logmath_t *logmath
    );	

/**
 * Get an LM by index. 
 */
lm_t* lmset_get_lm_widx(lmset_t *lms,  /**< In: The set of LM */
			int32 lmidx    /**< In: LM index */
    );

/**
 * Get an LM by name
 * @return a pointer of the LM with name lmname
 */
lm_t* lmset_get_lm_wname(lmset_t *lms,  /**< In: The set of LM */
			 const char *lmname   /**< In: The LM name */
    );

/**
 * Set the current LM with index 
 */
void lmset_set_curlm_widx(lmset_t *lms, /**< In: The set of LM */
			  int32 lmidx   /**< In: LM index */
    );

/**
 * Set the current LM with name
 */
S3DECODER_EXPORT
void lmset_set_curlm_wname(lmset_t *lms, /**< In: The set of LM */
			   const char *lmname   /**< In: The LM name */
    );
  
/**
 * Convert name to index
 */
int32 lmset_name_to_idx(lmset_t *lms, /**< In: The set of LM */
			const char *lmname /**< In: The LM name */
    );

/**
 * Convert index to name
 * @return a pointer of the name string.  No memory is allocated. 
 */

char* lmset_idx_to_name(lmset_t *lms, /**< In: The set of LM */
			int32 lmidx /**< In: LM index */
    );


/** 
 * Add a new lm into the lmset. Notice that lms->n_lm will be added by 1
 */
  
void lmset_add_lm(lmset_t *lms,  /**< In/Out : The set of LM */
		  lm_t *lm,      /**< In : The input LM */
		  const char* lmname /**< In: The lm name */
    );

/**
 * Delete a LM with lmname. Notice that lms->n_lm will be subtracted by 1
 */
  
void lmset_delete_lm(lmset_t *lms, /**< In/Out : The set of LM */
		     const char *lmname /**< The lm name */
    );

/**
 * Free the lmset data structure
 */
S3DECODER_EXPORT
void lmset_free(lmset_t *lms /**< In: The set of LM */
    );

/**
 * Return trigram followers for given two words.  Both w1 and w2 must be valid.
 * Return value: #trigrams in returned list.
 */
int32 lm_tglist (lm_t *lmp,	/**< In: LM being queried */
		 s3lmwid32_t w1,	/**< In: LM word id of the first of a 2-word history */
		 s3lmwid32_t w2,	/**< In: LM word id of the second of the 2-word history */
		 tg_t **tg,	/**< Out: *tg = array of trigrams for <w1,w2> */
		 int32 *bowt	/**< Out: *bowt = backoff-weight for <w1, w2> */
    );

int32 lm_tg32list (lm_t *lmp,	/**< In: LM being queried */
		   s3lmwid32_t w1,	/**< In: LM word id of the first of a 2-word history */
		   s3lmwid32_t w2,	/**< In: LM word id of the second of the 2-word history */
		   tg32_t **tg,	/**< Out: *tg = array of trigrams for <w1,w2> */
		   int32 *bowt	/**< Out: *bowt = backoff-weight for <w1, w2> */
    );

/**
 * Return the bigram followers for the given word w.
 * Return value: #bigrams in returned list.
 */
int32 lm_bglist (lm_t *lmp,	/**< In: LM being queried */
		 s3lmwid32_t w,	/**< In: LM word id of the 1-word history */
		 bg_t **bg,	/**< Out: *bg = array of bigrams for w */
		 int32 *bowt	/**< Out: *bowt = backoff-weight for w */
    );

int32 lm_bg32list (lm_t *lmp,	/**< In: LM being queried */
		   s3lmwid32_t w,	/**< In: LM word id of the 1-word history */
		   bg32_t **bg,	/**< Out: *bg = array of bigrams for w */
		   int32 *bowt	/**< Out: *bowt = backoff-weight for w */
    );


#if 0 /*Obsolete and it will cause conflict the code, so comment for now*/
/*
 * Somewhat like lm_bglist, but fill up a wordprob_t array from the bigram list found, instead
 * of simply returning the bglist.  The wordprob array contains dictionary word IDs.  But note
 * that only the base IDs are entered; the caller is responsible for filling out the alternative
 * pronunciations.
 * Return value:  #entries filled in the wordprob array.
 */
int32 lm_bg_wordprob(lm_t *lm,		/**< In: LM being queried */
		     s3lmwid32_t w,	/**< In: LM word ID of the 1-word history */
		     int32 th,		/**< In: If a prob (logs3, langwt-ed) < th, ignore it */
		     wordprob_t *wp,	/**< In/Out: Array to be filled; caller must have
					   allocated this array */
		     int32 *bowt	/**< Out: *bowt = backoff-weight associated with w */
    );

#endif

/* Return LM word ID for the given string, or BAD_LMWID(lm) if not available */
s3lmwid32_t lm_wid (lm_t *lm, const char *wd);

/**
   Set all pointers to NULL in the lm
*/
void lm_null_struct(lm_t* lm 
    );

/**
 * Like lm_bg_wordprob, but for unigrams.
 * Return value:  #entries filled in the wordprob array.
 */
int32 lm_ug_wordprob(lm_t *lm, /**< In: LM being queried */
		     dict_t *dict, /**< In : The dictionary */
		     int32 th,
		     wordprob_t *wp /**< In/out: Array to be filled */
    );

/** Return the unigrams in LM.  Return value: #unigrams in returned list. */
int32 lm_uglist (lm_t *lmp,	/**< In: LM being queried */
		 ug_t **ug	/**< Out: *ug = unigram array */
    );
  


/* 20040227: This also account the in-class probability of wid*/
/** Return unigram score for the given word */
int32 lm_ug_score (lm_t *lmp,  /**< In: LM begin queried */
		   s3lmwid32_t lwid, /**< LM ID for the word */
		   s3wid_t wid     /**< Dict ID for the word */
    );

  
int32 lm_ug_exists(lm_t* lm ,  /**< LM */
		   s3lmwid32_t lwid /**< LM ID for the word */
    );
  
/*
 * Return bigram score for the given two word sequence.  If w1 is BAD_LMWID(lm), return
 * lm_ug_score (w2).
 * 20040227: This also account for the in-class probability of w2. 
 */
int32 lm_bg_score (lm_t *lmp, /**< In: LM begin queried */
		   s3lmwid32_t lw1, 
		   s3lmwid32_t lw2,
		   s3wid_t w2);


/**
   Whether a certain bigram exists. 
*/
int32 lm_bg_exists (lm_t *lm,  /**< In: LM */
		    s3lmwid32_t lw1,  
		    s3lmwid32_t lw2   
    );

/**
 * Return trigram score for the given three word sequence.  If w1 is BAD_LMWID(lm), return
 * lm_bg_score (w2, w3).  If both lw1 and lw2 are BAD_LMWID(lm), return lm_ug_score (lw3).
 * 
 * 20040227: This also account for the in-class probability of w3. 
 */
int32 lm_tg_score (lm_t *lmp,  /**< In: LM begin queried */
		   s3lmwid32_t lw1, 
		   s3lmwid32_t lw2, 
		   s3lmwid32_t lw3, 
		   s3wid_t w3);


/**
   Whether a certain trigram exists. 
*/
int32 lm_tg_exists (lm_t *lm,  /**< In: LM */
		    s3lmwid32_t lw1,  
		    s3lmwid32_t lw2,
		    s3lmwid32_t lw3
    );

/**
 * Set the language-weight and insertion penalty parameters for the LM, after revoking
 * any earlier set of such parameters.
 *
 * WARNING!! This function doesn't prevent underflow of values.  Make sure you call
 * safe lm2logs3 before it. 
 */
void lm_set_param (lm_t *lm,  /**< In: the LM */
		   float64 lw,  /**< In: the langauage weight */
		   float64 wip  /**< In: the word insertion penalty */
    );


S3DECODER_EXPORT
int32 lm_rawscore (lm_t *lm,  /**< In: the LM */
		   int32 score
    );



/** LM cache related */
S3DECODER_EXPORT
void lm_cache_reset (lm_t *lmp /**< In: the LM */
    );

/** LM cache statistic dumping */
S3DECODER_EXPORT
void lm_cache_stats_dump (lm_t *lmp /**< In: the LM */
    );

/** 
 * A simple version of reading in a LM 
 *
 * lm_read is a simple version of lm_read_advance.  It will assume
 * language weight, word insertion penalty and unigram weight to be
 * automatically applied.  There is also no class-based LM (so
 * ndict=0).  Format is set to NULL, so the program will determine
 * it automatically. 
 */
lm_t * lm_read ( 
    const char *file,	/**< In: LM file being read */
    const char *lmname,  /**<In: LM name*/
    cmd_ln_t *config,
    logmath_t *logmath);

/**
 * Read an LM file, it will automatically decide whether the file is
 * a DUMP file or a txt file. Then call lm_read_txt and lm_read_dump
 * (non-public functions) correspondingly.  Currently the code is
 * not aware about OOV.  
 *
 * lw, wip, uw and ndict are mainly used for recognition purpose.
 * When lm_read is used for other purpose, one could just used dummy
 * setting.  recommended one is lw=1.0,wip=0.1,uw=1.0 and
 * ndict=0. These are very useful when lm_read is just used as
 * reading the LM.  
 *
 * If applyweight is 0, lw,wip, uw will not be apply the LM at all.
 * This will allow users to just call the LM routine without
 * initializing other modules (such as logs3_init).
 * 
 * If applyweight is 1, then logs3_init must be called before lm_read. 
 * This is usually the case when kb_init is called before the code. 
 *
 * fmt now could be either "TXT", "DMP" and "TXT32" or just
 * NULL. If it is NULL, the LM format will be automatically
 * determined.  If it is specified as "TXT" or "DMP", the
 * corresponding lm reader will be called. In such a case, it is
 * important for the users to know what he/she is doing.
 * (Unfortunately, this is mostly not true. ) 
 * In the case of "TXT32", a text LM will be forced to 32bit mode. 
 *
 * ndict is the dictionary size of the application.  This is needed
 * because class-based LM are addressed in the dictionary wid-space
 * instead of lm wid-space. If class-based LM is not used, just set
 * this to zero.
 *
 * Note: there are two defense mechanisms of lm_read_advance. 
 * First of all, if no fmt is specified, it will start to read
 * the lm in the order of DMP->TXT. Second, if txt format
 * is specified but LM is found to hit the 16bit legacy segments
 * limit, it will automatically switch to read TXT32 LM
 *
 * @return pointer to LM structure created.
 */
lm_t *lm_read_advance (const char *file,	/**< In: LM file being read */
		       const char *lmname,   /**<In: LM name*/
		       float64 lw,	/**< In: Language weight */
		       float64 wip,	/**< In: Word insertion penalty */
		       float64 uw,	/**< In: Unigram weight (interpolation with uniform distr.) */
		       int32 ndict,    /**< In: Number of dictionary entry.  We need that because
					  class-based LM is addressed in dictionary word ID space. 
				       */  
		       const char* fmt,       /**< In: file format of the LM, it is
					   now either "TXT", "DMP" and NULL,
					   if NULL, file format is
					   automaticaly determined */
		       int32 applyweight,      /**< In: whether lw,wip, uw should be 
						 applied to the lm or not */
		       logmath_t *logmath
    );

S3DECODER_EXPORT
lm_t *lm_read_advance2(const char *file,	/**< In: LM file being read */
		       const char *lmname,   /**<In: LM name*/
		       float64 lw,	/**< In: Language weight */
		       float64 wip,	/**< In: Word insertion penalty */
		       float64 uw,	/**< In: Unigram weight (interpolation with uniform distr.) */
		       int32 ndict,    /**< In: Number of dictionary entry.  We need that because
					  class-based LM is addressed in dictionary word ID space. 
				       */  
		       const char* fmt,       /**< In: file format of the LM, it is
					   now either "TXT", "DMP" and NULL,
					   if NULL, file format is
					   automaticaly determined */
		       int32 applyweight,      /**< In: whether lw,wip, uw should be 
                                                  applied to the lm or not */
                       int lminmemory, /**< In: Whether LM is read into memory */
		       logmath_t *logmath
    );
/**
   Simple writing of an LM file, the input and output encoding will
   assume to be iso8859-1. Call lm_write. To convert encoding, please use
   lm_write_advance. 
*/
S3DECODER_EXPORT
int32 lm_write(lm_t *model, /** In: the pointer LM we want to output */
	       const char *outputfile, /**< In: the output file name */
	       const char *filename, /**< In: the LM file name  */
	       const char *fmt   /**< In: LM file format, it is now either "TXT" or "DMP" */
    );
  
/**
   Writing of an LM file with advanced options such as encoding support. 
   Called by lm_write. 
     
   fmt now could be TXT, DMP, FST
     
   inputenc and outputenc could now be iso8859-1, gb2312-hex, gb2312.  
   Not every pair of conversion works.  
     
   Current input/output encodings support list. 
   0: iso8859-1
   1: gb2312-hex
   2: gb2312

   -: do nothing
   n: doesn't make sense or not compatible
   x: not supported yet
   y: supported

   i\o 0 1 2
   0 - n n
   1 n - y
   2 n x -

   When we have 4 encoding types: This document should be
   implemented as a data structure.

   This conversion table is copied from encoding.c, please take a
   look the latest support in encoding.c
*/

int32 lm_write_advance(lm_t *model, /**< In: the pointer LM we want to output */
		       const char *outputfile, /**< In: the output file name */
		       const char *filename, /**< In: the LM file name  */
		       const char *fmt,   /**< In: LM file format, it is now either "TXT", "DMP", "FST" */
		       const char* inputenc, /**< In: Input encoding type */
		       char* outputenc /**< Out: Output encoding type */
    );

/* RAH, added code for freeing allocated memory 
 */
/**
   Deallocate the language model. 
*/
S3DECODER_EXPORT
void lm_free (lm_t *lm /**< In: a LM structure */
    );

/**
   Add word list to the LM 
   For each word in the file, call lm_add_wordlist. 
   The file is assume to have a format like this:
   <word1> 
   <word2>
   <word3>
   <word4>
     
   If the lmwid2dictid mapping is not updated, or the dictionary
   itself is not used in the context.  Just specify dict=NULL;
     
*/
int32 lm_add_wordlist(lm_t *lm, /**< In/Out: a modified LM structure */
		      dict_t *dict, /**< In: an initialized dictionary structure 
				       Used to update 
				    */
		      const char* filename /**< In: a file that contains a
					list of word one wants to
					add*/
    );

/**
   Add a word to the LM 

   look up the dictionary and see whether it exists in the dictionary
   Looks alike with wid.c's logic at this point.  

   (Incomplete!) Not fully tested in the situation for on-line
   recognition.
     
   We also avoid the addition of classes at this point because that
   could complicated things quite a lot. 
*/
int32 lm_add_word_to_ug(lm_t *lm, /**< In/Out: a modified LM structure */
			dict_t *dict, /**< In: an initialized dictionary structure 
					 Used to update lmwid2dictid mapping. 
				      */
			const char* newword /**<In: a pointer of a new word */
    );
/** 
    Get class ID given a LM. 
*/
int32 lm_get_classid (lm_t *model, /**< In: LM file being queried*/
		      const char *name   /**< In: The name of the class */
    );

/**
 * Explicity convert structure from 16bit -> 32bit or 32bit to 16bit. 
 */
void lm_convert_structure(lm_t *model, /**< In: LM file being used */
			  int32 is32bits 
    );	

/**
   Check whether the model is operating at 32 bits 
*/
int32 lm_is32bits(lm_t* model);

/**
   Write of UG structure
*/
void ug_write(FILE* fp,  /**< A file pointer */
	      ug_t* ug   /**< A pointer of the ug_t structure */
    );
/**
   Write of BG structure
*/
void bg_write(FILE* fp, /**< A file pointer */
	      bg_t* bg  /**< A pointer of the bg_t structure */
    );

/**
   Write of BG (32bits) structure
*/
void bg32_write(FILE* fp, /**< A file pointer */
		bg32_t* bg  /**< A pointer of the bg32_t structure */
    );

/**
   Write of TG structure
*/
  
void tg_write(FILE* fp, /**< A file pointer */
	      tg_t* tg  /**< A pointer of the tg_t structure */
    );

/**
   Write of TG (32bits) structure
*/
  
void tg32_write(FILE* fp, /**< A file pointer */
		tg32_t* tg  /**< A pointer of the tg32_t structure */
    );


/**
   Convert the 16 bit bigram structure to 32 bit
*/
void copy_bg_to_bg32(lm_t *lm /**< LM */
    );

/**
   Convert the 32 bit bigram structure to 16 bit
*/

void copy_bg32_to_bg(lm_t *lm /**< LM */
    );

/**
   Convert the 16 bit trigram structure to 32 bit
*/
void copy_tg_to_tg32(lm_t *lm /**< LM */
    );

/**
   Convert the 32 bit trigram structure to 16 bit
*/

void copy_tg32_to_tg(lm_t *lm /**< LM */
    );

/**
   Swap 16 bits bigram
*/
void swap_bg(bg_t* bg);
  

/**
   Swap 32 bits bigram
*/
void swap_bg32(bg32_t* bg);

/**
   Swap 16 bits trigram
*/
void swap_tg(tg_t* tg);
  

/**
   Swap 32 bits trigram
*/
void swap_tg32(tg32_t* tg);

int32 find_bg (bg_t *bg,  /**< In: The bigram */
	       int32 n, 
	       s3lmwid32_t w
    );

int32 find_bg32 (bg32_t *bg,  /**< In: The bigram */
		 int32 n, 
		 s3lmwid32_t w
    );


int32 find_tg (tg_t *tg, /**< In: The trigram */
	       int32 n, s3lmwid32_t w);

int32 find_tg32 (tg32_t *tg, /**< In: The trigram */
		 int32 n, s3lmwid32_t w);
  
/* Macro versions of access functions */
#define LM_TGPROB(lm,tgptr)	((lm)->tgprob[(tgptr)->probid].l)
#define LM_BGPROB(lm,bgptr)	((lm)->bgprob[(bgptr)->probid].l)
#define LM_UGPROB(lm,ugptr)	((ugptr)->prob.l)
#define LM_RAWSCORE(lm,score)	((score - (lm)->wip) / ((lm)->lw))
#define LM_DICTWID(lm,lmwid)     ((lm)->ug[(lmwid)].dictwid)

/** 
    Create a new unigram table
*/
ug_t *NewUnigramTable (int32 n_ug /**< Number of unigram */
    );


#if 0
{ /* Stop indent from complaining */
#endif
#ifdef __cplusplus
}
#endif

#endif