This file is indexed.

/usr/share/doc/sphinx3/s3_description.html is in sphinx3-doc 0.8-0ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
<!DOCTYPE HTML PUBLIC "-//w3c//DTD HTML 4.01//EN">
<html>
  <head>
    <title>Sphinx-3 s3.X Decoder (X=7)</title>
    <style type="text/css">
      body { margin-left: 6%; margin-right: 3%; background: white; }
      h1 { color: green; align: center; }
      h2 { margin-left: -6%; color: green; margin-top: 6em; }
      h3 { margin-left: -3%; margin-top: 1em; }
      h4,h5,h6 { margin-top: 1em; }
      pre { margin-left: 3%; font-family: monospace; }
      code { font-family: monospace; font-weight: bold; }
      div.wheatbox { background: wheat; padding: 0.5em; border: solid; border-width: thin; }
      div.silverbox { background: silver; padding: 0.5em; border: solid; border-width: thin; }
      div.endsec { background: wheat; border: solid; border-width: thin; }
    </style>
  </head>
  
  <BODY>
    <H1><U>Sphinx-3 s3.X Decoder (X=6)</U></H1>
    <center>
      Mosur K. Ravishankar (<em>aka</em> Ravi Mosur)<br>
      Sphinx Speech Group<br>
      School of Computer Science<br>
      Carnegie Mellon University<br>
      Pittsburgh, PA 15213<br>
    </center>
    
    
    <H2><U>Contents</U></H2>
    
    <div class="wheatbox">
      <UL>
	<LI><A HREF="#sec_intro">Introduction</A></LI>
	<LI><A HREF="#sec_decoverview">Overview of the s3.X Decoder</A>
          <UL>
	    <LI><A HREF="#sec_dec_input">Inputs</A></LI>
	    <LI><A HREF="#sec_dec_output">Outputs</A></LI>
          </UL>
	</LI>
        <LI><A HREF="#sec_compile">Compiling s3.X</A></LI>
        <LI><A HREF="#sec_exec">Running s3.X</A>
          <UL>
	    <LI><A HREF="#sec_args_overview">Configuration Arguments Overview</A></LI>
	    <LI><A HREF="#sec_dec_op">Decoder Operation</A></LI>
	    <LI><A HREF="#sec_dec_tune">Performance Tuning</A>
              <UL>
        	<LI><A HREF="#sec_tune_prune">Tuning the Pruning Behaviour</a></LI>
        	<LI><A HREF="#sec_tune_lw">Tuning Language Model Related Parameters</a></LI>
              </UL>
	    </LI>
            <LI><A HREF="#sec_dec_errors">Some Common Errors and Failure Modes</A></LI>
          </UL>
	</LI>
        <LI><A HREF="#sec_dict">Pronunciation Lexicon</A></LI>
        <LI><A HREF="#sec_am">Acoustic Model</A></LI>
        <LI><A HREF="#sec_lm">Language Model</A></LI>
        <LI><A HREF="#sec_ctl">Speech Input Control File</A></LI>
        <LI><A HREF="#sec_hypseg">Recognition Hypothesis Output</A></LI>
        <LI><A HREF="#sec_wordlat">Word Lattice Output</A></LI>
        <LI><A HREF="#sec_utilpgm">Other Utilities</A>
	  <UL>
	    <LI><A HREF="#sec_gausubvq">Gaussian Sub-Vector Quantization Utility</a></LI>
	  </UL>
	</LI>
        <LI><A HREF="#sec_src">Source Code</A></LI>
      </UL>
    </div>
    
    
    
    <H2><A NAME="sec_intro"><U>Introduction</U></A></H2>
    
    <P>Sphinx-3 is the successor to the Sphinx-II speech recognition
    system from Carnegie Mellon University.  It includes both an
    acoustic <em>trainer</em> and various <em>decoders</em>,
    <em>i.e.</em>, text recognition, phoneme recognition, N-best list
    generation, etc. In this document, "Sphinx-3" refers to any
    version of the Sphinx-3 decoder, and "s3.X" refers to the version
    available in this distribution. Notice that s3.X is in fact a
    branch from Sphinx-3, not a more recent release.
    </P>
    
    <P>The s3.X decoder is a recent implementation for speech-to-text
    recognition, its main goal being speed improvements over the
    original Sphinx-3 decoder.  It runs about 10 times faster than the
    latter on large vocabulary tasks.  The following is a brief
    summary of its main features and limitations:
    </P>

    <UL>
      <LI>5-10x real-time recognition time on large vocabulary tasks</LI>
      <LI>Limited to fully continuous acoustic models</LI>
      <LI>Limited to 3 or 5-state left-to-right HMM topologies</LI>
      <LI>Bigram or trigram language model</LI>
      <LI>Batch-mode or live operation from pre-recorded speech</LI>
    </UL>
    
    <P> After s3.5, s3.X decoder also starts to integrate both the
    flat-lexicon decoder search, tree decoder search by wrapping them
    under the same interface.  An implementation of finite-state
    transducer search is also available under the same condition.  
    </P> 

    <P> All of the decoding routines could be accessible under the
    executable <code> sphinx3_decode </code> through using the <code>
    -op_mode </mode> options.  (-op_mode 2: FST, -op_mode 3: Flat
    Lexicon Decoder, -op_mode: Tree Lexicon Decoder) The original
    flat-lexicon decoder interface still exists for backward
    compatibility purpose. 
    </P> 

    <P>This package contains the following programs:
    </P>
    <OL>
      <LI><code>sphinx3_decode</code>: The Sphinx-3 s3.2/s3.3/s3.X decoder processing cesptra files</li>
      <LI><code>sphinx3_decode_anytopo</code>: The Sphinx-3 s3.0 decoder processing cesptra files (for backward compatibility purpose) </li>
      <LI><code>sphinx3_continuous</code>: The Sphinx-3 live mode demo and ready for simple speak-and-decode application. </li>
      <LI><code>sphinx3_gausubvq</code>: Sub-vector clustered acoustic model building</LI>
      <LI><code>sphinx3_livedecode</code>: The Sphinx-3 s3.X decoder in live mode</LI>
      <LI><code>sphinx3_livepretend</code>: The Sphinx-3 s3.X decoder in batch mode</LI>      
      <LI><code>sphinx3_align</code>: The Sphinx-3 aligner</LI>
      <LI><code>sphinx3_allphone</code>: The Sphinx-3 phoneme recognizer</LI>
      <LI><code>sphinx3_astar</code>: The Sphinx-3 N-best generator</LI>
      <LI><code>sphinx3_dag</code>: The Sphinx-3 application for best-path searching</LI>
      <LI><code>lm_convert</code>: A program that could convert the DMP and TXT-format of LM </LI>

    </OL>
    <P>This distribution has been prepared for Unix platforms. Port to
    MS Windows (MS Visual C++ 6.0 workspace and project files) has
    been provided. 
    </P>

    <P>This document is a brief user's manual for the above programs.
    It is <em>not</em> meant to be a detailed description of the
    decoding algorithm, or an in-depth tutorial on speech recognition
    technology.  However, a set of Microsoft PowerPoint <a
    href="s3-2.ppt">slides</a> are available that give additional
    information about the decoder. Even though the slides refer to
    s3.2, keep in mind that the basic search structure remains te same
    in s3.X (where x=3 to 6). 
    </P>
    
    <P>The initial part of this document provides an overview of the
    decoder.  It is followed by descriptions of the main input and
    output databases; <em>i.e.</em>, the lexicon, language model, acoustic
    model, etc.
    </P>
    <div class="endsec">
      &curren
      <a href="#sec_intro">Back to top of this section</a>
    </div>

    
    <H2><A NAME="sec_decoverview"><U>Overview of the s3.X Decoder</U></A></H2>
    
    <P>The s3.X decoder is based on the conventional <em>Viterbi
    search</em> algorithm and <em>beam search</em> heuristics.  It
    uses a <em>lexical-tree</em> search structure somewhat like the
    Sphinx-II decoder, but with some improvements for greater accuracy
    than the latter. It takes its input from pre-recorded speech in
    raw PCM format and writes its recognition results to output files.
    </P>
    
    <H3><A NAME="sec_dec_input"><U>Inputs</U></A></H3>
    
    <P>We first give a brief outline of the input and output
    characteristics of the decoder.  More detailed information is
    available in later sections.  The decoder needs the following
    inputs:
    </P>
    <UL>
      <LI><a href="#sec_dict"><em>Lexical model</em></a>: The
      lexical or pronunciation model contains pronunciations for all
      the words of interest to the decoder.  Like most modern speech
      recognition systems, Sphinx-3 uses <em>phonetic units</em> to
      build word pronunciations.  Currently, the pronunciation lexicon
      is almost entirely hand-crafted.
      </LI>
      <P></P>
    
      <LI><a href="#sec_am"><em>Acoustic model</em></a>: Sphinx
      uses acoustic models based on statistical <em>hidden Markov
      models</em> (HMMs).  The acoustic model is trained from acoustic
      training data using the Sphinx-3 trainer.  The trainer is
      capable of building acoustic models with a wide range of
      structures, such as <em>discrete</em>, <em>semi-continuous</em>,
      or <em>continuous</em>.  However, the s3.X decoder is only
      capable of handling continuous acoustic models.
      </LI>
      <P></P>
      
      <LI><a href="#sec_lm"><em>Language model (LM)</em></a>:
      Sphinx-3 uses a conventional backoff bigram or trigram language
      model.
      </LI>
      <P></P>
      
      <LI><a href="#sec_ctl"><em>Speech input specification</em></a>:
      This distribution contains four executable files, three of which
      perform recognition. <code>sphinx3_livedecode</code> decodes live
      speech, that is, speech incoming from your audio
      card. <code>sphinx3_livepretend</code> decodes in batch mode using a
      <em>control file</em> that describes the input to be decoded
      into text. <code>sphinx3_decode</code> decodes also uses a control file
      for batch mode processing. In the latter, the entire input to be
      processed must be available beforehand, <em>i.e.</em>, the raw
      audio samples must have been preprocessed into cepstrum
      files. Also note that the decoder cannot handle arbitrary
      lengths of speech input.  Each separate piece (or
      <em>utterance</em>) to be processed by the decoder must be no
      more than 300 sec. long.  Typically, one uses a
      <em>segmenter</em> to chop up a cepstrum stream into manageable
      segments of up to 20 or 30 sec. duration.
      </LI>
    </UL>
    
    
    
    <H3><A NAME="sec_dec_output"><U>Outputs</U></A></H3>
    
    <P>The decoder can produce two types of recognition output:
    </P>
    <UL>
      <LI><a href="#sec_hypseg"><em>Recognition hypothesis</em></a>: A
      single best recognition result (or <em>hypothesis</em>) for each
      utterance processed.  It is a linear word sequence, with
      additional attributes such as their time segmentation and
      scores.
      </LI>
      <P></P>
      
      <LI><a href="#sec_wordlat"><em>Word lattice</em></a>: A
      word-graph of all possible candidate words recognized during the
      decoding of an utterance, including other attributes such as
      their time segmentation and acoustic likelihood scores.
      </LI>
    </UL>
    
    <P>In addition, the decoder also produces a detailed log to
    stdout/stderr that can be useful in debugging, gathering
    statistics, etc.
    </P>
    <div class="endsec">
      &curren
      <a href="#sec_decoverview">Back to top of this section</a>
    </div>
    
    
    
    <H2><A NAME="sec_compile"><u>Compiling s3.X</u></A></H2>
    
    <P>The current distribution has been set up for Unix platforms.
    The following steps are needed to compile the decoder:
    </P> 
 
    <P> For Users: </P> 
    <OL>
      <LI><code>./configure [--prefix=/my/install/directory]</code>:
      the argument is optional. If not given, it will install s3.X
      under /usr/local, provided you have the proper permissions. This
      step is only necessary the first time you compile s3.X.</LI>

      <LI><code>make clean</code>: This should remove any old object
      files.</LI> 

      <LI><code>make</code>: This compiles the libraries and example
      programs.</LI>

      <LI><code>make install</code>: This will install s3.X in the
      directory that you specified when you ran <code>configure</code>
      and also the provided models and documentation.</LI>
    </OL>
    
    <P>Note that the Makefiles are not foolproof; they do not eliminate
    the need for sometimes manually determining dependencies,
    especially upon updates to header files.  When in doubt, first
    clean out the compilation directories entirely by running
    <code>make distclean</code> and start over.

    <P> For Developers </P> 
    <OL> 
	<LI> The project could be boostrap from the SVN repository
	which
	could be obtained by command <code> svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/sphinx3 </code>  </LI> 
	<LI> One could boostrap the installation by typing command <code> ./autogen.sh ; ./autogen.sh </LI>
    </OL> 


    </P>
    <div class="endsec">
      &curren
      <a href="#sec_compile">Back to top of this section</a>
    </div>
    
    
    
    <H2><A NAME="sec_exec"><u>Running s3.X</u></A></H2>
    
    <P>Running the decoder is simply a matter of invoking the binary
    (<em>i.e.</em>, <code>sphinx3_decode</code>, <code>sphinx3_livedecode</code> or
    <code>sphinx3_livepretend</code>), with a number of command line arguments
    specifying the various input files, as well as decoding
    configuration parameters. <code>sphinx3_decode</code> and
    <code>sphinx3_livepretend</code> require a control file, the directory
    where the audio files are available, and a file containing the
    configuration arguments. <code>sphinx3_livedecode</code>, which runs live,
    requires only the file with the arguments.
    </p>

    <P>Invoking the binary without any argument produces a <a
    href="cmdhelp.txt">help message</a> with short descriptions of all
    the configuration arguments.
    </P>
    
    
    
    <H3><A name="sec_args_overview"><U>Configuration Arguments Overview</U></a></H3>
    
    <P>This section gives a brief overview of the main configuration
    arguments.  They are broken down into separate groups, based on
    whether they are the primary flags specifying input and output
    data, arguments for optional configuration, or for performance
    tuning.
    </P>
    
    <P>Note that not all the available configuration arguments are
    covered below.  There are a few additional and undocumented flags,
    intended mainly for debugging purposes.
    </P>
    
    <H4><A name="sec_flags_primary"><U>Primary Flags</U></a></H4>
    
    <P>Many of the flags have reasonable defaults.  The ones that a
    user minimally needs to provide are the input and output databases
    or files, which have been discussed <a
    href="#sec_decoverview">above</a>:
    </P>
    <table cellpadding="8">
      <tr>
	<td><UL><LI><code>-mdef</code></LI></UL></td>
	<td><a href="#sec_am">Model definition</a> input file</td>
      </tr>
      
      <tr>
	<td>
	  <UL>
	    <LI><code>-mean</code></LI>
	    <LI><code>-var</code></LI>
	    <LI><code>-mixw</code></LI>
	    <LI><code>-tmat</code></LI>
	    <LI><code>-subvq</code></LI>
	    <LI><code>-hmm</code></LI>
	  </UL>
	</td>

	<td><a href="#am_files">Acoustic model</a> files.  One could
	conveniently specify the acoustic model by simply specifying
	the -hmm option.  The default model file names for the
	components HMMs are <code> means </code>, <code> variances
	</code>, <code> mixture_weights </code>, <code>
	transition_matrices </code> and <code> mdef </code> (the model
	definition).
	</td>
      </tr>
      
      <tr>
	<td>
	  <UL>
	    <LI><code>-dict</code></LI>
	    <LI><code>-fdict</code></LI>
	  </UL>
	</td>
	<td><a href="#dict_main_filler">Main and filler lexicons</a></td>
      </tr>
      
      <tr>
	<td>
	<UL>
	<LI><code>-lm</code></LI>
	<LI><code>-lmctlfn</code></LI>
	<LI><code>-lminmemory</code></LI>
	<LI><code>-lmname</code></LI>
	</td>

	<td>One could specify language model <a
	href="#lm_dumpfile">binary dump file</a>
	or txt file using <code> -lm </code>.  Set of class-based LMs
	could be specified with <code> -lmctlfn </code>.  By default,
	the lm is accessed mainly at harddisc with a cache
	mechanism. One could toggle this behavior to fully memory-mode
	by specifying option <code> -lminmemory </code> </td>
      </tr>
      
      <tr>
	<td>
	  <UL>
	    <LI><code>-fillpen</code></LI>
	    <LI><code>-fillprob</code></LI>
	    <LI><code>-silprob</code></LI>
	  </UL>
	</td>
	<td><a href="#lm_filler">Filler word</a> probabilities</td>
      </tr>
      
      <tr>
	<td><UL>
	<LI><code>-hypseg</code></LI>
	<LI><code>-hypsegfmt</code></LI>
	<LI><code>-hypsegscore_unscale</code></LI>
	</UL></td>
	<td>Output <a href="#sec_hypseg">hypotheses file with detail scores and timing.  <code> -hypsegfmt </code> could be used to specify format for sphinx 2, Sphinx 3 and NIST CTM format. </a></td>
      </tr>

      <tr>
	<td><UL><LI><code>-hyp</code></LI></UL></td>
	<td>Output <a href="#sec_hypseg">hypotheses file without detail scores and timing </a></td>
      </tr>

    </table>
    
    
    <H4><A name="sec_flags_config"><U>Additional Configuration Flags</U></a></H4>
    
    <P>It may often be necessary to provide additional parameters to
    obtain the right decoder configuration:
    </P>
    <table cellpadding="8">
      <tr>
	<td>
	  <UL>
	    <LI><code>-cmn</code></LI>
	    <LI><code>-agc</code></LI>
	    <LI><code>-varnorm</code></LI>
	    <LI><code>-lowerf</code></LI>
	    <LI><code>-upperf</code></LI>
	    <LI><code>-nfilt</code></LI>
	    <LI><code>-samprate</code></LI>
	  </UL>
	</td>
	<td><a href="#am_feature">Feature type</a> configuration</td>
      </tr>
      
      <tr>
	<td><UL><LI><code>-cepdir</code></LI></UL></td>
	<td>Directory prefix for cepstrum files specified in the
	  <a href="#sec_ctl">control file</a>, ignored in <code>sphinx3_livedecode</code> and <code>sphinx3_livepretend</code></td>
      </tr>

      <tr>
	<td>
	  <UL>
	    <LI><code>-ctl</code></LI>
	    <LI><code>-ctl_lm</code></LI>
	    <LI><code>-ctl_mllr</code></LI>
	  </UL>
	</td>
	<td>Specify the file, the LM and the MLLR used in each
	utterances. </td>
      </tr>

      <tr>
	<td>
	  <UL>
	    <LI><code>-mllr</code></LI>
	    <LI><code>-cb2mllr</code></LI>
	  </UL>
	</td>
	<td> The regression matrix and the senone to regression matrix
	mapping </td>
      </tr>

      <tr>
	<td>
	  <UL>
	    <LI><code>-bestpath</code></LI>
	    <LI><code>-bestpathlw</code></LI>
	  </UL>
	</td>
	<td> Applicable for mode 3 (flat-lexicon) and mode 4
	(tree-lexicon) search.  In the second pass, control the
	best-path search. </td>
      </tr>
      
      <tr>
	<td>
	  <UL>
	    <LI><code>-ctloffset</code></LI>
	    <LI><code>-ctlcount</code></LI>
	  </UL>
	</td>
	<td>Selecting a portion of the <a href="#sec_ctl">control
	file</a> to be processed</td>
      </tr>
      
      <tr>
	<td>
	  <UL>
	    <LI><code>-outlatdir</code></LI>
	    <LI><code>-latext</code></LI>
	  </UL>
	</td>
	<td>Directory, file-extension for <a href="#sec_wordlat">word
	lattices</a> output</td>
      </tr>
    </table>

    
    
    <H4><A name="sec_flags_tune"><u>Performance Tuning Flags</u></a></H4>

    <P>In yet other cases, it may be necessary to tune the following
    parameters to obtain the optimal computational efficiency or
    recognition accuracy:
    </P>
    <table cellpadding="8">
      <tr>
	<td>
	  <UL>
	    <LI><code>-beam</code></LI>
	    <LI><code>-pbeam</code></LI>
	    <LI><code>-wbeam</code></LI>
	    <LI><code>-subvqbeam</code></LI>
	  </UL>
	</td>
	<td><a href="#sec_dec_prune">Beam pruning</a> parameters</td>
      </tr>

      <tr>
	<td>
	  <UL>
	    <LI><code>-maxwpf</code></LI>
	    <LI><code>-maxhistpf</code></LI>
	    <LI><code>-maxhmmpf</code></LI>
	  </UL>
	</td>
	<td><a href="#sec_dec_prune">Absolute pruning</a> parameters</td>
      </tr>

      <tr>
	<td>
	  <UL>
	    <LI><code>-ci_pbeam</code></LI>
	    <LI><code>-max_cdsenpf</code></LI>
	    <LI><code>-ds</code></LI>
	  </UL>
	</td>
	<td><a href="#sec_gmm_compute">Fast GMM Computation</a> parameters</td>
      </tr>

      <tr>
	<td>
	  <UL>
	    <LI><code>-lw</code></LI>
	    <LI><code>-wip</code></LI>
	  </UL>
	</td>
	<td><a href="#lm_lw_wip">Language weight, word insertion penalty</a></td>
      </tr>
    
      <tr>
	<td><UL><LI><code>-Nlextree</code></LI></UL></td>
	<td>Number of lexical tree instances</td>
      </tr>
    </table>
    
    
    
    <H3><A name="sec_dec_op"><U>Decoder Operation</U></a></H3>
    
    <P>This section is a bit of a mish-mash; its contents probably
    belong in an FAQ section.  But, hopefully, through this section a
    newcomer to Sphinx can get an idea of the structure, capabilities,
    and limitations of the s3.X decoder.
    </P>



    <H4><A name="sec_dec_init"><U>Initialization</U></a></H4>
    
    <P>The decoder is configured during the initialization step, and
    the configuration holds for the entire run.  This means, for
    example, that the decoder does not dynamically reconfigure the
    acoustic models to <em>adapt</em> to the input.  To choose another
    example, there is no mechanism in this decoder to switch language
    models from utterance to utterance, unlike in Sphinx-II.
    The main initialization steps are outlined below.
    </P>

    <P><b>Log-Base Initialization.</b> Sphinx performs all likelihood
    computations in the log-domain.  Furthermore, for computational
    efficiency, the <em>base</em> of the logarithm is chosen such that
    the likelihoods can be maintained as 32-bit integer values.  Thus,
    all the scores reported by the decoder are <em>log-likelihood</em>
    values in this peculiar log-base.  The default base is typically
    1.0003, and can be changed using the <code>-logbase</code>
    configuration argument.  The main reason for modifying the
    log-base would be to control the length (duration) of an input
    utterance before the accumulated log-likelihood values overflow
    the 32-bit representation, causing the decoder to fail
    catastrophically.  The log-base can be changed over a wide range
    without affecting the recognition.
    </P>

    <P><b>Models Initialization.</b> The lexical, acoustic, and
    language models specified via the configuration arguments are
    loaded during initialization.  This set of models is used to
    decode all the utterances in the input.  (The language model is
    actually only partly loaded, since s3.X uses a <a
    href="#lm_dumpfile">disk-based LM</a> strategy.)
    </P>

    <P><b>Effective Vocabulary.</b> After the models are loaded,
    the <em>effective vocabulary</em> is determined.  It is the set of
    words that the decoder is capable of recognizing.  Recall that the
    decoder is initialized with three sources of words: the <a
    href="#dict_main_filler">main and filler lexicon</a> files, and
    the <a href="#sec_lm">language model</a>.  The effective
    vocabulary is determined from them as follows:
    </P>
    <OL>
      <LI>Find the intersection of the words in the LM and the main
      pronunciation lexicon</LI>

      <LI>Include all the alternative pronunciations to the set
      derived above (using the main lexicon)</LI>

      <LI>Include all the filler words from the filler lexicon, but
      excluding the distinguished beginning and end of sentence words:
      <code>&lt;s&gt</code> and <code>&lt;/s&gt</code>.</LI>
    </OL>
    <P>The effective vocabulary remains in effect throughout the batch
    run.  It is not possible to add to or remove from this vocabulary
    dynamically, unlike in the Sphinx-II system.
    </P>

    <P><b>Lexical Tree Construction.</b> The decoder constructs
    <em>lexical trees</em> from the effective vocabulary described
    above.  Separate trees are constructed for words in the <a
    href="#dict_main_filler">main and filler lexicons</a>.
    Furthermore, several copies may be instantiated for the two,
    depending on the <code>-Nlextree</code> configuration argument.
    Further details of the lexical tree construction are available on
    the PowerPoint <a href="s3-2.ppt">slides</a>.
    </P>



    <H4><A name="sec_ctl_process"><U>Control File Processing</U></a></H4>

    <P>Following initialization, <code>sphinx3_decode</code> and
    <code>sphinx3_livepretend</code> processes the entries in the control file
    sequentially, one at a time.  It is possible to process a
    contiguous subset of the control file, using the
    <code>-ctloffset</code> and <code>-ctlcount</code> flags, as
    mentioned earlier.  There is no learning or <em>adaptation</em>
    capability as decoding progresses.  Since <code>sphinx3_livepretend</code>
    behaves as if the files were being spoken at the time of
    processing, rearranging the order of the entries in the control
    file may affect the individual results, but this change may be
    imperceptible if the environment in which the files were recorded
    remains constant. The order of entries in the control file does
    not affect <code>sphinx3_decode</code>.
    </P>



    <H4><A name="sec_dec_prune"><U>Pruning</U></a></H4>

    <P>Each entry in the control file, or utterance, is processed
    using the given input models, and using the <em>Viterbi search
    algorithm</em>.  In order to constrain the active search space to
    computationally manageable limits, <em>pruning</em> is employed,
    which means that the less promising hypotheses are continually
    discarded during the recognition process.  There are two kinds of
    pruning in s3.X, <em>beam pruning</em> and <em>absolute
    pruning</em>.
    </P>

    <P><b>Beam Pruning.</b> Each utterance is processed in a
    <em>time-synchronous</em> manner, one frame at a time.  At each
    frame the decoder has a number of currently <em>active</em> HMMs
    to match with the next frame of input speech.  But it first
    discards or deactivates those whose state likelihoods are below
    some <em>threshold</em>, relative to the best HMM state likelihood at
    that time.  The threshold value is obtained by
    <em>multiplying</em> the best state likelihood by a fixed
    <em>beamwidth</em>.  The beamwidth is a value between 0 and 1, the
    former permitting all HMMs to survive, and the latter permitting
    only the best scoring HMMs to survive.
    </P>
    
    <P>Similar beam pruning is also used in a number of other
    situations in the decoder, e.g., to determine the candidate words
    recognized at any time, or to determine the component densities in
    a mixture Gaussian that are closest to a given speech feature
    vector.  The various beamwidths have to be determined empirically
    and are set using <a href="#sec_flags_tune">configuration
    arguments</a>.
    </P>

    <P><b>Absolute Pruning.</b> Even with beam pruning, the number of
    active entities can sometimes become computationally overwhelming.
    If there are a large number of HMMs that fall within the pruning
    threshold, the decoder will keep all of them active.  However,
    when the number of active HMMs grows beyond certain limits, the
    chances of detecting the correct word among the many candidates
    are considerably reduced.  Such situations can occur, for example,
    if the input speech is noisy or quite mismatched to the acoustic
    models.  In such cases, there is no point in allowing the active
    search space to grow to arbitrary extents.  It can be contained
    using pruning parameters that limit the <em>absolute number</em>
    of active entities at any instant.  These parameters are also
    determined empirically, and set using <a
    href="#sec_flags_tune">configuration arguments</a>.
    </P>

    <H4><A name="sec_gmm_compute"><U>(After s3.4) Fast GMM
    Computation</U></a></H4>

    <p> The computation of likelihood Gaussian distribution can be one
    of the dominating factor of the GMM computation. Tuning the
    following parameters can control the amount of time required.  </p>

    <ul>
      <LI><code>-ci_pbeam</code>: Enable a two-pass computation where
      CI models were computed first and the CD models were then
      computed.  If this beam is used, only CD models, which correspond
      to CI models within the beam (relative to the max CI scores), are
      computed </LI>
      <LI><code>-maxcdsenpf</code>: Similar to <code> -ci_pbeam
      </code> but the beam is decided by an absolute number of senones
      computd.  </LI>
      <LI><code>-ds  </code> : Enable frame down-sampling. Only 1
      another N frames were computed.  </LI>
    </ul>

    <H4><U>Output Generation</U></H4>

    <P>During recognition, the decoder builds an internal
    <em>backpointer table</em> data structure, from which the final
    outputs are generated.  This table records all the candidate words
    recognized during decoding, and their attributes such as their
    time segmentation, acoustic and LM likelihoods, as well as their
    predecessor entries in the table.  When an utterance has been
    fully processed, the best <a href="#sec_hypseg">recognition
    hypothesis</a> is extracted from this table.  Optionally, the
    table is also converted into a <a
    href="#sec_wordlat">word-lattice</a> and written out to a file.
    </P>
    
    <P>More information on the backpointer table is available in the
    PowerPoint <a href="s3-2.ppt">slides</a>.
    </P>


    <H4><U>Miscellaneous Issues</U></H4>

    <P><b>Role of <code>&lt;s&gt</code> and
    <code>&lt;/s&gt</code>.</b> The distinguished
    <em>beginning-of-sentence</em> and <em>end-of-sentence</em> tokens
    <code>&lt;s&gt</code> and <code>&lt;/s&gt</code> are not in the
    effective vocabulary, and no part of the input speech is decoded
    into either of them.  They are merely anchors at the ends of each
    utterance, and provide context for the LM.  This is in contrast to
    earlier versions of Sphinx, which required some silence at either
    end of each speech utterance, to be decoded into these tokens.
    </P>

    
    
    <H3><A name="sec_dec_tune"><U>Performance Tuning</U></a></H3>
    
    <P>To obtain the best recognition performance, it is necessary to
    select the appropriate front-end and feature type computation,
    train the various models, as well as tune the decoder
    configuration parameters.  This section deals with the last issue.
    There are mainly two groups of parameters to be tuned, pertaining
    to <a href="#sec_tune_prune">pruning</a> and <a
    href="#sec_tune_lw">LM</a>.  Unfortunately, there are no automatic
    methods for determining the values of these parameters; it is
    necessary to derive them by trial and error.  Additionally, the
    following points should be kept in mind with regard to the pruning
    parameters:
    </P>
    <UL>
      <LI>The pruning parameters need to be tuned whenever the
      acoustic model is changed.</LI> <P></P>

      <LI>Changing the LM related parameters affects the effective
      pruning behaviour.  The pruning parameters ought to be re-tuned
      after the former have been tuned (although this step is often
      skipped in practice).</LI> <P></P>

      <LI>For computational efficiency, the beamwidth parameter values
      should be as narrow as possible (values closer to 1.0 are
      narrower), and the absolute pruning parameter values should be
      as small as possible.</LI> <P></P>

      <LI>But, for recognition accuracy, the pruning parameters should
      be as relaxed as possible.  (However, relaxing the beamwidth
      parameters too much can actually <em>worsen</em> recognition
      accuracy.  The reasons for such perverse behaviour are not quite
      understood.)</LI>
    </UL>



    <H4><A name="sec_tune_prune"><U>Tuning the Pruning Behaviour</U></a></H4>

    <P>The pruning parameters are the following:</P>
    <UL>
      <LI><code>-beam</code>: Determines which HMMs remain active at
      any given point (frame) during recognition.  (Based on the best
      state score within each HMM.)
      </LI>

      <LI><code>-pbeam</code>: Determines which active HMM can
      transition to its successor in the lexical tree at any point.
      (Based on the exit state score of the source HMM.)
      </LI>

      <LI><code>-wbeam</code>: Determines which words are recognized
      at any frame during decoding.  (Based on the exit state scores
      of leaf HMMs in the lexical trees.)</LI>

      <LI><code>-maxhmmpf</code>: Determines the number of HMMs
      (approx.) that can remain active at any frame.</LI>

      <LI><code>-maxwpf</code>: Controls the number of distinct words
      recognized at any given frame.</LI>

      <LI><code>-maxhistpf</code>: Controls the number of distinct
      word histories recorded in the backpointer table at any given
      frame.</LI>

      <LI><code>-subvqbeam</code>: For each <a
      href="#sec_am">senone</a> and its underlying acoustic model,
      determines its active mixture components at any frame.</LI>
    </UL>
    
    
    <P>In order to determine the pruning parameter values empirically,
    it is first necessary to obtain a <em>test set</em>,
    <em>i.e.</em>, a collection of test sentences not used in any
    training data.  The test set should be sufficiently large to
    ensure statistically reliable results.  For example, a
    large-vocabulary task might require a test set that includes a
    half-hour of speech, or more.
    </P>
    
    <P>It is difficult to tune a handful of parameters simultaneously,
    especially when the input models are completely new.  The
    following steps may be followed to deal with this complex problem.
    </P>
    <OL>
      <LI>To begin with, set the absolute pruning parameters to large
      values, making them essentially ineffective.  Set both
      <code>-beam</code> and <code>-pbeam</code> to
      <code>1e-60</code>, and <code>-wbeam</code> to
      <code>1e-30</code>.  Set <code>-subvqbeam</code> to a small
      value (e.g., the same as <code>-beam</code>).  Run the decoder
      on the chosen test set and obtain accuracy results.  (Use
      default values for the <a href="#sec_tune_lw">LM related
      parameters</a> when tuning the pruning parameters for the first
      time.)
      </LI>
      <P></P>
    
      <LI>Repeat the decoder runs, varying <code>-beam</code> up and
      down, until the setting for best accuracy is identified.  (Keep
      <code>-pbeam</code> the same as <code>-beam</code> every time.)
      </LI>
      <P></P>
    
      <LI>Now vary <code>-wbeam</code> up and down and identify its
      best possible setting (keeping <code>-beam</code> and
      <code>-pbeam</code> fixed at their most recently obtained
      value).
      </LI>
      <P></P>
    
      <LI>Repeat the above two steps, alternately optimizing
      <code>-beam</code> and <code>-wbeam</code>, until convergence.
      Note that during these iterations <code>-pbeam</code> should
      always be the same as <code>-beam</code>.  (This step can be
      omitted if the accuracy attained after the first iteration is
      acceptable.)
      </LI>
      <P></P>
    
      <LI>Gradually increase <code>-subvqbeam</code> (<em>i.e.</em>,
      towards 1.0 for a narrower setting), stopping when recognition
      accuracy begins to drop noticeably.  Values near the default are
      reasonable.  (This step is needed only if a <a
      href="#am_subvq">sub-vector quantized</a> model is available for
      speeding up acoustic model evaluation.)
      </LI>
      <P></P>
    
      <LI>Now gradually increase <code>-pbeam</code> (<em>i.e.</em>,
      towards 1.0), stopping when recognition accuracy begins to drop
      noticeably.  (This step is optional; it mainly optimizes the
      computational effort a little more.)
      </LI>
      <P></P>
    
      <LI>Reduce <code>-maxhmmpf</code> gradually until accuracy
      begins to be affected.  Repeat the process with
      <code>-maxwpf</code>, and then with <code>-maxhistpf</code>.
      (However, in some situations, especially when the vocabulary
      size is small, it may not be necessary to tune these absolute
      pruning parameters.)
      </LI>
    </OL>

    <P>In practice, it may not always be possible to follow the above
    steps strictly.  For example, considerations of computational cost
    might dictate that the absolute pruning parameters or the
    <code>-subvqbeam</code> parameter be tuned earlier in the
    sequence.
    </P>



    <H4><A name="sec_tune_lw"><U>Tuning Language Model Related
    Parameters</U></a></H4>

    <P>The parameters needed to be tuned are the following:</P>
    <UL>
      <LI><code>-lw</code>:  The <a href="#lm_lw_wip">language weight</a>.

      <LI><code>-wip</code>: The <a href="#lm_lw_wip">word insertion
      penalty</a>.
      </LI>
    </UL>
    <P>Like the pruning parameters, the above two are tuned on a test
    set.  Since the decoder is much more sensitive to the language
    weight, that is typically tuned first, using the default word
    insertion penalty.  The latter is then tuned.  It is usually not
    necessary to repeat the process.
    </P>



    
    <H3><A name="sec_dec_errors"><U>Some Common Errors and Failure
    Modes</U></a></H3> 
    <P>To be completed.</P>
    <div class="endsec">
      &curren
      <a href="#sec_exec">Back to top of this section</a>
    </div>
    

    
    
    <H2><a name="sec_dict"><U>Pronunciation Lexicon</U></a></H2>
    <div class="wheatbox">
      <UL>
	<LI><A HREF="#dict_struct">Lexicon Structure</A>
	  <UL>
	    <LI><A HREF="#dict_multipron">Multiple Pronunciations</a></LI>
	    <LI><A HREF="#dict_compwd">Compound Words</a></LI>
	  </UL>
	</LI>
	<LI><A HREF="#dict_main_filler">Main and Filler Lexicons</A></LI>
      </UL>
    </div>

    <H3><a name="dict_struct"><U>Lexicon Structure</U></a></H3>

    <P>A pronunciation lexicon (or dictionary) file specifies word
    pronunciations.  In Sphinx, pronunciations are specified as a
    linear sequence of <em>phonemes</em>.  Each line in the file
    contains one pronunciation specification, except that any line
    that begins with a "#" character <u>in the first column</u> is
    treated as a comment and is ignored.  Example dictionary for
    digits:
    </P>
    <pre>
ZERO               Z IH R OW
ONE                W AH N
TWO                T UW
THREE              TH R IY
FOUR               F AO R
FIVE               F AY V
SIX                S IH K S
SEVEN              S EH V AX N
EIGHT              EY TD
NINE               N AY N</pre>

    <P>The lexicon is completely <em>case-insensitive</em>
    (unfortunately).  For example, it's not possible to have two
    different entries <code>Brown</code> and <code>brown</code> in the
    dictionary.
    </P>
    
    <H4><a name="dict_multipron"><U>Multiple Pronunciations</U></a></H4>
    
    <P>A word may have more than one pronunciation, each one on a
    separate line.  They are distinguished by a unique parenthesized
    suffix for the word string.  For example:
    </P>

    <pre>
ACTUALLY          AE K CH AX W AX L IY
ACTUALLY(2nd)       AE K SH AX L IY
ACTUALLY(3rd)       AE K SH L IY</pre>

    <P>If a word has more than one pronunciation, its first appearance
    must be the unparenthesized form.  For the rest, the parenthesized
    suffix may be any string, as long as it is unique for that word.
    There is no other significance to the order of the alternatives;
    each one is considered to be equally likely.
    </P>
    
    <H4><a name="dict_compwd"><U>Compound Words</U></a></H4>
    
    <P>In Sphinx-3, the lexicon may also contain <em>compound
    words</em>.  A compound word is usually a short phrase whose
    pronunciation happens to differ significantly from the mere
    concatenation of the pronunciations of its constituent words.
    Compound word tokens are formed by concatenating the component
    word strings with an underscore character; e.g.:
    </P>

    <pre>WANT_TO           W AA N AX</pre>

    <P>(The s3.X decoder, however, treats a compound word as just
    another word in the language, and does not do anything special
    with it.)
    </P>
    
    
    <H3><a name="dict_main_filler"><U>Main and Filler Lexicons</U></a></H3>
    
    <P>The Sphinx-3 decoders actually need two separate lexicons: a
    "regular" one containing the words in the language of interest,
    and also a <em>filler</em> or <em>noise</em> lexicon.  The latter
    defines "words" not in the language.  More specifically, it
    defines legal "words" that do not appear in the language model
    used by the decoder, but are nevertheless encountered in normal
    speech.  This lexicon must include the <em>silence word</em>
    <code>&lt;sil&gt</code>, as well as the special
    <em>beginning-of-sentence</em> and <em>end-of-sentence</em> tokens
    <code>&lt;s&gt</code>, and <code>&lt;/s&gt</code>, respectively.
    All of them usually have the silence-phone <code>SIL</code> as
    their pronunciation.  In addition, this lexicon may also contain
    "pronunciations" for other noise event words such as breath noise,
    "UM" and "UH" sounds made during spontaneous speech, etc.
    </P>
    <div class="endsec">
      &curren
      <a href="#sec_dict">Back to top of this section</a>
    </div>
    
    

    <H2><a name="sec_am"><U>Acoustic Model</U></a></H2>
    <div class="wheatbox">
      <UL>
	<LI><A HREF="#am_intro">Introduction</A></LI>
	<LI><A HREF="#am_feature">Acoustic Features Computation</A></LI>
	<LI><A HREF="#am_training">Acoustic Model Training</A></LI>
	<LI><A HREF="#am_struct">Model Structures</A></LI>
	<LI><A HREF="#am_subvq">Sub-Vector Quantized Models</A></LI>
	<LI><A HREF="#am_files">Model Files</A></LI>
      </UL>
    </div>
    

    <H3><a name="am_intro"><U>Introduction</U></a></H3>

    <P>Sphinx-3 is based on <em>subphonetic acoustic models</em>.
    First, the basic sounds in the language are classified into
    phonemes or <em>phones</em>.  There are roughly 50 phones in the
    English language.  For example, here is a pronunciation for the
    word <code>LANDSAT</code>:
    </P>
    <pre>L  AE  N  D  S  AE  TD</pre>
    
    <P>Phones are then further refined into context-dependent
    <em>triphones</em>, <em>i.e.</em>, phones occurring in given left
    and right phonetic contexts.  The reason is that the same phone
    within different contexts can have widely different acoustic
    manifestations, requiring separate acoustic models.  For example,
    the two occurrences of the <code>AE</code> phone above have
    different contexts, only the first of which is nasal.
    </P>
    
    <P>In contrast to triphones, a phone considered without any
    specific context is referred to as a <em>context-independent</em>
    phone or <em>basephone</em>.  Note also that context-dependency
    gives rise to the notion of <em>cross-word</em> triphones.  That
    is, the left context for the leftmost basephone of a word depends
    on what was the previous word spoken.
    </P>
    
    <P>Phones are also distinguished according to their position
    within the word: beginning, end, internal, or single (abbreviated
    <code>b</code>, <code>e</code>, <code>i</code> and <code>s</code>,
    respectively).  For example, in the word <code>MINIMUM</code> with
    the following pronunciation:
    </P>
    <pre>M IH N AX M AX M</pre> 

    <P>the three occurrences of the phone <code>M</code> have three
    different position attributes.  The <code>s</code> attribute
    applies if a word has just a single phone as its pronunciation.
    </P>
    
    <P>For most applications, one builds acoustic models for
    triphones, qualified by the four position attributes.  (This
    provides far greater modelling detail and accuracy than if one
    relies on just basephone models.)  Each triphone is modelled by a
    <em>hidden Markov model</em> or <em>HMM</em>.  Typically, 3 or 5
    state HMMs are used, where each state has a statistical model for
    its underlying acoustics.  But if we have 50 basephones, with 4
    position qualifiers and 3-state HMMs, we end up with a total of
    50<sup>3</sup>*4*3 distinct HMM states!  Such a model set would be
    too large and impractical to train.  To keep things manageable,
    HMM states are <em>clustered</em> into a much smaller number of
    groups.  Each such group is called a <em>senone</em> (in Sphinx
    terminology), and all the states mapped into one senone share the
    same underlying statistical model.  (The clustering of HMM states
    into senones is described in Mei-Yuh Hwang's PhD Thesis.)
    </P>
    
    <P>Each triphone also has a <em>state transition probability
    matrix</em> that defines the topology of its HMM.  Once again, to
    conserve resources, there is a considerable amount of sharing.
    Typically, there is one such matrix per basephone, and all
    triphones derived from the same parent basephone share its state
    transition matrix.
    </P>
    
    <P>The information regarding triphones and mapping from triphone
    states to senones and transition matrices is captured in a
    <em>model definition</em>, or <em>mdef</em> input file.
    </P>
    
    
    
    <H3><a name="am_feature"><U>Acoustic Features Computation</U></a></H3>
    
    <P>For various reasons, it is undesirable to build acoustic models
    directly in terms of the raw audio samples.  Instead, the audio is
    processed to extract a vector of relevant features.  All acoustic
    modelling is carried out in terms of such feature vectors.
    </P>
    
    <P>In Sphinx, feature vector computation is a two-stage process.
    An off-line <a href="./s3_fe_spec.pdf"><em>front-end</em></a>
    module is first responsible for processing the raw audio sample
    stream into a <em>cepstral</em> stream, which can then be input to
    the Sphinx software.  The input audio stream consists of 16-bit
    samples, at a sampling rate of 8 or 16 KHz depending on whether
    the input is narrow or wide-band speech.  The input is windowed,
    resulting in <em>frames</em> of duration 25.625 ms. The number of
    samples in a frame depends on the sampling rate. The output is a
    stream of 13-dimensional real-valued <em>cepstrum
    vectors</em>. The frames overlap, thus resulting in a rate of 100
    vectors/sec.
    </P>
    
    <P>In the second stage, the Sphinx software (both trainer and
    decoder) internally converts the stream of cepstrum vectors into a
    <em>feature stream</em>.  This process consists of the following
    steps:
    </P>
    <OL>
      <LI>An optional <em>cepstrum mean-normalization</em> (CMN) step,
      which itself includes an optional <em>variance
      normalization</em> (VN) step.
      </LI>
      <P></P>

      <LI>An optional <em>automatic gain control</em> (AGC) step, in
      which the signal power component of the cepstral vectors is
      normalized.
      </LI>
      <P></P>

      <LI><em>Feature vector generation</em>.  The final speech
      feature vector is created by typically augmenting the cepstrum
      vector (after CMN and AGC, if any) with one or more time
      derivatives.  In s3.X, the feature vector in each frame is
      computed by concatenating first and second derivatives to the
      cepstrum vector, giving a 39-dimensional vector:
	<div class="silverbox">
	  <center><IMG ALT="* " SRC="./s3/feat.gif"></center>
	</div>
      </LI>
    </OL>
    
    
    
    <H3><a name="am_training"><U>Acoustic Model Training</U></a></H3>
    
    <P>This refers to the computation of a (statistical) model for
    each senone in the model.  As a <u>very rough approximation</u>,
    this process can be described by the following <u>conceptual</u>
    steps:
    <P>
    <OL>
      <LI>Obtain a corpus of training data.  This may include
      thousands of sentences (or <em>utterances</em>, in Sphinx
      jargon), consisting of the spoken text and corresponding audio
      sample stream.
      </LI>
      <P></P>
      
      <LI>For each utterance, convert the audio data to a stream of
      feature vectors as described above.
      </LI>
      <P></P>

      <LI>For each utterance, convert the text into a linear sequence
      of triphone HMMs using the <a href="#sec_dict">pronunciation
      lexicon</a>.  (This is usually called the <em>sentence
      HMM</em>.)
      </LI>
      <P></P>

      <LI>For each utterance, find the best <em>state sequence</em> or
      <em>state alignment</em> through the sentence HMM, for the
      corresponding feature vector sequence.  For example, the figure
      below shows a single HMM with 3 states (using senones 0, 1, 2),
      and an utterance of 14-frames of feature vectors.  The figure
      also shows a sample HMM-state (senone) sequence: each feature
      frame is labelled with a senone ID.
	<div class="silverbox">
	  <center><IMG ALT="* " SRC="./s3/falign.gif"></center>
	</div>
      The best state sequence is one with the <em>smallest
      mismatch</em> between the input feature vectors and the labelled
      senones' underlying statistical models.
      </LI>
      <P></P>

      <LI>For each senone, gather all the frames in the training
      corpus that mapped to that senone in the above step, and build a
      suitable statistical model for the corresponding collection of
      feature vectors.
      </LI>
    </OL>
    <P>Note that there is a circularity in the above description.  We
    wish to train the senone models, but in the penultimate step, we
    need the senone models to compute the best possible state
    alignment.  This circularity is resolved by using the iterative
    <em>Baum-Welch</em> or <em>forward-backward</em> training
    algorithm.  The algorithm begins with some initial set of models,
    which could be completely flat, for the senones.  It then repeats
    the last two steps several times.  Each iteration uses the model
    computed at the end of the previous iteration.
    </P>
    
    <P>Although not mentioned above, the HMM state-transition
    probability matrices are also trained from the state alignments.
    Acoustic modelling is described in greater detail in the Sphinx-3
    trainer module.
    </P>
    
    
    <H3><a name="am_struct"><U>Model Structures</U></a></H3>
    
    <P>The acoustic models trained as described above can be of
    different degrees of sophistication.  Two forms are commonly used:
    </P>
    <UL>
      <LI><em>continuous</em>, and
      <LI><em>semi-continuous</em> or <em>tied-mixture</em>.
    </UL>
    <P>In a continuous model, each senone has its own, private
    <em>mixture-Gaussian</em> distribution that describes the
    statistics of its underlying speech feature space.  In a
    semi-continuous model, all the senones share a single
    <em>codebook</em> of Gaussian distributions, but each senone has
    its own set of <em>mixture weights</em> applied to the codebook
    components.  Sphinx-3 supports both models, and other,
    intermediate degrees of state-tying as well.  (The s3.X decoder,
    however, can only handle continuous density acoustic models.)
    </P>
    
    <P>Similarly, Sphinx-3 in general supports "arbitrary" HMM
    topologies, unlike Sphinx-II, which is restricted to a specific
    5-state topology.  However, for efficiency's sake, the s3.X
    decoder is hardwired to deal with only two types of HMM
    topologies: 3-state and 5-state, described briefly in <a
    href="../src/libs3decoder/hmm.h">hmm.h</a>.
    </P>


    <H3><a name="am_subvq"><U>Sub-Vector Quantized Models</U></a></H3>
    
    <P>Continuous density acoustic models are computationally
    expensive to deal with, since they can contain hundreds of
    thousands of Gaussian densities that must be evaluated in each
    frame.  To reduce this cost, one can use an approximate model that
    efficiently identifies the top scoring candidate densities in each
    Gaussian mixture in any given frame.  The remaining densities can
    be ignored during that frame.
    </P>
    
    <P>In Sphinx-3, such an approximate model is built by
    <em>sub-vector quantizing</em> the acoustic model densities.  The
    utility that performs this conversion is included in this
    distribution and is called <code>gausubvq</code>, which stands for
    Gaussian Sub-Vector Quantization.
    </P>
    
    <P>Note that if the original model consists of mixture Gaussians
    that only contain a few component densities (say, 4 or fewer per
    mixture), a sub-vector quantized model may not be effective in
    reducing the computational load.
    </P>


    
    <H3><a name="am_files"><U>Model Files</U></a></H3>
    
    <P>An acoustic model is represented by the following collection of
    files:
    </P>
    <UL>
      <LI>A <em>model definition</em> (or <em>mdef</em>) file.  It
      defines the set of basephone and triphone HMMs, the mapping of
      each HMM state to a senone, and the mapping of each HMM to a
      state transition matrix.
      </LI>
      <LI>Gaussian <em>mean</em> and <em>variance</em> (or
      <em>mean</em> and <em>var</em>) files.  These files contain all
      the Gaussian codebooks in the model.  The Gaussian means and
      corresponding variance vectors are separated into the two files.
      </LI>
      <LI>A <em>mixture weights</em> (or <em>mixw</em>) file
      containing the Gaussian mixture weights for all the senones in
      the model.
      </LI>
      <LI>A <em>state transition matrix</em> (or <em>tmat</em>) file
      containing all the HMM state transition topologies and their
      transition probabilities in the model.
      </LI>
      <LI>An optional <em>sub-vector quantized model</em> (or
      <em>subvq</em>) file containing an approximation of the acoustic
      model, for efficient evaluation.
      </LI>
    </UL>
    
    <P>The <em>mean</em>, <em>var</em>, <em>mixw</em>, and
    <em>tmat</em> files are produced by the Sphinx-3 trainer, and
    their file formats should be documented there.
    </P>
    <div class="endsec">
      &curren
      <a href="#sec_am">Back to top of this section</a>
    </div>
    
    

    <H2><a name="sec_lm"><U>Language Model</U></a></H2>
    <div class="wheatbox">
      <UL>
	<LI><A HREF="#lm_intro">Introduction</a></LI> 

        <LI><A HREF="#lm_ngrams">Unigrams, Bigrams, Trigrams, LM
        Vocabulary</a></LI>

	<LI><A HREF="#lm_pron_case">Pronunciation and Case
	Considerations</a></LI>

	<LI><A HREF="#lm_dumpfile">Binary LM File</A></LI>

	<LI><A HREF="#lm_filler">Silence and Filler Words</a></LI>

	<LI><A HREF="#lm_lw_wip">Language Weight and Insertion
	Penalty</A></LI>
      </UL>
    </div>
    

    <H3><a name="lm_intro"><U>Introduction</U></a></H3>

    <P>The main language model (LM) used by the Sphinx decoder is a
    conventional bigram or trigram backoff language model.  The <a
    href="http://www.speech.cs.cmu.edu/SLM_info.html"><em>CMU-Cambridge
    SLM toolkit</em></a> is capable of generating such a model from LM
    training data.  Its output is an ascii text file.  But a large
    text LM file can be very slow to load into memory.  To speed up
    this process, the LM must be compiled into a <a
    href="#lm_dumpfile">binary form</a>.  The code to convert from an
    ascii text file to the binary format is available at <a
    href="http://www.sourceforge.net/projects/cmusphinx">SourceForge</a>
    in the CVS tree, in a module named <em>share</em>.
    </P>
    
    
    <H3><a name="lm_ngrams"><U>Unigrams, Bigrams, Trigrams, LM
    Vocabulary</U></a></H3>

    <P>A trigram LM primarily consists of the following:</P>
    <UL>
      <LI><em>Unigrams:</em> The entire set of words in this LM, and
      their individual probabilities of occurrence in the language.
      The unigrams must include the special
      <em>beginning-of-sentence</em> and <em>end-of-sentence</em>
      tokens: <code>&lt;s&gt</code>, and <code>&lt;/s&gt</code>
      respectively.</LI> <P></P>
    
      <LI><em>Bigrams:</em> A <em>bigram</em> is mathematically
      <em>P(word2 | word1)</em>.  That is, the <em>conditional
      probability</em> that <em>word2</em> immediately follows
      <em>word1</em> in the language.  An LM typically contains this
      information for some subset of the possible word pairs.  That
      is, not all possible <em>word1 word2</em> pairs need be covered
      by the bigrams.</LI> <P></P>
    
      <LI><em>Trigrams:</em> Similar to a bigram, a <em>trigram</em>
      is <em>P(word3 | word1, word2)</em>, or the conditional
      probability that <em>word3</em> immediately follows a <em>word1
      word2</em> sequence in the language.  Not all possible 3-word
      combinations need be covered by the trigrams.</LI> <P></P>
    </UL>
    <P>The <em>vocabulary</em> of the LM is the set of words covered
    by the unigrams.</P> 

    <P>The LM probability of an entire sentence is the product of the
    individual word probabilities.  For example, the LM probability of
    the sentence <code>"HOW ARE YOU"</code> is:</P>

    <pre>
P(HOW | &lt;s&gt) * 
P(ARE | &lt;s&gt, HOW) * 
P(YOU | HOW, ARE) * 
P(&lt;/s&gt | ARE, YOU)</pre>


    
    <H3><a name="lm_pron_case"><U>Pronunciation and Case
    Considerations</U></a></H3>

    <P>In Sphinx, the LM cannot distinguish between different
    pronunciations of the same word.  For example, even though the
    lexicon might contain two different pronunciation entries for the
    word <code>READ</code> (present and past tense forms), the
    language model cannot distinguish between the two.  Both
    pronunciations would inherit the same probability from the
    language model.
    </P>
    
    <P>Secondly, the LM is <em>case-insensitive</em>.  For example, it
    cannot contain two different tokens <code>READ</code> and
    <code>read</code>.
    </P>

    <P>The reasons for the above restrictions are historical.  Precise
    pronunciation and case information has rarely been present in LM
    training data.  It would certainly be desirable to do away with
    the restrictions at some time in the future.
    </P>

    
    
    <H3><a name="lm_dumpfile"><U>Binary LM File</U></a></H3>
    
    <P>The binary LM file (also referred to as the LM <em>dump</em>
    file) is more or less a disk image of the LM data structure
    constructed in memory.  This data structure was originally
    designed during the Sphinx-II days, when efficient memory usage
    was the focus.  In Sphinx-3, however, memory usage is no longer an
    issue since the binary file enables the decoder to use a
    <em>disk-based LM</em> strategy.  That is, the LM binary file is
    no longer read entirely into memory.  Rather, the portions
    required during decoding are read in on demand, and cached.  For
    large vocabulary recognition, the memory resident portion is
    typically about 10-20% of the bigrams, and 5-10% of the trigrams.
    </P>

    <P>Since the decoder uses a <a href="#lm_dumpfile">disk-based
    LM</a>, it is necessary to have efficient access to the binary LM
    file.  Thus, network access to an LM file at a remote location is
    not recommended.  It is desirable to have the LM file be resident
    on the local machine.
    </P>
      
    <P>The binary dump file can be created from the ascii form using
    the <code>lm3g2dmp</code> utility, which is part of the Sphinx-II
    distribution, and also available as standalone code, as mentioned
    before.  (The header of the dump file itself contains a brief
    description of the file format.)
    </P>
    

    
    <H3><a name="lm_filler"><U>Silence and Filler Words</U></a></H3>

    <P>Language models typically do not cover acoustically significant
    events such as silence, breath-noise, <em>UM</em> or <em>UH</em>
    sounds made by a person hunting for the right phrase, etc.  These
    are known generally as <em>filler words</em>, and are excluded
    from the LM vocabulary.  The reason is that a language model
    training corpus, which is simply a lot of text, usually does not
    include such information.
    </P>
    
    <P>Since the main trigram LM ignores silence and filler words,
    their "language model probability" has to be specified in a
    separate file, called the <em>filler penalty file</em>.  The
    format of this file is very straightforward; each line contains
    one word and its probability, as in the following example:
    </P>
    <pre>
++UH++      0.10792
++UM++      0.00866
++BREATH++  0.00147</pre>

    <P>The filler penalty file is not required.  If it <em>is</em>
    present, it does not have to contain entries for every filler
    word.  The decoder allows a default value to be specified for
    filler word probabilities (through the <code>-fillprob</code>
    configuration argument), and a default silence word probability
    (through the <code>-silprob</code> argument).
    </P>
    
    <P>Like the main trigram LM, filler and silence word probabilities
    are obtained from appropriate training data.  However, training
    them is considerably easier since they are merely unigram
    probabilities.
    </P>

    <P>Filler words are invisible or <em>transparent</em> to the
    trigram language model.  For example, the LM probability of the
    sentence <code>"HAVE CAR &lt;sil&gt WILL TRAVEL"</code> is:</P>
    <pre>
P(HAVE | &lt;s&gt) * 
P(CAR | &lt;s&gt, HAVE) * 
P(&lt;sil&gt) *
P(WILL | HAVE, CAR) * 
P(TRAVEL | CAR, WILL) * 
P(&lt;/s&gt | WILL, TRAVEL)</pre>


    
    <H3><a name="lm_lw_wip"><U>Language Weight and Word Insertion
    Penalty</U></a></H3>
    
    <P>During recognition the decoder combines both acoustic
    likelihoods and language model probabilities into a single score
    in order to compare various hypotheses.  This combination of the
    two is not just a straightforward product.  In order to obtain
    optimal recognition accuracy, it is usually necessary to
    <em>exponentiate</em> the language model probability using a
    <em>language weight</em> before combining the result with the
    acoustic likelihood.  (Since likelihood computations are actually
    carried out in the log-domain in the Sphinx decoder, the LM weight
    becomes a multiplicative factor applied to LM log-probabilities.)
    </P>
    
    <P>The language weight parameter is typically obtained through
    trial and error.  In the case of Sphinx, the optimum value for
    this parameter has usually ranged between 6 and 13, depending on
    the task at hand.
    </P>
    
    <P>Similarly, though with lesser impact, it has also been found
    useful to include a <em>word insertion penalty</em> parameter
    which is a fixed penalty for each new word hypothesized by the
    decoder.  It is effectively another multiplicative factor in the
    language model probability computation (before the application of
    the language weight).  This parameter has usually ranged between
    0.2 and 0.7, depending on the task.
    </P>
    <div class="endsec">
      &curren
      <a href="#sec_lm">Back to top of this section</a>
    </div>


    
    <H2><a name="sec_ctl"><U>Speech Input Control File</U></a></H2>
    
    <P>The Sphinx-3 decoder processes entries listed in a <em>control
    file</em>.  Each line in the control file identifies a separate
    <em>utterance</em>.  A line has the following format (the brackets
    indicate a group of fields that is optional):
    </P>
    <pre>
AudioFile [ StartFrame EndFrame UttID ]</pre>

    <P><em>AudioFile</em> is the speech input file. In this
    distribution of s3.X, this file is in raw audio format. In all
    other versions of Sphinx-3, this file contains cepstrum data.  The
    filename extension should be omitted from the specification.  If
    this is the only field in the line, the entire file is processed
    as one utterance.  In this case, an <em>utterance ID</em> string
    is automatically derived from the cepstrum filename, by stripping
    any leading directory name components from it.  E.g.: if the
    control file contains the following entries:
    </P>
    <pre>
/net/alf20/usr/rkm/SHARED/cep/nov94/h1_et_94/4t0/4t0c0201
/net/alf20/usr/rkm/SHARED/cep/nov94/h1_et_94/4t0/4t0c0202
/net/alf20/usr/rkm/SHARED/cep/nov94/h1_et_94/4t0/4t0c0203</pre>

    <P>three utterances are processed, with IDs <code>4t0c0201</code>,
    <code>4t0c0202</code>, and <code>4t0c0203</code>, respectively.
    </P>
    
    <P>If, on the other hand, a control file entry includes the
    <em>StartFrame</em> and <em>EndFrame</em> fields, only that
    portion of the cepstrum file is processed.  This form of the
    control file is frequently used if the speech input can be
    arbitrarily long, such as an entire TV news show.  There is one
    big cepstrum file, but it is processed in smaller chunks or
    segments.  In this case, the final <em>UttID</em> field is the
    utterance ID string for the entry.
    </P>
    
    <P>The utterance ID associated with a control file entry is used
    to identify all the output from the decoder for that utterance.
    For example, if the decoder is used to generate <a
    href="#sec_wordlat">word lattice</a> files, they are named using
    the utterance ID.  Hence, each ID, whether automatically derived
    or explicitly specified, should be unique over the entire control
    file.
    </P>

    <P>Any line in the control file beginning with a <code>#</code>
    character is a comment line, and is ignored.
    </P>
    <div class="endsec">
      &curren
      <a href="#sec_ctl">Back to top of this section</a>
    </div>
    
    <H2><a name="sec_hypseg"><U>Recognition Hypothesis Output</U></a></H2>
    
    <P>The Sphinx-3 decoder produces a single recognition
    <em>hypothesis</em> for each utterance it processes.  The
    hypotheses for all the utterances processed in a single run are
    written to a single output file, one line per utterance.  The line
    format is as follows:
    </P>

    <pre><em>u</em> S <em>s</em> T <em>t</em> A <em>a</em> L <em>l</em> <em>sf</em> <em>wa</em> <em>wl</em> <em>wd</em> <em>sf</em> <em>wa</em> <em>sl</em> <em>wd</em> ... <em>nf</em></pre>

    <P>The <code>S</code>, <code>T</code>, <code>A</code>, and
    <code>L</code> fields are keywords and appear in the output as
    shown.  The remaining fields are briefly described below:
    </P>
    <UL>
      <LI><em>u</em>: the utterance ID

      <LI><em>s</em>: an acoustic score scaling done during acoustic
      likelihood computation (However, this field is 0 in the s3.X
      decoder output.)

      <LI><em>t</em>: the total score for this hypothesis

      <LI><em>a</em>: the total acoustic score for this hypothesis

      <LI><em>l</em>: the total language model score for this hypothesis
    </UL>
    <P>The <em>l score</em> field is followed by groups of four fields,
    one group for each successive word in the output hypothesis.  The
    four fields are:
    </P>
    <UL>
      <LI><em>sf</em>: Start frame for the word (its end frame is just
      before the start frame of the next word)

      <LI><em>wa</em>: Acoustic score for the word

      <LI><em>wl</em>: LM score for the word

      <LI><em>wd</em>: The word string itself.
    </UL>
    <P>The final field, <em>nf</em>, in each hypothesis line is the
    total number of frames in the utterance.</P>
    
    <P>Note that all scores are <em>log-likelihood</em> values in the
    peculiar logbase used by the decoder.  Secondly, the acoustic
    scores are <em>scaled</em> values; in each frame, the acoustic
    scores of all active <a href="#sec_am">senones</a> are scaled such
    that the best senone has a log-likelihood of 0.  Finally, the
    language model scores reported include the <a
    href="#lm_lw_wip"><em>language weight</em> and <em>word-insertion
    penalty</em></a> parameters.
    </P>
    
    <P>Here is an <a href="./s3/hypseg.txt">example hypothesis file</a>
    for three utterances.
    </P>
    <div class="endsec">
      &curren
      <a href="#sec_hypseg">Back to top of this section</a>
    </div>
    

    
    <H2><a name="sec_wordlat"><U>Word Lattice Output</U></a></H2>
    <div class="wheatbox">
      <UL>
	<LI><A HREF="#wordlat_overview">Word Lattice Overview</A></LI>
	<LI><A HREF="#wordlat_format">Word Lattice File Format</A></LI>
      </UL>
    </div>
    

    <h3><a name="wordlat_overview"><U>Word Lattice Overview</U></a></H3>
    
    <P>During recognition the decoder maintains not just the single
    best hypothesis, but also a number of alternatives or candidates.
    For example, <code>REED</code> is a perfectly reasonable
    alternative to <code>READ</code>.  The alternatives are useful in
    many ways: for instance, in N-best list generation.  To facilitate
    such <em>post-processing</em>, the decoder can optionally produce
    a <em>word lattice</em> output for each input utterance.  This
    output records all the candidate words recognized by the decoder
    at any point in time, and their main attributes such as time
    segmentation and acoustic likelihood scores.
    </P>
    
    <P>The term "lattice" is used somewhat loosely.  The word-lattice
    is really a <em>directed acyclic graph</em> or <em>DAG</em>.  Each
    node of the DAG denotes a word instance that begins at a
    particular frame within the utterance.  That is, it is a unique
    <code>&lt;word,start-time&gt</code> pair.  (However, there could
    be a number of end-times for this word instance.  One of the
    features of a time-synchronous Viterbi search using beam pruning
    is that word candidates hypothesized by the decoder have a
    well-defined start-time, but a fuzzy range of end-times.  This is
    because the start-time is primarily determined by <em>Viterbi
    pruning</em>, while the possible end-times are determined by beam
    pruning.)
    </P>
    
    <P>There is a directed edge between two nodes in the DAG if the
    start-time of the destination node immediately follows one of the
    end times of the source node.  That is, the two nodes can be
    adjacent in time.  Thus, the edge determines one possible
    segmentation for the source node: beginning at the source's
    start-time and ending one frame before the destination's
    start-time.  The edge also contains an acoustic likelihood for
    this particular segmentation of the source node.
    </P>
    
    <P><em>Note:</em> The beginning and end of sentence tokens,
    <code>&lt;s&gt</code> and <code>&lt;/s&gt</code>, are not decoded
    as part of an utterance by the s3.X decoder.  However, they have
    to be included in the word lattice file, for compatibility with
    the older Sphinx-3 decoder software.  They are assigned 1-frame
    segmentations, with log-likelihood scores of 0.  To accommodate
    them, the segmentations of adjacent nodes have to be "fudged" by 1
    frame.
    </P>



    <h3><a name="wordlat_format"><U>Word Lattice File Format</U></a></H3>

    <P>A word lattice file essentially contains the above information
    regarding the nodes and edges in the DAG.  It is structured in
    several sections, as follows:
    </P>
    <OL>
      <LI>A comment section, listing important configuration arguments
      as comments</LI>
     
      <LI><code>Frames</code> section, specifying the number of frames
      in utterance</LI>

      <LI><code>Nodes</code> section, listing the nodes in the DAG</LI>

      <LI><code>Initial</code> and <code>Final</code> nodes (for
      <code>&lt;s&gt</code> and <code>&lt;/s&gt</code>,
      respectively)</LI>

      <LI><code>BestSegAscr</code> section, a historical remnant now
      essentially empty</LI>

      <LI><code>Edges</code> section, listing the edges in the DAG</LI>
    </OL>
    
    <P>The file is formatted as follows.  Note that any line in the
    file that begins with the <code>#</code> character in the first
    column is considered to be a comment.
    </P>
    <pre>
# getcwd: &lt;current-working-directory&gt
# -logbase &lt;logbase-in-effect&gt
# -dict &lt;main lexicon&gt
# -fdict &lt;filler lexicon&gt
# ... (other arguments, written out as comment lines)
#
Frames &lt;number-of-frames-in-utterance&gt
#
Nodes &lt;number-of-nodes-in-DAG&gt (NODEID WORD STARTFRAME FIRST-ENDFRAME LAST-ENDFRAME)
&lt;Node-ID&gt &lt;Word-String&gt &lt;Start-Time&gt &lt;Earliest-End-time&gt &lt;Latest-End-Time&gt
&lt;Node-ID&gt &lt;Word-String&gt &lt;Start-Time&gt &lt;Earliest-End-time&gt &lt;Latest-End-Time&gt
&lt;Node-ID&gt &lt;Word-String&gt &lt;Start-Time&gt &lt;Earliest-End-time&gt &lt;Latest-End-Time&gt
... (for all nodes in DAG)
#
Initial &lt;Initial-Node-ID&gt
Final &lt;Final-Node-ID&gt
#
BestSegAscr 0 (NODEID ENDFRAME ASCORE)
#
Edges (FROM-NODEID TO-NODEID ASCORE)
&lt;Source-Node-ID&gt &lt;Destination-Node-ID&gt &lt;Acoustic Score&gt
&lt;Source-Node-ID&gt &lt;Destination-Node-ID&gt &lt;Acoustic Score&gt
&lt;Source-Node-ID&gt &lt;Destination-Node-ID&gt &lt;Acoustic Score&gt
... (for all edges in DAG)
End</pre>
    
    <P>Note that the <em>node-ID</em> values for DAG nodes are
    assigned sequentially, starting from 0.  Furthermore, they are
    sorted in <em>descending order</em> of their
    <em>earliest-end-time</em> attribute.
    </P>
    <P>Here is an <a href="4t0c020c.lat">example word lattice</a> file.</P>
    <div class="endsec">
      &curren
      <a href="#sec_wordlat">Back to top of this section</a>
    </div>


    
    <H2><a name="sec_utilpgm"><U>Other Utilities</U></a></H2>
    
    <P>In addition to the s3.X decoders, <code>sphinx3_decode</code>,
    <code>sphinx3_livedecode</code> and <code>livrepretend</code>, this
    distribution also provides other utility programs.</P>

    <H4><a name="sec_gausubvq"><U>Gaussian Sub-Vector Quantization
    Utility</U></A></H4>
    
    <P></P>
    <div class="endsec">
      &curren
      <a href="#sec_utilpgm">Back to top of this section</a>
    </div>

    
    <H2><a name="sec_src"><U>Source Code</U></a></H2>
    
    <P>In alphabetical order:</P>
    <table>

      <tr>
	<td><a href="../src/libs3decoder/libam/adaptor.c"><code>adaptor.c</code></a></td>
	<td>A wrapper of adaptation routines. </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/approx_cont_mgau.c"><code>approx_cont_mgau.c</code></a></td>
	<td>Fast Gaussian Distribution Computation</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libcep_feat/agc.c"><code>agc.c</code></a></td>
	<td>Automatic gain control (on signal energy)</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libsearch/ascr.c"><code>ascr.c</code></a></td>
	<td>Senone acoustic scores</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libcommon/bio.c"><code>bio.c</code></a></td>
	<td>Binary file I/O support</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libcommon/blkarray.c"><code>blkarray.c</code></a></td>
	<td>Block array used in FST search</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/cb2mllr.c"><code>cb2mllr.c</code></a></td>
	<td>Codebook to MLLR mapping</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libep/classify.c"><code>classify.c</code></a></td>
	<td>GMM Classifier. </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libcep_feat/cmn.c"><code>cmn.c</code></a></td>
	<td>Cepstral mean normalization and variance normalization</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libdict/cmu6_lts_rules.c"><code>cmn6_lts_rules.c</code></a></td>
	<td>Letter-to-sound rules from flite.</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libdict/ctxt_table.c"><code>ctxt_table.c</code></a></td>
	<td>Context table for search functions. </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libconfidence/confidence.c"><code>confidence.c</code></a></td>
	<td>Word-lattice based word-based confidence scoring. </td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libcommon/corpus.c"><code>corpus.c</code></a></td>
	<td>Control file processing</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libcommon/encoding.c"><code>encoding.c</code></a></td>
	<td>Take care of text-encoding issues.  </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/cont_mgau.c"><code>cont_mgau.c</code></a></td>
	<td>Mixture Gaussians (acoustic model)</td>
      </tr>

      <tr>
	<td><a
	href="../src/programs/decode.c"><code>decode.c</code></a></td>
	<td>Main file for <code>sphinx3_decode</code></td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libdict/dict.c"><code>dict.c</code></a></td>
	<td>Pronunciation lexicon</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libdict/dict2pid.c"><code>dict2pid.c</code></a></td>
	<td>Generation of triphones for the pronunciation dictionary</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libep/endptr.c"><code>endptr.c</code></a></td>
	<td>Voting-based end-pointer.</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/fast_algo_struct.c"><code>fast_algo_struct.c</code></a></td>
	<td>Structures of all fast algorithm. </td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libfeat/feat.c"><code>feat.c</code></a></td>
	<td>Feature vectors computation</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libdict/fillpen.c"><code>fillpen.c</code></a></td>
	<td>Filler word probabilities</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libsearch/flat_fwd.c"><code>flat_fwd.c</code></a></td>
	<td>Implementation of flat lexicon decoder. </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libsearch/fsg_history.c"><code>fsg_history.c</code></a></td>
	<td>The history table used in the FSG search </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libsearch/fsg_lextree.c"><code>fsg_lextree.c</code></a></td>
	<td>It is actually flat structure at this point. </td>
      </tr>
      
      <tr>
	<td><a href="../src/programs/gausubvq.c"><code>gausubvq.c</code></a></td>
	<td>Standalone acoustic model sub-vector quantizer</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decode/libam/gs.c"><code>gs.c</code></a></td>
	<td>Gaussian selector by Bochierri</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/hmm.c"><code>hmm.c</code></a></td>
	<td>HMM evaluation</td>
      </tr>
      
      <tr>
	<td><a href="../include/hyp.h"><code>hyp.h</code></a></td>
	<td>Recognition hypotheses data type</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/interp.c"><code>interp.c</code></a></td>
	<td>Interpolation of scoustic models</td>
      </tr>
      
      <tr>
	<td><a href="../include/kb.h"><code>kb.h</code></a></td>
	<td>All knowledge bases, search structures used by decoder</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libsearch/kbcore.c"><code>kbcore.c</code></a></td>
	<td>Collection of core knowledge bases</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/kdtree.c"><code>kdtree.c</code></a></td>
	<td>KD-tree support in semi-continous HMM</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/lextree.c"><code>lextree.c</code></a></td>
	<td>Lexical search tree</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libAPI/live_decode_args.c"><code>live_decode_args.c</code></a></td>
	<td>Argument definition of the livedecoder. </td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libAPI/live_decode_API.c"><code>live_decode_API.c</code></a></td>
	<td>Live decoder functions</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libdict/lts.c"><code>lts.c</code></a></td>
	<td>Letter to sound rules. </td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/liblm/lm.c"><code>lm.c</code></a></td>
	<td>Trigram language model, top-level controller module</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/liblm/lm_3g.c"><code>lm_3g.c</code></a></td>
	<td>Trigram language model, TXT file driver</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/liblm/lm_3g_dmp.c"><code>lm_3g_dmp.c</code></a></td>
	<td>Trigram language model, DMP file driver</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/liblm/lm_attfsm.c"><code>lm_attfsm.c</code></a></td>
	<td>Trigram language model, ATT-FSM file format driver</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/liblm/lm_class.c"><code>lm_class.c</code></a></td>
	<td>Handling of class-based LM </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/liblm/lmset.c"><code>lmset.c</code></a></td>
	<td>Handling of a set of LMs </td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libcommon/logs3.c"><code>logs3.c</code></a></td>
	<td>Support for log-likelihood operations</td>
      </tr>

      <tr>
	<td><a
	href="../src/programs/main_live_example.c"><code>main_live_example.c</code></a></td>
	<td>Main file for <code>s3livedecode</code> showing use of
	<code>live_decode_API.h</code></td>
      </tr>

      <tr>
	<td><a
	href="../src/programs/main_live_pretend.c"><code>main_live_pretend.c</code></a></td>
	<td>Main file for <code>s3livepretend</code> showing use of
	<code>live_decode_API.h</code></td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/mdef.c"><code>mdef.c</code></a></td>
	<td>Acoustic model definition</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libcommon/misc.c"><code>misc.c</code></a></td>
	<td>Miscellaneous routines used in Sphinx 3</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/mllr.c"><code>mllr.c</code></a></td>
	<td>transformation of mean based on a linear regression matrix. </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/ms_gauden.c"><code>ms_gauden.c</code></a></td>
	<td>Multi-stream Gaussian computation. (Adapted from s3.0) </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/ms_mllr.c"><code>ms_mllr.c</code></a></td>
	<td>Multi-stream MLLR. (Adapted from s3.0) </td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/ms_senone.c"><code>ms_senone.c</code></a></td>
	<td>Multi-stream Senone computation. (Adapted from s3.0) </td>
      </tr>

      <tr>
	<td><a href="../include/s2_semi_mgau.c"><code>s2_semi_mgau.c</code></a></td>
	<td>Sphinx 2 semi-continuous HMM computation. </td>
      </tr>

      <tr>
	<td><a href="../include/s3types.h"><code>s3types.h</code></a></td>
	<td>Various data types, for ease of modification</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libcommon/stat.c"><code>stat.c</code></a></td>
	<td>Statistics of decoding.</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/subvq.c"><code>subvq.c</code></a></td>
	<td>Sub-vector quantized acoustic model</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libam/tmat.c"><code>tmat.c</code></a></td>
	<td>HMM transition matrices (topology definition)</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libcommon/vector.c"><code>vector.c</code></a></td>
	<td>Vector operations, quantization, etc.</td>
      </tr>

      <tr>
	<td><a href="../src/libs3decoder/libsearch/vithist.c"><code>vithist.c</code></a></td>
	<td>Backpointer table (Viterbi history)</td>
      </tr>
      
      <tr>
	<td><a href="../src/libs3decoder/libdict/wid.c"><code>wid.c</code></a></td>
	<td>Mapping between LM and lexicon word IDs</td>
      </tr>
    </table>
    <P></P>
    <div class="endsec">
      &curren
      <a href="#sec_src">Back to top of this section</a>
    </div>
    <P></P>
    
    <H2></H2><!-- Just to provide some space -->

    <address>Maintained by <a href="mailto:egouvea+sourceforge@cs.cmu.edu">Evandro B. Gouv&ecirc;a<a> and  <a href="mailto:archan+sourceforge@cs.cmu.edu"> Arthur Chan <a> </address>
    
    <!-- Created: Sun Feb 22 14:03:14 EST 1998 -->
    <!-- hhmts start -->
Last modified: Thu Jul 22 09:35:27 EDT 2004
<!-- hhmts end -->
  </body>
</html>