/usr/include/googlepinyin/dictbuilder.h is in libgooglepinyin0-dev 0.1.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | /*
* Copyright (C) 2009 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
#define PINYINIME_INCLUDE_DICTBUILDER_H__
#include <stdlib.h>
#include "./utf16char.h"
#include "./dictdef.h"
#include "./dictlist.h"
#include "./spellingtable.h"
#include "./spellingtrie.h"
#include "./splparser.h"
namespace ime_pinyin {
#ifdef ___BUILD_MODEL___
#define ___DO_STATISTICS___
class DictTrie;
class DictBuilder {
private:
// The raw lemma array buffer.
LemmaEntry *lemma_arr_;
size_t lemma_num_;
// Used to store all possible single char items.
// Two items may have the same Hanzi while their spelling ids are different.
SingleCharItem *scis_;
size_t scis_num_;
// In the tree, root's level is -1.
// Lemma nodes for root, and level 0
LmaNodeLE0 *lma_nodes_le0_;
// Lemma nodes for layers whose levels are deeper than 0
LmaNodeGE1 *lma_nodes_ge1_;
// Number of used lemma nodes
size_t lma_nds_used_num_le0_;
size_t lma_nds_used_num_ge1_;
// Used to store homophonies' ids.
LemmaIdType *homo_idx_buf_;
// Number of homophonies each of which only contains one Chinese character.
size_t homo_idx_num_eq1_;
// Number of homophonies each of which contains more than one character.
size_t homo_idx_num_gt1_;
// The items with highest scores.
LemmaEntry *top_lmas_;
size_t top_lmas_num_;
SpellingTable *spl_table_;
SpellingParser *spl_parser_;
#ifdef ___DO_STATISTICS___
size_t max_sonbuf_len_[kMaxLemmaSize];
size_t max_homobuf_len_[kMaxLemmaSize];
size_t total_son_num_[kMaxLemmaSize];
size_t total_node_hasson_[kMaxLemmaSize];
size_t total_sonbuf_num_[kMaxLemmaSize];
size_t total_sonbuf_allnoson_[kMaxLemmaSize];
size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
size_t total_homo_num_[kMaxLemmaSize];
size_t sonbufs_num1_; // Number of son buffer with only 1 son
size_t sonbufs_numgt1_; // Number of son buffer with more 1 son;
size_t total_lma_node_num_;
void stat_init();
void stat_print();
#endif
public:
DictBuilder();
~DictBuilder();
// Build dictionary trie from the file fn_raw. File fn_validhzs provides
// valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
// included.
bool build_dict(const char* fn_raw, const char* fn_validhzs,
DictTrie *dict_trie);
private:
// Fill in the buffer with id. The caller guarantees that the paramters are
// vaild.
void id_to_charbuf(unsigned char *buf, LemmaIdType id);
// Update the offset of sons for a node.
void set_son_offset(LmaNodeGE1 *node, size_t offset);
// Update the offset of homophonies' ids for a node.
void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);
// Format a speling string.
void format_spelling_str(char *spl_str);
// Sort the lemma_arr by the hanzi string, and give each of unique items
// a id. Why we need to sort the lemma list according to their Hanzi string
// is to find items started by a given prefix string to do prediction.
// Actually, the single char items are be in other order, for example,
// in spelling id order, etc.
// Return value is next un-allocated idx available.
LemmaIdType sort_lemmas_by_hz();
// Build the SingleCharItem list, and fill the hanzi_scis_ids in the
// lemma buffer lemma_arr_.
// This function should be called after the lemma array is ready.
// Return the number of unique SingleCharItem elements.
size_t build_scis();
// Construct a subtree using a subset of the spelling array (from
// item_star to item_end)
// parent is the parent node to update the necessary information
// parent can be a member of LmaNodeLE0 or LmaNodeGE1
bool construct_subset(void* parent, LemmaEntry* lemma_arr,
size_t item_start, size_t item_end, size_t level);
// Read valid Chinese Hanzis from the given file.
// num is used to return number of chars.
// The return buffer is sorted and caller needs to free the returned buffer.
char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);
// Read a raw dictionary. max_item is the maximum number of items. If there
// are more items in the ditionary, only the first max_item will be read.
// Returned value is the number of items successfully read from the file.
size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
size_t max_item);
// Try to find if a character is in hzs buffer.
bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);
// Try to find if all characters in str are in hzs buffer.
bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
const char16 *str, size_t str_len);
// Get these lemmas with toppest scores.
void get_top_lemmas();
// Allocate resource to build dictionary.
// lma_num is the number of items to be loaded
bool alloc_resource(size_t lma_num);
// Free resource.
void free_resource();
};
#endif // ___BUILD_MODEL___
}
#endif // PINYINIME_INCLUDE_DICTBUILDER_H__
|