/usr/include/googlepinyin/dicttrie.h is in libgooglepinyin0-dev 0.1.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | /*
* Copyright (C) 2009 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef PINYINIME_INCLUDE_DICTTRIE_H__
#define PINYINIME_INCLUDE_DICTTRIE_H__
#include <stdlib.h>
#include "./atomdictbase.h"
#include "./dictdef.h"
#include "./dictlist.h"
#include "./searchutility.h"
namespace ime_pinyin {
class DictTrie : AtomDictBase {
private:
struct ParsingMark {
size_t node_offset:24;
size_t node_num:8; // Number of nodes with this spelling id given
// by spl_id. If spl_id is a Shengmu, for nodes
// in the first layer of DictTrie, it equals to
// SpellingTrie::shm2full_num(); but for those
// nodes which are not in the first layer,
// node_num < SpellingTrie::shm2full_num().
// For a full spelling id, node_num = 1;
};
// Used to indicate an extended mile stone.
// An extended mile stone is used to mark a partial match in the dictionary
// trie to speed up further potential extending.
// For example, when the user inputs "w", a mile stone is created to mark the
// partial match status, so that when user inputs another char 'm', it will be
// faster to extend search space based on this mile stone.
//
// For partial match status of "wm", there can be more than one sub mile
// stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
// there may be more one parsing mark used to mark these partial matchings.
// A mile stone records the starting position in the mark list and number of
// marks.
struct MileStone {
uint16 mark_start;
uint16 mark_num;
};
DictList* dict_list_;
const SpellingTrie *spl_trie_;
LmaNodeLE0* root_; // Nodes for root and the first layer.
LmaNodeGE1* nodes_ge1_; // Nodes for other layers.
// An quick index from spelling id to the LmaNodeLE0 node buffer, or
// to the root_ buffer.
// Index length:
// SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
// to get the end.
// All Shengmu ids are not indexed because they will be converted into
// corresponding full ids.
// So, given an id splid, the son is:
// root_[splid_le0_index_[splid - kFullSplIdStart]]
uint16 *splid_le0_index_;
size_t lma_node_num_le0_;
size_t lma_node_num_ge1_;
// The first part is for homophnies, and the last top_lma_num_ items are
// lemmas with highest scores.
unsigned char *lma_idx_buf_;
size_t lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte.
size_t total_lma_num_; // Total number of lemmas in this dictionary.
size_t top_lmas_num_; // Number of lemma with highest scores.
// Parsing mark list used to mark the detailed extended statuses.
ParsingMark *parsing_marks_;
// The position for next available mark.
uint16 parsing_marks_pos_;
// Mile stone list used to mark the extended status.
MileStone *mile_stones_;
// The position for the next available mile stone. We use positions (except 0)
// as handles.
MileStoneHandle mile_stones_pos_;
// Get the offset of sons for a node.
inline size_t get_son_offset(const LmaNodeGE1 *node);
// Get the offset of homonious ids for a node.
inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);
// Get the lemma id by the offset.
inline LemmaIdType get_lemma_id(size_t id_offset);
void free_resource(bool free_dict_list);
bool load_dict(FILE *fp);
// Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
// them into the lpi_items buffer.
// This function is called by the search engine.
size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
LmaNodeLE0 *node);
// Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
// them into the lpi_items buffer.
// This function is called by inner functions extend_dict0(), extend_dict1()
// and extend_dict2().
size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
size_t homo_buf_off, LmaNodeGE1 *node,
uint16 lma_len);
// Extend in the trie from level 0.
MileStoneHandle extend_dict0(MileStoneHandle from_handle,
const DictExtPara *dep, LmaPsbItem *lpi_items,
size_t lpi_max, size_t *lpi_num);
// Extend in the trie from level 1.
MileStoneHandle extend_dict1(MileStoneHandle from_handle,
const DictExtPara *dep, LmaPsbItem *lpi_items,
size_t lpi_max, size_t *lpi_num);
// Extend in the trie from level 2.
MileStoneHandle extend_dict2(MileStoneHandle from_handle,
const DictExtPara *dep, LmaPsbItem *lpi_items,
size_t lpi_max, size_t *lpi_num);
// Try to extend the given spelling id buffer, and if the given id_lemma can
// be successfully gotten, return true;
// The given spelling ids are all valid full ids.
bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
#ifdef ___BUILD_MODEL___
bool save_dict(FILE *fp);
#endif // ___BUILD_MODEL___
static const int kMaxMileStone = 100;
static const int kMaxParsingMark = 600;
static const MileStoneHandle kFirstValidMileStoneHandle = 1;
friend class DictParser;
friend class DictBuilder;
public:
DictTrie();
~DictTrie();
#ifdef ___BUILD_MODEL___
// Construct the tree from the file fn_raw.
// fn_validhzs provide the valid hanzi list. If fn_validhzs is
// NULL, only chars in GB2312 will be included.
bool build_dict(const char *fn_raw, const char *fn_validhzs);
// Save the binary dictionary
// Actually, the SpellingTrie/DictList instance will be also saved.
bool save_dict(const char *filename);
#endif // ___BUILD_MODEL___
void convert_to_hanzis(char16 *str, uint16 str_len);
void convert_to_scis_ids(char16 *str, uint16 str_len);
// Load a binary dictionary
// The SpellingTrie instance/DictList will be also loaded
bool load_dict(const char *filename, LemmaIdType start_id,
LemmaIdType end_id);
bool load_dict_fd(int sys_fd, long start_offset, long length,
LemmaIdType start_id, LemmaIdType end_id);
bool close_dict() {return true;}
size_t number_of_lemmas() {return 0;}
void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
MileStoneHandle extend_dict(MileStoneHandle from_handle,
const DictExtPara *dep,
LmaPsbItem *lpi_items,
size_t lpi_max, size_t *lpi_num);
size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
LmaPsbItem *lpi_items, size_t lpi_max);
uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
uint16 splids_max, bool arg_valid);
size_t predict(const char16 *last_hzs, uint16 hzs_len,
NPredictItem *npre_items, size_t npre_max,
size_t b4_used);
LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
uint16 lemma_len, uint16 count) {return 0;}
LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
bool selected) {return 0;}
LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
uint16 lemma_len) {return 0;}
LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;}
LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
uint16 lemma_len) {return 0;}
bool remove_lemma(LemmaIdType lemma_id) {return false;}
size_t get_total_lemma_count() {return 0;}
void set_total_lemma_count_of_others(size_t count);
void flush_cache() {}
LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
// Fill the lemmas with highest scores to the prediction buffer.
// his_len is the history length to fill in the prediction buffer.
size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,
size_t npre_max, size_t b4_used);
};
}
#endif // PINYINIME_INCLUDE_DICTTRIE_H__
|