/usr/include/kytea/string-util.h is in libkytea-dev 0.4.6+dfsg-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | /*
* Copyright 2009, KyTea Development Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef STRING_UTIL_H__
#define STRING_UTIL_H__
#include <kytea/kytea-struct.h>
// #include <kytea/kytea-string.h>
// #include <iostream>
#include <sstream>
// #include <vector>
// #include <cstdlib>
namespace kytea {
// a class for turning std::strings into internal representation
class StringUtil {
public:
// types of characters (set in the constructor)
typedef char CharType;
const static CharType KANJI = 'K';
const static CharType KATAKANA = 'T';
const static CharType HIRAGANA = 'H';
const static CharType ROMAJI = 'R';
const static CharType DIGIT = 'D';
const static CharType OTHER = 'O';
// types of encodings
typedef char Encoding;
const static Encoding ENCODING_UTF8 = 'W';
const static Encoding ENCODING_EUC = 'E';
const static Encoding ENCODING_SJIS = 'S';
// A map that normalizes characters to a single representation
GenericMap<KyteaChar,KyteaChar> * normMap_;
public:
StringUtil() : normMap_(NULL) { }
virtual ~StringUtil() {
if(normMap_) delete normMap_;
}
// map a std::string to a character
virtual KyteaChar mapChar(const std::string & str, bool add = true) = 0;
virtual std::string showChar(KyteaChar c) = 0;
std::string showString(const KyteaString & c) {
std::ostringstream buff;
for(unsigned i = 0; i < c.length(); i++)
buff << showChar(c[i]);
return buff.str();
}
// map an unparsed std::string to a KyteaString
virtual KyteaString mapString(const std::string & str) = 0;
// get the type of a character
virtual CharType findType(const std::string & str) = 0;
virtual CharType findType(KyteaChar c) = 0;
// return the encoding provided by this util
virtual Encoding getEncoding() = 0;
virtual const char* getEncodingString() = 0;
// transform to or from a character std::string
virtual void unserialize(const std::string & str) = 0;
virtual std::string serialize() const = 0;
// normalization functions
virtual GenericMap<KyteaChar,KyteaChar> * getNormMap() = 0;
KyteaString normalize(const KyteaString & str);
// Check that these are equal by serializing them
void checkEqual(const StringUtil & rhs) const;
// parse an integer or float
int parseInt(const char* str);
double parseFloat(const char* str);
// get a std::string of character types
std::string getTypeString(const KyteaString& str) {
std::ostringstream buff;
for(unsigned i = 0; i < str.length(); i++)
buff << findType(str[i]);
return buff.str();
}
};
// a class for parsing UTF8
class StringUtilUtf8 : public StringUtil {
private:
const static char maskr6 = 63, maskr5 = 31, maskr4 = 15, maskr3 = 7,
maskl1 = 1 << 7, maskl2 = 3 << 6, maskl3 = 7 << 5,
maskl4 = 15 << 4, maskl5 = 31 << 3;
// variables
StringCharMap charIds_;
std::vector<std::string> charNames_;
std::vector<CharType> charTypes_;
public:
StringUtilUtf8();
~StringUtilUtf8() { }
// map a std::string to a character
KyteaChar mapChar(const std::string & str, bool add = true);
std::string showChar(KyteaChar c);
CharType findType(KyteaChar c);
GenericMap<KyteaChar,KyteaChar> * getNormMap();
bool badu(char val) { return ((val ^ maskl1) & maskl2); }
KyteaString mapString(const std::string & str);
// find the type of a unicode character
CharType findType(const std::string & str);
Encoding getEncoding() { return ENCODING_UTF8; }
const char* getEncodingString() { return "utf8"; }
const std::vector<std::string> & getCharNames() { return charNames_; }
// transform to or from a character std::string
void unserialize(const std::string & str);
std::string serialize() const;
};
class StringUtilEuc : public StringUtil {
const static char maskl1 = 1 << 7;
const static KyteaChar mask3len = 1 << 14;
public:
StringUtilEuc() { };
~StringUtilEuc() { }
KyteaChar mapChar(const std::string & str, bool add = true);
std::string showChar(KyteaChar c);
GenericMap<KyteaChar,KyteaChar> * getNormMap();
// map an unparsed std::string to a KyteaString
KyteaString mapString(const std::string & str);
// get the type of a character
CharType findType(const std::string & str);
CharType findType(KyteaChar c);
// return the encoding provided by this util
Encoding getEncoding();
const char* getEncodingString();
// transform to or from a character std::string
void unserialize(const std::string & str);
std::string serialize() const;
};
class StringUtilSjis : public StringUtil {
const static char maskl1 = 1 << 7;
const static KyteaChar mask3len = 1 << 14;
public:
StringUtilSjis() { };
~StringUtilSjis() { }
KyteaChar mapChar(const std::string & str, bool add = true);
GenericMap<KyteaChar,KyteaChar> * getNormMap();
std::string showChar(KyteaChar c);
// map an unparsed std::string to a KyteaString
KyteaString mapString(const std::string & str);
// get the type of a character
CharType findType(const std::string & str);
CharType findType(KyteaChar c);
// return the encoding provided by this util
Encoding getEncoding();
const char* getEncodingString();
// transform to or from a character std::string
void unserialize(const std::string & str);
std::string serialize() const;
};
}
#endif
|