This file is indexed.

/usr/include/kytea/string-util.h is in libkytea-dev 0.4.6+dfsg-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/*
* Copyright 2009, KyTea Development Team
* 
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* 
*     http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef STRING_UTIL_H__
#define STRING_UTIL_H__

#include <kytea/kytea-struct.h>
// #include <kytea/kytea-string.h>
// #include <iostream>
#include <sstream>
// #include <vector>
// #include <cstdlib>

namespace kytea {

// a class for turning std::strings into internal representation
class StringUtil {

public:

    // types of characters (set in the constructor)
    typedef char CharType;
    const static CharType KANJI    = 'K';
    const static CharType KATAKANA = 'T';
    const static CharType HIRAGANA = 'H';
    const static CharType ROMAJI   = 'R';
    const static CharType DIGIT    = 'D';
    const static CharType OTHER    = 'O';

    // types of encodings
    typedef char Encoding;
    const static Encoding ENCODING_UTF8    = 'W';
    const static Encoding ENCODING_EUC     = 'E';
    const static Encoding ENCODING_SJIS    = 'S';

    // A map that normalizes characters to a single representation
    GenericMap<KyteaChar,KyteaChar> * normMap_;

public:

    StringUtil() : normMap_(NULL) { }

    virtual ~StringUtil() {
        if(normMap_) delete normMap_;    
    }

    // map a std::string to a character
    virtual KyteaChar mapChar(const std::string & str, bool add = true) = 0;
    virtual std::string showChar(KyteaChar c) = 0;

    std::string showString(const KyteaString & c) {
        std::ostringstream buff;
        for(unsigned i = 0; i < c.length(); i++)
            buff << showChar(c[i]);
        return buff.str();
    }

    // map an unparsed std::string to a KyteaString
    virtual KyteaString mapString(const std::string & str) = 0;

    // get the type of a character
    virtual CharType findType(const std::string & str) = 0;
    virtual CharType findType(KyteaChar c) = 0;

    // return the encoding provided by this util
    virtual Encoding getEncoding() = 0;
    virtual const char* getEncodingString() = 0;
    
    // transform to or from a character std::string
    virtual void unserialize(const std::string & str) = 0;
    virtual std::string serialize() const = 0;
    
    // normalization functions
    virtual GenericMap<KyteaChar,KyteaChar> * getNormMap() = 0;
    KyteaString normalize(const KyteaString & str);

    // Check that these are equal by serializing them
    void checkEqual(const StringUtil & rhs) const;

    // parse an integer or float
    int parseInt(const char* str);
    double parseFloat(const char* str);


    // get a std::string of character types
    std::string getTypeString(const KyteaString& str) {
        std::ostringstream buff;
        for(unsigned i = 0; i < str.length(); i++)
            buff << findType(str[i]);
        return buff.str();
    }


};

// a class for parsing UTF8
class StringUtilUtf8 : public StringUtil {

private:
    
    const static char maskr6 = 63, maskr5 = 31, maskr4 = 15, maskr3 = 7,
                      maskl1 = 1 << 7, maskl2 = 3 << 6, maskl3 = 7 << 5, 
                      maskl4 = 15 << 4, maskl5 = 31 << 3;

    // variables
    StringCharMap charIds_;
    std::vector<std::string> charNames_;
    std::vector<CharType> charTypes_;

public:

    StringUtilUtf8();

    ~StringUtilUtf8() { }
    
    // map a std::string to a character
    KyteaChar mapChar(const std::string & str, bool add = true);
    std::string showChar(KyteaChar c);

    CharType findType(KyteaChar c);

    GenericMap<KyteaChar,KyteaChar> * getNormMap();

    bool badu(char val) { return ((val ^ maskl1) & maskl2); }
    KyteaString mapString(const std::string & str);

    // find the type of a unicode character
    CharType findType(const std::string & str);

    Encoding getEncoding() { return ENCODING_UTF8; }
    const char* getEncodingString() { return "utf8"; }

    const std::vector<std::string> & getCharNames() { return charNames_; }

    // transform to or from a character std::string
    void unserialize(const std::string & str);
    std::string serialize() const;

};

class StringUtilEuc : public StringUtil {

const static char maskl1 = 1 << 7;
const static KyteaChar mask3len = 1 << 14;
    

public:
    StringUtilEuc() { };
    ~StringUtilEuc() { }

    KyteaChar mapChar(const std::string & str, bool add = true);
    std::string showChar(KyteaChar c);
    
    GenericMap<KyteaChar,KyteaChar> * getNormMap();

    // map an unparsed std::string to a KyteaString
    KyteaString mapString(const std::string & str);

    // get the type of a character
    CharType findType(const std::string & str);
    CharType findType(KyteaChar c);

    // return the encoding provided by this util
    Encoding getEncoding();
    const char* getEncodingString();
    
    // transform to or from a character std::string
    void unserialize(const std::string & str);
    std::string serialize() const;

};

class StringUtilSjis : public StringUtil {

const static char maskl1 = 1 << 7;
const static KyteaChar mask3len = 1 << 14;
    

public:
    StringUtilSjis() { };
    ~StringUtilSjis() { }

    KyteaChar mapChar(const std::string & str, bool add = true);
    GenericMap<KyteaChar,KyteaChar> * getNormMap();

    std::string showChar(KyteaChar c);
    
    // map an unparsed std::string to a KyteaString
    KyteaString mapString(const std::string & str);

    // get the type of a character
    CharType findType(const std::string & str);
    CharType findType(KyteaChar c);

    // return the encoding provided by this util
    Encoding getEncoding();
    const char* getEncodingString();
    
    // transform to or from a character std::string
    void unserialize(const std::string & str);
    std::string serialize() const;

};



}

#endif