/usr/include/phonenumbers/utf/unicodetext.h is in libphonenumber6-dev 6.3~svn698-3+b1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | // Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Author: Jim Meehan
#ifndef UTIL_UTF8_UNICODETEXT_H__
#define UTIL_UTF8_UNICODETEXT_H__
#include <iterator>
#include <string>
#include <utility>
#include "phonenumbers/base/basictypes.h"
namespace i18n {
namespace phonenumbers {
using std::string;
using std::bidirectional_iterator_tag;
using std::pair;
// ***************************** UnicodeText **************************
//
// A UnicodeText object is a container for a sequence of Unicode
// codepoint values. It has default, copy, and assignment constructors.
// Data can be appended to it from another UnicodeText, from
// iterators, or from a single codepoint.
//
// The internal representation of the text is UTF-8. Since UTF-8 is a
// variable-width format, UnicodeText does not provide random access
// to the text, and changes to the text are permitted only at the end.
//
// The UnicodeText class defines a const_iterator. The dereferencing
// operator (*) returns a codepoint (char32). The iterator is a
// bidirectional, read-only iterator. It becomes invalid if the text
// is changed.
//
// There are methods for appending and retrieving UTF-8 data directly.
// The 'utf8_data' method returns a const char* that contains the
// UTF-8-encoded version of the text; 'utf8_length' returns the number
// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
// 4 bytes of UTF-8 data in a char array and returns the number of
// bytes that it stored.
//
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
// 0x10FFFF], but UnicodeText has the additional restriction that it
// can contain only those characters that are valid for interchange on
// the Web. This excludes all of the control codes except for carriage
// return, line feed, and horizontal tab. It also excludes
// non-characters, but codepoints that are in the Private Use regions
// are allowed, as are codepoints that are unassigned. (See the
// Unicode reference for details.) The function UniLib::IsInterchangeValid
// can be used as a test for this property.
//
// UnicodeTexts are safe. Every method that constructs or modifies a
// UnicodeText tests for interchange-validity, and will substitute a
// space for the invalid data. Such cases are reported via
// LOG(WARNING).
//
// MEMORY MANAGEMENT: copy, take ownership, or point to
//
// A UnicodeText is either an "owner", meaning that it owns the memory
// for the data buffer and will free it when the UnicodeText is
// destroyed, or it is an "alias", meaning that it does not.
//
// There are three methods for storing UTF-8 data in a UnicodeText:
//
// CopyUTF8(buffer, len) copies buffer.
//
// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
//
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
//
// All three methods perform a validity check on the buffer. There are
// private, "unsafe" versions of these functions that bypass the
// validity check. They are used internally and by friend-functions
// that are handling UTF-8 data that has already been validated.
//
// The purpose of an alias is to avoid making an unnecessary copy of a
// UTF-8 buffer while still providing access to the Unicode values
// within that text through iterators or the fast scanners that are
// based on UTF-8 state tables. The lifetime of an alias must not
// exceed the lifetime of the buffer from which it was constructed.
//
// The semantics of an alias might be described as "copy on write or
// repair." The source data is never modified. If push_back() or
// append() is called on an alias, a copy of the data will be created,
// and the UnicodeText will become an owner. If clear() is called on
// an alias, it becomes an (empty) owner.
//
// The copy constructor and the assignment operator produce an owner.
// That is, after direct initialization ("UnicodeText x(y);") or copy
// initialization ("UnicodeText x = y;") x will be an owner, even if y
// was an alias. The assignment operator ("x = y;") also produces an
// owner unless x and y are the same object and y is an alias.
//
// Aliases should be used with care. If the source from which an alias
// was created is freed, or if the contents are changed, while the
// alias is still in use, fatal errors could result. But it can be
// quite useful to have a UnicodeText "window" through which to see a
// UTF-8 buffer without having to pay the price of making a copy.
//
// UTILITIES
//
// The interfaces in util/utf8/public/textutils.h provide higher-level
// utilities for dealing with UnicodeTexts, including routines for
// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
// strings, creating strings from UnicodeTexts, normalizing text for
// efficient matching or display, and others.
class UnicodeText {
public:
class const_iterator;
typedef char32 value_type;
// Constructors. These always produce owners.
UnicodeText(); // Create an empty text.
UnicodeText(const UnicodeText& src); // copy constructor
// Construct a substring (copies the data).
UnicodeText(const const_iterator& first, const const_iterator& last);
// Assignment operator. This copies the data and produces an owner
// unless this == &src, e.g., "x = x;", which is a no-op.
UnicodeText& operator=(const UnicodeText& src);
// x.Copy(y) copies the data from y into x.
UnicodeText& Copy(const UnicodeText& src);
inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
// x.PointTo(y) changes x so that it points to y's data.
// It does not copy y or take ownership of y's data.
UnicodeText& PointTo(const UnicodeText& src);
UnicodeText& PointTo(const const_iterator& first,
const const_iterator& last);
~UnicodeText();
void clear(); // Clear text.
bool empty() { return repr_.size_ == 0; } // Test if text is empty.
// Add a codepoint to the end of the text.
// If the codepoint is not interchange-valid, add a space instead
// and log a warning.
void push_back(char32 codepoint);
// Generic appending operation.
// iterator_traits<ForwardIterator>::value_type must be implicitly
// convertible to char32. Typical uses of this method might include:
// char32 chars[] = {0x1, 0x2, ...};
// vector<char32> more_chars = ...;
// utext.append(chars, chars+arraysize(chars));
// utext.append(more_chars.begin(), more_chars.end());
template<typename ForwardIterator>
UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
while (first != last) { push_back(*first++); }
return *this;
}
// A specialization of the generic append() method.
UnicodeText& append(const const_iterator& first, const const_iterator& last);
// An optimization of append(source.begin(), source.end()).
UnicodeText& append(const UnicodeText& source);
int size() const; // the number of Unicode characters (codepoints)
friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
class const_iterator {
typedef const_iterator CI;
public:
typedef bidirectional_iterator_tag iterator_category;
typedef char32 value_type;
typedef ptrdiff_t difference_type;
typedef void pointer; // (Not needed.)
typedef const char32 reference; // (Needed for const_reverse_iterator)
// Iterators are default-constructible.
const_iterator();
// It's safe to make multiple passes over a UnicodeText.
const_iterator(const const_iterator& other);
const_iterator& operator=(const const_iterator& other);
char32 operator*() const; // Dereference
const_iterator& operator++(); // Advance (++iter)
const_iterator operator++(int) { // (iter++)
const_iterator result(*this);
++*this;
return result;
}
const_iterator& operator--(); // Retreat (--iter)
const_iterator operator--(int) { // (iter--)
const_iterator result(*this);
--*this;
return result;
}
// We love relational operators.
friend bool operator==(const CI& lhs, const CI& rhs) {
return lhs.it_ == rhs.it_; }
friend bool operator!=(const CI& lhs, const CI& rhs) {
return !(lhs == rhs); }
friend bool operator<(const CI& lhs, const CI& rhs);
friend bool operator>(const CI& lhs, const CI& rhs) {
return rhs < lhs; }
friend bool operator<=(const CI& lhs, const CI& rhs) {
return !(rhs < lhs); }
friend bool operator>=(const CI& lhs, const CI& rhs) {
return !(lhs < rhs); }
friend difference_type distance(const CI& first, const CI& last);
// UTF-8-specific methods
// Store the UTF-8 encoding of the current codepoint into buf,
// which must be at least 4 bytes long. Return the number of
// bytes written.
int get_utf8(char* buf) const;
// Return the iterator's pointer into the UTF-8 data.
const char* utf8_data() const { return it_; }
string DebugString() const;
private:
friend class UnicodeText;
friend class UnicodeTextUtils;
friend class UTF8StateTableProperty;
explicit const_iterator(const char* it) : it_(it) {}
const char* it_;
};
const_iterator begin() const;
const_iterator end() const;
class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
public:
const_reverse_iterator(const_iterator it) :
std::reverse_iterator<const_iterator>(it) {}
const char* utf8_data() const {
const_iterator tmp_it = base();
return (--tmp_it).utf8_data();
}
int get_utf8(char* buf) const {
const_iterator tmp_it = base();
return (--tmp_it).get_utf8(buf);
}
};
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
// Substring searching. Returns the beginning of the first
// occurrence of "look", or end() if not found.
const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
// Equivalent to find(look, begin())
const_iterator find(const UnicodeText& look) const;
// Returns whether this contains the character U+FFFD. This can
// occur, for example, if the input to Encodings::Decode() had byte
// sequences that were invalid in the source encoding.
bool HasReplacementChar() const;
// UTF-8-specific methods
//
// Return the data, length, and capacity of UTF-8-encoded version of
// the text. Length and capacity are measured in bytes.
const char* utf8_data() const { return repr_.data_; }
int utf8_length() const { return repr_.size_; }
int utf8_capacity() const { return repr_.capacity_; }
// Return the UTF-8 data as a string.
static string UTF8Substring(const const_iterator& first,
const const_iterator& last);
// There are three methods for initializing a UnicodeText from UTF-8
// data. They vary in details of memory management. In all cases,
// the data is tested for interchange-validity. If it is not
// interchange-valid, a LOG(WARNING) is issued, and each
// structurally invalid byte and each interchange-invalid codepoint
// is replaced with a space.
// x.CopyUTF8(buf, len) copies buf into x.
UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
// x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
// buf. buf is not copied.
UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
int byte_length,
int byte_capacity);
// x.PointToUTF8(buf,len) changes x so that it points to buf
// ("becomes an alias"). It does not take ownership or copy buf.
// If the buffer is not valid, this has the same effect as
// CopyUTF8(utf8_buffer, byte_length).
UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
// Occasionally it is necessary to use functions that operate on the
// pointer returned by utf8_data(). MakeIterator(p) provides a way
// to get back to the UnicodeText level. It uses CHECK to ensure
// that p is a pointer within this object's UTF-8 data, and that it
// points to the beginning of a character.
const_iterator MakeIterator(const char* p) const;
string DebugString() const;
private:
friend class const_iterator;
friend class UnicodeTextUtils;
class Repr { // A byte-string.
public:
char* data_;
int size_;
int capacity_;
bool ours_; // Do we own data_?
Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
~Repr() { if (ours_) delete[] data_; }
void clear();
void reserve(int capacity);
void resize(int size);
void append(const char* bytes, int byte_length);
void Copy(const char* data, int size);
void TakeOwnershipOf(char* data, int size, int capacity);
void PointTo(const char* data, int size);
string DebugString() const;
private:
Repr& operator=(const Repr&);
Repr(const Repr& other);
};
Repr repr_;
// UTF-8-specific private methods.
// These routines do not perform a validity check when compiled
// in opt mode.
// It is an error to call these methods with UTF-8 data that
// is not interchange-valid.
//
UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
UnicodeText& UnsafeTakeOwnershipOfUTF8(
char* utf8_buffer, int byte_length, int byte_capacity);
UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
const_iterator UnsafeFind(const UnicodeText& look,
const_iterator start_pos) const;
};
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
return !(lhs == rhs);
}
// UnicodeTextRange is a pair of iterators, useful for specifying text
// segments. If the iterators are ==, the segment is empty.
typedef pair<UnicodeText::const_iterator,
UnicodeText::const_iterator> UnicodeTextRange;
inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
return r.first == r.second;
}
// *************************** Utilities *************************
// A factory function for creating a UnicodeText from a buffer of
// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
// is an "owner.")
//
// Each byte that is structurally invalid will be replaced with a
// space. Each codepoint that is interchange-invalid will also be
// replaced with a space, even if the codepoint was represented with a
// multibyte sequence in the UTF-8 data.
//
inline UnicodeText MakeUnicodeTextAcceptingOwnership(
char* utf8_buffer, int byte_length, int byte_capacity) {
return UnicodeText().TakeOwnershipOfUTF8(
utf8_buffer, byte_length, byte_capacity);
}
// A factory function for creating a UnicodeText from a buffer of
// UTF-8 data. The new UnicodeText does not take ownership of the
// buffer. (It is an "alias.")
//
inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
const char* utf8_buffer, int byte_length) {
return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
}
// Create a UnicodeText from a UTF-8 string or buffer.
//
// If do_copy is true, then a copy of the string is made. The copy is
// owned by the resulting UnicodeText object and will be freed when
// the object is destroyed. This UnicodeText object is referred to
// as an "owner."
//
// If do_copy is false, then no copy is made. The resulting
// UnicodeText object does NOT take ownership of the string; in this
// case, the lifetime of the UnicodeText object must not exceed the
// lifetime of the string. This Unicodetext object is referred to as
// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
//
// If the input string does not contain valid UTF-8, then a copy is
// made (as if do_copy were true) and coerced to valid UTF-8 by
// replacing each invalid byte with a space.
//
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
bool do_copy) {
UnicodeText t;
if (do_copy) {
t.CopyUTF8(utf8_buf, len);
} else {
t.PointToUTF8(utf8_buf, len);
}
return t;
}
inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
}
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
return UTF8ToUnicodeText(utf8_buf, len, true);
}
inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
return UTF8ToUnicodeText(utf8_string, true);
}
// Return a string containing the UTF-8 encoded version of all the
// Unicode characters in t.
inline string UnicodeTextToUTF8(const UnicodeText& t) {
return string(t.utf8_data(), t.utf8_length());
}
} // namespace phonenumbers
} // namespace i18n
#endif // UTIL_UTF8_UNICODETEXT_H__
|