/usr/share/ada/adainclude/xmlada/unicode-ces-utf8.ads is in libxmlada5-dev 4.4.2014-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | ------------------------------------------------------------------------------
-- XML/Ada - An XML suite for Ada95 --
-- --
-- Copyright (C) 2001-2014, AdaCore --
-- --
-- This library is free software; you can redistribute it and/or modify it --
-- under terms of the GNU General Public License as published by the Free --
-- Software Foundation; either version 3, or (at your option) any later --
-- version. This library is distributed in the hope that it will be useful, --
-- but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHAN- --
-- TABILITY or FITNESS FOR A PARTICULAR PURPOSE. --
-- --
-- You should have received a copy of the GNU General Public License along --
-- with this program; see the file COPYING3. If not, see --
-- <http://www.gnu.org/licenses/>. --
-- --
------------------------------------------------------------------------------
-- This package provides support for Utf8 encoding of characters.
--
-- Characters whose code is less than 128 are encoded as is in the
-- Utf8_String. As a result, such a string is compatible with a standard
-- String whose characters are all standard ASCII (and contains no
-- extended ASCII characters).
-- In that, one of the beauties of UTF-8 (and UTF-16) is that there is no
-- overlap, as opposed to what happens with other encodings. If you search
-- for an ASCII character within a Utf8_String, using the standard string
-- string or array manipulation functions, you will only find that character,
-- and not part of a longer sequence that encodes another character.
-- As a result, all the standard string-manipulation functions will work
-- as is (note however that the 'Length attribute doesn't represent the
-- number of characters in the string, but the number of bytes).
--
-- However, since characters can be encoded on one to six bytes, this means
-- that traversing a string is not as efficient as with other encodings.
--
-- Also, this encoding is not subject to byte-ordering constraints, since this
-- is only a sequence of bytes. It is self-synchronizing, in that you can
-- start anywhere in the string and find a synchronization point easily.
with Unicode.CES.Utf32;
with Unicode.CCS;
with Unchecked_Deallocation;
package Unicode.CES.Utf8 is
-----------
-- Types --
-----------
subtype Utf8_String is String;
type Utf8_String_Access is access all Utf8_String;
-- An UTF8-encoded string.
-------------------------------------------
-- Conversion to and from byte sequences --
-------------------------------------------
procedure Encode
(Char : Unicode_Char;
Output : in out Byte_Sequence;
Index : in out Natural);
-- Set the byte sequence representing Char in the Utf8 character encoding.
-- There must remain at least 6 characters in Output if you want to avoid
-- Constraint_Errors.
procedure Read
(Str : Utf8_String;
Index : in out Positive;
Char : out Unicode_Char);
-- Return the character starting at location Index in Str, and move Index
-- to the beginning of the next location
-- Invalid_Encoding is raised if not valid byte sequence starts at Index.
-- Incomplete_Encoding is raised if there is not enough characters for
-- a valid encoding.
function Width (Char : Unicode_Char) return Natural;
pragma Inline (Width);
-- Return the number of bytes occupied by the Utf8 representation of Char
function Length (Str : Utf8_String) return Natural;
-- Return the number of characters in Str
function Utf8_Length (Str : Utf8_String) return Natural renames Length;
-- Return the number of characters in Str
function Utf8_Next_Char
(Str : Utf8_String; Index : Natural) return Natural;
pragma Inline (Utf8_Next_Char);
-- Return the location of the next character in Str.
-- Index must point to the beginning of a character.
function Utf8_Prev_Char
(Str : Utf8_String; Index : Natural) return Natural;
pragma Inline (Utf8_Prev_Char);
-- Return the start index of the rightmost UTF-8 sequence starting
-- strictly before Index.
-- If Index is the start index of an UTF-8 sequence, this returns the
-- start index of the previous UTF-8 sequence.
-- If Index falls in the middle of an UTF-8 sequence, this returns the
-- start index of that sequence.
procedure Utf8_Get_Char
(Str : Utf8_String; Index : in out Positive; Char : out Unicode_Char);
pragma Inline (Utf8_Get_Char);
-- Similar to read, but sets Char to Unicode_Char'Last in case of
-- invalid encoding.
function Utf8_Find_Next_Char
(Str : Utf8_String; Index : Natural) return Natural;
pragma Inline (Utf8_Find_Next_Char);
-- Finds the start of the next UTF8 character in the string after Index.
-- Index does not have to be at the beginning of a UTF8 character.
-- If you know you are at the beginning of a UTF8 character, it is more
-- efficient to use Utf8_Next_Char.
-------------------------------------------
-- Conversion to and from Utf32-encoding --
-------------------------------------------
function From_Utf32 (Str : Unicode.CES.Utf32.Utf32_LE_String)
return Utf8_String;
-- Return a new Utf8-encoded string, from a Utf32-encoded string.
function To_Utf32 (Str : Utf8_String)
return Unicode.CES.Utf32.Utf32_LE_String;
-- Return a new Utf32-encoded string, from a Utf8-encoded string.
---------------------------
-- Byte order conversion --
---------------------------
function To_Unicode_LE
(Str : Utf8_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf8_String;
-- Convert str (character set is CS) to a Unicode
-- little-endian byte-sequence
-- If Str contains a BOM that indicates an encoding other than Utf8,
-- Invalid_Encoding is raised.
-- Order is irrelevant for utf8, but is kept for interface compatibility
-- with other similar functions.
function To_CS
(Str : Utf8_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf8_String;
-- Convert Str (Unicode) to another character set
---------------------
-- Encoding Scheme --
---------------------
Utf8_Encoding : constant Encoding_Scheme :=
(BOM => Utf8_All,
Read => Read'Access,
Width => Width'Access,
Encode => Encode_Function'(Encode'Access),
Length => Length'Access);
------------------
-- Deallocation --
------------------
procedure Free is new Unchecked_Deallocation
(Utf8_String, Utf8_String_Access);
-- Free the memory occupied by a utf8-encoded string
end Unicode.CES.Utf8;
|