/usr/lib/ocaml/netstring/netulex.mli is in libocamlnet-ocaml-dev 4.0.4-1build3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | (* $Id: netulex.mli 1003 2006-09-24 15:17:15Z gerd $
* ----------------------------------------------------------------------
* PXP: The polymorphic XML parser for Objective Caml.
* Copyright by Gerd Stolpmann. See LICENSE for details.
*)
(** Support module for Alain Frisch's [ulex] lexer generator
*
* The sub module [ULB] is a Unicode-based lexing buffer that
* reads encoded strings and makes them available to the lexer
* as both Unicode arrays and UTF-8 strings.
*
* The sub module [Ulexing] is a replacement for the module
* in [ulex] with the same name. It uses [ULB] to represent
* the main lexing buffer. It is much faster than the original
* [Ulexing] implementation when the scanned text is UTF-8
* encoded and [Ulexing.utf8_lexeme] is frequently called to
* get the lexeme strings. Furthermore, it can process input
* data of all encodings available to [Netconversion]. It is,
* however, no drop-in replacement as it has a different
* signature.
*
* To enable this version of [Ulexing], simply put an
* [open Netulex] before using the [ulex] lexers.
*
* Note that the tutorial has been moved to {!Netulex_tut}.
*)
(** {1:modules Modules} *)
module ULB : sig
(** This module provides the [unicode_lexbuf] record with
* access functions. In this record, the data is available
* in two forms: As an array of Unicode code points
* [ulb_chars], and as string of encoded chars [ulb_rawbuf].
* Both buffers are synchronised by [ulb_chars_pos]. This
* array stores where every character of [ulb_chars] can be
* found in [ulb_rawbuf].
*)
type unicode_lexbuf =
private
{ mutable ulb_encoding : Netconversion.encoding;
(** The character encoding of [ulb_rawbuf] *)
mutable ulb_encoding_start : int;
(** The first character position to which [ulb_encoding]
* applies (the encoding of earlier positions is
* lost)
*)
mutable ulb_rawbuf : string;
(** The encoded string to analyse *)
mutable ulb_rawbuf_len : int;
(** The filled part of [ulb_rawbuf] *)
mutable ulb_rawbuf_end : int;
(** The analysed part of [ulb_rawbuf]. We have always
* [ulb_rawbuf_end <= ulb_rawbuf_len]. The analysed part
* may be shorter than the filled part because there is
* not enough space in [ulb_chars], or because the filled
* part ends with an incomplete multi-byte character
*)
mutable ulb_rawbuf_const : bool;
(** Whether [ulb_rawbuf] is considered as a constant. If
* [true], it is never blitted.
*)
mutable ulb_chars : int array;
(** The analysed part of [ulb_rawbuf] as array of Unicode
* code points. Only the positions 0 to [ulb_chars_len-1]
* of the array are filled.
*)
mutable ulb_chars_pos : int array;
(** For every analysed character this array stores the
* byte position where the character begins in [ulb_rawbuf].
* In addition, the array contains at [ulb_chars_len] the
* value of [ulb_rawbuf_end].
*
* This array is one element longer than [ulb_chars].
*)
mutable ulb_chars_len : int;
(** The filled part of [ulb_chars] *)
mutable ulb_eof : bool;
(** Whether EOF has been seen *)
mutable ulb_refill : string -> int -> int -> int;
(** The refill function *)
mutable ulb_enc_change_hook : unicode_lexbuf -> unit;
(** This function is called when the encoding changes *)
mutable ulb_cursor : Netconversion.cursor;
(** Internally used by the implementation *)
}
val from_function :
?raw_size:int ->
?char_size:int ->
?enc_change_hook:(unicode_lexbuf -> unit) ->
refill:(string -> int -> int -> int) ->
Netconversion.encoding ->
unicode_lexbuf
(** Creates a [unicode_lexbuf] to analyse strings of the
* passed [encoding] coming from the [refill] function.
*
* @param raw_size The initial size for [ulb_rawbuf]. Defaults to 512
* @param char_size The initial size for [ulb_chars]. Defaults to 256
* @param enc_change_hook This function is called when the encoding
* is changed, either by this module, or by the user calling
* [set_encoding].
* @param refill This function is called with arguments [ulb_rawbuf],
* [ulb_rawbuf_len], and [l], where
* [l = String.length ulb_rawbuf - ulb_rawbuf_len] is the free
* space in the buffer. The function should fill new bytes into
* this substring, and return the number of added bytes. The
* return value 0 signals EOF.
*)
val from_in_obj_channel :
?raw_size:int ->
?char_size:int ->
?enc_change_hook:(unicode_lexbuf -> unit) ->
Netconversion.encoding ->
Netchannels.in_obj_channel ->
unicode_lexbuf
(** Creates a [unicode_lexbuf] to analyse strings of the
* passed [encoding] coming from the object channel.
*
* @param raw_size The initial size for [ulb_rawbuf]. Defaults to 512
* @param char_size The initial size for [ulb_chars]. Defaults to 256
* @param enc_change_hook This function is called when the encoding
* is changed, either by this module, or by the user calling
* [set_encoding].
*)
val from_string :
?enc_change_hook:(unicode_lexbuf -> unit) ->
Netconversion.encoding -> string -> unicode_lexbuf
(** Creates a [unicode_lexbuf] analysing the passed string encoded in
* the passed encoding. This function copies the input string.
*
* @param enc_change_hook This function is called when the encoding
* is changed, either by this module, or by the user calling
* [set_encoding]
*)
val from_string_inplace :
?enc_change_hook:(unicode_lexbuf -> unit) ->
Netconversion.encoding -> string -> unicode_lexbuf
(** Creates a [unicode_lexbuf] analysing the passed string encoded in
* the passed encoding. This function does not copy the input string,
* but uses it directly as [ulb_rawbuf]. The string is not modified by [ULB],
* but the caller must ensure that other program parts do not
* modify it either.
*
* @param enc_change_hook This function is called when the encoding
* is changed, either by this module, or by the user calling
* [set_encoding]
*)
val delete :
int -> unicode_lexbuf -> unit
(** Deletes the number of characters from [unicode_lexbuf].
* These characters
* are removed from the beginning of the buffer, i.e.
* [ulb_chars.(n)] becomes the new first character of the
* buffer. All three buffers [ulb_rawbuf], [ulb_chars], and
* [ulb_chars_pos] are blitted as necessary.
*
* When the buffer is already at EOF, the function fails.
*
* For efficiency, it should be tried to call [delete] as seldom as
* possible. Its speed is linear to the number of characters to move.
*)
val refill :
unicode_lexbuf ->
unit
(** Tries to add characters to the [unicode_lexbuf] by calling the
* [ulb_refill] function. When the buffer is already at EOF, the
* exception [End_of_file] is raised, and the buffer is not modified.
* Otherwise, the [ulb_refill] function is called to
* add new characters. If necessary, [ulb_rawbuf], [ulb_chars], and
* [ulb_chars_pos] are enlarged such that it is ensured that either
* at least one new character is added, or that EOF is found for
* the first time
* In the latter case, [ulb_eof] is set to [true] (and the next call
* of [refill_unicode_lexbuf] will raise [End_of_file]).
*)
val set_encoding :
Netconversion.encoding -> unicode_lexbuf -> unit
(** Sets the [encoding] to the passed value. This only affects future
* [refill] calls. The hook [enc_change_hook] is invoked when defined.
*)
val close :
unicode_lexbuf -> unit
(** Sets [ulb_eof] of the [unicode_lexbuf]. The rest of the buffer
* is not modified
*)
val utf8_sub_string : int -> int -> unicode_lexbuf -> string
(** The two [int] arguments are the position and length of a sub
* string of the lexbuf that is returned as UTF8 string. Position
* and length are given as character multiples, not byte multiples.
*)
val utf8_sub_string_length : int -> int -> unicode_lexbuf -> int
(** Returns [String.length(utf8_sub_string args)]. Tries not to
* allocate the UTF-8 string.
*)
end (* module ULB *)
module Ulexing : sig
(** This is a lexing buffer for [ulex]. *)
type lexbuf
exception Error
(** Lexical error *)
val from_ulb_lexbuf : ULB.unicode_lexbuf -> lexbuf
(** Creates a new [lexbuf] from the [unicode_lexbuf]. After that,
* the [unicode_lexbuf] must no longer be modified.
*)
val lexeme_start: lexbuf -> int
(** The character position of the start of the lexeme *)
val lexeme_end: lexbuf -> int
(** The character position of the end of the lexeme *)
val lexeme_length: lexbuf -> int
(** The length of the lexeme in characters *)
val lexeme: lexbuf -> int array
(** Returns the lexeme as array of Unicode code points *)
val lexeme_char: lexbuf -> int -> int
(** Returns the code point of a certain character of the
* lexeme
*)
val sub_lexeme: lexbuf -> int -> int -> int array
(** Returns a substring of the lexeme as array of Unicode
* code points. The first [int] is the characater position
* where to start, the second [int] is the number of
* characters.
*)
val utf8_lexeme: lexbuf -> string
(** Returns the lexeme as UTF-8 encoded string *)
val utf8_sub_lexeme: lexbuf -> int -> int -> string
(** Returns a substring of the lexeme as UTF-8 encoded
* string. The first [int] is the characater position
* where to start, the second [int] is the number of
* characters.
*)
val utf8_sub_lexeme_length: lexbuf -> int -> int -> int
(** Same as
* String.length(utf8_sub_lexeme args), i.e. returns
* the number of bytes a certain sub lexeme will have
* when encoded as UTF-8 string.
*)
(**/**)
(* "Internal" interface. This must match ulex's ones. *)
val start: lexbuf -> unit
val next: lexbuf -> int
val mark: lexbuf -> int -> unit
val backtrack: lexbuf -> int
end
|