/usr/share/doc/libsmlnj-smlnj/HTML/html-lex is in libsmlnj-smlnj 110.78-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | (* html-lex
*
* COPYRIGHT (c) 1995 AT&T Bell Laboratories.
* COPYRIGHT (c) 1996 AT&T Research.
*
* A scanner for HTML.
*
* TODO:
* Recognize the DOCTYPE element
* <!DOCTYPE HTML PUBLIC "...">
* Clean-up the scanning of start tags (do we need Err?).
* Whitespace in PRE elements should be preserved, but how?
*)
structure T = Tokens
structure Elems = HTMLElementsFn (
structure Tokens = Tokens
structure Err = Err
structure HTMLAttrs = HTMLAttrs)
type pos = int
type svalue = T.svalue
type arg = (((string * int * int) -> unit) * string option)
type ('a, 'b) token = ('a, 'b) T.token
type lexresult= (svalue, pos) token
fun eof _ = Tokens.EOF(0, 0)
(* a buffer for collecting a string piecewise *)
val buffer = ref ([] : string list)
fun addStr s = (buffer := s :: !buffer)
fun getStr () = (String.concat(List.rev(! buffer)) before (buffer := []))
%%
%s COM1 COM2 STAG;
%header (functor HTMLLexFn (
structure Tokens : HTML_TOKENS
structure Err : HTML_ERROR
structure HTMLAttrs : HTML_ATTRS));
%arg (errorFn, file);
%full
%count
alpha=[A-Za-z];
digit=[0-9];
namechar=[-A-Za-z0-9.];
tag=({alpha}{namechar}*);
ws = [\ \t];
%%
<INITIAL>"<"{tag}
=> (addStr yytext; YYBEGIN STAG; continue());
<STAG>">"
=> (addStr yytext;
YYBEGIN INITIAL;
case Elems.startTag file (getStr(), !yylineno, !yylineno)
of NONE => continue()
| (SOME tag) => tag
(* end case *));
<STAG>\n
=> (addStr " "; continue());
<STAG>{ws}+
=> (addStr yytext; continue());
<STAG>{namechar}+
=> (addStr yytext; continue());
<STAG>"="
=> (addStr yytext; continue());
<STAG>"\""[^\"\n]*"\""
=> (addStr yytext; continue());
<STAG>"'"[^'\n]*"'"
=> (addStr yytext; continue());
<STAG>.
=> (addStr yytext; continue());
<INITIAL>"</"{tag}{ws}*">"
=> (case Elems.endTag file (yytext, !yylineno, !yylineno)
of NONE => continue()
| (SOME tag) => tag
(* end case *));
<INITIAL>"<!--"
=> (YYBEGIN COM1; continue());
<COM1>"--"
=> (YYBEGIN COM2; continue());
<COM1>\n
=> (continue());
<COM1>.
=> (continue());
<COM2>"--"
=> (YYBEGIN COM1; continue());
<COM2>">"
=> (YYBEGIN INITIAL; continue());
<COM2>\n
=> (continue());
<COM2>{ws}
=> (continue());
<COM2>.
=> (errorFn("bad comment syntax", !yylineno, !yylineno+1);
YYBEGIN INITIAL;
continue());
<INITIAL>"&#"[A-Za-z]+";"
=> (
(** At some point, we should support &#SPACE; and &#TAB; **)
continue());
<INITIAL>"&#"[0-9]+";"
=> (T.CHAR_REF(yytext, !yylineno, !yylineno));
<INITIAL>"&"{tag}";"
=> (T.ENTITY_REF(yytext, !yylineno, !yylineno));
<INITIAL>"\n"
=> (continue());
<INITIAL>{ws}
=> (continue());
<INITIAL>[^<]+
=> (T.PCDATA(yytext, !yylineno, !yylineno));
<INITIAL>.
=> (errorFn(concat[
"bogus character #\"", Char.toString(String.sub(yytext, 0)),
"\" in PCDATA\n"
], !yylineno, !yylineno+1);
continue());
|