/usr/share/ada/adainclude/opentoken/html_lexer.ads is in libopentoken4-dev 5.0a-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | -------------------------------------------------------------------------------
--
-- Copyright (C) 2009, 2010, 2013 Stephen Leake
-- Copyright (C) 1999, 2000 Christoph Karl Walter Grein
--
-- This file is part of the OpenToken package.
--
-- The OpenToken package is free software; you can redistribute it and/or
-- modify it under the terms of the GNU General Public License as published
-- by the Free Software Foundation; either version 3, or (at your option)
-- any later version. The OpenToken package is distributed in the hope that
-- it will be useful, but WITHOUT ANY WARRANTY; without even the implied
-- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- GNU General Public License for more details. You should have received
-- a copy of the GNU General Public License distributed with the OpenToken
-- package; see file GPL.txt. If not, write to the Free Software Foundation,
-- 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
--
-- As a special exception, if other files instantiate generics from
-- this unit, or you link this unit with other files to produce an
-- executable, this unit does not by itself cause the resulting
-- executable to be covered by the GNU General Public License. This
-- exception does not however invalidate any other reasons why the
-- executable file might be covered by the GNU Public License.
-------------------------------------------------------------------------------
with Ada.Strings.Unbounded;
with OpenToken.Token.Enumerated.Analyzer;
package HTML_Lexer is
----------------------------------------------------------------------
-- Utilities for a lexical analyser for the HTML language.
--
-- See the child packages HTML_Lexer.Task_Safe, .Task_Unsafe for
-- the actual lexers.
--
----------------------------------------------------------------------
type Token_Name is
(
-- Syntax error
Bad_Token,
-- Comments <!-- anything -->
Comment,
Whitespace,
-- Document Type Declaration <!DOCTYPE attributes>
Document_Type, -- treated as a comment
-- Tag delimiters
Start_Tag_Opener, -- <
End_Tag_Opener, -- </
Tag_Closer, -- >
-- Tags (without delimiters), not all tags may have attributes
Anchor, -- <A attributes>
HTML, -- <HTML attributes>
HTML_Body, -- <BODY attributes>
Head, -- <HEAD attributes>
Heading_1, -- <H1 attributes>
Image, -- <IMG attributes>
Meta, -- <META attributes>
Pre, -- <pre> ... </pre>, treated as a comment
Title, -- <TITLE attributes>
-- Attributes (Attribute=value)
Content, -- CONTENT
Hyper_Reference, -- HREF
Name, -- NAME
Link_Type, -- TYPE
Source, -- SRC
-- The assignment character in attributes
Assignment, -- =
-- Values (the right side of assignments)
Value, -- unquoted
String, -- "quoted"
-- Running text and entities like &
Text,
Entity,
End_Of_File);
type HTML_Token is private;
function Name (Token : in HTML_Token) return Token_Name;
function Lexeme (Token : in HTML_Token) return Standard.String;
function Line (Token : in HTML_Token) return Natural;
function Column (Token : in HTML_Token) return Natural;
private
type HTML_Token is record
Name : Token_Name;
Lexeme : Ada.Strings.Unbounded.Unbounded_String;
Line : Natural;
Column : Natural;
end record;
-- Visible for children
package Master_Token is new OpenToken.Token.Enumerated (Token_Name, Token_Name'Image, Token_Name'Width);
package Tokenizer is new Master_Token.Analyzer (Token_Name'First, Token_Name'Last);
-----------------------------------------------------------------------
-- HTML syntax is very different from Ada or Java syntax. This is an
-- abbreviated excerpt of the HTML 4.0 Reference.
--
-- Elements are the structures that describe parts of an HTML
-- document ... An element has three parts: a start tag, content,
-- and an end tag. A tag is special text--"markup"--that is
-- delimited by "<" and ">". An end tag includes a "/" after the
-- "<" ... The start and end tags surround the content of the
--- element:
-- <EM>This is emphasized text</EM>
-- ... An element's attributes define various properties for the
-- element ...
--
-- <IMG SRC="wdglogo.gif" ALT="Web Design Group">
--
-- An attribute is included in the start tag only--never the end
-- tag--and takes the form Attribute-name="Attribute-value".
--
-- Thus the text between tokens is arbitrary and need not be analysed.
-- In fact the whole text between tags is treated as a token of its
-- own (entities excepted).
-- Inside tags, however, we want to analyse for tag names, attribute
-- names and attribute values.
-- Thus we have to analyse the HTML document after an opening "<" and
-- stop after a closing ">".
--
-- So the idea is the following:
-- We split the syntax into two parts: A text and a tag syntax.
-- The lexer starts with the text syntax, and every time a tag opener
-- or closer is hit, we change the syntax to the appropriate one.
--
-- When defining the syntaxes, the following has to be taken into
-- account:
-- Since the syntax has to contain all token names, unused (and hence
-- in this syntax illegal) names use the Nothing recognizer. In order
-- to return them as Bad_Token, this name has to come first in the
-- sequence of names.
-- Since Document_Type and Comment both use the Bracketed_Comment
-- recognizer with the same opening string "<!", Comment has to come
-- first in the sequence of names.
--
-- If Document_Type is to be analyzed further like other tags, the
-- same trick with switching syntaxes can be applied.
-----------------------------------------------------------------------
function Text_Syntax return Tokenizer.Syntax;
function Tag_Syntax return Tokenizer.Syntax;
-- These must be functions, not constants, because they contain
-- pointers, and we don't have a deep copy defined.
end HTML_Lexer;
|