/usr/lib/ruby/1.8/samizdat/sanitize.rb is in libsamizdat-ruby1.8 0.6.2-2ubuntu1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | # Samizdat HTML validation
#
# Copyright (c) 2002-2009 Dmitry Borodaenko <angdraug@debian.org>
#
# This program is free software.
# You can distribute/modify this program under the terms of
# the GNU General Public License version 3 or later.
#
# vim: et sw=2 sts=2 ts=8 tw=0
require 'rexml/document'
# use (") instead of (') in XML attributes, escape both of them
#
module REXML
class Attribute
def to_string
%{#@expanded_name="#{to_s().gsub(/"/, '"').gsub(/'/, ''')}"}
end
end
end # module REXML
module Samizdat
class SanitizeError < RuntimeError; end
class Sanitize
begin
FORMATTER = REXML::Formatters::Default.new(true) # enable IE hack
rescue LoadError, NameError
# backwards compatibility for Ruby versions without REXML::Formatters
#
class LegacyFormatter
def write(node, output)
return unless node.respond_to?(:write)
node.write(output, -1, false, true)
end
end
FORMATTER = LegacyFormatter.new
end
# _xhtml_ is expected to be loaded from xhtml.yaml.
#
# _tidypath_ may point to a binary or library. If it's a library (detected by
# .so in the file name), Ruby/Tidy DL-based wrapper library will be used. If
# it's a binary, pipe will be used to filter HTML through it.
#
def initialize(xhtml, tidypath=nil)
@xhtml = xhtml
set_tidy(tidypath)
end
attr_reader :xhtml
CSS = Regexp.new(%r{
\A\s*
([-a-z0-9]+) : \s*
(?: (?: [-./a-z0-9]+ | \#[0-9a-f]+ | [0-9]+% ) \s* ) +
\s*\z
}xi).freeze
def check_style(css, style)
style.split(';').each do |s|
return false unless
s =~ CSS and css.include? $1
end
true
end
# compare elements and attributes with xhtml.yaml
#
def sanitize_element(xml, filter=@xhtml)
if xml.name =~ /^_/ or not filter.keys.include?(xml.name)
# doesn't work without xpath
xml.document.delete_element(xml.xpath)
return
end
if xml.has_attributes?
attrs = filter['_common'].merge((filter[xml.name] or {}))
xml.attributes.each_attribute do |a|
xml.delete_attribute(a.name) unless attrs[a.name] === a.to_s
if 'style' == a.name and filter['_css']
# sanitize CSS in style="" attributes
xml.delete_attribute(a.name) unless
check_style(filter['_css'], a.value)
end
end
end
if xml.has_elements? # recurse
xml.elements.each {|e| sanitize_element(e) }
end
end
# filter HTML through Tidy
#
def tidy(html)
@tidy_binary ? tidy_pipe(html) : tidy_dl(html)
end
# return sanitized HTML
#
def sanitize(html, filter=@xhtml)
html = tidy(html)
(html.nil? or html.empty?) and raise SanitizeError,
"Invalid HTML detected"
begin
xml = REXML::Document.new(html).root
xml = xml.elements['//html/body']
rescue REXML::ParseException
raise SanitizeError, "Invalid XHTML detected: " +
$!.continued_exception.to_s.gsub(/\n.*/, '')
end
sanitize_element(xml, filter)
html = ''
xml.each {|child| FORMATTER.write(child, html) }
html
end
private
SO_PATH_PATTERN = Regexp.new(/\.so(?:\..+)?\z/).freeze
def is_so?(path)
path =~ SO_PATH_PATTERN and File.readable?(path)
end
def set_tidy(tidypath)
if tidypath.nil?
[ '/usr/bin/tidy',
'/usr/local/bin/tidy',
'/usr/lib/libtidy.so',
'/usr/local/lib/libtidy.so'
].each {|path|
if File.exists?(path)
tidypath = path
break
end
}
end
if is_so?(tidypath)
require 'tidy'
# workaround for memory leak in Tidy.path=
if not defined?(@@tidysopath) or tidypath != @@tidysopath
Tidy.path = @@tidysopath = tidypath
end
@tidy_binary = nil
elsif File.executable?(tidypath)
@tidy_binary = tidypath
end
require 'open3' if @tidy_binary
end
def tidy_dl(html)
xml = Tidy.open(:quiet => true,
:show_warnings => false,
:show_errors => 1,
:output_xhtml => true,
:literal_attributes => true,
:preserve_entities => true,
:tidy_mark => false,
:wrap => 0,
:char_encoding => 'utf8'
) {|tidy| tidy.clean(html.to_s.untaint) }
xml.taint
end
def tidy_pipe(html)
stdin, stdout, stderr =
Open3.popen3(@tidy_binary +
' --quiet yes' +
' --show-warnings no' +
' --show-errors 1' +
' --output-xhtml yes' +
' --literal-attributes yes' +
' --preserve-entities yes' +
' --tidy-mark no' +
' --wrap 0' +
' --char-encoding utf8')
stdin.write(html.to_s.untaint)
stdin.close
errors = stderr.read
stderr.close
xhtml = stdout.read
stdout.close
errors.nil? or errors.empty? or raise SanitizeError,
"Invalid HTML detected: " + errors
xhtml
end
end
end # module Samizdat
|