/usr/share/tdiary/bayes.rb is in tdiary-contrib 3.2.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | # Copyright (C) 2007, KURODA Hiraku <hiraku@hinet.mydns.jp>
# You can redistribute it and/or modify it under GPL2. 
require "pstore"
module Bayes
	module CHARSET
		def self.setup_re(m)
			o = $KCODE
			$KCODE = m::KCODE
			m.const_set(:RE_MESSAGE_TOKEN, Regexp.union(m::RE_KATAKANA, m::RE_KANJI, /[a-zA-Z]+/))
			$KCODE=o
		end
		module EUC
			KCODE = "e"
			KATAKANA = "\xa5\xa2-\xa5\xf3"
			BAR = "\xa1\xbc"
			KANJI = "\xb0\xa1-\xfc\xfe"
			RE_KATAKANA = /[#{KATAKANA}#{BAR}]{2,}/eo
			RE_KANJI = /[#{KANJI}]{2,}/eo
			CHARSET.setup_re(self)
		end
		module UTF8
			KCODE = "u"
			def self.c2u(c)
				[c].pack("U")
			end
			def self.utf_range(a, b)
				"#{c2u(a)}-#{c2u(b)}"
			end
			KATAKANA = utf_range(0x30a0, 0x30ff)
			BAR = c2u(0x30fc)
			KANJI = utf_range(0x4e00, 0x9faf)
			RE_KATAKANA = /[#{KATAKANA}#{BAR}]{2,}/uo
			RE_KANJI = /[#{KANJI}]{2,}/uo
			CHARSET.setup_re(self)
		end
	end
	class TokenList < Array
		attr_reader :charset
		def initialize(charset=nil)
			unless charset
				charset =
					case $KCODE
					when /^e/i
						CHARSET::EUC
					else
						CHARSET::UTF8
					end
			end
			@charset = charset
		end
		alias _concat concat
		def concat(array, prefix=nil)
			if prefix
				_concat(array.map{|i| "#{prefix} #{i.to_s}"})
			else
				_concat(array)
			end
		end
		alias _push push
		def push(item, prefix=nil)
			if prefix
				_push("#{prefix} #{item.to_s}")
			else
				_push(item)
			end
		end
		def add_host(host, prefix=nil)
			if /^(?:\d{1,3}\.){3}\d{1,3}$/ =~ host
				while host.size>0
					push(host, prefix)
					host = host[/^(.*?)\.?\d+$/, 1]
				end
			else
				push(host, prefix)
				h = host
				while /^(.*?)[-_.](.*)$/=~h
					h = $2
					push($1, prefix)
					push(h, prefix)
				end
			end
			self
		end
		def add_url(url, prefix=nil)
			if %r[^(?:https?|ftp)://(.*?)(?::\d+)?/(.*?)\/?(\?.*)?$] =~ url
				host, path = $1, $2
				add_host(host, prefix)
				if path.size>0
					push(path, prefix)
					p = path
					re = %r[^(.*)[-_./](.*?)$]
					while re=~p
						p = $1
						push($2, prefix)
						push(p, prefix)
					end
				end
			end
			self
		end
		def add_message(message, prefix=nil)
			concat(message.scan(@charset::RE_MESSAGE_TOKEN), prefix)
			self
		end
		def add_mail_addr(addr, prefix=nil)
			push(addr, prefix)
			name, host = addr.split(/@/)
			return self if (name||"").empty?
			host ||= ""
			push(name, prefix)
			add_host(host, prefix)
			self
		end
	end
	class FilterBase
		attr_reader :spam, :ham, :db_name, :charset
		def initialize(db_name=nil, charset=nil)
			@spam = self.class::Corpus.new
			@ham = self.class::Corpus.new
			@charset = charset
			@db_name = db_name
			if db_name && File.exist?(db_name)
				PStore.new(db_name).transaction(true) do |db|
					@spam = db["spam"]
					@ham = db["ham"]
					@charset = db["charset"]
				end
			end
		end
		def save(db_name=nil)
			db_name ||= @db_name
			@db_name ||= db_name
			return unless @db_name
			PStore.new(@db_name).transaction do |db|
				db["spam"] = @spam
				db["ham"] = @ham
				db["charset"] = @charset
				yield(db) if block_given?
			end
		end
		def [](token)
			score(token)
		end
	end
	class PlainBayes < FilterBase
		class Corpus < Hash
			def initialize
				super(0.0)
			end
			def <<(src)
				s = src.size.to_f
				src.each do |i|
					self[i] += 1/s
				end
			end
		end
		def score(token)
			return nil unless @spam.include?(token) || @ham.include?(token)
			s = @spam[token]
			h = @ham[token]
			s/(s+h)
		end
		def estimate(tokens, take=15)
			s = tokens.uniq.map{|i| score(i)}.compact.sort{|a, b| (0.5-a).abs <=> (0.5-b)}.reverse[0...take]
			return nil if s.empty? || s.include?(1.0) && s.include?(0.0)
			prod = s.inject(1.0){|r, i| r*i}
			return prod/(prod+s.inject(1.0){|r, i| r*(1-i)})
		end
	end
	class PaulGraham < FilterBase
		class Corpus < Hash
			attr_reader :count
			def initialize
				super(0)
				@count = 0
			end
			def <<(src)
				@count += 1
				src.each do |i|
					self[i] += 1
				end
			end
		end
		def score(token)
			return 0.4 unless @spam.include?(token) or @ham.include?(token)
			g = @ham.count==0 ? 0.0 : [1.0, 2*@ham[token]/@ham.count.to_f].min
			b = @spam.count==0 ? 0.0 : [1.0, @spam[token]/@spam.count.to_f].min
			r = [0.01, [0.99, b/(g+b)].min].max
			r
		end
		def estimate(tokens, take=15)
			s = tokens.uniq.map{|i| score(i)}.compact.sort{|a, b| (0.5-a).abs <=> (0.5-b)}.reverse[0...take]
			return nil if s.empty? || s.include?(1.0) && s.include?(0.0)
			prod = s.inject(1.0){|r, i| r*i}
			return prod/(prod+s.inject(1.0){|r, i| r*(1-i)})
		end
	end
end
 |