This file is indexed.

/usr/share/skktools/convert2skk/chasen2skk.rb is in skktools 1.3.2-2.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/ruby1.8 -Ke
## Copyright (C) 2005 MITA Yuusuke <clefs@mail.goo.ne.jp>
##
## Author: MITA Yuusuke <clefs@mail.goo.ne.jp>
## Maintainer: SKK Development Team <skk@ring.gr.jp>
## Version: $Id: chasen2skk.rb,v 1.4 2006/01/04 10:35:06 skk-cvs Exp $
## Keywords: japanese, dictionary
## Last Modified: $Date: 2006/01/04 10:35:06 $
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2, or (at your option)
## any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program, see the file COPYING.  If not, write to the
## Free Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston,
## MA 02110-1301, USA.
##
### Commentary:
##
### Instruction:
##
## This script tries to extract SKK pairs from the output of ChaSen.
##
## % chasen | chasen2skk.rb
## or
## % mecab -Ochasen | chasen2skk.rb
##
##
## skkdictools.rb required.
##
## TODO: pick up compound-verbs, eg. ¡ÖÉñ¤¤»¶¤ë¡×
## Éñ¤¤    ¥Þ¥¤    Éñ¤¦    Æ°»ì-¼«Î©       ¸ÞÃÊ¡¦¥ï¹ÔÂ¥²»ÊØ        Ï¢ÍÑ·Á
## »¶¤ë    ¥Á¥ë    »¶¤ë    Æ°»ì-¼«Î©       ¸ÞÃÊ¡¦¥é¹Ô      ´ðËÜ·Á
##
require 'jcode'
require 'kconv'
require 'skkdictools'

#require 'cgi'
#require 'socket'
#require 'timeout'

require 'optparse'
opt = OptionParser.new

katakana_words = false
#katakana_majiri = false
#append_goohits = false
keyword = ""
#fetch_from_goo = false
append_notes = false
allow_noun_chains = true
#allow_verb_chains = true
handle_prefix = true
min_length = 2 * 2
max_length = 100 * 2

# -g might be a bad idea; better eliminate pairs already in SKK-JISYO.L first
#opt.on('-g', 'append goo hit numbers') { append_goohits = true }
opt.on('-k', '--extract-katakana', 'extract katakana words (if WORD not given)') { katakana_words = true }
#opt.on('-K', 'extract words containing katakana') { katakana_majiri = true }
opt.on('-m VAL', '--min-length=VAL', 'ignore words less than VAL letters') { |v| min_length = v.to_i * 2 }
opt.on('-M VAL', '--max-length=VAL', 'ignore words more than VAL letters') { |v| max_length = v.to_i * 2 }
opt.on('-n', '--append-notes', 'append grammatical notes') { append_notes = true }
opt.on('-N', '--disallow-noun-chains', 'disallow noun chains containing hiragana') { allow_noun_chains = false }
opt.on('-P', '--ignore-prefixes', 'don\'t take prefixes into consideration') { handle_prefix = false }
opt.on('-w WORD', '--extract-word=WORD', 'extract pairs containing WORD') { |v| keyword = v }
#opt.on('-W WORD', 'query goo and extract pairs containing WORD') { |v| keyword = v; fetch_from_goo = true }

begin
  opt.parse!(ARGV)
rescue OptionParser::InvalidOption => e
  print "'#{$0} -h' for help.\n"
  exit 1
end

#keyword_pat = Regexp.compile("[°¡-ô¦]*#{keyword}[°¡-ô¦]*")

count = 0
#key = word = last_key = last_word = last_part = ""
key = word = last_part = ""
poisoned = terminate = false

while gets
  midasi, yomi, root, part, conj = $_.split("	", 5)
  #if midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼]+$/ || terminate
  if (midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼¡¹]+$/ &&
      (!allow_noun_chains || part !~ /̾»ì/ || part =~ /Èó¼«Î©/ ||
      midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼¡¹¤¡-¤ó]+$/ )) || terminate
  #if (midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼]+$/ && conj !~ /Ï¢ÍÑ·Á/) || terminate
    #next if count < 1
    if count < 1
      next if !handle_prefix
      if part =~ /ÀÜƬ»ì/
	# kludge - keep prefix w/o increasing count (cf.¡Ö¤´Î©Çɡס֤ªÌ£Á¹¡×)
	key = yomi.to_hiragana
	word = midasi
	last_part = part
      #elsif part =~ /¼«Î©/ && conj =~ /Ï¢ÍÑ·Á/
      #  hogehoge
      else
	key = word = last_part = ""
      end
      next
    end

    if midasi =~ /^[^°¡-ô¦¥¡-¥ó¥ô¡¼¡¹]+$/ && !terminate
      # nothing
    else
      if part =~ /Àܳ»ì|ÀÜƬ»ì|Éû»ì[^²Ä]/
	# nothing - decline some parts
      elsif midasi =~ /ʤÓ|µÚ¤Ó/
	# nothing - (HACK) decline conjonctions that ChaSen overlooks
      elsif midasi =~ /^[¤¡-¤ó]+[°¡-ô¦¥¡-¥ó¥ô¡¼¡¹]+/
	# nothing - this applies to quasi-words such as:
	# ¤Ë´Ø¤¹¤ë        ¥Ë¥«¥ó¥¹¥ë      ¤Ë´Ø¤¹¤ë        ½õ»ì-³Ê½õ»ì-Ï¢¸ì
      else
	key += yomi.to_hiragana
	word += midasi
	last_part = part
	# asayaKify here?
      end
    end

    if word =~ /^[¤¡-¤ó¡¼]+$/
      # nothing
    elsif !katakana_words && word =~ /^[¥¡-¥ó¥ô¡¼]+$/
      # nothing
    elsif !keyword.empty? && !word.include?(keyword)
      # nothing
    elsif poisoned || word.size < min_length || word.size > max_length
      # nothing
    else
      print_pair(key, word, nil, append_notes ? "<autogen>,#{last_part.chomp}" : nil)
    end

    key = word = last_part = ""
    poisoned = terminate = false
    count = 0

  else
    if count > 0 && part =~ /Àܳ»ì|ÀÜƬ»ì|Éû»ì[^²Ä]/
      terminate = true
      redo
    elsif count == 0 && part =~ /ÀÜÈø/
      # avoid generating ¡Ö²óÂç²ñ¡× from ¡ÖÂ裳²óÂç²ñ¡×
      # ²ó      ¥«¥¤    ²ó      ̾»ì-ÀÜÈø-½õ¿ô»ì
      key = word = last_part = ""
      next
    end
    count += 1
    key += yomi.to_hiragana
    word += midasi
    last_part = part
    poisoned = true if part =~ /̤Ãθì/
  end
end