/usr/share/skktools/convert2skk/chasen2skk.rb

#!/usr/bin/ruby -Ke
# -*- coding: euc-jp -*-
## Copyright (C) 2005 MITA Yuusuke <clefs@mail.goo.ne.jp>
##
## Author: MITA Yuusuke <clefs@mail.goo.ne.jp>
## Maintainer: SKK Development Team <skk@ring.gr.jp>
## Version: $Id: chasen2skk.rb,v 1.5 2013/05/26 09:47:48 skk-cvs Exp $
## Keywords: japanese, dictionary
## Last Modified: $Date: 2013/05/26 09:47:48 $
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2, or (at your option)
## any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program, see the file COPYING.  If not, write to the
## Free Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston,
## MA 02110-1301, USA.
##
### Commentary:
##
### Instruction:
##
## This script tries to extract SKK pairs from the output of ChaSen.
##
## % chasen | chasen2skk.rb
## or
## % mecab -Ochasen | chasen2skk.rb
##
##
## skkdictools.rb required.
##
## TODO: pick up compound-verbs, eg. ¡ÖÉñ¤¤»¶¤ë¡×
## Éñ¤¤    ¥Þ¥¤    Éñ¤¦    Æ°»ì-¼«Î©       ¸ÞÃÊ¡¦¥ï¹ÔÂ¥²»ÊØ        Ï¢ÍÑ·Á
## »¶¤ë    ¥Á¥ë    »¶¤ë    Æ°»ì-¼«Î©       ¸ÞÃÊ¡¦¥é¹Ô      ´ðËÜ·Á
##
require 'jcode' if RUBY_VERSION.to_f < 1.9
require 'kconv'
require 'skkdictools'

#require 'cgi'
#require 'socket'
#require 'timeout'

require 'optparse'
opt = OptionParser.new

katakana_words = false
#katakana_majiri = false
#append_goohits = false
keyword = ""
#fetch_from_goo = false
append_notes = false
allow_noun_chains = true
#allow_verb_chains = true
handle_prefix = true
min_length = 2 * 2
max_length = 100 * 2

# -g might be a bad idea; better eliminate pairs already in SKK-JISYO.L first
#opt.on('-g', 'append goo hit numbers') { append_goohits = true }
opt.on('-k', '--extract-katakana', 'extract katakana words (if WORD not given)') { katakana_words = true }
#opt.on('-K', 'extract words containing katakana') { katakana_majiri = true }
opt.on('-m VAL', '--min-length=VAL', 'ignore words less than VAL letters') { |v| min_length = v.to_i * 2 }
opt.on('-M VAL', '--max-length=VAL', 'ignore words more than VAL letters') { |v| max_length = v.to_i * 2 }
opt.on('-n', '--append-notes', 'append grammatical notes') { append_notes = true }
opt.on('-N', '--disallow-noun-chains', 'disallow noun chains containing hiragana') { allow_noun_chains = false }
opt.on('-P', '--ignore-prefixes', 'don\'t take prefixes into consideration') { handle_prefix = false }
opt.on('-w WORD', '--extract-word=WORD', 'extract pairs containing WORD') { |v| keyword = v }
#opt.on('-W WORD', 'query goo and extract pairs containing WORD') { |v| keyword = v; fetch_from_goo = true }

begin
  opt.parse!(ARGV)
rescue OptionParser::InvalidOption => e
  print "'#{$0} -h' for help.\n"
  exit 1
end

#keyword_pat = Regexp.compile("[°¡-ô¦]*#{keyword}[°¡-ô¦]*")

count = 0
#key = word = last_key = last_word = last_part = ""
key = word = last_part = ""
poisoned = terminate = false

while gets
  midasi, yomi, root, part, conj = $_.split("	", 5)
  #if midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼]+$/ || terminate
  if (midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼¡¹]+$/ &&
      (!allow_noun_chains || part !~ /Ì¾»ì/ || part =~ /Èó¼«Î©/ ||
      midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼¡¹¤¡-¤ó]+$/ )) || terminate
  #if (midasi !~ /^[°¡-ô¦¥¡-¥ó¥ô¡¼]+$/ && conj !~ /Ï¢ÍÑ·Á/) || terminate
    #next if count < 1
    if count < 1
      next if !handle_prefix
      if part =~ /ÀÜÆ¬»ì/
	# kludge - keep prefix w/o increasing count (cf.¡Ö¤´Î©ÇÉ¡×¡Ö¤ªÌ£Á¹¡×)
	key = yomi.to_hiragana
	word = midasi
	last_part = part
      #elsif part =~ /¼«Î©/ && conj =~ /Ï¢ÍÑ·Á/
      #  hogehoge
      else
	key = word = last_part = ""
      end
      next
    end

    if midasi =~ /^[^°¡-ô¦¥¡-¥ó¥ô¡¼¡¹]+$/ && !terminate
      # nothing
    else
      if part =~ /ÀÜÂ³»ì|ÀÜÆ¬»ì|Éû»ì[^²Ä]/
	# nothing - decline some parts
      elsif midasi =~ /ÊÂ¤Ó|µÚ¤Ó/
	# nothing - (HACK) decline conjonctions that ChaSen overlooks
      elsif midasi =~ /^[¤¡-¤ó]+[°¡-ô¦¥¡-¥ó¥ô¡¼¡¹]+/
	# nothing - this applies to quasi-words such as:
	# ¤Ë´Ø¤¹¤ë        ¥Ë¥«¥ó¥¹¥ë      ¤Ë´Ø¤¹¤ë        ½õ»ì-³Ê½õ»ì-Ï¢¸ì
      else
	key += yomi.to_hiragana
	word += midasi
	last_part = part
	# asayaKify here?
      end
    end

    if word =~ /^[¤¡-¤ó¡¼]+$/
      # nothing
    elsif !katakana_words && word =~ /^[¥¡-¥ó¥ô¡¼]+$/
      # nothing
    elsif !keyword.empty? && !word.include?(keyword)
      # nothing
    elsif poisoned || word.size < min_length || word.size > max_length
      # nothing
    else
      print_pair(key, word, nil, append_notes ? "<autogen>,#{last_part.chomp}" : nil)
    end

    key = word = last_part = ""
    poisoned = terminate = false
    count = 0

  else
    if count > 0 && part =~ /ÀÜÂ³»ì|ÀÜÆ¬»ì|Éû»ì[^²Ä]/
      terminate = true
      redo
    elsif count == 0 && part =~ /ÀÜÈø/
      # avoid generating ¡Ö²óÂç²ñ¡× from ¡ÖÂè£³²óÂç²ñ¡×
      # ²ó      ¥«¥¤    ²ó      Ì¾»ì-ÀÜÈø-½õ¿ô»ì
      key = word = last_part = ""
      next
    end
    count += 1
    key += yomi.to_hiragana
    word += midasi
    last_part = part
    poisoned = true if part =~ /Ì¤ÃÎ¸ì/
  end
end
skktools 1.3.3+0.20150901-1 / usr / share / skktools / convert2skk / chasen2skk.rb