#!/usr/bin/ruby
#----------------------------------------------------------------------------
#Copyright (C) 2003 mita-K, NAKAUE.T (Meister), Gimite 
#
# mita-Kñ饤֥
#
#      Original works by mita-K
#      Extended by Gimite/Meister
#
#  2003.06.06                 ˥饹(Meister)
#                             Ūʸɤ˰¸ʤ褦ˡ
#                             ʸ(SJIS)˰¸
#  2003.06.22                 GimiteǤ(Meister)
#                             ννñʸե륿ѹ
#                             ߤμǤʸĴ٤뤿ᡢ
#                             虜虜ñʤȤ
#                             ƱΤ¾θ˴ޤޤƤ
#  2003.06.23                 GimiteǤεǽɲðܿ(Gimite)
#                             Ҥ餬ʸ򤸤θۤȤФǤʤ
#                             ñлζ§ɲ
#                             ñлθΥߤνɲ
#                             Ҥ餬ʤ2ʸʾϢ³̵ϸ§Ȥоݳ
#                             checkWordCand()Ϥprestrpoststr󤫤ʸѹ
#                             äݤΤθФ򶯲
#                             EUCѤ˽񤭴Τǡɬפ˱ᤷƤ
#
#----------------------------------------------------------------------------
$KCODE='E'
#----------------------------------------------------------------------------
# ʸ椫ñ(餷ʸ)õФ
class WordExtractor

  # 󥹥ȥ饯
  # WordExtractor(ñꥹȤݻĹ,ñɲûΥХå)
  def initialize(candlistlength=7,onaddword=nil)
    # ñΥꥹ
    @candList=Array.new(candlistlength,[])
    @onAddWord=onaddword
  end


  def candList; @candList;  end


  # ñΥꥹȤ֤
  def getCandList
    return @candList.flatten.compact.uniq
  end


  # ñȤŬڤɤȽꤹ
  # äݤʤɤˤŬѤʤ
  # Ŭnil֤
  def wordFilter1(word)
    # (^_^;
    return nil if !word
    # ʸ
    return nil if word=~/^.$/e
    # ʿ̾
    return nil if word=~/^[-]+$/e
    # Ҥ餬ʤ2ʸʾϢ³ޤޤʤ
    return nil if !(word=~/[^-][^-]/e)
    # äݤΤޤ
    return nil if word=~/[^-][ΤȤ]$/e
    # Ƭʳˡ֤ס֤ϡפޤ
    return nil if word=~/^.+/e
    return nil if word=~/^.+/e

    return word
  end


  # ñȤŬڤɤȽꤹ
  # äݤʤɤˤŬѤ
  # Ŭnil֤
  def wordFilter2(word)
    # (^_^;
    return nil if !word
    # 
    return nil if word=~/^$/e
    return nil if word=~/^[ ]/e
    return nil if word=~/[ ]$/e
    # ʰʸ
    return nil if word=~/^[--]$/e
    # Ҥ餬2ʸ
    return nil if word=~/^[-󡼡][-󡼡]$/e
    # ͡
    return nil if word=~/^[-.\/+*:;,~_|&'"`()0-9]+$/e
    # ޤ
    return nil if word=~/[ʡˡġݡ]/e
    return nil if word=~/[֡סء١ڡ̡ۡ]/e
    return nil if word=~/[΢̧͡]/e
    return nil if word=~/[()]/e
    # ʤʸϤޤäƤ
    return nil if word=~/^[,]/e
    return nil if word=~/^[󤡤]/e
    return nil if word=~/^[ݥ󥡥å]/e
    return nil if word=~/^[-][^-]/e
    # HTMLʸ
    return nil if word=~/&[#a-zA-Z0-9]+;/e

    return word
  end


  # ñȤŬڤɤȽꤹ
  # ʸ⻲ͤˤ
  # Ŭnil֤
  def checkWordCand(word,prestr='',poststr='')
    prestr='' if prestr==nil
    poststr='' if prestr==nil
    word=word.clone

    # ʲ2ƤϤޤäϰζ§Ƚ
    if (prestr=~/[ʡˡ]$/e || prestr=='') \
     &&(poststr=~/^[Ϥ]([^-]|$)/e) \
     &&((word+poststr[0..0])!~/(Ǥ||ˤ|Τ)$/e) \
     &&(word=~/^[-]+$/ || word=~/^[^-]/) \
     &&(word.length()>=6)
      # äݤΡ{orʸƬ}[Ϥ]{Ҥ餬ʰʳ}
      # ʤ֤Ǥϡס֤ס֤ˤϡס֤ΤפȤʤΤ
      # Ruby 1.6ˤjlengthʤΤŬƨƤ
    elsif (prestr=~/[]$/e)&&(poststr=='')
      # ʸΡ֡פ֡
    else
      word=wordFilter1(word)
    end
    return wordFilter2(word)
  end
  
  
  # ʸñȤɲä٤Ƚꤹ
  # ɲä٤ñwordȤϰۤʤˤޤnilŬˤ֤
  def checkWord(word)
    # Υߤν
    while word=~/^(.+)(Ȥ|ʤ|Ǥ?|Τ褦|||ä)$/e \
        || word=~/^(.+)(Ȥ||?|ޤ|ʤ|Ǥ?|Τ褦)$/e \
        || word=~/^(.+)(|?|ˤ|?||ޤ|Ǥ?)$/e \
        || word=~/^([^-]+)[Ȥˤ]$/e
      word= $1
    end
    # §
    return nil if word=~/^[--]$/e || word=~/^[-󡼡][-󡼡]$/e
      #ñ˽ƤϤΥߤνǸ줿ǽͭΤǤ⤦1
    return nil if word=~/ʤ|ä|ä|Ƥ||||Ȥ||||줿|ޤ/e
    return nil if word=~/||Τ||ˤ||ʤ|||||/e
    return nil if word=~/|Τ||||줬||||||/e
    return nil if word=~/|||||Τ||ʤ|Ǥ|ʤ|Ǥ|Ȥ/e
    return nil if word=~/褦|/e
    return nil if word=~/[^-][Ǥˤ]/e
    return nil if word=~/$/e
    return word
  end


  # ʸ󤫤ñ
  # ˥ޥХʸ(ܸʸ)Ѥ
  # 󥰥Хʸ򿩤碌ƤפʤϤ
  def extractCands(s)
    result=[]

    ss=s.split(//e)

    # ѿϢ³礹
    (ss.size-2).downto(0) {|i| ss[i]+=ss.delete_at(i+1) if (ss[i]=~/[-_0-9a-zA-Z]$/e)&&(ss[i+1]=~/[-_0-9a-zA-Z]$/e)}
    # ʤϢ³礹
    (ss.size-2).downto(0) {|i| ss[i]+=ss.delete_at(i+1) if (ss[i]=~/[ݥ-]$/e)&&(ss[i+1]=~/[ݥ-]$/e)}

    for i in 0..(ss.size-1)
      for j in i..(ss.size-1)
        cand=checkWordCand(ss[i..j].join,ss[0...i].join,ss[j+1...ss.size()].join)
        result << cand if cand!=nil
      end
    end
#    dprint("ñ", result)

    return result
  end


  # ñꥹ޴طˤΤñꥹȤŬ
  def optimizeWordList(wordcand)
    for i in 0..(wordcand.length-2)
      next if !wordcand[i]
      for j in (i+1)..(wordcand.length-1)
        next if !wordcand[j]
        if wordcand[j].index(wordcand[i])
          wordcand[i]=nil
          break
        end
        wordcand[j]=nil if wordcand[i].index(wordcand[j])
      end
    end
    wordcand.compact!

    return wordcand
  end


  # ʸǻȤƤñ
  def extractWords(line,words=[])

    # ñ䤬ʸ˻ȤƤñˤ
    wordcand = getCandList.reject {|word| !line.index(word)}

    # äñƱΤ޴طäûۤõ
    ## 㤨С֤ʤȡפȤñ줬Ͽ
    ## ֤ʤס֤ȡפƱñǧƤޤΤɤ
    wordcand=optimizeWordList(wordcand)
    
    # §
    wordcand2 = []
    for word in wordcand
      word2 = checkWord(word)
      wordcand2.push(word2) if word2
    end

    # ññȤǧꤹ롣
    ## ֤ä
    words = words | wordcand2

    if @onAddWord
      words.each {|w| @onAddWord.call(w)}
    end

    return words
  end


  # ¿Хʸޤϥ󥰥ХʸʤʸڤФ
  # $KCODEŬڤꤵƤʤФʤʤ
  # 0,2,4ܤ󥰥Хʸʸ
  def splitByCharType(s)
    result=[]

    issingle=true
    word=''
    s.split(//e).each{|c|
      if issingle!=(c.size==1)
        result << word
        word=''
      end
      word+=c
      issingle=(c.size==1)
    }
    result << word if word.size>0

    return result
  end


  # ñΥꥹȤ򹹿
  def renewCandList(line)
    newlist=[]
    wordlist=splitByCharType(line)
    for i in 0..(wordlist.size-1)
      if (i%2)==0
        wordlist[i].split(' ').each{|w| newlist << w if checkWordCand(w)!=nil}
      else
        newlist+=extractCands(wordlist[i])
      end
    end

    @candList.shift
    @candList << newlist
  end


  # ñΥꥹȤ򹹿
  # (󥰥ХʸλʬΥԤʤС)
  def renewCandList2(line)
    @candList.shift
    @candList << extractCands(line)
  end


  # ññꥹȹ1ʬ
  def processLine(line)
    words=extractWords(line)
    renewCandList(line)
    return words
  end


  #ǥХå
  def dprint(caption, obj)
    print(Kconv.tosjis(caption+": "+obj.inspect()), "\n")
  end

end
#----------------------------------------------------------------------------
=begin
# 
wordextractor=WordExtractor.new
p word=wordextractor.processLine('̾ͤǤ')
=end

