# encoding: utf-8
#Copyright (C) 2012 J.r0ck <j69@ar156.dip.jp>

#
# 人工無能 TwitAngela4
# 文章類似度比較用Trigram
#

module TwAngela

  class Trigram

    def initialize(src, part_len = 3)
      @src = src
      @part_len = part_len
    end

    def analyze(other)
      return 10 if @src.nil?
      return 10 if other.nil?
      return 10 if @src.length < @part_len
      return 10 if other.length < @part_len
      
      counter = 0.0
      gram = to_trigram(@src, @part_len)
      gram.each do |part|
        w = part.join('')
        hit = other.scan(/#{Regexp.quote(w)}/).size
        counter += hit
      end

      return counter / gram.length
    end
    
    def to_trigram(s, part_len = 3)
      trigram = []
      target = s.downcase.gsub(/[ 　\n\t]+/, "")
      array = target.split(//u)
      (0..(array.length - part_len)).each do |start|
        gram = array[start..array.length - 1].each_slice(part_len).inject([]) { |accum, el| accum << el }
        trigram << gram.first
      end
      return trigram
    end

  end
end
