/*!
  \file
  \brief ルビ位置のパース

  \author Satofumi KAMIMURA

  $Id: rubi_parse.cpp 1825 2010-05-15 10:50:09Z satofumi $

  \todo 不適切な文字列が渡されたときにも、Segmentation falt にならないようにする
*/

#include "rubi_parse.h"
#include "Utf8.h"

#include <cstdio>

using namespace qrk;
using namespace std;


namespace
{
    extern bool parse(vector<rubi_t>& rubi_positions,
                      const char* kanji_text, size_t kanji_offset,
                      const char* kana_text, size_t kana_offset);


    bool isKana(unsigned long code)
    {
        //fprintf(stderr, " [code: %04lx, %d], ", code, ((code >= 0x8181) && (code <= 0x83b6)) ? true : false);
        // e38181(ぁ) から e383b6(ヶ) までをの範囲で true を返す
        return ((code >= 0xe38181) && (code <= 0xe383b6)) ? true : false;
    }


    rubi_t create_rubi(size_t kanji_first, size_t kanji_size,
                       size_t rubi_first, size_t rubi_size)
    {
        rubi_t rubi;
        rubi.kanji_first = kanji_first;
        rubi.kanji_size = kanji_size;
        rubi.rubi_first = rubi_first;
        rubi.rubi_size = rubi_size;

        return rubi;
    }


    bool register_rubi(vector<rubi_t>& rubi_positions,
                       Utf8& kanji, Utf8& kana,
                       size_t kanji_offset, size_t kana_offset,
                       size_t kana_index,
                       size_t same_size, size_t rubi_size)
    {
        // ルビの登録
        rubi_positions.
            push_back(create_rubi(kanji_offset + same_size,
                                  kana_index - same_size,
                                  kana_offset + same_size,
                                  rubi_size));

        size_t next_kanji_offset = kana_index + 1;
        size_t next_kana_offset = same_size + rubi_size + 1;
        const string next_kanji =
            kanji.substr(next_kanji_offset,
                         kanji.size() - next_kanji_offset).toStdString();
        const string next_kana =
            kana.substr(next_kana_offset,
                        kana.size() - next_kana_offset).toStdString();
        return parse(rubi_positions,
                     next_kanji.c_str(), kanji_offset + next_kanji_offset,
                     next_kana.c_str(), kana_offset + next_kana_offset);
    }


    bool parse(vector<rubi_t>& rubi_positions,
               const char* kanji_text, size_t kanji_offset,
               const char* kana_text, size_t kana_offset)
    {
        //fprintf(stderr, "\n[%d, %d]: ", kanji_offset, kana_offset);

        Utf8 kanji(kanji_text);
        Utf8 kana(kana_text);
        //fprintf(stderr, " [kanji: %d, kana: %d],", kanji.size(), kana.size());

        if (kana.size() < kanji.size()) {
            // 平仮名の方が短ければ、戻る
            //fprintf(stderr, " false! \n");
            return false;
        }

        // 先頭から共通して同じ平仮名を取り除く
        size_t same_size = 0;
        size_t n = min(kanji.size(), kana.size());
        for (size_t i = 0; i < n; ++i, ++same_size) {
            // !!! 範囲外にアクセスしないかのチェックをすべき
            //fprintf(stderr, " <%04lx,%04lx>,", kanji.ch(i), kana.ch(i));
            if (kanji.ch(i) != kana.ch(i)) {
                break;
            }
        }
        //fprintf(stderr, " [same_size: %d],", same_size);

        if (kanji.size() - same_size == 0) {
            // 文字列がなくなったら、処理を終了する
            //fprintf(stderr, "\n");
            return true;
        }

        // kanji の最初の１文字が平仮名だったら、戻る
        if (isKana(kanji.ch(same_size))) {
            return false;
        }

        // 漢字の次の文字を探す
        // !!! 1 byte 文字なども適切に処理できることを保証すべき
        size_t kana_index = 0;
        for (size_t i = same_size + 1; i < kanji.size(); ++i) {
            if (isKana(kanji.ch(i))) {
                kana_index = i;
                break;
            }
        }
        //fprintf(stderr, " [kana_index: %d],", kana_index);

        // 平仮名がなければ、残り全てをルビとみなす
        if (kana_index == 0) {
            // ルビの登録
            size_t rubi_size = kana.size() - same_size;
            rubi_positions.
                push_back(create_rubi(kanji_offset + same_size,
                                      kanji.size() - kana_index - same_size,
                                      kana_offset + same_size, rubi_size));
            //fprintf(stderr, "\n");
            return true;
        }

        unsigned long found_kana = kanji.ch(kana_index);

        size_t rubi_size = 1;
        for (size_t i = same_size + 1; i < kana.size(); ++i) {
            if (found_kana == kana.ch(i)) {
                //fprintf(stderr, " [rubi_size = %d],", rubi_size);
                bool ret = register_rubi(rubi_positions,
                                         kanji, kana,
                                         kanji_offset, kana_offset,
                                         kana_index,
                                         same_size, rubi_size);
                //fprintf(stderr, " %s!\n", ret ? "true" : "false");
                if (ret) {
                    return true;
                }
                rubi_positions.pop_back();
            }
            //fprintf(stderr, " [i = %d],", i);
            ++rubi_size;
        }

        //fprintf(stderr, " false!\n");
        return false;
    }
}


bool qrk::rubi_parse(std::vector<rubi_t>& rubi_positions,
                     const char* text, const char* kana_only)
{
    return parse(rubi_positions, text, 0, kana_only, 0);
}
