﻿using System;
using System.Collections.Generic;
using System.Text;
using ChaKi.Entity.Corpora;
using System.IO;
using System.Xml;
using Iesi.Collections.Generic;
using ChaKi.Entity.Corpora.Annotations;

namespace ChaKi.Service.Readers
{
    public abstract class CabochaReader
    {
        protected Corpus m_Corpus;
        protected TagSet m_TagSet;

        public abstract Lexeme AddLexeme(string s, LexiconBuilder lb);

        public CabochaReader(Corpus corpus)
        {
            m_Corpus = corpus;

            // Cabochaから要求されるTagSetを定義しておく
            m_TagSet = new TagSet("CabochaTagSet");
            m_TagSet.Version = new TagSetVersion("1", 0);
            m_TagSet.AddTag(new Tag(Tag.SEGMENT, "Bunsetsu"));
            m_TagSet.AddTag(new Tag(Tag.LINK, "A"));
            m_TagSet.AddTag(new Tag(Tag.LINK, "O"));
            m_TagSet.AddTag(new Tag(Tag.LINK, "D"));
        }

        /// <summary>
        /// CabochaデータをCorpusに読み込む（文節情報はBunsetsuテーブルに入れる）
        /// </summary>
        /// <param name="path"></param>
        /// <param name="encoding"></param>
        public Document ReadFromFile(string path, string encoding)
        {
            Document newdoc = new Document();

            // 現在のChar Position
            int charPos = 0;

            LexiconBuilder lb = new LexiconBuilder();

            using (TextReader streamReader = new StreamReader(path, Encoding.GetEncoding(encoding)))
            {
                int n = 0;
                string s;
                Sentence sen = new Sentence(newdoc);
                Bunsetsu currentBunsetsu = null;     // 最後に読んだ文節
                StringBuilder sb = new StringBuilder();     // Document全体の平文を格納
                while ((s = streamReader.ReadLine()) != null)
                {
                    if (s.StartsWith("*"))
                    {
                        //文節の開始
                        try
                        {
                            Bunsetsu buns = sen.AddBunsetsu(s);
                            currentBunsetsu = buns;
                        }
                        catch (Exception)
                        {
                            Console.WriteLine(string.Format("Bunsetsu parse error: {0}", s));
                        }
                    }
                    else if (s.StartsWith("EOS"))
                    {
                        // 文の終わり
                        sen.CheckBunsetsus();   // デフォルト文節を追加。係り受け構造全体の整合性を取る。

                        if (++n % 1000 == 0)
                        {
                            Console.Write("> {0}\r", n);
                        }
                        sen.EndChar = charPos;
                        m_Corpus.AddSentence(sen);
                        // 以降のWordのために、新しいSentenceを初期化して用意する。
                        sen = new Sentence(newdoc);
                        sen.StartChar = charPos;
                        currentBunsetsu = null;
                    }
                    else if (s.Trim().Length > 0)
                    {
                        Lexeme m = null;
                        try
                        {
                            m = this.AddLexeme(s, lb);
                        }
                        catch (Exception)
                        {
                            Console.WriteLine(string.Format("Lexeme parse error: {0}", s));
                        }
                        if (m != null)
                        {
                            Word w = sen.AddWord(m);
                            w.StartChar = charPos;
                            w.EndChar = charPos + w.CharLength;
                            w.Bunsetsu = currentBunsetsu;
                            w.Bunsetsu = currentBunsetsu;   // currentBunsetsu はChaSenの場合はnull。
                            // 日本語の場合：デリミタなしで平文を再現
                            sb.Append(m.Surface);
                            //TODO: 英語の場合：平文を再現するにはデリミタで単語を区切る必要がある

                            charPos += m.CharLength;
                        }
                    }
                }
                newdoc.Text = sb.ToString();
                Console.Write("> {0} Sentences Found.\r", n);
            }
            return newdoc;
        }

        /// <summary>
        /// CabochaデータをCorpusに読み込む（文節情報はSegment,Linkテーブルに入れる）
        /// </summary>
        /// <param name="path"></param>
        /// <param name="encoding"></param>
        public Document ReadFromFileSLA(string path, string encoding)
        {
            Document newdoc = new Document();

            // 現在のChar Position
            int charPos = 0;

            // 原始Lexicon
            LexiconBuilder lb = new LexiconBuilder();

            // 文節データの一時リスト
            CabochaBunsetsuList bunsetsuList = new CabochaBunsetsuList();

            // Documentデータの一時リスト
            Dictionary<int, DocumentTagSet> documentTags = new Dictionary<int, DocumentTagSet>();
            DocumentTag.UniqueID = 0;

            int n = 0;
            using (TextReader streamReader = new StreamReader(path, Encoding.GetEncoding(encoding)))
            {
                string s;
                Sentence sen = new Sentence(newdoc);
                CabochaBunsetsu currentBunsetsu = null;     // 最後に読んだ文節
                CabochaBunsetsu terminalBunsetsu = null;    // 現在の文内において、係り先が-1であるような文節
                StringBuilder sb = new StringBuilder();     // Document全体の平文内容

                while ((s = streamReader.ReadLine()) != null)
                {
                    if (s.StartsWith("*"))
                    {
                        //文節の開始
                        try
                        {
                            CabochaBunsetsu buns = this.ParseBunsetsu(sen, charPos, s);
                            currentBunsetsu = buns;
                            if (buns.DependsTo == -1)
                            {
                                terminalBunsetsu = buns;
                            }
                            bunsetsuList.Add(buns);
                        }
                        catch (Exception)
                        {
                            Console.WriteLine(string.Format("Bunsetsu parse error: {0}", s));
                        }
                    }
                    else if (s.StartsWith("EOS"))
                    {
                        // 文の終わり
                        if (currentBunsetsu == null)
                        {   // デフォルト文節を追加(入力がChasen/Mecabの場合のため)
                            CabochaBunsetsu buns = new CabochaBunsetsu(sen, sen.StartChar, 0, String.Empty, -1);
                            buns.EndPos = charPos;
                            bunsetsuList.Add(buns);
                            currentBunsetsu = buns;
                            terminalBunsetsu = buns;
                        }
                        // 終端ダミー文節を追加
                        CabochaBunsetsu dummy = new CabochaBunsetsu(sen, charPos, currentBunsetsu.BunsetsuPos + 1, String.Empty, -1);
                        bunsetsuList.Add(dummy);
                        // 係り先が-1である文節をdummyに係るようにする。
                        if (terminalBunsetsu != null)
                        {
                            terminalBunsetsu.DependsTo = dummy.BunsetsuPos;
                            terminalBunsetsu.DependsAs = "D";
                        }

                        if (++n % 1000 == 0)
                        {
                            Console.Write("> {0}\r", n);
                        }
                        sen.EndChar = charPos;
                        m_Corpus.AddSentence(sen);
                        // 以降のWordのために、新しいSentenceを初期化して用意する。
                        sen = new Sentence(newdoc);
                        sen.StartChar = charPos;
                        currentBunsetsu = null;
                        terminalBunsetsu = null;
                    }
                    else if (s.Trim().Length > 0)
                    {
                        Lexeme m = null;
                        try
                        {
                            m = this.AddLexeme(s, lb);
                        }
                        catch (Exception)
                        {
                            Console.WriteLine(string.Format("Lexeme parse error: {0}", s));
                        }
                        if (m != null)
                        {
                            Word w = sen.AddWord(m);
                            w.StartChar = charPos;
                            w.EndChar = charPos + w.CharLength;
                            if (currentBunsetsu != null)   // currentBunsetsu はChaSenの場合はnull。
                            {
                                currentBunsetsu.AddWord(w);
                            }
                            //                            w.Bunsetsu = currentBunsetsu;   // currentBunsetsu はChaSenの場合はnull。
                            // 日本語の場合：デリミタなしで平文を再現
                            sb.Append(m.Surface);
                            //TODO: 英語の場合：平文を再現するにはデリミタで単語を区切る必要がある

                            charPos += m.CharLength;
                        }
                    }
                }
                newdoc.Text = sb.ToString();
                Console.Write("> {0} Sentences Found.\r", n);
            }

            // BunsetsuをSegment&LinkとしてCorpusに登録
            Console.WriteLine("\nChecking Segments (Count={0})", bunsetsuList.Count);
            Tag bunsetsuTag = m_TagSet.FindTag(Tag.SEGMENT, "Bunsetsu");
            n = 0;
            foreach (CabochaBunsetsu buns in bunsetsuList.Values)
            {
                if (++n % 100 == 0)
                {
                    Console.Write("> {0}\r", n);
                }
                Segment seg = new Segment();
                seg.StartChar = buns.StartPos;
                seg.EndChar = buns.EndPos;
                seg.Tag = bunsetsuTag;
                seg.Sentence = buns.Sen;
                m_Corpus.AddSegment(seg);
                buns.Seg = seg;
            }
            Console.WriteLine("> {0}", bunsetsuList.Count);
            Console.WriteLine("Checking Links (Count={0})", bunsetsuList.Count);
            n = 0;
            foreach (CabochaBunsetsu buns in bunsetsuList.Values)
            {
                if (++n % 100 == 0)
                {
                    Console.Write("> {0}\r", n);
                }
                CabochaBunsetsu depBunsetsu = bunsetsuList.Find(buns.Sen, buns.DependsTo);
                if (depBunsetsu != null)
                {
                    Link link = new Link();
                    link.From = buns.Seg;
                    link.To = depBunsetsu.Seg;
                    link.FromSentence = buns.Sen;
                    link.ToSentence = buns.Sen;
                    link.Tag = m_TagSet.FindTag(Tag.LINK, buns.DependsAs);
                    m_Corpus.AddLink(link);
                }
            }
            Console.WriteLine("> {0}", bunsetsuList.Count);

            // 原始LexiconをCorpusのLexiconへコピー
            lb.CopyToCorpusLexicon(m_Corpus.Lex);

            return newdoc;
        }

        private CabochaBunsetsu ParseBunsetsu(Sentence sen, int charPos, string s)
        {
            char[] bunsetsuSplitPattern = new char[] { ' ' };
            char[] numberPattern = new char[] { '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' };

            // "* 0 -1D 0/0 0.00000000"の形式の行をパースする
            string[] bunsetsuparams = s.Split(bunsetsuSplitPattern);
            if (bunsetsuparams.Length < 3)
            {
                throw new InvalidDataException();
            }
            int bunsetsuPos = Int32.Parse(bunsetsuparams[1]);
            int pos = bunsetsuparams[2].LastIndexOfAny(numberPattern);
            if (pos < 0 || pos + 1 > bunsetsuparams[2].Length - 1)
            {
                throw new InvalidDataException();
            }
            int depBunsetsuId = Int32.Parse(bunsetsuparams[2].Substring(0, pos + 1));
            string depType = bunsetsuparams[2].Substring(pos + 1, bunsetsuparams[2].Length - pos - 1);

            // パラメータが正しければ、文節オブジェクトを作成
            if (bunsetsuPos < 0 || depType == null)
            {
                throw new InvalidDataException();
            }
            return new CabochaBunsetsu(sen, charPos, bunsetsuPos, depType, depBunsetsuId);
        }

        private void AddDocumentTags(Dictionary<int, DocumentTagSet> documentTags, int id, string xmlstr)
        {
            DocumentTagSet list;
            if (!documentTags.TryGetValue(id, out list))
            {
                list = new DocumentTagSet();
                documentTags.Add(id, list);
            }

            string s;
            int sp = xmlstr.IndexOf('<');
            if (sp >= 0)
            {
                s = string.Format("<Root><FilePath>{0}</FilePath>{1}</Root>", xmlstr.Substring(0, sp), xmlstr.Substring(sp));
            }
            else
            {
                s = string.Format("<Root><FilePath>{0}</FilePath></Root>", xmlstr);
            }
            using (TextReader trdr = new StringReader(s))
            {
                XmlReader xrdr = XmlReader.Create(trdr);
                while (xrdr.Read())
                {
                    if (xrdr.Name.Equals("Root")) continue;
                    DocumentTag dt = new DocumentTag();
                    dt.ID = DocumentTag.UniqueID++;
                    dt.Tag = xrdr.Name;
                    dt.Description = xrdr.ReadString();
                    if (dt.Tag.Equals("FilePath"))
                    {
                        dt.Description = dt.Description.Replace(":", @"/");
                        dt.Description = dt.Description.Replace("//", @":/");
                        if (list.Primary == null)
                        {
                            list.Primary = dt;              // BibIDがなければFilePathをDcument Filterに使用
                        }
                    }
                    else if (dt.Tag.Equals("Bib_ID"))       // Document Filterではこのタグを優先して使用する
                    {
                        list.Primary = dt;
                    }
                    list.Set.Add(dt);
                    m_Corpus.DocumentTags.Add(dt);
                }
            }
        }
    }
}
