﻿using System;
using System.Collections.Generic;
using System.Text;
using ChaKi.Entity.Corpora;

namespace ChaKi.Service.Readers
{
    public class LexiconBuilder
    {
        private const string BASE_TAG = "基本形";

        /// <summary>
        /// Cabochaなどソースを読みながらLexiconを追加していくための連想リスト
        /// </summary>
        public SortedList<string, Lexeme> KeyedEntries { get; set; }
        public Dictionary<string, PartOfSpeech> KeyedPartsOfSpeech { get; set; }
        public Dictionary<string, CType> KeyedCTypes { get; set; }
        public Dictionary<string, CForm> KeyedCForms { get; set; }

        // 基本形検索のためにReading, Pronunciationを除いてキー化した連想リスト
        public SortedList<string, Lexeme> KeyedEntries2 { get; set; }

        private static char[] TabSeparator = new char[] { '\t' };
        private static char[] CommaSeparator = new char[] { ',' };

        public LexiconBuilder()
        {
            this.KeyedEntries = new SortedList<string, Lexeme>();
            this.KeyedEntries2 = new SortedList<string, Lexeme>();
            this.KeyedPartsOfSpeech = new Dictionary<string, PartOfSpeech>();
            this.KeyedCTypes = new Dictionary<string, CType>();
            this.KeyedCForms = new Dictionary<string, CForm>();
        }

        /// <summary>
        /// 辞書に語を追加する。
        /// 同じ語が既にあればそのエントリを返す。
        /// </summary>
        /// <param name="s">語のPropertyをカンマで区切った文字列</param>
        /// <returns></returns>
        public Lexeme AddEntry(string[] props)
        {
            Lexeme m = new Lexeme();
            m.Surface = props[0];
            m.Reading = props[1];
            m.Pronunciation = props[2];
            m.BaseLexeme = m;
            m.PartOfSpeech = GetPartOfSpeech(props[4]);
            m.CType = GetCType(props[5]);
            m.CForm = GetCForm(props[6]);

            if (props[6].Length > 0 && !props[6].Equals(BASE_TAG))
            {
                // 基本形を検索（なければ登録）
                Lexeme m_base = new Lexeme();
                m_base.Surface = props[3];
                m_base.Reading = string.Empty;
                m_base.Pronunciation = string.Empty;
                m_base.BaseLexeme = m_base;
                m_base.PartOfSpeech = GetPartOfSpeech(props[4]);
                m_base.CType = GetCType(props[5]);
                m_base.CForm = GetCForm(BASE_TAG);

                string key_base = m_base.ToString();
                string key_base2 = m_base.ToString2();
                Lexeme l;
                if (!this.KeyedEntries.TryGetValue(key_base, out l))    // Reading, Pronunciationを含めて検索
                {
                    if (!this.KeyedEntries2.TryGetValue(key_base2, out l))    //なければ Reading, Pronunciationを除いて検索
                    {
                        this.KeyedEntries.Add(key_base, m_base);
                        this.KeyedEntries2.Add(key_base2, m_base);
                        l = m_base;
                    }
                }
                m.BaseLexeme = l;
            }

            string key = m.ToString();
            string key2 = m.ToString2();
            Lexeme lex;
            if (!this.KeyedEntries.TryGetValue(key, out lex))
            {
                // Reading, Pronunciationを除いて一致するものが既にあれば、そのReading, Pronunciationを修正するのみでよい
                if (!this.KeyedEntries2.TryGetValue(key2, out lex))
                {
                    this.KeyedEntries.Add(key, m);
                    if (m.CForm.Name.Equals(BASE_TAG))
                    {
                        this.KeyedEntries2.Add(key2, m);
                    }
                    lex = m;
                    lex.Frequency = 0;
                }
                else
                {
                    lex.Reading = m.Reading;
                    lex.Pronunciation = m.Pronunciation;
                }
            }
            lex.Frequency++;
            return lex;
        }

        /// <summary>
        /// 辞書に語を追加する。
        /// 同じ語が既にあればそのエントリを返す。
        /// </summary>
        /// <param name="s">表層形で表された１語のデータ。</param>
        /// <returns></returns>
        public Lexeme AddEntry(string plaintext)
        {
            string[] props = new string[] { plaintext, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty };
            return AddEntry(props);
        }

        /// <summary>
        /// 辞書に語を追加する。
        /// 同じ語が既にあればそのエントリを返す。
        /// </summary>
        /// <param name="s">ChaSen/CaboChaフォーマットで表された１語のデータ。</param>
        /// <returns></returns>
        public Lexeme AddEntryChasen(string s)
        {
            string[] props = SplitChasenFormat(s);
            return AddEntry(props);
        }

        public Lexeme AddEntryMecab(string s)
        {
            string[] props = SplitMecabFormat(s);
            return AddEntry(props);
        }

        public static string[] SplitChasenFormat(string s)
        {
            Lexeme lex = new Lexeme();

            string[] tags = s.Split(TabSeparator);
            if (tags.Length < 6)
            {
                throw new ArgumentException();
            }
            return new string[] { tags[0], tags[1], tags[1], tags[2], tags[3], tags[4], tags[5] };
        }

        public static string[] SplitMecabFormat(string s)
        {
            string[] tags = s.Split(TabSeparator);
            if (tags.Length < 2)
            {
                throw new ArgumentException();
            }

            string subtagstr = tags[1].Replace("*", "") + ",,,,";
            string[] subtags = subtagstr.Split(CommaSeparator);
            if (subtags.Length < 9)
            {
                throw new ArgumentException();
            }
            StringBuilder postag = new StringBuilder();
            for (int i = 0; i < 4; i++)
            {
                if (subtags[i].Length == 0)
                {
                    break;
                }
                if (i > 0)
                {
                    postag.Append("-");
                }
                postag.Append(subtags[i]);
            }
            return new string[] { tags[0], subtags[7], subtags[8], subtags[6], postag.ToString(), subtags[4], subtags[5] };
        }


        public PartOfSpeech GetPartOfSpeech(string name)
        {
            PartOfSpeech obj = null;
            if (!this.KeyedPartsOfSpeech.TryGetValue(name, out obj))
            {
                obj = new PartOfSpeech(name);
                this.KeyedPartsOfSpeech.Add(name, obj);
            }
            return obj;
        }

        public CType GetCType(string name)
        {
            CType obj = null;
            if (!this.KeyedCTypes.TryGetValue(name, out obj))
            {
                obj = new CType(name);
                this.KeyedCTypes.Add(name, obj);
            }
            return obj;
        }

        public CForm GetCForm(string name)
        {
            CForm obj = null;
            if (!this.KeyedCForms.TryGetValue(name, out obj))
            {
                obj = new CForm(name);
                this.KeyedCForms.Add(name, obj);
            }
            return obj;
        }

        public void CopyToCorpusLexicon(Lexicon lexicon)
        {
            foreach (Lexeme lex in KeyedEntries.Values)
            {
                lexicon.Entries.Add(lex);
            }
            foreach (PartOfSpeech pos in KeyedPartsOfSpeech.Values)
            {
                lexicon.PartsOfSpeech.Add(pos);
            }
            foreach (CType ctype in KeyedCTypes.Values)
            {
                lexicon.CTypes.Add(ctype);
            }
            foreach (CForm cform in KeyedCForms.Values)
            {
                lexicon.CForms.Add(cform);
            }
        }
    }
}
