// charcode.cpp cCve[Vt@C
#include "charcode.h"
#include "sfmt.h"                       // sgc::sfmt
#include <algorithm>                    // std::min
#include <assert.h>                     // assert()


#define ALPHA_UPPER_BEGIN       (L'A')
#define ALPHA_UPPER_END         (L'Z' + 1)
#define ALPHA_LOWER_BEGIN       (L'a')
#define ALPHA_LOWER_END         (L'z' + 1)
#define NUMBER_BEGIN            (L'0')
#define NUMBER_END              (L'9' + 1)
#define WIDE_ALPHA_UPPER_BEGIN  (L'`')
#define WIDE_ALPHA_UPPER_END    (L'y' + 1)
#define WIDE_ALPHA_LOWER_BEGIN  (L'')
#define WIDE_ALPHA_LOWER_END    (L'' + 1)
#define WIDE_NUMBER_BEGIN       (L'O' )
#define WIDE_NUMBER_END         (L'X' + 1)
/*
#define WIDE_SYMBOL_BEGIN       (0x3000)
#define WIDE_SYMBOL_END         (0x3038)
*/

// {
#define JAPANESE_HIRAGANA_BEGIN     (0x3041)
#define JAPANESE_HIRAGANA_END       (0x309f)
#define JAPANESE_KATAKANA1_BEGIN    (0x309b)
#define JAPANESE_KATAKANA1_END      (0x309d)
#define JAPANESE_KATAKANA2_BEGIN    (0x30a1)
#define JAPANESE_KATAKANA2_END      (0x30ff)
#define JAPANESE_KANJI1_BEGIN       (0x3400)
#define JAPANESE_KANJI1_END         (0xa000)
#define JAPANESE_KANJI2_BEGIN       (0xf900)
#define JAPANESE_KANJI2_END         (0xfa2e)
#define JAPANESE_SYMBOL1_BEGIN      (0x3220)
#define JAPANESE_SYMBOL1_END        (0x3380)
#define JAPANESE_SYMBOL2_BEGIN      (0x33e0)
#define JAPANESE_SYMBOL2_END        (0x4e00)

// nO
#define HANGEUL1_BEGIN  (0x1100)
#define HANGEUL1_END    (0x11fa)
#define HANGEUL2_BEGIN  (0x3131)
#define HANGEUL2_END    (0x318f)
#define HANGEUL3_BEGIN  (0xac00)
#define HANGEUL3_END    (0xd7a4)

// L
#define CYRILLIC_BEGIN  (0x0401)
#define CYRILLIC_END    (0x04fa)

// AjA
#define ARMENIAN_BEGIN  (0x0531)
#define ARMENIAN_END    (0x058a)

// wuCiƋL킯قHj
#define HEBREW_BEGIN    (0x0591)
#define HEBREW_END      (0x05f5)

// ArA
#define ARABIC_BEGIN        (0x060c)
#define ARABIC_END          (0x06ee)
#define ARABIC_NUMBER_BEGIN (0x06f0)
#define ARABIC_NUMBER_END   (0x06f9)

// foi[K
#define DEVANAGARI_BEGIN        (0x0901)
#define DEVANAGARI_END          (0x0966)
#define DEVANAGARI_NUMBER_BEGIN (0x0966)
#define DEVANAGARI_NUMBER_END   (0x0971)


#define STR_SPACE           L" \t\r\n"
#define STR_UNDERSCORE      L"_"
#define STR_BRACKET         L"()[]{}<>"
#define STR_SPECIAL         L"#$%&~^+-*/@`;:,.!?|\"\'\\"

#define STR_WIDE_BRACKET    L"ij[]klyzopstuvwx"

#define STR_JAPANESE_SYMBOL L"ABSXTUVY"


namespace
{
	const wchar_t g_lfstr_CR  [] = {sgc::charcode::wCR ,                     sgc::charcode::wNUL};  // CRiMacintosh`j
	const wchar_t g_lfstr_LF  [] = {sgc::charcode::wLF ,                     sgc::charcode::wNUL};  // LFiUnix`j
	const wchar_t g_lfstr_CRLF[] = {sgc::charcode::wCR , sgc::charcode::wLF, sgc::charcode::wNUL};  // CR-LFiWindows`j
	const wchar_t g_lfstr_NEL [] = {sgc::charcode::wNEL,                     sgc::charcode::wNUL};  // NextLine
	const wchar_t g_lfstr_LS  [] = {sgc::charcode::wLS ,                     sgc::charcode::wNUL};  // LineSeparator
	const wchar_t g_lfstr_PS  [] = {sgc::charcode::wPS ,                     sgc::charcode::wNUL};  // ParagraphSeparator
	sgc::charcode::linefeed_t g_lf_default = sgc::charcode::CL_CRLF;  // ftHg̉sR[h

	sgc::charcode::categorize_t g_table[WCHAR_MAX + 1];     // ރe[u

	// e[utB֐iꉞ inline ĂĂj
	// ͈͎wɂtB
	inline void _fill_range(const int begin, const int end, const sgc::charcode::categorize_t val)
	{ sgc::charcode::categorize_t *table_ptr = g_table; for(int i = begin; i < end; i++) { table_ptr[i] = val; } }

	// ɂtB
	inline void _fill_str(const wchar_t *str, const sgc::charcode::categorize_t val)
	{ sgc::charcode::categorize_t *table_ptr = g_table; while(*str != sgc::charcode::wNUL) { table_ptr[*str++] = val; } }
}


_SGC_BEGIN                              // namespace sgc {
_CHARCODE_BEGIN                         // namespace charcode {


// ރe[u
void init(void)
{
	// e[u CC_ETC ŏ
	_fill_range(WCHAR_MIN, WCHAR_MAX + 1, CC_ETC);

	// zCgXy[X
	_fill_str(STR_SPACE, CC_SPACE);

	// At@xbg
	_fill_range(ALPHA_UPPER_BEGIN, ALPHA_UPPER_END, CC_ALPHA_UPPER);
	_fill_range(ALPHA_LOWER_BEGIN, ALPHA_LOWER_END, CC_ALPHA_LOWER);

	// 
	_fill_range(NUMBER_BEGIN, NUMBER_END, CC_NUMBER);

	// A_[XRAi_j
	_fill_str(STR_UNDERSCORE, CC_UNDERSCORE);

	// 
	_fill_str(STR_BRACKET, CC_BRACKET);

	// ꕶi#$"'%&()[]{}<>~^\+-*/@`;:,.!?|j
	_fill_str(STR_SPECIAL, CC_SPECIAL);


	// SpAt@xbg
	_fill_range(WIDE_ALPHA_UPPER_BEGIN, WIDE_ALPHA_UPPER_END, CC_WIDE_ALPHA_UPPER);
	_fill_range(WIDE_ALPHA_LOWER_BEGIN, WIDE_ALPHA_LOWER_END, CC_WIDE_ALPHA_LOWER);

	// Sp
	_fill_range(WIDE_NUMBER_BEGIN, WIDE_NUMBER_END, CC_WIDE_NUMBER);

	// Sp
	_fill_str(STR_WIDE_BRACKET, CC_WIDE_BRACKET);


	// {
	_fill_range(JAPANESE_HIRAGANA_BEGIN , JAPANESE_HIRAGANA_END , CC_JAPANESE_HIRAGANA);
	_fill_range(JAPANESE_KATAKANA1_BEGIN, JAPANESE_KATAKANA1_END, CC_JAPANESE_KATAKANA);
	_fill_range(JAPANESE_KATAKANA2_BEGIN, JAPANESE_KATAKANA2_END, CC_JAPANESE_KATAKANA);
	_fill_range(JAPANESE_KANJI1_BEGIN   , JAPANESE_KANJI1_END   , CC_JAPANESE_KANJI);
	_fill_range(JAPANESE_KANJI2_BEGIN   , JAPANESE_KANJI2_END   , CC_JAPANESE_KANJI);
	_fill_range(JAPANESE_SYMBOL1_BEGIN  , JAPANESE_SYMBOL1_END  , CC_JAPANESE_SYMBOL);
	_fill_range(JAPANESE_SYMBOL2_BEGIN  , JAPANESE_SYMBOL2_END  , CC_JAPANESE_SYMBOL);
	_fill_str(STR_JAPANESE_SYMBOL, CC_JAPANESE_SYMBOL);


	// nO
	_fill_range(HANGEUL1_BEGIN, HANGEUL1_END, CC_HANGEUL);
	_fill_range(HANGEUL2_BEGIN, HANGEUL2_END, CC_HANGEUL);
	_fill_range(HANGEUL3_BEGIN, HANGEUL3_END, CC_HANGEUL);

	// L
	_fill_range(CYRILLIC_BEGIN, CYRILLIC_END, CC_CYRILLIC);

	// AjA
	_fill_range(ARMENIAN_BEGIN, ARMENIAN_END, CC_ARMENIAN);

	// wuC
	_fill_range(HEBREW_BEGIN, HEBREW_END, CC_HEBREW);

	// ArA
	_fill_range(ARABIC_BEGIN       , ARABIC_END       , CC_ARABIC);
	_fill_range(ARABIC_NUMBER_BEGIN, ARABIC_NUMBER_END, CC_ARABIC_NUMBER);

	// foi[K
	_fill_range(DEVANAGARI_BEGIN       , DEVANAGARI_END       , CC_DEVANAGARI);
	_fill_range(DEVANAGARI_NUMBER_BEGIN, DEVANAGARI_NUMBER_END, CC_DEVANAGARI_NUMBER);
}

// 𕪗
categorize_t categorize(const wchar_t wchar)
{
	return g_table[wchar];
}


// sʒǔ
size_t find_linefeed(const wchar_t *wstr, const size_t size, linefeed_t &type, size_t &skip)
{
	const size_t skip_array[] = {0, 1, 1, 2, 1, 1, 1};

	for(size_t i = 0; i < size; i++)
	{
		switch(wstr[i])
		{
		case wCR:
			type = CL_CR;
			if(i < size - 1 && wstr[i + 1] == wLF)
			{
				// CR-LF
				type = CL_CRLF;
			}
			skip = skip_array[type];
			return i;

		case wLF:  type = CL_LF;  skip = skip_array[type]; return i;
		case wNEL: type = CL_NEL; skip = skip_array[type]; return i;
		case wLS:  type = CL_LS;  skip = skip_array[type]; return i;
		case wPS:  type = CL_PS;  skip = skip_array[type]; return i;
		}
	}

	// sȂ
	type = CL_DEFAULT;
	return charcode::POS_INVALID;
}

// ftHg̉sR[h̐ݒ
bool set_linefeed_default(const linefeed_t type)
{
	if(type == CL_DEFAULT) { return false; }
	g_lf_default = type;
	return true;
}

// ftHg̉sR[h̎擾
linefeed_t get_linefeed_default(void)
{
	return g_lf_default;
}

// s̎擾
const wchar_t *get_linefeed_string(const linefeed_t type)
{
	switch(type)
	{
	case CL_CR:   return g_lfstr_CR;        // CRiMacintosh`j
	case CL_LF:   return g_lfstr_LF;        // LFiUnix`j
	case CL_CRLF: return g_lfstr_CRLF;      // CR-LFiWindows`j
	case CL_NEL:  return g_lfstr_NEL;       // NextLine
	case CL_LS:   return g_lfstr_LS;        // LineSeparator
	case CL_PS:   return g_lfstr_PS;        // ParagraphSeparator
	}

	// CL_DEFAULT
	const linefeed_t type_default = get_linefeed_default();
	assert(type_default != CL_DEFAULT);
	return get_linefeed_string(type_default);
}


// UTF-8[eB
_UTF8_BEGIN                             // namespace utf8 {

namespace
{
	// ŏʃrbgAėĂrbg̐𐔂
	int count_bits(const unsigned char byte)
	{
		int count = 0;
		for(unsigned char mask = 0x80; mask != 0x00; mask >>= 1)
		{
			if((byte & mask) == 0) { break; }
			count++;
		}
		return count;
	}
	int rnd(int range_max)
	{
		return rand() % (range_max + 1);
	}
}

// UTF-8𓝌vIɌo
// UTF-8̃rbgp^[͈ȉ̂ꂩƂȂB
//   0xxxxxxx
//   110yyyyx 10xxxxxx
//   1110yyyy 10yxxxxx 10xxxxxx
//   11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
//   111110yy 10yyyxxx 10xxxxxx 10xxxxxx 10xxxxxx
//   1111110y 10yyyyxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// L̂ǂɂĂ͂܂Ȃrbgp^[ꂽUTF-8ł͂ȂƔfł̂
// ȉ̎菇ɏ]UTF-8̉\OĂ䂭B
//   1. eLXg烉_Ƀf[^1byteo
//   2. oÕf[^Lrbgp^[OĂ΁A̎_UTF-8łȂƔf
//   3. 1,2 statistics_option JԂ
//   4. L菇1`3̒MSB1̃f[^xȂASCIIiUTF-8łȂjƔfB
//   5. L菇1`4SăpXUTF-8ƔfB
// ͈ȉ̒ʂ
//   E莞Ԃ̓eLXg̃TCYƂ͖֌W O(1)
//   Estatistics_option̒l傫قǌoxオ邪AxƂ̃g[hItƂȂ
//   EASCII啔߂傫ȃt@CASCIIƌFĂ܂\
bool detect(const char *text, const size_t size, const unsigned int statistics_option /* = 128 */)
{
	if(size == 0) { return false; }

	const uint32_t upper_bound = static_cast<uint32_t>(size - 1);
	bool msb_flag = false;

	sfmt random;

	for(size_t i = 0; i < statistics_option; i++)
	{
		// eLXg1byte_ɔoAMSBAėĂrbg̐𐔂
		const uint32_t offset = random(upper_bound);
		const uint8_t  data   = text[offset];
		const int      count  = count_bits(data);

		switch(count)
		{
		case 0:
			// ̃f[^MSB1ȂUTF-8łȂ
			if(offset < size - 1)
			{
				const int count2 = count_bits(text[offset + 1]);
				if(count2 == 1) { return false; }
			}
			break;

		case 1:
			// Õf[^MSB0ȂUTF-8łȂ
			if(offset > 0)
			{
				const int count2 = count_bits(text[offset - 1]);
				if(count2 == 0) { return false; }
			}
			msb_flag = true;
			break;

		case 7:
		case 8:
			// MSB7bitȏ1A邱Ƃ͂Ȃ
			return false;

		default:
			// MSBcount-11AĂȂUTF-8łȂ
			if(offset + count > size) { return false; }
			for(int j = 1; j < count; j++)
			{
				const int count2 = count_bits(text[offset + j]);
				if(count2 != 1) { return false; }
			}
			msb_flag = true;
			break;
		}
	}
	// MSBxĂȂUTF-8łȂ(ASCII)
	return msb_flag;
}

_UTF8_END                               // }

_CHARCODE_END                           // }
_SGC_END                                // }
