/** 
 * @file  codepage_detect.cpp
 *
 * @brief Deducing codepage from file contents, when we can
 *
 */
// RCS ID line follows -- this is updated by CVS
// $Id: codepage_detect.cpp,v 1.3 2005/08/22 05:48:04 jtuc Exp $

#include "StdAfx.h"
#include <shlwapi.h>
#include "codepage_detect.h"
#include "unicoder.h"
#include "codepage.h"
#include "charsets.h"
#include "markdown.h"

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

/**
 * @brief Is specified codepage number valid on this system?
 */
static unsigned ValidCodepage(unsigned cp)
{
	return cp && isCodepageSupported(cp) ? cp : 0;
}

/**
 * @brief Parser for HTML files to find encoding information
 */
static unsigned demoGuessEncoding_html(const char *src, size_t len, int defcodepage)
{
	CMarkdown markdown(src, src + len, CMarkdown::Html);
	//As <html> and <head> are optional, there is nothing to pull...
	//markdown.Move("html").Pop().Move("head").Pop();
	while (markdown.Move("meta"))
	{
		CMarkdown::String http_equiv = markdown.GetAttribute("http-equiv");
		if (http_equiv.A && lstrcmpiA(http_equiv.A, "content-type") == 0)
		{
			CMarkdown::String content = markdown.GetAttribute("content");
			if (char *pchKey = content.A)
			{
				while (int cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
				{
					char *pchValue = pchKey + cchKey;
					int cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
					if (cchKey >= 7 && memicmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
					{
						pchValue[cchValue] = '\0';
						// Is it an encoding name known to charsets module ?
						unsigned encodingId = GetEncodingIdFromName(pchValue);
						if (encodingId == 0)
						{
							if (unsigned codepage = atoi(pchValue))
							{
								encodingId = GetEncodingIdFromCodePage(codepage);
							}
						}
						if (encodingId)
						{
							return GetEncodingCodePageFromId(encodingId);
						}
						return defcodepage;
					}
					pchKey = pchValue + cchValue;
				}
			}
		}
	}
	return defcodepage;
}

/**
 * @brief Parser for XML files to find encoding information
 */
static unsigned demoGuessEncoding_xml(const char *src, size_t len, int defcodepage)
{
	CMarkdown xml(src, src + len);
	if (xml.Move("?xml"))
	{
		CMarkdown::String encoding = xml.GetAttribute("encoding");
		if (encoding.A)
		{
			// Is it an encoding name known to charsets module ?
			unsigned encodingId = GetEncodingIdFromName(encoding.A);
			if (encodingId == 0)
			{
				if (unsigned codepage = atoi(encoding.A))
				{
					encodingId = GetEncodingIdFromCodePage(codepage);
				}
			}
			if (encodingId)
			{
				return GetEncodingCodePageFromId(encodingId);
			}
		}
	}
	return defcodepage;
}

/**
 * @brief Parser for rc files to find encoding information
 * @note sscanf() requires first argument to be zero-terminated so we must
 * copy lines to temporary buffer.
 */
static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepage)
{
	unsigned cp = defcodepage;
	char line[80];
	do
	{
		while (len && (*src == '\r' || *src == '\n'))
		{
			++src;
			--len;
		}
		const char *base = src;
		while (len && *src != '\r' && *src != '\n')
		{
			++src;
			--len;
		}
		lstrcpynA(line, base, sizeof line);
	} while (len && sscanf(line, "#pragma code_page(%d)", &cp) != 1);
	return ValidCodepage(cp);
}

/**
 * @brief Try to deduce encoding for this file
 */
unsigned GuessEncoding_from_bytes(LPCTSTR ext, const char *src, size_t len, int guessEncodingType)
{
	unsigned cp = getDefaultCodepage();
	if (guessEncodingType & 2)
	{
		ucr::IExconverterPtr pexconv(ucr::createConverterMLang());
		if (pexconv && src != NULL)
		{
			int autodetectType = (unsigned int)guessEncodingType >> 16;
			cp = pexconv->detectInputCodepage(autodetectType, cp, src, len);
		}
	}
	if (guessEncodingType & 1)
	{
		if (len > 65536)
			len = 65536;
		if (lstrcmpi(ext, _T(".rc")) ==  0)
		{
			cp = demoGuessEncoding_rc(src, len, cp);
		}
		else if (lstrcmpi(ext, _T(".htm")) == 0 || lstrcmpi(ext, _T(".html")) == 0)
		{
			cp = demoGuessEncoding_html(src, len, cp);
		}
		else if (lstrcmpi(ext, _T(".xml")) == 0 || lstrcmpi(ext, _T(".xsl")) == 0)
		{
			cp = demoGuessEncoding_xml(src, len, cp);
		}
	}
	return cp;
}

/**
 * @brief Try to deduce encoding for this file
 */
bool GuessEncoding_from_bytes(LPCTSTR ext, const char **data, int count, int *codepage, int guessEncodingType)
{
	if (data)
	{
		const char *src = data[0];
		size_t len = data[count] - src;
		if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType))
		{
			*codepage = cp;
			return true;
		}
	}
	return false;
}

/**
 * @brief Try to deduce encoding for this file
 */
void GuessCodepageEncoding(LPCTSTR filepath, int *unicoding, int *codepage, int guessEncodingType)
{
	CMarkdown::FileImage fi(filepath, 65536);
	*unicoding = ucr::NONE;
	*codepage = getDefaultCodepage();
	switch (fi.nByteOrder)
	{
	case 8 + 2 + 0:
		*unicoding = ucr::UCS2LE;
		*codepage = 1200;
		break;
	case 8 + 2 + 1:
		*unicoding = ucr::UCS2BE;
		*codepage = 1201;
		break;
	case 8:
		*unicoding = ucr::UTF8;
		*codepage = 65001;
		break;
	}
	if (fi.nByteOrder == 1 && guessEncodingType != 0)
	{
		LPCTSTR ext = PathFindExtension(filepath);
		if (unsigned cp = GuessEncoding_from_bytes(ext, (char *)fi.pImage, fi.cbImage, guessEncodingType))
		{
			*codepage = cp;
		}
		else
			*codepage = getDefaultCodepage();
	}
}
