// Copyright (C) 2014  Mocchi (mocchi_2003@yahoo.co.jp)
// License: Boost Software License   See LICENSE.txt for the full license.

#include "rapidjson/document.h"
#include "rapidjson/stringbuffer.h"
#include "rapidjson/prettywriter.h"

#include "unf/normalizer.hh"

#include "zlib.h"

#include <stdint.h>
#include <vector>
#include <list>
#include <string>
#include <set>
#include <map>
#include <deque>
#include <cstdio>
#include <cstdlib>
#include <cctype>

typedef std::vector<uint8_t> Data;
struct StreamData{
	Data data;
	enum{
		FlateDecode, Plane, Unknown
	}filter;
	bool decoded;
	StreamData() : decoded(false), filter(Unknown){
	}
};
typedef std::vector<StreamData> Streams;
typedef std::deque<std::string> PrefetchQueue;

// outbuf̕ ȂƂ num_digit͗pӂ邱
void PDF_WriteHexString(uint32_t value, int num_digit, char *outbuf){
	static const char hex[] = "0123456789ABCDEF";
	outbuf += num_digit-1;
	for (int i = 0; i < num_digit; ++i, --outbuf){
		*outbuf = hex[value & 0xF];
		value /= 16;
	}
}

struct UnicodeTableItem{
	std::string first;
	Data second;
	UnicodeTableItem(){}
	UnicodeTableItem(uint32_t value, int num_digit){
		first.resize(num_digit);
		PDF_WriteHexString(value, num_digit, &first[0]);
	}
	UnicodeTableItem(char c){
		first = c;
	}
	UnicodeTableItem(const UnicodeTableItem &rhs) : first(rhs.first), second(rhs.second){
	}
	UnicodeTableItem &operator =(const UnicodeTableItem &rhs){
		first = rhs.first;
		second = rhs.second;
		return *this;
	}
	bool operator <(const UnicodeTableItem &rhs) const{
		return (first < rhs.first);
	}
};
template <> void std::swap<UnicodeTableItem>(UnicodeTableItem &lhs, UnicodeTableItem &rhs){
	std::swap(lhs.first, rhs.first);
	std::swap(lhs.second, rhs.second);
}

typedef std::vector<UnicodeTableItem> UnicodeTable;
typedef std::list<UnicodeTable> UnicodeTablesContainer;

// typedef std::pair<UnicodeTableItem *, int> UnicodeTableRef;
typedef std::map<std::string, UnicodeTable *> UnicodeTables;

int PDF_FindUnicodeTableItemAndForwardPtr(UnicodeTable &tbl, const char *&val){
	if (!val || !val[0]) return -1;
	UnicodeTableItem uti(std::toupper(val[0])); ++val;
	UnicodeTable::iterator iter = tbl.begin(), itere = tbl.end();
	for(;; ++val){
		iter = std::lower_bound(iter, itere, uti);
		if (iter == tbl.end()) break;
		if (iter->first == uti.first) return static_cast<int>(iter - tbl.begin());
		if (*val == 0) break;
		uti.first += std::toupper(*val);
		itere = std::lower_bound(iter, itere, 
			UnicodeTableItem(std::strtoul(uti.first.c_str(), 0, 16)+1, static_cast<int>(uti.first.size())));
	}
	val = 0;
	return -1;
}

// () ̒ɂ \0 Ă\邽߁A()̕񒷂m肽Ƃ́A
// strlen ̕@̑ɁA ')'܂ł̒𒲂ׂKvB
struct StringPool{
	std::vector<char *> strs;
	StringPool(){
	}
	~StringPool(){
		for (size_t i = 0; i < strs.size(); ++i){
			std::free(strs[i]);
		}
	}
	const char *Add(const std::string &str){
		char *s = static_cast<char *>(std::malloc(str.size()+1));
		std::memcpy(s, str.data(), str.size());
		s[str.size()] = '\0';
		strs.push_back(s);
		return s;
	}
};

enum TokenState{
	Neutral, SomeValue, DictKey, DictValue, ArrayItem
};
typedef std::vector<std::pair<std::string, TokenState> > TokenStack;

enum Unicode_Normalize_Type{
	UNT_NFD, UNT_NFC, UNT_NFKD, UNT_NFKC, UNT_NONCONVERT
};

bool PDF_GetNextLine(FILE *fp, Data &line){
	if (!fp) return false;
	bool parenthesis = false;
	line.clear();
	if (std::feof(fp)) return false;
	int c;
	while(!std::feof(fp)){
		c = std::fgetc(fp);
		if (c == '(') parenthesis = true;
		if (parenthesis || (c != 0x0d && c != 0x0a)){
			line.push_back(static_cast<uint8_t>(c));
			if (c == ')') parenthesis = false;
		}
		else break;
	}
	if (c == 0x0d) if ((c = std::fgetc(fp)) != 0x0a) std::ungetc(c, fp);
	return true;
}

struct Token{
	size_t startpos, endpos;
	Token() : startpos(0), endpos(0){}
};

bool PDF_GetNextToken(Data &line, Token &token){
	size_t pos = token.endpos;
	int16_t c, p = -1, pp = -1;
	while(pos < line.size() && std::isspace(line[pos])) ++pos;
	if (pos == line.size()) return false;

	token.startpos = pos;
	bool sharp_hex = false;
	for(; pos < line.size(); ++pos, pp = p, p = c){
		c = line[pos];
		if (std::isspace(c)) break;
		if (p == -1) continue;
		if (c == '#'){ sharp_hex = true; continue; }
		if (sharp_hex){
			if (std::isdigit(c)) continue;
			char hexc[] = {"ABCDEFabcdef"};
			bool is_hexc = false;
			for (int i = 0; i < 12; ++i){
				if (c != hexc[i]) continue;
				is_hexc = true; break;
			}
			if (is_hexc) continue;
			char hb[3] = {'0', '0', '\0'};
			for (int i = pos - 2, ii = 0; i < pos; ++i, ++ii) if (pos >= 2) hb[ii] = line[i];
			p = std::strtoul(hb, 0, 16);
			sharp_hex = false;
		}
		if (p != '<' && c == '<') break;
		if (p == '<' && c != '<') break;
		if (p != '>' && c == '>') break;
		if (p == '>' && c != '>') break;
		if (p != '/' && c == '/') break;
		if (p != '\\' && c == '\\') break;
		if (c == '[') break;
		if (p == '[') break;
		if (c == ']') break;
		if (p == ']') break;
		if (p != '\\' && c == '(') break;
		if (p == '(') break;
		if (p != '\\' && c == ')') break;
		if (p == ')') break;
		if (pp == '>' && p == '>') break;
//		if (std::isdigit(p) && !std::isdigit(c) && c != '.') break;
	}
	token.endpos = pos;
	return true;
}

bool PDF_Prefetch(Token &tok, Data &line, PrefetchQueue &pq){
	enum FetchType{
		Reference, Parenthesis, AngleBracket, Other, Unknown
	}ft = Unknown;
	for(;;){
		Token prev_tok = tok;
		if (!PDF_GetNextToken(line, tok)) break;
		if (ft == Unknown){
			if (line[tok.startpos] == '(') ft = Parenthesis;
			else if (line[tok.startpos] == '<' && (tok.endpos - tok.startpos) == 1) ft = AngleBracket;
			else ft = Other;
		}
		if (ft == Other || ft == Reference){
			bool isdigit = true;
			if (line[tok.startpos] != 'R' || (tok.endpos - tok.startpos) != 1){
				for (size_t i = tok.startpos; i < tok.endpos; ++i){
					if (!(isdigit =
						(std::isdigit(line[i]) || line[i] == '.' || line[i] == '-' || line[i] == '+')) ?
							true : false) break;
				}
				if (!isdigit){
					tok = prev_tok;
					break;
				}else ft = Reference;
			}
		}
		pq.push_back(std::string(reinterpret_cast<const char *>(&line[tok.startpos]), tok.endpos - tok.startpos));
		if ((ft == Reference    && pq.back()[0] == 'R') ||
			(ft == Parenthesis  && pq.back()[0] == ')') ||
			(ft == AngleBracket && pq.back()[0] == '>' && pq.back().size() == 1)) break;
	}
	return pq.size() ? true :false;
}

struct PDF_Tokenizer{
	PrefetchQueue pq;
	Token tok;
	Data line;
	FILE *fp;
	std::vector<size_t> xref;
	PDF_Tokenizer(FILE *fp_) : fp(fp_){
	}
	PDF_Tokenizer(uint8_t *ptr, size_t size) : fp(0), line(ptr, ptr + size){
	}
	void ParseXRef(){
		std::fseek(fp, 0, SEEK_END);
		size_t fsize = std::ftell(fp), block_size;
		for(block_size = 30; block_size < fsize; block_size *= 2){
			std::vector<uint8_t> block(std::min(block_size, fsize));
			std::fseek(fp, -static_cast<long>(block_size), SEEK_END);
			std::fread(&block[0], block_size, 1, fp);
			// startxrefT
			char *p = std::strstr(reinterpret_cast<char *>(&block[0]), "startxref");
			if (!p) continue;
			for (; !std::isdigit(*p); ++p);
			size_t offset = std::strtoul(p, 0, 0);
			std::fseek(fp, offset, SEEK_SET);
			break;
		}
		if (fsize == block_size) return;
		for (;;){
			std::string curtok;
			if (!PDF_GetNextLine(fp, line)) break;
			Do(curtok);
			if (curtok != "xref") continue;
			if (!PDF_GetNextLine(fp, line)) break;
			tok = Token();
			PDF_GetNextToken(line, tok);
			PDF_GetNextToken(line, tok);
			int xreflines = std::strtol(reinterpret_cast<const char *>(&line[tok.startpos]), 0, 0);
			xref.resize(xreflines);
			for (int i = 0; i < xreflines; ++i){
				if (!PDF_GetNextLine(fp, line)) break;
				xref[i] = std::strtoul(reinterpret_cast<const char *>(&line[0]), 0, 10);
			}
			break;
		}
		std::fseek(fp, 0, SEEK_SET);
		line.clear();
		tok = Token();
	}
	bool Do(std::string &curtok){
		bool rc = false;
		curtok = "";
		for (;;){
			if (line.size() == 0 || line[0] == '%'){
				if (!PDF_GetNextLine(fp, line)) break;
				tok = Token();
				continue;
			}
			if (!PDF_Prefetch(tok, line, pq)){
				if (!PDF_GetNextToken(line, tok)){
					if (!PDF_GetNextLine(fp, line)) break;
					tok = Token();
					continue;
				}
				curtok = std::string(reinterpret_cast<const char *>(&line[tok.startpos]), tok.endpos - tok.startpos);
			}else{
				if (pq.size() == 3 && pq.back()[0] == 'R'){
					curtok = pq.front() + " "; pq.pop_front();
					curtok += pq.front() + " "; pq.pop_front();
					curtok += pq.front(); pq.pop_front();
				}else if ((pq.front()[0] == '(' && pq.back()[0] != ')') || (pq.front()[0] == '<' && pq.back()[0] != '>')){
					if (!PDF_GetNextLine(fp, line)) break;
					tok = Token();
					continue;
				}else if ((pq.front()[0] == '(' && pq.back()[0] == ')') || (pq.front()[0] == '<' && pq.back()[0] == '>')){
					curtok = pq.front(); pq.pop_front();
					for (; pq.size(); pq.pop_front()) curtok += pq.front();
				}else{
					curtok = pq.front();
					pq.pop_front();
				}
			}
			rc = true;
			break;
		}
		return rc;
	}
};

rapidjson::Value *PDF_GetObjRef(rapidjson::Value &objects, const char *refstr){
	using namespace rapidjson;
	if (!refstr) return 0;
	size_t sz = std::strlen(refstr);
	if (sz == 0 || refstr[sz-1] != 'R') return 0;
	int objid = std::strtol(refstr, 0, 0);
	if (objid < 0 || objid >= static_cast<int>(objects.Size())) return 0;
	return &objects[objid];
}
rapidjson::Value *PDF_GetObjRef(rapidjson::Value &objects, rapidjson::Value &refstr){
	if (!refstr.IsString()) return 0;
	return PDF_GetObjRef(objects, refstr.GetString());
}
rapidjson::Value *PDF_GetDict(rapidjson::Value *obj){
	using namespace rapidjson;
	if (!obj || !obj->IsObject()) return 0;
	Value &dict = (*obj)["dict"];
	if (!dict.IsObject()) return 0;
	return &dict;
}
int PDF_GetStreamId(rapidjson::Value *obj){
	using namespace rapidjson;
	if (!obj || !obj->IsObject()) return 0;
	Value &stream = (*obj)["stream"];
	if (!stream.IsInt()) return -1;
	return stream.GetInt();
}

rapidjson::Value *PDF_GetType0FontMemberWithTraverse(rapidjson::Value &objects, rapidjson::Value *font, const char *member){
	using namespace rapidjson;
	if (!font || !font->IsObject()) return 0;
	std::deque<Value *> fontstack;
	fontstack.push_back(font);
	while(fontstack.size()){
		// DT
		Value *curfont = fontstack.front();
		fontstack.pop_front();
		Value &type = (*curfont)["/Type"];
		if (!type.IsString() || std::strcmp(type.GetString(), "/Font") != 0) continue;
		Value &m = (*curfont)[member];
		if (!m.IsNull()) return &m;
		Value &dfs = (*curfont)["/DescendantFonts"];
		if (!dfs.IsArray()) continue;
		for (SizeType i = 0; i < dfs.Size(); ++i){
			Value *df = PDF_GetDict(PDF_GetObjRef(objects, dfs[i]));
			if (df && df->IsObject()) fontstack.push_back(df);
		}
	}
	return 0;

}

const uint16_t *PDF_GetNormalizedUnicodes(uint16_t uc);

int ON_EncodeUTF8( uint32_t u, uint8_t sUTF8[6] );
void ON_UNF_EncodeUTF8(uint32_t u, UNF::Normalizer &norm, Unicode_Normalize_Type &type, Data &utf8){
	const uint16_t *nus = 0;
	int sz = 0;
	if (u < 0x10000 && (nus = PDF_GetNormalizedUnicodes(static_cast<uint16_t>(u)))){
		utf8.resize(70);
		for (const uint16_t *p = nus; *p; ++p){
			if (utf8.size() < sz + 6) utf8.resize(sz+6);
			sz += ON_EncodeUTF8(*p, &utf8[sz]);
		}
	}else{
		utf8.resize(6);
		sz = ON_EncodeUTF8(u, &utf8[0]);
	}
	if (type == UNT_NONCONVERT){
		utf8.resize(sz);
		return;
	}
	utf8.resize(sz+1);
	utf8[sz] = '\0';
	const char *r;
	switch(type){
		case UNT_NFD:
			r = norm.normalize(reinterpret_cast<const char *>(&utf8[0]), UNF::Normalizer::FORM_NFD);
			break;
		case UNT_NFC:
			r = norm.normalize(reinterpret_cast<const char *>(&utf8[0]), UNF::Normalizer::FORM_NFC);
			break;
		case UNT_NFKD:
			r = norm.normalize(reinterpret_cast<const char *>(&utf8[0]), UNF::Normalizer::FORM_NFKD);
			break;
		case UNT_NFKC:
			r = norm.normalize(reinterpret_cast<const char *>(&utf8[0]), UNF::Normalizer::FORM_NFKC);
			break;
		default:
			return;
	}
	if (r == reinterpret_cast<const char *>(&utf8[0])){
		utf8.pop_back(); return;
	}
	utf8.clear();
	for (const char *p = r; *p; ++p) utf8.push_back(static_cast<uint8_t>(*p));
}

UnicodeTable* PDF_CreateUnicodeTable(UnicodeTablesContainer &utc, Data &stream, UNF::Normalizer &norm, Unicode_Normalize_Type ucnrmtype){
	PDF_Tokenizer tok(&stream[0], stream.size());
	enum State{
		Neutral, CodeSpace, BFRange, BFChar
	}state = Neutral;

//	size_t cs_max = 0;
	struct{
		int pos;
		uint32_t fromto[2];
		int pos_arraystart;
	}bfrange;

	struct{
		int32_t target;
	}bfchar;
	int num_digit = -1;
	std::string digit_buf;

	Data buf;
	utc.push_back(UnicodeTable());
	UnicodeTable &ut = utc.back();
	for (std::string curtok; tok.Do(curtok); ){
		if (curtok.size() == 0) continue;
		if (state == Neutral){
			if (curtok == "begincodespacerange"){
				state = CodeSpace; continue;
			}
			if (curtok == "beginbfrange"){
				num_digit = -1;
				bfrange.pos = 0;
				bfrange.pos_arraystart = -1;
				state = BFRange; continue;
			}
			if (curtok == "beginbfchar"){
				num_digit = -1;
				bfchar.target = -1;
				state = BFChar; continue;
			}
		}else if (state == CodeSpace){
			if (curtok == "endcodespacerange"){
//				ut.resize(cs_max+1);
				state = Neutral; continue;
			}else{
				if (curtok[0] != '<' || curtok[curtok.size()-1] != '>') continue; 
//				size_t sz = std::strtoul(curtok.substr(1, 4).c_str(), 0, 16);
//				if (cs_max < sz) cs_max = sz;
			}
		}else if (state == BFRange){
			if (curtok == "endbfrange"){
				state = Neutral; continue;
			}else if (bfrange.pos < 0) continue;
			else if (bfrange.pos < 2){
				if (curtok[0] != '<' || curtok[curtok.size()-1] != '>') continue; 
				if (bfrange.pos == 0) num_digit = curtok.size()-2;
				bfrange.fromto[bfrange.pos++] = std::strtoul(curtok.substr(1, num_digit).c_str(), 0, 16);
			}else if (bfrange.pos == 2){
				if (curtok[0] == '<'){
					if (curtok[0] != '<' || curtok[curtok.size()-1] != '>') continue; 
					digit_buf.resize(num_digit);
					unsigned long offset = std::strtoul(curtok.substr(1, num_digit).c_str(), 0, 16);
					for (uint32_t i = bfrange.fromto[0]; i <= bfrange.fromto[1]; ++i){
						uint32_t uc = i + offset - bfrange.fromto[0];
						UnicodeTableItem uti(i, num_digit);
						UnicodeTable::iterator iter = std::lower_bound(ut.begin(), ut.end(), uti);
						ON_UNF_EncodeUTF8(uc, norm, ucnrmtype, ut.insert(iter, uti)->second);
					}
					bfrange.pos = 0;
				}else if (curtok == "["){
					bfrange.pos++;
					bfrange.pos_arraystart = bfrange.pos;
				}
			}else if (bfrange.pos_arraystart){
				if (curtok == "]") continue;
				if (curtok[0] != '<' || curtok[curtok.size()-1] != '>') continue; 
				int index = bfrange.pos - bfrange.pos_arraystart;
				uint32_t i = bfrange.fromto[0] + index;
				if (i > bfrange.fromto[1]) break;
				UnicodeTableItem uti(i, num_digit);
				UnicodeTable::iterator iter = std::lower_bound(ut.begin(), ut.end(), uti);
				Data &l = ut.insert(iter, uti)->second;
				for (size_t j = 0; j < curtok.size() - 2; j += 4){
					uint32_t uc = std::strtoul(curtok.substr(j+1, 4).c_str(), 0, 16);
					ON_UNF_EncodeUTF8(uc, norm, ucnrmtype, buf);
					l.insert(l.end(), buf.begin(), buf.end());
				}
				bfrange.pos++;
			}
		}else if (state == BFChar){
			if (curtok == "endbfchar"){
				state = Neutral; continue;
			}else if (bfchar.target < -1) continue;
			else if (bfchar.target == -1){
				if (curtok[0] != '<' || curtok[curtok.size()-1] != '>') continue; 
				num_digit = curtok.size()-2;
				bfchar.target = std::strtoul(curtok.substr(1, num_digit).c_str(), 0, 16);
			}else{
				if (curtok[0] != '<' || curtok[curtok.size()-1] != '>') continue; 
				UnicodeTableItem uti(bfchar.target, num_digit);
				UnicodeTable::iterator iter = std::lower_bound(ut.begin(), ut.end(), uti);
				Data &l = ut.insert(iter, uti)->second;
				for (size_t j = 0; j < curtok.size() - 2; j += 4){
					uint32_t uc = std::strtoul(curtok.substr(j+1, 4).c_str(), 0, 16);
					ON_UNF_EncodeUTF8(uc, norm, ucnrmtype, buf);
					l.insert(l.end(), buf.begin(), buf.end());
				}
				bfchar.target = -1;
			}
		}
	}
	return &ut;
}

void get_cidmap_table(const char *ccs_name, uint32_t *&cid2unicode_table_ptr, int &table_size);

UnicodeTable *PDF_CreateEmptyUnicodeTable(UnicodeTablesContainer &utc){
	utc.push_back(UnicodeTable());
	UnicodeTable &ut = utc.back();
	ut.resize(1);
	return &ut;
}

UnicodeTable *PDF_CreateUnicodeTableOfIdentityEncoding(UnicodeTablesContainer &utc, const char *ccs_name, UNF::Normalizer &norm, Unicode_Normalize_Type ucnrmtype){
	uint32_t *table_ptr;
	int size;
	get_cidmap_table(ccs_name, table_ptr, size);
	if (!table_ptr || size == 0) return 0;
	int num_digit = 0;
	for (int sz = size; sz > 0; ++num_digit, sz /= 16);

	utc.push_back(UnicodeTable());
	Data buf;
	UnicodeTable &ut = utc.back();
	for (int i = 0; i < size; ++i){
		uint32_t target = table_ptr[i];
		if (!target) continue;
		UnicodeTableItem uti(i, num_digit);
		ut.push_back(uti);
		Data &l = ut.back().second;
		ON_UNF_EncodeUTF8(target, norm, ucnrmtype, l);
	}
	return &ut;
}

uint32_t GlyphToUnicode(const char *glyph);
int PDF_GetEncodingTable(const char *encname, const char **&table);

UnicodeTable *PDF_CreateUnicodeTableFromEncodingName(UnicodeTablesContainer &utc, const char *enc_name, UNF::Normalizer &norm, Unicode_Normalize_Type ucnrmtype){
	const char **table;
	int sz = PDF_GetEncodingTable(enc_name+1, table);
	if (sz == 0) return 0;

	utc.push_back(UnicodeTable());
	UnicodeTable &ut = utc.back();

	ut.resize(sz);
	uint32_t uc;
	for (int i = 0; i < sz; ++i)
		(ut[i] = UnicodeTableItem(i, 2)),
		ON_UNF_EncodeUTF8(GlyphToUnicode(table[i]), norm, ucnrmtype, ut[i].second);
	
	return &ut;
}

UnicodeTable *PDF_CreateUnicodeTable(UnicodeTablesContainer &utc, rapidjson::Value *encdict, UNF::Normalizer &norm, Unicode_Normalize_Type ucnrmtype){
	using namespace rapidjson;
	Value &base_enc_name = (*encdict)["/BaseEncoding"];
	const char *base_enc_name_str = base_enc_name.IsString() ? base_enc_name.GetString() : 0;
	// base_enc_naméA
	// StandardEncodingAMacRomanEncodingAWinAnsiEncodingAMacExpertEncoding ̂ǂꂩ
	// ɁADifferences ɂ鍷B
	// Ή\́A PDF dl Appendix. D Q
	UnicodeTable *utp = PDF_CreateUnicodeTableFromEncodingName(utc,
		base_enc_name_str ? base_enc_name_str : "StandardEncoding", norm, ucnrmtype);
	if (!utp){
		utc.push_back(UnicodeTable());
		utp = &utc.back();
		utp->resize(0x100);
		for (uint32_t i = 0; i < 0x100; ++i)
			((*utp)[i] = UnicodeTableItem (i, 2)), 
			ON_UNF_EncodeUTF8(i, norm, ucnrmtype, (*utp)[i].second);
	}
	UnicodeTable &ut = *utp;
	if (ut.size() < 0x100) ut.resize(0x100);

	if (!encdict || !encdict->IsObject()) goto end;

	Value &differences = (*encdict)["/Differences"];
	if (!differences.IsArray() || differences.Size() < 2) goto end;
	Value &first = differences[static_cast<SizeType>(0)];
	if (!first.IsString() || !std::isdigit(first.GetString()[0])) goto end;
	uint32_t cc = std::strtoul(first.GetString(), 0, 0);
	uint32_t cc_prev = 0;
	for (SizeType i = 1; i < differences.Size(); ++i){
		Value &v = differences[i];
		if (!v.IsString()) continue;
		const char *s = v.GetString();
		if (std::isdigit(s[0])){
			cc = std::strtoul(s, 0, 0);
		}else{
			Data *l = 0;
			if (cc < 0x100){
				l = &ut[cc].second;
				l->clear();
			}else{ // ɂ͗Ȃ?
				UnicodeTableItem uti(cc, 2); // 2̂͂
				l = (cc_prev < cc) ? 
					(ut.push_back(uti), &ut.back().second) :
					&ut.insert(std::lower_bound(ut.begin(), ut.end(), uti), uti)->second;
			}
			uint32_t uc = GlyphToUnicode(s);
			ON_UNF_EncodeUTF8(uc, norm, ucnrmtype, *l);
			cc_prev = cc;
			++cc;
		}
	}
end:
	return &ut;
}

void DumpObject(rapidjson::Value &v, const char *filename){
	using namespace rapidjson;
	StringBuffer sb;
	v.Accept(PrettyWriter<StringBuffer>(sb));
	const char *str = sb.GetString();
	FILE *fp = std::fopen(filename, "w");
	std::fprintf(fp, "%s\n", str);
	std::fclose(fp);
}

void PDF_ModifyDocumentToDumpStyle(rapidjson::Value &pdfdoc, Streams &streams){
	using namespace rapidjson;
	Value &objects = pdfdoc["objects"];
	if (objects.IsArray()){
		for (SizeType i = 0; i < objects.Size(); ++i){
			Value &o = objects[i];
			if (!o.IsObject()) continue;
			Value &stream = o["stream"];
			if (!stream.IsInt()) continue;
			int str_no = stream.GetInt();
			if (str_no < 0 || static_cast<int>(streams.size()) <= str_no) continue;
			if (streams[str_no].decoded){
				stream.SetString(reinterpret_cast<const char *>(&streams[str_no].data[0]), streams[str_no].data.size());
			}else{
				stream.SetString("<<< Encoded Data >>>");
			}
		}
	}
}

int main(int argc, char *argv[]){
	if (argc == 1){
		std::printf("[g]\n extract_pdftext PDFt@C [-T o̓eLXgt@C] [-J JSONt@C]\n");
		return 0;
	}
	const char *pdffile = argv[1];
	const char *textfile = 0;
	const char *jsonfile = 0;
	const char **dest = 0;
	for (int i = 2; i < argc; ++i){
		if (std::strcmp(argv[i], "-T") == 0) dest = &textfile;
		else if (std::strcmp(argv[i], "-J") == 0) dest = &jsonfile;
		else if (dest){
			*dest = argv[i];
			dest = 0;
		}
	}

	using namespace rapidjson;
	// pdfdoc Rei


	UNF::Normalizer norm;
	Unicode_Normalize_Type ucnrmtype = UNT_NONCONVERT;
//	Unicode_Normalize_Type ucnrmtype = UNT_NFD;

	struct OnEnd{
		const char *jsonfile;
		Document pdfdoc;
		Streams streams;
		OnEnd(const char *jsonfile_) : jsonfile(jsonfile_){}
		~OnEnd(){
			if (jsonfile){
				PDF_ModifyDocumentToDumpStyle(pdfdoc, streams);
				DumpObject(pdfdoc, jsonfile);
			}
		}
	}oe(jsonfile);
	Document &pdfdoc = oe.pdfdoc;
	Streams &streams = oe.streams;

	pdfdoc.SetObject();

	// pdfdoc̃XL[}
	// {
	//   "objects": [
	//    { ... },            // 0 0 obj {...} ̒g
	//    { ... },            // 1 0 obj {...} ̒g
	//    { ... },            // 2 0 obj {...} ̒g
	//    ...
	// }

	// e obj ̃XL[}
	// {
	//   "dict": { ... }      // ldict̒g
	//   "stream": ԍ       // Xg[ԍ -1̎͋
	// }

	Value NullVal;
	pdfdoc.AddMember("objects", NullVal, pdfdoc.GetAllocator());
	Value &objects = pdfdoc["objects"];
	objects.SetArray();

	pdfdoc.AddMember("trailers", NullVal, pdfdoc.GetAllocator());
	Value &trailers = pdfdoc["trailers"];
	trailers.SetArray();

	StringPool sp;

	PDF_Tokenizer tok(std::fopen(pdffile, "rb"));
	tok.ParseXRef();

	int obj_no, obj_gen;
	TokenStack tok_stack;
	std::vector<size_t> nest_pos_stack;

	Value dictstack; dictstack.SetArray();

	int stream_no;

	TokenState tok_state_expect = Neutral;

	for(;;){
		std::string curtok;
		if (!tok.Do(curtok)) break;
		tok_stack.push_back(std::make_pair( curtok, Neutral ));

		if (tok_stack.back().first == "stream"){
			Value &v = dictstack[dictstack.Size()-1];
			Value &sizestr = v["/Length"];
			if (sizestr.IsString()){
				const char *ss = sizestr.GetString();
				size_t sz;
				if (ss[sizestr.GetStringLength()-1] == 'R'){
					size_t cpos = std::ftell(tok.fp);
					// Ώۂ̃IuWFNg̓eɓǂ݂ɍs
					bool rc = false;
					size_t objno = std::strtoul(ss, 0, 0);
					for(;;){
						if (tok.xref.size() <= objno) break;
						std::fseek(tok.fp, tok.xref[objno], SEEK_SET);
						PDF_Tokenizer ltok(tok.fp);
						std::vector<std::string> toks(4);
						for (int i = 0; i < 4; ++i){
							ltok.Do(toks[i]);
						}
						if (toks[3][toks[3].size()-1] == 'R'){
							objno = std::strtoul(toks[3].c_str(), 0, 0);
							continue;
						}else{
							sz = std::strtoul(toks[3].c_str(), 0, 0);
							rc = true;
							break;
						}
					}
					if (!rc) break;
					std::fseek(tok.fp, cpos, SEEK_SET);
				}else sz = std::strtoul(ss, 0, 0);
				// zlibfR[ĥݎ
				stream_no = static_cast<int>(streams.size());
				streams.push_back(StreamData()); 
				StreamData &ds = streams.back();
				Value &f = v["/Filter"];
				const char *filter = 0;
				if (f.IsString()) filter = f.GetString();
				else if (f.IsArray() && f.Size() == 1 && f[static_cast<SizeType>(0)].IsString()){
					filter = f[static_cast<SizeType>(0)].GetString();
				}
				if (filter && std::strcmp(filter, "/FlateDecode") == 0){ // zlib̏ꍇ
					ds.filter = ds.FlateDecode;
					std::vector<Bytef> ibuf(sz);
					std::vector<Bytef> obuf(sz * 2);
					std::fread(&ibuf[0], sz, 1, tok.fp); 

					z_stream zstr = {0};
					::inflateInit(&zstr);
					zstr.next_in = &ibuf[0];
					zstr.avail_in = sz;
					int rc = Z_OK;
					for(;rc != Z_STREAM_END;) {
						zstr.next_out = &obuf[0];
						zstr.avail_out = obuf.size();
						rc = ::inflate(&zstr, Z_NO_FLUSH);
						if (rc < 0) break;
						ds.data.insert(ds.data.end(), &obuf[0], zstr.next_out);
					};
					::inflateEnd(&zstr);
					if (rc == Z_STREAM_END) ds.decoded = true;
				}else{
					ds.data.resize(sz);
					std::fread(&ds.data[0], sz, 1, tok.fp); 
					if (!filter) ds.decoded = true;
				}
			}
			continue;
		}
		if (tok_stack.back().first == "endstream"){
			continue;
		}
		if (tok_stack.back().first == "endobj"){
			// nest_pos_stack̃TCY͂̎_1ɂȂĂ͂
			while (objects.Size() <= obj_no) objects.PushBack(NullVal, pdfdoc.GetAllocator());
			Value &v = objects[obj_no];
			v.SetObject();
			v.AddMember("no", obj_no, pdfdoc.GetAllocator());
			v.AddMember("dict", dictstack[dictstack.Size()-1], pdfdoc.GetAllocator());
			if (stream_no >= 0) v.AddMember("stream", stream_no, pdfdoc.GetAllocator());
			dictstack.Empty();
			tok_stack.resize(nest_pos_stack.back());
			nest_pos_stack.pop_back();
			tok_state_expect = Neutral;
			continue;
		}
		if (tok_stack.back().first == "xref"){
			if (!PDF_GetNextLine(tok.fp, tok.line)) break;
			tok.tok = Token();
			PDF_GetNextToken(tok.line, tok.tok);
			PDF_GetNextToken(tok.line, tok.tok);
			int xreflines = std::strtol(reinterpret_cast<const char *>(&tok.line[tok.tok.startpos]), 0, 0);
			for (int i = 0; i < xreflines; ++i){
				if (!PDF_GetNextLine(tok.fp, tok.line)) break;
			}
			tok_stack.pop_back();
			if (!PDF_GetNextLine(tok.fp, tok.line)) break; tok.tok = Token(); // ̍s "trailer" ̂͂
			continue;
		}
		if (tok_stack.back().first == "trailer"){
			dictstack.SetArray();
			dictstack.PushBack(trailers, pdfdoc.GetAllocator());
			tok_state_expect = tok_stack.back().second = DictValue;
			if (!PDF_GetNextLine(tok.fp, tok.line)) break; tok.tok = Token();
			continue;
		}
		if (tok_stack.back().first == "startxref"){
			if (dictstack.Size() >= 1) trailers = dictstack[static_cast<SizeType>(0)];
			dictstack.SetArray();
			tok_stack.pop_back();
			tok_stack.pop_back();
			if (!PDF_GetNextLine(tok.fp, tok.line)) break;
			if (!PDF_GetNextLine(tok.fp, tok.line)) break; tok.tok = Token();
			tok_state_expect = Neutral;
			continue;
		}
		if (tok_state_expect == Neutral && tok_stack.size() >= 3 && tok_stack.back().first == "obj"){
			stream_no = -1;
			nest_pos_stack.push_back(tok_stack.size()-3);
			dictstack.SetArray();
			obj_no = std::strtol(tok_stack[tok_stack.size()-3].first.c_str(), 0, 0);
			obj_gen = std::strtol(tok_stack[tok_stack.size()-2].first.c_str(), 0, 0);
			tok_state_expect = tok_stack.back().second = SomeValue; continue;
		}
		if (tok_state_expect == ArrayItem){
			if (tok_stack.back().first == "]"){
				if (dictstack.Size() >= 2){
					Value &parent = dictstack[dictstack.Size()-2];
					if (parent.IsObject()){
						parent.AddMember(
							sp.Add(tok_stack[nest_pos_stack.back()-1].first),
							dictstack[dictstack.Size()-1],
							pdfdoc.GetAllocator()
						);
						tok_stack.resize(nest_pos_stack.back()-1);
					}else if (parent.IsArray()){
						parent.PushBack(
							dictstack[dictstack.Size()-1],
							pdfdoc.GetAllocator()
						);
						tok_stack.resize(nest_pos_stack.back());
					}
					dictstack.PopBack();
				}else{
					tok_stack.resize(nest_pos_stack.back());
				}
				nest_pos_stack.pop_back();
				tok_state_expect = tok_stack.back().second;
			}else if (tok_stack.back().first == "["){
				nest_pos_stack.push_back(tok_stack.size()-1);
				dictstack.PushBack(NullVal, pdfdoc.GetAllocator());
				dictstack[dictstack.Size()-1].SetArray();
				tok_state_expect = tok_stack.back().second = ArrayItem;
			}else{
				dictstack[dictstack.Size()-1].PushBack(
					sp.Add(tok_stack.back().first), pdfdoc.GetAllocator()
				);
				tok_stack.pop_back();
			}
			continue;
		}
		if (tok_state_expect == DictKey){
			if (tok_stack.back().first == ">>"){
				if (dictstack.Size() >= 2){
					Value &parent = dictstack[dictstack.Size()-2];
					if (parent.IsObject()){
						dictstack[dictstack.Size()-2].AddMember(
							sp.Add(tok_stack[nest_pos_stack.back()-1].first), 
							dictstack[dictstack.Size()-1],
							pdfdoc.GetAllocator()
						);
						tok_stack.resize(nest_pos_stack.back()-1);
					}else if (parent.IsArray()){
						parent.PushBack(
							dictstack[dictstack.Size()-1],
							pdfdoc.GetAllocator()
						);
						tok_stack.resize(nest_pos_stack.back());
					}
					dictstack.PopBack();
				}else{
					tok_stack.resize(nest_pos_stack.back());
				}
				nest_pos_stack.pop_back();
				tok_state_expect = tok_stack.back().second;
			}else{
				tok_state_expect = tok_stack.back().second = DictValue;
			}
			continue;
		}
		if (tok_state_expect == DictValue || tok_state_expect == SomeValue){
			if (tok_stack.back().first == "<<"){
				nest_pos_stack.push_back(tok_stack.size()-1);
				dictstack.PushBack(NullVal, pdfdoc.GetAllocator());
				dictstack[dictstack.Size()-1].SetObject();
				tok_state_expect = tok_stack.back().second = DictKey;
			}else if (tok_stack.back().first == "["){
				nest_pos_stack.push_back(tok_stack.size()-1);
				dictstack.PushBack(NullVal, pdfdoc.GetAllocator());
				dictstack[dictstack.Size()-1].SetArray();
				tok_state_expect = tok_stack.back().second = ArrayItem;
			}else if (tok_state_expect == DictValue && tok_stack.size() >= 2){
				dictstack[dictstack.Size()-1].AddMember(
					sp.Add(tok_stack[tok_stack.size()-2].first), 
					sp.Add(tok_stack.back().first),
					pdfdoc.GetAllocator()
				);
				tok_stack.resize(tok_stack.size()-2);
				tok_state_expect = tok_stack.back().second = DictKey;
			}else{
				dictstack.PushBack(sp.Add(tok_stack.back().first), pdfdoc.GetAllocator());
				tok_state_expect = tok_stack.back().second = SomeValue;
			}
			continue;
		}
	}

	std::fclose(tok.fp);

	/// y[Wɓǂݍ݁AeLXg𒊏oăt@Cɕۑ
	FILE *fp = textfile ? std::fopen(textfile, "w") : 0;
	if (fp){
		UnicodeTablesContainer utc;
		UnicodeTables unicode_tables;
		UnicodeTables::iterator std_ut_iter = unicode_tables.insert(
			std::make_pair("", PDF_CreateUnicodeTableFromEncodingName(utc, "/StandardEncoding", norm, ucnrmtype))).first;
		// trailers  Catalog 擾
		Value *catalog = 0;
		for (SizeType j = 0; j < trailers.Size(); ++j){
			Value &trailer = trailers[j];
			catalog = PDF_GetDict(PDF_GetObjRef(objects, trailer["/Root"]));
			if (catalog) break;
		}
		if (!catalog) return 1;

		// Catalog  Pages 擾
		Value *pages = PDF_GetDict(PDF_GetObjRef(objects, (*catalog)["/Pages"]));
		if (!pages) return 1;

		// Pages  Page [DTŎ擾
		std::vector<Value *> pages_stack;
		pages_stack.push_back(pages);
		int page = 0;
		while(pages_stack.size()){
			Value &curpages = *pages_stack.back();
			pages_stack.pop_back();
			Value &type = curpages["/Type"];
			if (!type.IsString()) continue;
			if (std::strcmp(type.GetString(), "/Page") == 0){
				Value &contentsval = curpages["/Contents"];
				std::vector<Value *> contentsrefs;
				if (contentsval.IsString()) contentsrefs.push_back(&contentsval);
				else if (contentsval.IsArray()){
					for (SizeType i = 0; i < contentsval.Size(); ++i){
						contentsrefs.push_back(&contentsval[i]);
					}
				}
				for (size_t j = 0; j < contentsrefs.size(); ++j){
					Value *contentsobj = PDF_GetObjRef(objects, *contentsrefs[j]);
					Value *contents = PDF_GetDict(contentsobj);
					int strid = PDF_GetStreamId(contentsobj);
					if (!contents) continue;

					// y[W
					// c[ɒHB
					std::vector<Value *> ancestors;
					Value *ancestor = &curpages;
					while(ancestor){
						ancestors.push_back(ancestor);
						ancestor = PDF_GetDict(PDF_GetObjRef(objects, (*ancestor)["/Parent"]));
					}

					// ancestorsォ珇ɒHAFonts̃RNV쐬B
					std::map<std::string, Value *> fontsmap;
					for (size_t i = 0; i < ancestors.size(); ++i){
						size_t ri = ancestors.size() - i - 1;
						Value *rec = &(*ancestors[ri])["/Resources"];
						if (!rec) continue;
						if (rec->IsString()){
							rec = PDF_GetDict(PDF_GetObjRef(objects, *rec));
						}else if (!rec->IsObject()) continue;
						Value &fonts = (*rec)["/Font"];
						Value *fontsdict = &fonts;
						if (!fonts.IsObject() && !(fontsdict = PDF_GetDict(PDF_GetObjRef(objects, fonts)))) continue;
						for (Value::Member *iter = fontsdict->MemberBegin(); iter != fontsdict->MemberEnd(); ++iter){
							if (!iter->name.IsString()) continue;
							fontsmap.insert(std::make_pair(
								std::string(reinterpret_cast<const char *>(iter->name.GetString())),
								PDF_GetDict(PDF_GetObjRef(objects, iter->value))));
						}
					}

					Data &str = streams[strid].data;
					PDF_Tokenizer tok(&str[0], str.size());
					std::string curtok;
					std::printf("**** Page:%d ****\n", page++);
					enum State{
						Unknown, In_BT, In_Bracket
					}state = Unknown;
					UnicodeTables::iterator iter = std_ut_iter;
					while(tok.Do(curtok)){
						Value *font = 0;
						if (curtok.size() && curtok[0] == '/' && (font = fontsmap[curtok])){
							// Font擾
		//					std::printf("font: %s\n", curtok.c_str());
							iter = unicode_tables.find(curtok);
							do{
								if (iter != unicode_tables.end()) break;
								Value *tounicode = PDF_GetType0FontMemberWithTraverse(objects, font, "/ToUnicode");
								UnicodeTable *tr;
								if (tounicode){
									int strid = PDF_GetStreamId(PDF_GetObjRef(objects, *tounicode));
									if (strid < 0) break;
									tr = PDF_CreateUnicodeTable(utc, streams[strid].data, norm, ucnrmtype);
								}else{
									Value *encdict = 0;
									Value *encoding = PDF_GetType0FontMemberWithTraverse(objects, font, "/Encoding");
									if (!encoding) break;
									if (encoding->IsObject()) encdict = encoding;
									else if (!encoding->IsString()) break;

									std::string encoding_str = encoding->GetString();
			//						std::printf("encoding: %s\n", encoding_str.c_str());

									StringBuffer sb;
									if (encoding_str.size() && encoding_str[0] != '/' && encoding_str[encoding_str.size()-1] == 'R'){
										if (!(encdict = PDF_GetDict(PDF_GetObjRef(objects, encoding_str.c_str())))) break;
										tr = PDF_CreateUnicodeTable(utc, encdict, norm, ucnrmtype);
									}else if (encoding_str == "/Identity-H" || encoding_str == "/Identity-V"){
										Value *cidsi = PDF_GetType0FontMemberWithTraverse(objects, font, "/CIDSystemInfo");
										if (!cidsi || !cidsi->IsObject()) break;
										Value &reg = (*cidsi)["/Registry"];
										Value &ord = (*cidsi)["/Ordering"];
										if (!reg.IsString() || !ord.IsString()) break;
										std::string ccs = reg.GetString();
										ccs += "-", ccs += ord.GetString();
										for (size_t si = 0; si < ccs.size(); ++si){
											size_t rsi = ccs.size() - si - 1;
											if (ccs[rsi] =='(' || ccs[rsi] == ')'){
												ccs.erase(ccs.begin() + rsi);
											}
										}
										tr = PDF_CreateUnicodeTableOfIdentityEncoding(utc, ccs.c_str(), norm, ucnrmtype);
									}else if ((tr = PDF_CreateUnicodeTableFromEncodingName(utc, encoding_str.c_str(), norm, ucnrmtype)) == 0){
										break;
									}
								}
								iter = unicode_tables.insert(std::make_pair(curtok, tr)).first;
							}while(0);
							if (iter == unicode_tables.end()) iter = std_ut_iter;
						}else if (state == Unknown && curtok == "BT"){
							state = In_BT;
						}else if (state == In_BT){
							if (curtok == "ET") state = Unknown;
							else if (curtok == "[") state = In_Bracket;
							else if (curtok == "TD" || curtok == "Td" || curtok == "Tm") std::fprintf(fp, "\n");
						}else if (state == In_Bracket){
							if (curtok.size() == 0) continue;
							if (curtok[0] == '-' && std::strtol(curtok.c_str()+1, 0, 0) >= 100){
								// -100ȉ̏ꍇ̓Xy[XƂĈB
								std::fprintf(fp, " ");
							}else if (curtok[0] == ']') state = In_BT;
		//					std::fwrite(curtok.data(), curtok.size(), 1, stdout);
						}
						if ((state == In_BT || state == In_Bracket)){
							if (curtok[0] == '<'){
								UnicodeTable *tr = iter->second;
								for (const char *p = curtok.c_str() + 1; p && *p != '>'; ){
									int v = PDF_FindUnicodeTableItemAndForwardPtr(*tr, p);
									if (v >= 0) std::fwrite(&(*tr)[v].second[0], (*tr)[v].second.size(), 1, fp);
									else std::fprintf(fp, ".");
								}
							}else if(curtok[0] == '('){
								UnicodeTable *tr = iter->second;
								int yen_escape = -1;
								char yen_char[4] = {'\0', '\0', '\0', '\0'};
								for (size_t i = 1; i < curtok.size() - 1; ++i){
									if (yen_escape < 0 && curtok[i] == '\\'){
										yen_escape = 0; continue;
									}else if (yen_escape >= 0){
										int d = curtok[i] - '0';
										bool in_octal_range = (d >= 0 && d < 8);
										if (in_octal_range) yen_char[yen_escape++] = curtok[i];
										if (yen_escape == 3 || !in_octal_range){
											yen_escape = -1;
											char buf[3] = {0};
											std::sprintf(buf, "%02X", std::strtoul(yen_char, 0, 8));
											const char *p = buf;
											int idx = PDF_FindUnicodeTableItemAndForwardPtr(*tr, p);
											if (idx >= 0){
												const uint8_t *s = &((*tr)[idx].second[0]);
												size_t slen = (*tr)[idx].second.size();
												std::fwrite(s, slen, 1, fp);
											}else std::fprintf(fp, ".");
										}
										continue;
									}
									char buf[3] = {0};
									std::sprintf(buf, "%02X", static_cast<int>(curtok[i]));
									const char *p = buf;
									int idx = PDF_FindUnicodeTableItemAndForwardPtr(*tr, p);
									if (idx >= 0)
										std::fwrite(&((*tr)[idx].second[0]), (*tr)[idx].second.size(), 1, fp);
									else std::fprintf(fp, ".");
								}
							}
						}
		//				std::printf("\n");
					}
				}
			}else if (std::strcmp(type.GetString(), "/Pages") != 0) continue;

			Value &kids = curpages["/Kids"];
			if (!kids.IsArray()) continue;
			SizeType ksz = kids.Size();
			for (SizeType j = 0; j < ksz; ++j){
				SizeType rj = ksz - j - 1;
				Value *kid = PDF_GetDict(PDF_GetObjRef(objects, kids[rj]));
				pages_stack.push_back(kid);
			}
		}

		std::fclose(fp);
	}
	
	return 0;
}
