package jp.ac.dendai.cdl.mori.wikie.mapred;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import jp.ac.dendai.cdl.mori.wikie.WikIE;
import jp.ac.dendai.cdl.mori.wikie.util.WLinkUtils;
import jp.ac.dendai.cdl.mori.wikie.util.WNormalizer;
import jp.ac.dendai.cdl.mori.wikie.util.WikipediaDump;

import org.apache.commons.digester.BeanPropertySetterRule;
import org.apache.commons.digester.Digester;
import org.apache.commons.digester.ObjectCreateRule;
import org.apache.commons.digester.SetPropertiesRule;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public abstract class WikipediaMapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> extends Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {

    private Digester digester = null;

    protected WNormalizer normalizer = null;

    protected WLinkUtils linkUtils = null;

    public WikipediaMapper() {
        super();
    }

    protected void setup(Context context) throws IOException ,InterruptedException {
        System.out.println("start setup");
        digester = new Digester();
        digester.addRule(WikIE.PAGE_ELEMENT, new ObjectCreateRule(WPageElement.class));
        digester.addRule(WikIE.PAGE_ELEMENT, new SetPropertiesRule());
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.TITLE_ELEMENT, new BeanPropertySetterRule("title"));
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.ID_ELEMENT, new BeanPropertySetterRule("pageId"));
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.REVISION_ELEMENT + "/" + WikIE.ID_ELEMENT, new BeanPropertySetterRule("revisionId"));
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.REVISION_ELEMENT + "/" + WikIE.TIMESTAMP_ELEMENT, new BeanPropertySetterRule("timestamp"));
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.REVISION_ELEMENT + "/" + WikIE.CONTRIBUTOR_ELEMENT + "/" + WikIE.USERNAME_ELEMENT, new BeanPropertySetterRule("username"));
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.REVISION_ELEMENT + "/" + WikIE.CONTRIBUTOR_ELEMENT + "/" + WikIE.IP_ELEMENT, new BeanPropertySetterRule("username"));
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.REVISION_ELEMENT + "/" + WikIE.CONTRIBUTOR_ELEMENT + "/" + WikIE.ID_ELEMENT, new BeanPropertySetterRule("userId"));
        digester.addRule(WikIE.PAGE_ELEMENT + "/" + WikIE.REVISION_ELEMENT + "/" + WikIE.TEXT_ELEMENT, new BeanPropertySetterRule("text"));

        FileSystem fs = FileSystem.get(context.getConfiguration());
        InputStream is = fs.open(((FileSplit)context.getInputSplit()).getPath());
        Map<String, Integer> orignalNsMap = createNsMap(is);

        Map<String, Integer> nsMap = new HashMap<String, Integer>(orignalNsMap);

        System.out.println("start db");
        Set<String> projectSet = createProjectSet(fs, new Path(context.getConfiguration().get("interwiki")));
        Set<String> langSet = createLangSet(fs, new Path(context.getConfiguration().get("interwiki")));
        System.out.println("end db");

        Map<Integer, String> normalizeMap = new HashMap<Integer, String>();
        for (String str : orignalNsMap.keySet()) {
            normalizeMap.put(orignalNsMap.get(str), str);
        }

        normalizer = new WNormalizer(nsMap, langSet, projectSet);
        normalizer.setOrignalNsMap(normalizeMap);
        linkUtils = new WLinkUtils(normalizer);
        System.out.println("end setup");
    }

    public static Map<String, Integer> createNsMap(InputStream is) throws IOException {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
            Map<String, Integer> result = new HashMap<String, Integer>();
            Pattern pattern = Pattern.compile("^ *<namespace *key=\"(.+?)\".*?>(.+?)</namespace> *$");
            String line = null;
            while ((line = reader.readLine()) != null) {
                if (line.matches("^ *</namespaces> *$")) {
                    break;
                }
                Matcher matcher = pattern.matcher(line);
                if (matcher.matches()) {
                    result.put(matcher.group(2).toLowerCase(), Integer.parseInt(matcher.group(1)));
                }
            }
            return result;
        } catch (IOException e) {
            throw e;
        }
        finally {
            if (reader != null) try{reader.close();}  catch(IOException e){throw e;}
        }
    }

    private Set<String> createLangSet(FileSystem fs, Path interwiki) throws IOException {
        WikipediaDump dump = null;
        Set<String> langs = new HashSet<String>();
        try {
            dump = new WikipediaDump();
            dump.setUp(fs.open(interwiki));
            langs.addAll(dump.getLanguagePrefixSet());
        } catch (SQLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            throw e;
        } finally {
            if (dump != null) {
                try {dump.close();} catch (SQLException e) {e.printStackTrace();}
            }
        }
        return langs;
    }

    private Set<String> createProjectSet(FileSystem fs, Path interwiki) throws IOException {
        WikipediaDump dump = null;
        Set<String> langs = new HashSet<String>();
        try {
            dump = new WikipediaDump();
            dump.setUp(fs.open(interwiki));
            langs.addAll(dump.getProjectPrefixSet());
        } catch (SQLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            throw e;
        } finally {
            if (dump != null) {
                try {dump.close();} catch (SQLException e) {e.printStackTrace();}
            }
        }
        return langs;
    }

    private Set<String> createProjectSet() throws IOException {
        return createRecordSet(WikIE.class.getResourceAsStream("/" + WikIE.PROJECT_DAT));
    }

    private Set<String> createLangSet() throws IOException {
        return createRecordSet(WikIE.class.getResourceAsStream("/" + WikIE.LANG_DAT));
    }

    private Set<String> createRecordSet(InputStream is) throws IOException {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(is, WikIE.UTF8));
            Set<String> result = new HashSet<String>();
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                if (StringUtils.isNotEmpty(line)) {
                    result.add(line.trim());
                }
            }
            return result;
        } catch (IOException e) {
            e.printStackTrace();
            throw e;
        }
        finally {
            if (reader != null) try{reader.close();}  catch(IOException e){throw e;}
        }
    }

    protected WPageElement parsePageElement(Text text) throws IOException, SAXException {
        return (WPageElement)digester.parse(new InputSource(new StringReader(text.toString())));
    }

}
