package jp.ac.dendai.cdl.mori.wikie.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashSet;
import java.util.Set;

import jp.ac.dendai.cdl.mori.wikie.WikIE;

public class WikipediaDump {

    private Connection conn = null;

    public WikipediaDump() throws SQLException {
        super();
        try {
            Class.forName("org.h2.Driver");
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
        this.conn = DriverManager.getConnection("jdbc:h2:mem:");
    }

    public void setUp(InputStream interwiki) throws SQLException, IOException {
        executeUpdate(createSql_createTableInterWiki());
        executeUpdate(extractInsertSql(interwiki));
    }

    private void executeUpdate(String sql) throws SQLException {
        Statement stmt = null;
        try {
            stmt = conn.createStatement();
            stmt.executeUpdate(sql);
        } catch (SQLException e) {
            throw e;
        } finally {
            if (stmt != null) {
                try {stmt.close();} catch (SQLException e) {e.printStackTrace();}
            }
        }
    }

    private String extractInsertSql(InputStream is) throws IOException {
        StringBuffer result = new StringBuffer();
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(is, Charset.forName(WikIE.UTF8)));
            String line = null;
            while ((line = reader.readLine()) != null) {
                if (line.matches("INSERT INTO .*?;")) {
                    result.append(line);
                }
            }
        } catch (IOException e) {
            throw e;
        } finally {
            if (reader != null) {
                try {reader.close();} catch (IOException e) {e.printStackTrace();}
            }
        }
        return result.toString();
    }

    private String createSql_createTableInterWiki() {
        return new StringBuffer("CREATE TABLE `interwiki` (")
        .append("`iw_prefix` varchar(32) NOT NULL DEFAULT '', ")
        .append("`iw_url` varchar(127) NOT NULL DEFAULT '', ")
        .append("`iw_local` tinyint(1) NOT NULL DEFAULT '0', ")
        .append("`iw_trans` tinyint(1) NOT NULL DEFAULT '0', ")
        .append("UNIQUE KEY `iw_prefix` (`iw_prefix`)")
        .append(") ").toString();
    }

    public Set<String> getLanguagePrefixSet() throws SQLException {
        Set<String> result = new HashSet<String>();
        Statement stmt = null;
        ResultSet rs = null;
        try {
            stmt = conn.createStatement();
            rs = stmt.executeQuery("SELECT `iw_prefix`, `iw_url` FROM `interwiki`");
            while (rs.next()) {
                String prefix = rs.getString(1);
                String url = rs.getString(2);
                if (url.matches("http://.*?\\.wikipedia\\.org/wiki/\\$1")) {
                    result.add(prefix);
                }
            }
        } catch (SQLException e) {
            throw e;
        } finally {
            if (rs != null) {
                try {rs.close();} catch (SQLException e) {e.printStackTrace();}
            }
            if (stmt != null) {
                try {stmt.close();} catch (SQLException e) {e.printStackTrace();}
            }
        }
        return result;
    }


    public Set<String> getProjectPrefixSet() throws SQLException {
        Set<String> result = new HashSet<String>();
        Statement stmt = null;
        ResultSet rs = null;
        try {
            stmt = conn.createStatement();
            rs = stmt.executeQuery("SELECT `iw_prefix`, `iw_url` FROM `interwiki`");
            while (rs.next()) {
                String prefix = rs.getString(1);
                String url = rs.getString(2);
                if (!url.matches("http://.*?\\.wikipedia\\.org/wiki/\\$1")) {
                    result.add(prefix);
                }
            }
        } catch (SQLException e) {
            throw e;
        } finally {
            if (rs != null) {
                try {rs.close();} catch (SQLException e) {e.printStackTrace();}
            }
            if (stmt != null) {
                try {stmt.close();} catch (SQLException e) {e.printStackTrace();}
            }
        }
        return result;
    }
//    `iw_prefix` varchar(32) binary NOT NULL default '',
//    `iw_url` varchar(127) binary NOT NULL default '',
//    `iw_local` tinyint(1) NOT NULL default '0',
//    `iw_trans` tinyint(1) NOT NULL default '0',

    public void close() throws SQLException {
        if (conn != null) {
            conn.close();
        }
    }

    public static void main(String[] args) {
        WikipediaDump db2 = null;
        try {
            db2 = new WikipediaDump();
            db2.setUp(new FileInputStream(new File("D:\\data\\wikipedia\\jawiki-latest-interwiki.sql")));

            Set<String> langs = db2.getLanguagePrefixSet();
            for (String prefix : langs) {
                System.out.println(prefix);
            }
            System.out.println(langs.size());

            Set<String> projs = db2.getProjectPrefixSet();
            for (String prefix : projs) {
                System.out.println(prefix);
            }
            System.out.println(projs.size());
        } catch (SQLException e) {
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (db2 != null) {
                try {db2.close();} catch (SQLException e) {e.printStackTrace();}
            }
        }

    }
}
