package jp.ac.dendai.cdl.mori.wikie.io;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class WikipediaXmlRecordReader extends LineRecordReader {

    private String tag = "page";

    private String currentValue = null;

    @Override
    public void initialize(InputSplit genericSplit, TaskAttemptContext context)
            throws IOException {
        super.initialize(genericSplit, context);
    }

    @Override
    public boolean nextKeyValue() throws IOException {
        StringBuffer page = new StringBuffer();
        boolean withinTarget = false;
        while (super.nextKeyValue()) {
            String line = super.getCurrentValue().toString();
            if (withinTarget) {
                page.append(line + "\r\n");
                if (line.matches(" *</" + tag + ">")) {
                    currentValue = page.toString();
                    return true;
                }
            }
            else if (line.matches(" *<" + tag + ">")) {
                page.append(line + "\r\n");
                withinTarget = true;
            }
        }
        return false;
    }

    @Override
    public Text getCurrentValue() {
        return new Text(currentValue);
    }

}
