/*
 * Decompiled with CFR 0.152.
 */
package com.nexwave.nquindexer;

import com.nexwave.nquindexer.SaxDocFileParser;
import com.nexwave.nsidita.DocFileInfo;
import com.nexwave.stemmer.snowball.SnowballStemmer;
import com.nexwave.stemmer.snowball.ext.EnglishStemmer;
import com.nexwave.stemmer.snowball.ext.FrenchStemmer;
import com.nexwave.stemmer.snowball.ext.GermanStemmer;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class SaxHTMLIndex
extends SaxDocFileParser {
    private Map<String, String> tempDico;
    private int i = 0;
    private ArrayList<String> cleanUpList = null;
    private ArrayList<String> cleanUpPunctuation = null;

    public SaxHTMLIndex() {
    }

    public SaxHTMLIndex(ArrayList<String> cleanUpStrings) {
        this.cleanUpList = cleanUpStrings;
    }

    public SaxHTMLIndex(ArrayList<String> cleanUpStrings, ArrayList<String> cleanUpChars) {
        this.cleanUpList = cleanUpStrings;
        this.cleanUpPunctuation = cleanUpChars;
    }

    public int init(Map<String, String> tempMap) {
        this.tempDico = tempMap;
        return 0;
    }

    public DocFileInfo runExtractData(File file, String indexerLanguage) {
        String[] tokenizedItems;
        this.fileDesc = new DocFileInfo(file);
        this.strbf = new StringBuffer("");
        this.parseDocument(file);
        String str = this.cleanBuffer(this.strbf);
        str = str.replaceAll("\\s+", " ");
        String[] items = str.split("\\s");
        if (indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh") || indexerLanguage.equalsIgnoreCase("ko")) {
            LinkedList<String> tokens = new LinkedList<String>();
            try {
                CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
                StringReader reader = new StringReader(str);
                TokenStream stream = analyzer.tokenStream("", (Reader)reader);
                TermAttribute termAtt = (TermAttribute)stream.addAttribute(TermAttribute.class);
                OffsetAttribute offAtt = (OffsetAttribute)stream.addAttribute(OffsetAttribute.class);
                while (stream.incrementToken()) {
                    String term = termAtt.term();
                    tokens.add(term);
                }
                tokenizedItems = tokens.toArray(new String[tokens.size()]);
            }
            catch (IOException ex) {
                tokenizedItems = items;
                System.out.println("Error tokenizing content using CJK Analyzer. IOException");
                ex.printStackTrace();
            }
        } else {
            SnowballStemmer stemmer = indexerLanguage.equalsIgnoreCase("en") ? new EnglishStemmer() : (indexerLanguage.equalsIgnoreCase("de") ? new GermanStemmer() : (indexerLanguage.equalsIgnoreCase("fr") ? new FrenchStemmer() : null));
            tokenizedItems = stemmer != null ? stemmer.doStem(items) : items;
        }
        HashSet<String> tempSet = new HashSet<String>();
        tempSet.addAll(Arrays.asList(tokenizedItems));
        for (String s : tempSet) {
            if (this.tempDico.containsKey(s)) {
                String temp = this.tempDico.get(s);
                temp = temp.concat(",").concat(Integer.toString(this.i));
                this.tempDico.put(s, temp);
                continue;
            }
            this.tempDico.put(s, Integer.toString(this.i));
        }
        ++this.i;
        return this.fileDesc;
    }

    private String cleanBuffer(StringBuffer strbf) {
        Iterator<String> it;
        String str = strbf.toString().toLowerCase();
        StringBuffer tempStrBuf = new StringBuffer("");
        StringBuffer tempCharBuf = new StringBuffer("");
        if (this.cleanUpList == null || this.cleanUpList.isEmpty()) {
            tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");
            tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");
            tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");
            tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");
            tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b");
            str = str.replaceFirst("Copyright \u00d4\u00f8\u03a9 1998-2007 NexWave Solutions.", " ");
        } else {
            tempStrBuf.append("\\ba\\b");
            it = this.cleanUpList.iterator();
            while (it.hasNext()) {
                tempStrBuf.append("|\\b" + it.next() + "\\b");
            }
        }
        if (this.cleanUpPunctuation != null && !this.cleanUpPunctuation.isEmpty()) {
            tempCharBuf.append("\\u3002");
            it = this.cleanUpPunctuation.iterator();
            while (it.hasNext()) {
                tempCharBuf.append("|" + it.next());
            }
        }
        str = this.minimalClean(str, tempStrBuf, tempCharBuf);
        return str;
    }

    private String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {
        String tempPunctuation = new String(tempCharBuf);
        str = str.replaceAll("\\s+", " ");
        str = str.replaceAll("->", " ");
        str = str.replaceAll("[$|%,;.':()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+", " ");
        str = str.replaceAll("[$,;.':()\\/*\"{}=!&+<>\\\\]", " ");
        str = str.replaceAll("\\u3000|\\u3001|\\u3002|\\u3003|\\u3008|\\u3009|\\u300C|\\u300D", " ");
        str = str.replaceAll("\\u3013|\\u3014|\\u3015|\\u301C|\\u301D|\\u301E|\\u301F", " ");
        str = str.replaceAll("\\u3013|\\u300C|\\u300D", " ");
        if (tempPunctuation.length() > 0) {
            str = str.replaceAll(tempPunctuation, " ");
        }
        str = str.replaceAll(tempStrBuf.toString(), " ");
        str = str.replaceAll("[$|%,;.':()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+", " ");
        str = str.replaceAll("[$,;.':()\\/*\"{}=!&+<>\\\\]", " ");
        str = str.replaceAll("\\u3000|\\u3001|\\u3002|\\u3003|\\u3008|\\u3009|\\u300C|\\u300D", " ");
        str = str.replaceAll("\\u3013|\\u3014|\\u3015|\\u301C|\\u301D|\\u301E|\\u301F", " ");
        str = str.replaceAll("\\u3013|\\u300C|\\u300D", " ");
        if (tempPunctuation.length() > 0) {
            str = str.replaceAll(tempPunctuation, " ");
        }
        return str;
    }
}

