/*
 * Decompiled with CFR 0.152.
 */
package com.aliasi.corpus.parsers;

import com.aliasi.classify.BinaryLMClassifier;
import com.aliasi.classify.Classification;
import com.aliasi.corpus.ClassificationHandler;
import com.aliasi.corpus.Corpus;
import com.aliasi.corpus.StringParser;
import com.aliasi.io.FileExtensionFilter;
import java.io.File;
import java.io.IOException;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class Reuters21578Parser
extends StringParser<ClassificationHandler<CharSequence, Classification>> {
    private final boolean mIncludeTestDocuments;
    private final boolean mIncludeTrainingDocuments;
    private final String mTopic;
    static final Classification ON_TOPIC = new Classification(BinaryLMClassifier.DEFAULT_ACCEPT_CATEGORY);
    static final Classification OFF_TOPIC = new Classification(BinaryLMClassifier.DEFAULT_REJECT_CATEGORY);
    static final String END_BOILERPLATE_1 = "Reuter&#3;";
    static final String END_BOILERPLATE_2 = "REUTER&#3;";
    static final String[] TOPICS = new String[]{"acq", "alum", "austdlr", "barley", "bean", "belly", "bfr", "bop", "cake", "can", "carcass", "castor", "castorseed", "cattle", "chem", "citruspulp", "cocoa", "coconut", "coffee", "copper", "copra", "corn", "cornglutenfeed", "cotton", "cottonseed", "cpi", "cpu", "crude", "cruzado", "debt", "dfl", "dkr", "dlr", "dmk", "earn", "f", "feed", "fishmeal", "fuel", "fx", "gas", "gnp", "gold", "grain", "groundnut", "heat", "hk", "hog", "housing", "income", "instal", "interest", "inventories", "ipi", "iron", "jet", "jobs", "l", "lead", "lei", "lin", "linseed", "lit", "livestock", "lumber", "meal", "metal", "money", "naphtha", "nat", "nickel", "nkr", "nzdlr", "oat", "oil", "oilseed", "orange", "palladium", "palm", "palmkernel", "peseta", "pet", "platinum", "plywood", "pork", "potato", "propane", "rand", "rape", "rapeseed", "red", "reserves", "retail", "rice", "ringgit", "rubber", "rupiah", "rye", "saudriyal", "sfr", "ship", "silver", "skr", "sorghum", "soy", "soybean", "steel", "stg", "strategic", "sugar", "sun", "sunseed", "supply", "tapioca", "tea", "tin", "trade", "veg", "wheat", "wool", "wpi", "yen", "zinc"};

    public Reuters21578Parser(String topic, boolean includeTrainingDocuments, boolean includeTestDocuments) {
        this.mIncludeTrainingDocuments = includeTrainingDocuments;
        this.mIncludeTestDocuments = includeTestDocuments;
        this.mTopic = topic;
        if (!Reuters21578Parser.isAvailableTopic(this.mTopic)) {
            String msg = "Require known topic. Found topic=" + topic;
            throw new IllegalArgumentException(msg);
        }
    }

    @Override
    public void parseString(char[] cs, int start, int end) {
        String text = new String(cs, start, end - start);
        String[] lines = text.split("\n");
        for (int i = 0; i < lines.length; ++i) {
            if (!lines[i].startsWith("<REUTERS")) continue;
            StringBuilder sb = new StringBuilder();
            while (!lines[i].startsWith("</REUTERS")) {
                sb.append(lines[i++]);
                sb.append("\n");
            }
            this.handleDocument(sb.toString());
        }
    }

    void handleDocument(String text) {
        if (!Reuters21578Parser.hasTopics(text)) {
            return;
        }
        if (Reuters21578Parser.isTrainingDocument(text) && !this.mIncludeTrainingDocuments) {
            return;
        }
        if (Reuters21578Parser.isTestDocument(text) && !this.mIncludeTestDocuments) {
            return;
        }
        String topics = Reuters21578Parser.extract("TOPICS", text, true);
        String title = Reuters21578Parser.extract("TITLE", text, true);
        String dateline = Reuters21578Parser.extract("DATELINE", text, true);
        String body = Reuters21578Parser.extract("BODY", text, true);
        if (body.endsWith(END_BOILERPLATE_1) || body.endsWith(END_BOILERPLATE_2)) {
            body = body.substring(0, body.length() - END_BOILERPLATE_1.length());
        }
        StringBuilder sb = new StringBuilder();
        sb.append(title + "\n");
        sb.append(dateline + "\n");
        sb.append(body);
        boolean hasTopic = topics.indexOf(this.mTopic) >= 0;
        Classification classification = hasTopic ? ON_TOPIC : OFF_TOPIC;
        ((ClassificationHandler)this.getHandler()).handle(sb, classification);
    }

    static String extract(String elt, String text, boolean allowEmpty) {
        String startElt = "<" + elt + ">";
        String endElt = "</" + elt + ">";
        int startEltIndex = text.indexOf(startElt);
        if (startEltIndex < 0) {
            if (allowEmpty) {
                return "";
            }
            throw new IllegalArgumentException("no start, elt=" + elt + " text=" + text);
        }
        int start = startEltIndex + startElt.length();
        int end = text.indexOf(endElt, start);
        if (end < 0) {
            throw new IllegalArgumentException("no end, elt=" + elt + " text=" + text);
        }
        return text.substring(start, end);
    }

    public static String[] availableTopics() {
        String[] topics = new String[TOPICS.length];
        for (int i = 0; i < topics.length; ++i) {
            topics[i] = TOPICS[i];
        }
        return topics;
    }

    public static boolean isAvailableTopic(String topic) {
        for (String validTopic : TOPICS) {
            if (!validTopic.equals(topic)) continue;
            return true;
        }
        return false;
    }

    public static Corpus<ClassificationHandler<CharSequence, Classification>> corpus(String topic, File directory) throws IOException {
        return new ReutersCorpus(topic, directory);
    }

    static boolean hasTopics(String document) {
        return Reuters21578Parser.containsText(document, "TOPICS=\"Y");
    }

    static boolean isTrainingDocument(String document) {
        return Reuters21578Parser.containsText(document, "LEWISSPLIT=\"TR");
    }

    static boolean isTestDocument(String document) {
        return Reuters21578Parser.containsText(document, "LEWISSPLIT=\"TE");
    }

    static boolean containsText(String doc, String text) {
        return doc.indexOf(text) >= 0;
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static class ReutersCorpus
    extends Corpus<ClassificationHandler<CharSequence, Classification>> {
        private final String mTopic;
        private final File mDirectory;

        ReutersCorpus(String topic, File directory) {
            this.mTopic = topic;
            this.mDirectory = directory;
        }

        @Override
        public void visitCorpus(ClassificationHandler<CharSequence, Classification> handler) throws IOException {
            this.visit(handler, true, true);
        }

        @Override
        public void visitTest(ClassificationHandler<CharSequence, Classification> handler) throws IOException {
            this.visit(handler, false, true);
        }

        @Override
        public void visitTrain(ClassificationHandler<CharSequence, Classification> handler) throws IOException {
            this.visit(handler, true, false);
        }

        void visit(ClassificationHandler<CharSequence, Classification> handler, boolean includeTrain, boolean includeTest) throws IOException {
            Reuters21578Parser parser = new Reuters21578Parser(this.mTopic, includeTrain, includeTest);
            parser.setHandler(handler);
            for (File file : this.mDirectory.listFiles(new FileExtensionFilter(".sgm"))) {
                parser.parse(file);
            }
        }
    }
}

