/*
 * Decompiled with CFR 0.152.
 */
package edu.columbia.ob.gen.app;

import edu.columbia.ob.gen.env.PreGenEnv;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.HashSet;
import ob.util.Utils;

public class DomainCorpusCreator {
    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.out.println("usage: DomainCorpusCreator <domain_name> <wikipedia_dir>");
            System.exit(0);
        }
        String domainName = args[0];
        String corpusDir = PreGenEnv.getSubjectDomainCorpusDir(domainName);
        HashSet<String> selectedArticles = new HashSet<String>(Utils.readLines(new File(corpusDir, "domain_entities").getAbsolutePath()));
        File outfile = new File(corpusDir, "corpus");
        String wikipediaDir = args[1];
        String wikipediaFile = new File(wikipediaDir, "wiki-clean.txt").getAbsolutePath();
        PrintWriter pw = new PrintWriter(new FileWriter(outfile));
        String article = null;
        int c = 0;
        int t = 0;
        for (String line : Utils.readLinesDynamically(wikipediaFile)) {
            if (line.startsWith("####") && line.endsWith("####") && line.length() >= 8) {
                article = line.substring(4, line.length() - 4).replace(" ", "_");
                ++t;
                if (selectedArticles.contains(article)) {
                    ++c;
                }
                if (t % 10000 == 0) {
                    System.out.println("done with " + c + " articles out of " + selectedArticles.size() + " (" + t + ")");
                }
            }
            if (!selectedArticles.contains(article)) continue;
            pw.println(line);
        }
        pw.close();
    }
}

