/*
 * Decompiled with CFR 0.152.
 */
package edu.columbia.ob.gen.gems.rdfApps;

import edu.columbia.ob.gen.env.PreGenEnv;
import java.io.File;
import java.util.HashSet;
import java.util.TreeSet;
import ob.util.Counts;
import ob.util.Utils;

public class CorpusExpander {
    public static void main(String[] args) {
        if (args.length != 2) {
            System.out.println("usage: CorpusExpander <domain_name> <wikipedia_dir>");
            System.exit(0);
        }
        String domainName = args[0];
        String corpusDir = PreGenEnv.getSubjectDomainCorpusDir(domainName);
        HashSet<String> selectedArticles = new HashSet<String>();
        for (String article : Utils.readLinesDynamically(new File(corpusDir, "domain_entities").getAbsolutePath())) {
            selectedArticles.add(article.toLowerCase());
        }
        File outfile = new File(corpusDir, "expanded_domain_articles");
        String wikipediaDir = args[1];
        String wikipediaLinksFile = new File(wikipediaDir, "wiki-clean.link").getAbsolutePath();
        Counts<String> expandedArticleCounts = new Counts<String>();
        for (String line : Utils.readLinesDynamically(wikipediaLinksFile)) {
            String[] tokens = line.split("\\t");
            if (!selectedArticles.contains(tokens[0].toLowerCase()) || selectedArticles.contains(tokens[1].toLowerCase()) || !Character.isAlphabetic(tokens[1].charAt(0))) continue;
            expandedArticleCounts.add(tokens[1].toLowerCase());
        }
        TreeSet<String> expandedArticles = new TreeSet<String>();
        for (String article : expandedArticleCounts) {
            if (expandedArticleCounts.getCount(article) <= 1) continue;
            expandedArticles.add(article);
        }
        Utils.writeLinesToFile(outfile.getAbsolutePath(), expandedArticles);
    }
}

