/*
 * Decompiled with CFR 0.152.
 */
package edu.columbia.ob.gen.app;

import edu.columbia.ob.gen.env.PreGenEnv;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.TreeSet;
import ob.util.Utils;

public class SubjectDomainWikipediaSubsetCreator {
    private static String _domainCategory;
    private static Collection<Filter> _filters;
    private static Collection<String> _excludedCategories;

    public static void main(String[] args) {
        if (args.length != 2) {
            System.out.println("usage: SubjectDomainWikipediaSubsetCreator <domain_name> <wikipedia_dir>");
            System.exit(0);
        }
        String domainName = args[0];
        String corpusDir = PreGenEnv.getSubjectDomainCorpusDir(domainName);
        _domainCategory = Utils.readLines(new File(corpusDir, "domain_category").getAbsolutePath()).get(0);
        _excludedCategories = Utils.readLines(new File(corpusDir, "excluded_categories").getAbsolutePath());
        _filters = SubjectDomainWikipediaSubsetCreator.readFilters(corpusDir);
        String wikipediaDir = args[1];
        File categoriesOfCategoriesFile = new File(wikipediaDir, "categories-of-categories");
        Collection<String> domainCategories = SubjectDomainWikipediaSubsetCreator.getDomainCategories(categoriesOfCategoriesFile);
        File categoriesOutFile = new File(corpusDir, "selected_domain_categories");
        categoriesOutFile.getParentFile().mkdirs();
        Utils.writeLinesToFile(categoriesOutFile.getAbsolutePath(), domainCategories);
        System.out.println("selected " + domainCategories.size() + " domain categories");
        File articleCategoryFile = new File(wikipediaDir, "categories-of-articles");
        Collection<String> domainArticles = SubjectDomainWikipediaSubsetCreator.getDomainArticles(articleCategoryFile, domainCategories);
        File articlesOutFile = new File(corpusDir, "selected_domain_articles");
        articlesOutFile.getParentFile().mkdirs();
        Utils.writeLinesToFile(articlesOutFile.getAbsolutePath(), domainArticles);
        System.out.println("selected " + domainArticles.size() + " domain articles");
    }

    private static Collection<String> getDomainArticles(File articleCategoryFile, Collection<String> domainCategories) {
        ArrayList<String> domainArticles = new ArrayList<String>();
        block0: for (String line : Utils.readLinesDynamically(articleCategoryFile.getAbsolutePath())) {
            String[] tokens = line.split("\\t");
            String article = tokens[0];
            int i = 1;
            while (i < tokens.length) {
                String category = tokens[i];
                if (domainCategories.contains(category)) {
                    domainArticles.add(article);
                    continue block0;
                }
                ++i;
            }
        }
        return domainArticles;
    }

    private static Collection<Filter> readFilters(String corpusDir) {
        ArrayList<Filter> filters = new ArrayList<Filter>();
        for (String line : Utils.readLines(new File(corpusDir, "filters").getAbsolutePath())) {
            if (line.trim().isEmpty() || line.startsWith("#")) continue;
            String[] tokens = line.split("\\t");
            filters.add(new Filter(tokens[0], tokens[1], Boolean.parseBoolean(tokens[2])));
        }
        return filters;
    }

    private static Collection<String> getDomainCategories(File categoriesOfCategoriesFile) {
        HashMap<String, Collection<String>> parents = new HashMap<String, Collection<String>>();
        HashMap<String, Collection<String>> children = new HashMap<String, Collection<String>>();
        for (String line : Utils.readLinesDynamically(categoriesOfCategoriesFile.getAbsolutePath())) {
            String[] tokens = line.split("\\t");
            String child = tokens[0];
            int i = 1;
            while (i < tokens.length) {
                Object parent = tokens[i];
                SubjectDomainWikipediaSubsetCreator.addToMap(parents, child, (String)parent);
                SubjectDomainWikipediaSubsetCreator.addToMap(children, (String)parent, child);
                ++i;
            }
        }
        System.out.println("done with map");
        TreeSet<String> excluded = new TreeSet<String>();
        for (String excludedCategory : _excludedCategories) {
            SubjectDomainWikipediaSubsetCreator.addDescendants(excluded, excludedCategory, Utils.list(_domainCategory), 0, 8, children);
        }
        System.out.println("excluded size: " + excluded.size());
        TreeSet<String> descendants = new TreeSet<String>();
        SubjectDomainWikipediaSubsetCreator.addDescendants(descendants, _domainCategory, excluded, 0, 20, children);
        TreeSet<String> categories = new TreeSet<String>();
        for (String descendant : descendants) {
            if (SubjectDomainWikipediaSubsetCreator.filter(descendant)) continue;
            categories.add(descendant);
        }
        return categories;
    }

    private static boolean filter(String category) {
        String lcCategory = category.toLowerCase();
        for (Filter filter : _filters) {
            if (filter.getType().equals("contains") && (filter.isCaseSensitive() ? category : lcCategory).contains(filter.getFilter())) {
                return true;
            }
            if (!filter.getType().equals("starts") || !(filter.isCaseSensitive() ? category : lcCategory).startsWith(filter.getFilter())) continue;
            return true;
        }
        return false;
    }

    private static boolean addDescendants(Collection<String> descendants, String category, Collection<String> excluded, int d, int max, Map<String, Collection<String>> childrenMap) {
        if (descendants.contains(category)) {
            return false;
        }
        if (excluded != null && excluded.contains(category)) {
            return false;
        }
        if (d + 1 == max) {
            return false;
        }
        descendants.add(category);
        Collection<String> children = childrenMap.get(category);
        if (children == null) {
            return false;
        }
        boolean res = false;
        for (String child : children) {
            boolean ans = SubjectDomainWikipediaSubsetCreator.addDescendants(descendants, child, excluded, d + 1, max, childrenMap);
            if (!ans) continue;
            res = true;
        }
        if (category.equals(_domainCategory)) {
            return true;
        }
        return res;
    }

    private static void addToMap(Map<String, Collection<String>> map, String key, String value) {
        Collection<String> values = map.get(key);
        if (values == null) {
            values = new HashSet<String>();
        }
        values.add(value);
        map.put(key, values);
    }

    private static class Filter {
        private String _filter;
        private String _type;
        private boolean _caseSensitive;

        public Filter(String filter, String type, boolean caseSensitive) {
            this._filter = filter;
            this._type = type;
            this._caseSensitive = caseSensitive;
        }

        public String getFilter() {
            return this._filter;
        }

        public String getType() {
            return this._type;
        }

        public boolean isCaseSensitive() {
            return this._caseSensitive;
        }
    }
}

