/*
 * Decompiled with CFR 0.152.
 */
package edu.columbia.ob.gen.app;

import edu.columbia.ob.gen.env.PreGenEnv;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import ob.core.CentralFactory;
import ob.core.Feature;
import ob.core.FeatureVector;
import ob.core.FeatureVectorImpl;
import ob.core.NumericFeature;
import ob.core.StringFeature;
import ob.ml.Learner;
import ob.ml.YNUtils;
import ob.ml.classifier.Classifier;
import ob.ml.classifier.DataPoint;
import ob.ml.classifier.DataPointImpl;
import ob.ml.classifier.weka.WekaClassifier;
import ob.ml.classifier.weka.WekaClassifierFactory;
import ob.operators.CosineSimilarity;
import ob.operators.VectorCombination;
import ob.util.CharacterUtils;
import ob.util.Pair;
import ob.util.Utils;

public class TaxonomyCreator {
    private static final String MODEL_FILE = "models/ontology.smo.model";
    private static final boolean FIX = true;
    private static boolean _featuresSet = false;
    private static final Feature CLASS_FEATURE = new StringFeature("type", YNUtils.getNominals());
    private static NumberFormat nf = NumberFormat.getIntegerInstance();
    private static final VectorCombination _pairSimilarity = new CosineSimilarity();

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.out.println("usage: TaxonomyCreator <domain_name> <wikipedia_dir>");
            System.exit(0);
        }
        String domainName = args[0];
        String corpusDir = PreGenEnv.getSubjectDomainCorpusDir(domainName);
        List<String> selectedArticles = Utils.readLines(new File(corpusDir, "expanded_domain_articles").getAbsolutePath());
        File outfile = new File(corpusDir, "taxonomy");
        String wikipediaDir = args[1];
        ArrayList<String> features = new ArrayList<String>();
        System.out.println("class map  --  " + TaxonomyCreator.nf(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
        Map<Pair<String>, Pair<List<String>>> pairsMap = TaxonomyCreator.createPairsMap(selectedArticles);
        Collection<String> conceptSet = TaxonomyCreator.makeConceptSet(pairsMap.keySet());
        System.out.println("vector/redirect features  --  " + TaxonomyCreator.nf(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
        TaxonomyCreator.addVectorAndRedirectFeatures(pairsMap, features, conceptSet, wikipediaDir);
        System.out.println("link features  --  " + TaxonomyCreator.nf(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
        TaxonomyCreator.addLinkFeatures(pairsMap, features, conceptSet, wikipediaDir);
        System.out.println("simple features  --  " + TaxonomyCreator.nf(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
        TaxonomyCreator.addSimpleFeatures(pairsMap, features);
        System.out.println("classifying...");
        Classifier learner = WekaClassifierFactory.load(MODEL_FILE);
        Feature[] featureArray = TaxonomyCreator.makeFeatureArray(features);
        PrintWriter pw = new PrintWriter(new FileWriter(outfile));
        for (Pair<String> pair : pairsMap.keySet()) {
            boolean hypernym12 = TaxonomyCreator.predict(learner, featureArray, pairsMap.get(pair).getFirst());
            boolean hypernym21 = TaxonomyCreator.predict(learner, featureArray, pairsMap.get(pair).getSecond());
            if (hypernym12 && hypernym21) {
                pw.println(String.valueOf(pair.getFirst()) + "\t" + pair.getSecond() + "\t" + "synonym");
                continue;
            }
            if (hypernym12) {
                pw.println(String.valueOf(pair.getFirst()) + "\t" + pair.getSecond() + "\t" + "hypernym");
                continue;
            }
            if (!hypernym21) continue;
            pw.println(String.valueOf(pair.getSecond()) + "\t" + pair.getFirst() + "\t" + "hypernym");
        }
        pw.close();
    }

    private static boolean predict(Learner learner, Feature[] featureArray, List<String> featureValues) {
        DataPoint dataPoint = TaxonomyCreator.makeDataPoint(featureArray, featureValues);
        if (!_featuresSet) {
            ((WekaClassifier)learner).setFeatures(dataPoint);
        }
        return YNUtils.isPositive(learner.predict(dataPoint));
    }

    private static DataPoint makeDataPoint(Feature[] features, List<String> featureValues) {
        DataPointImpl point = new DataPointImpl();
        point.setClassFeature(CLASS_FEATURE);
        point.setClassValue(CentralFactory.getStringValue("N"));
        int j = 0;
        while (j < features.length) {
            point.addValue(features[j], CentralFactory.getNumericValue(Double.parseDouble(featureValues.get(j))));
            ++j;
        }
        return point;
    }

    private static Feature[] makeFeatureArray(List<String> features) {
        Feature[] featureArray = new Feature[features.size()];
        int i = 0;
        while (i < features.size()) {
            featureArray[i] = new NumericFeature(features.get(i));
            ++i;
        }
        return featureArray;
    }

    private static Map<Pair<String>, Pair<List<String>>> createPairsMap(List<String> selectedArticles) {
        LinkedHashMap<Pair<String>, Pair<List<String>>> map = new LinkedHashMap<Pair<String>, Pair<List<String>>>();
        int i = 0;
        while (i < selectedArticles.size() - 1) {
            System.out.print(String.valueOf(i) + " ");
            if (i % 100 == 0) {
                System.out.println();
            }
            int j = i + 1;
            while (j < selectedArticles.size()) {
                String concept1 = selectedArticles.get(i);
                String concept2 = selectedArticles.get(j);
                concept1 = TaxonomyCreator.fix(concept1);
                concept2 = TaxonomyCreator.fix(concept2);
                Pair listPair = new Pair(new ArrayList(), new ArrayList());
                map.put(new Pair<String>(concept1, concept2), listPair);
                ++j;
            }
            ++i;
        }
        System.out.println();
        return map;
    }

    private static Collection<String> makeConceptSet(Collection<Pair<String>> pairs) {
        HashSet<String> set = new HashSet<String>();
        for (Pair<String> pair : pairs) {
            set.add(pair.getFirst());
            set.add(pair.getSecond());
        }
        return set;
    }

    private static String nf(long l) {
        return nf.format(l);
    }

    private static void addVectorAndRedirectFeatures(Map<Pair<String>, Pair<List<String>>> map, Collection<String> features, Collection<String> conceptSet, String wikipediaDir) {
        features.add("cosine similarity");
        features.add("1 redirects to 2");
        features.add("2 redirects to 1");
        features.add("both redirect to same");
        Map<String, String> redirects = TaxonomyCreator.readRedirects(conceptSet, wikipediaDir);
        Map<String, FeatureVector> vectors = TaxonomyCreator.readVectors(conceptSet, wikipediaDir);
        for (Pair<String> pair : map.keySet()) {
            double similarity = TaxonomyCreator.getSimilarity(pair, vectors, redirects);
            map.get(pair).getFirst().add(Double.toString(similarity));
            map.get(pair).getSecond().add(Double.toString(similarity));
            double redirect12 = TaxonomyCreator.booleanToDouble(pair.getSecond().equals(redirects.get(pair.getFirst())));
            double redirect21 = TaxonomyCreator.booleanToDouble(pair.getFirst().equals(redirects.get(pair.getSecond())));
            map.get(pair).getFirst().add(Double.toString(redirect12));
            map.get(pair).getSecond().add(Double.toString(redirect21));
            map.get(pair).getFirst().add(Double.toString(redirect21));
            map.get(pair).getSecond().add(Double.toString(redirect12));
            double redirectToSame = TaxonomyCreator.booleanToDouble(redirects.get(pair.getSecond()) != null && redirects.get(pair.getSecond()).equals(redirects.get(pair.getFirst())));
            map.get(pair).getFirst().add(Double.toString(redirectToSame));
            map.get(pair).getSecond().add(Double.toString(redirectToSame));
        }
    }

    private static void addLinkFeatures(Map<Pair<String>, Pair<List<String>>> map, Collection<String> features, Collection<String> conceptSet, String wikipediaDir) {
        Pair<Map<String, Collection<String>>> links = TaxonomyCreator.readLinks(conceptSet, wikipediaDir);
        Map<String, Collection<String>> realLinks = links.getFirst();
        Map<String, Collection<String>> nameLinks = links.getSecond();
        features.add("1 links to 2");
        features.add("2 links to 1");
        features.add("outgoing link similarity");
        features.add("outgoing link ratio 1 to 2");
        features.add("outgoing link ratio 2 to 1");
        for (Pair<String> pair : map.keySet()) {
            double link12 = TaxonomyCreator.booleanToDouble(realLinks.containsKey(pair.getFirst()) && realLinks.get(pair.getFirst()).contains(pair.getSecond()) || nameLinks.containsKey(pair.getFirst()) && nameLinks.get(pair.getFirst()).contains(pair.getSecond()));
            double link21 = TaxonomyCreator.booleanToDouble(realLinks.containsKey(pair.getSecond()) && realLinks.get(pair.getSecond()).contains(pair.getFirst()) || nameLinks.containsKey(pair.getSecond()) && nameLinks.get(pair.getSecond()).contains(pair.getFirst()));
            map.get(pair).getFirst().add(Double.toString(link12));
            map.get(pair).getSecond().add(Double.toString(link21));
            map.get(pair).getFirst().add(Double.toString(link21));
            map.get(pair).getSecond().add(Double.toString(link12));
            double outLinkSimilarity = TaxonomyCreator.getLinkSimilarity(realLinks.get(pair.getFirst()), realLinks.get(pair.getSecond()));
            map.get(pair).getFirst().add(Double.toString(outLinkSimilarity));
            map.get(pair).getSecond().add(Double.toString(outLinkSimilarity));
            double outLinkRatio12 = TaxonomyCreator.getLinkRatio(realLinks.get(pair.getFirst()), realLinks.get(pair.getSecond()));
            double outLinkRatio21 = TaxonomyCreator.getLinkRatio(realLinks.get(pair.getSecond()), realLinks.get(pair.getFirst()));
            map.get(pair).getFirst().add(Double.toString(outLinkRatio12));
            map.get(pair).getSecond().add(Double.toString(outLinkRatio21));
            map.get(pair).getFirst().add(Double.toString(outLinkRatio21));
            map.get(pair).getSecond().add(Double.toString(outLinkRatio12));
        }
        nameLinks = null;
        links = null;
        System.out.println("making reverse link map  --  " + TaxonomyCreator.nf(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
        Map<String, Collection<String>> reversedRealLinks = TaxonomyCreator.makeReversedLinks(realLinks);
        realLinks = null;
        System.out.println("done with reverse link map  --  " + TaxonomyCreator.nf(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
        features.add("incoming link similarity");
        features.add("incoming link ratio 1 to 2");
        features.add("incoming link ratio 2 to 1");
        for (Pair<String> pair : map.keySet()) {
            double inLinkSimilarity = TaxonomyCreator.getLinkSimilarity(reversedRealLinks.get(pair.getFirst()), reversedRealLinks.get(pair.getSecond()));
            map.get(pair).getFirst().add(Double.toString(inLinkSimilarity));
            map.get(pair).getSecond().add(Double.toString(inLinkSimilarity));
            double inLinkRatio12 = TaxonomyCreator.getLinkRatio(reversedRealLinks.get(pair.getFirst()), reversedRealLinks.get(pair.getSecond()));
            double inLinkRatio21 = TaxonomyCreator.getLinkRatio(reversedRealLinks.get(pair.getSecond()), reversedRealLinks.get(pair.getFirst()));
            map.get(pair).getFirst().add(Double.toString(inLinkRatio12));
            map.get(pair).getSecond().add(Double.toString(inLinkRatio21));
            map.get(pair).getFirst().add(Double.toString(inLinkRatio21));
            map.get(pair).getSecond().add(Double.toString(inLinkRatio12));
        }
    }

    private static void addSimpleFeatures(Map<Pair<String>, Pair<List<String>>> map, Collection<String> features) {
        features.add("word count difference");
        features.add("word overlap");
        for (Pair<String> pair : map.keySet()) {
            double wordNumDifference = TaxonomyCreator.getWordNumDifference(pair);
            map.get(pair).getFirst().add(Double.toString(wordNumDifference));
            map.get(pair).getSecond().add(Double.toString(wordNumDifference));
            double wordOverlap = TaxonomyCreator.getWordOverlap(pair);
            map.get(pair).getFirst().add(Double.toString(wordOverlap));
            map.get(pair).getSecond().add(Double.toString(wordOverlap));
        }
    }

    private static double getWordOverlap(Pair<String> pair) {
        String[] tokens1 = pair.getFirst().split("\\s");
        String[] tokens2 = pair.getSecond().split("\\s");
        int overlap = 0;
        int i = 0;
        while (i < tokens1.length) {
            int j = 0;
            while (j < tokens2.length) {
                if (tokens1[i].equals(tokens2[j])) {
                    ++overlap;
                }
                ++j;
            }
            ++i;
        }
        return overlap;
    }

    private static double getWordNumDifference(Pair<String> pair) {
        int numWords1 = pair.getFirst().split("\\s").length;
        int numWords2 = pair.getSecond().split("\\s").length;
        return Math.abs(numWords1 - numWords2);
    }

    private static double getLinkRatio(Collection<String> links1, Collection<String> links2) {
        if (links1 == null || links2 == null) {
            return 0.0;
        }
        if (links1.size() == 0) {
            return 0.0;
        }
        double ratio = 0.0;
        for (String link : links1) {
            if (!links2.contains(link)) continue;
            ratio += 1.0;
        }
        return ratio / (double)links1.size();
    }

    private static double getLinkSimilarity(Collection<String> links1, Collection<String> links2) {
        if (links1 == null || links2 == null) {
            return 0.0;
        }
        FeatureVector vector1 = TaxonomyCreator.buildVector(links1);
        FeatureVector vector2 = TaxonomyCreator.buildVector(links2);
        return _pairSimilarity.getValue(vector1, vector2);
    }

    private static FeatureVector buildVector(Collection<String> links) {
        FeatureVectorImpl vector = new FeatureVectorImpl();
        for (String link : links) {
            vector.set(new NumericFeature(link), 1.0);
        }
        return vector;
    }

    private static Map<String, Collection<String>> makeReversedLinks(Map<String, Collection<String>> links) {
        HashMap<String, Collection<String>> map = new HashMap<String, Collection<String>>();
        for (String concept : links.keySet()) {
            for (String link : links.get(concept)) {
                HashSet<String> concepts = (HashSet<String>)map.get(link);
                if (concepts == null) {
                    concepts = new HashSet<String>();
                    map.put(link, concepts);
                }
                concepts.add(concept);
            }
        }
        return map;
    }

    private static double booleanToDouble(boolean b) {
        return b ? 1.0 : 0.0;
    }

    private static double getSimilarity(Pair<String> pair, Map<String, FeatureVector> vectors, Map<String, String> redirects) {
        FeatureVector vector2;
        FeatureVector vector1 = vectors.get(pair.getFirst());
        if (vector1 == null && redirects.containsKey(pair.getFirst())) {
            vector1 = vectors.get(redirects.get(pair.getFirst()));
        }
        if ((vector2 = vectors.get(pair.getSecond())) == null && redirects.containsKey(pair.getSecond())) {
            vector2 = vectors.get(redirects.get(pair.getSecond()));
        }
        double similarity = 0.0;
        if (vector1 != null && vector2 != null) {
            similarity = _pairSimilarity.getValue(vector1, vector2);
        }
        return similarity;
    }

    private static Map<String, String> readRedirects(Collection<String> conceptSet, String wikipediaDir) {
        HashMap<String, String> map = new HashMap<String, String>();
        for (String line : Utils.readLinesDynamically(new File(wikipediaDir, "wiki.redirect").getAbsolutePath())) {
            String[] tokens = line.split("\\t");
            if (tokens.length != 2) continue;
            String word1 = tokens[0];
            String word2 = tokens[1];
            word1 = TaxonomyCreator.fix(word1);
            word2 = TaxonomyCreator.fix(word2);
            if (!conceptSet.contains(word1) && !conceptSet.contains(word2)) continue;
            conceptSet.add(word1);
            conceptSet.add(word2);
            map.put(word1, word2);
        }
        return map;
    }

    private static Map<String, FeatureVector> readVectors(Collection<String> conceptSet, String wikipediaDir) {
        HashMap<String, FeatureVector> vectors = new HashMap<String, FeatureVector>();
        for (String line : Utils.readLinesDynamically(new File(wikipediaDir, "wiki.vector").getAbsolutePath())) {
            String[] tokens = line.split("\\t");
            String concept = tokens[0];
            if (!conceptSet.contains(concept = TaxonomyCreator.fix(concept)) || tokens.length < 2) continue;
            FeatureVectorImpl vector = new FeatureVectorImpl();
            int i = 1;
            while (i < tokens.length) {
                String[] wordAndValue = tokens[i].split(":#:");
                String word = wordAndValue[0];
                Double value = Double.parseDouble(wordAndValue[1]);
                vector.set(new StringFeature(word), value);
                ++i;
            }
            vectors.put(concept, vector);
        }
        return vectors;
    }

    private static Pair<Map<String, Collection<String>>> readLinks(Collection<String> conceptSet, String wikipediaDir) {
        HashMap<String, HashSet<String>> realLinks = new HashMap<String, HashSet<String>>();
        HashMap<String, HashSet<String>> nameLinks = new HashMap<String, HashSet<String>>();
        int c = 0;
        for (String line : Utils.readLinesDynamically(new File(wikipediaDir, "wiki.link").getAbsolutePath())) {
            String[] tokens = line.split("\\t");
            if (tokens.length < 3) continue;
            String concept = tokens[0];
            String realLink = tokens[1];
            String nameLink = tokens[2];
            concept = TaxonomyCreator.fix(concept);
            realLink = TaxonomyCreator.fix(realLink);
            nameLink = TaxonomyCreator.fix(nameLink);
            if (!conceptSet.contains(concept)) continue;
            HashSet<String> conceptRealLinks = (HashSet<String>)realLinks.get(concept);
            if (conceptRealLinks == null) {
                conceptRealLinks = new HashSet<String>();
                realLinks.put(concept, conceptRealLinks);
            }
            conceptRealLinks.add(realLink);
            if (!realLink.equals(nameLink)) {
                HashSet<String> conceptNameLinks = (HashSet<String>)realLinks.get(concept);
                if (conceptNameLinks == null) {
                    conceptNameLinks = new HashSet<String>();
                    nameLinks.put(concept, conceptNameLinks);
                }
                conceptNameLinks.add(nameLink);
            }
            if (c % 1000000 == 0) {
                System.out.println("links - " + c + "  --  " + TaxonomyCreator.nf(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
            }
            ++c;
        }
        return new Pair<Map<String, Collection<String>>>(realLinks, nameLinks);
    }

    private static String fix(String token) {
        token = TaxonomyCreator.normalize(token);
        token = token.replaceAll("\\([^\\)]+\\)", "").trim();
        return token;
    }

    private static String normalize(String word) {
        word = CharacterUtils.removeDiacritics(word);
        word = word.toLowerCase();
        word = word.replace('_', ' ');
        word = word.trim();
        return word;
    }
}

