/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import org.apache.commons.lang.StringUtils;
import org.languagetool.JLanguageTool;
import org.languagetool.language.English;
import org.languagetool.rules.ConfusionSet;
import org.languagetool.rules.ConfusionSetLoader;
import org.languagetool.rules.ConfusionString;
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tools.StringTools;

public class RuleCreator {
    private static final boolean XML_MODE = true;
    private final Map<String, List<OccurrenceInfo>> occurrenceInfos = new HashMap<String, List<OccurrenceInfo>>();
    private final Map<String, Long> ngramToOccurrence = new HashMap<String, Long>();
    private final WordTokenizer wordTokenizer = new English().getWordTokenizer();
    private final float minErrorProb;
    private int ruleCount = 0;
    private int tokenFilteredRules = 0;
    private int probFilteredRules = 0;

    public RuleCreator(float minErrorProb) {
        this.minErrorProb = minErrorProb;
    }

    private void run(File homophoneOccurrences, String homophonePath) throws IOException {
        ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
        InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(homophonePath);
        Map<String, List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
        this.initMaps(homophoneOccurrences);
        int groupCount = 0;
        System.out.println("<rules lang='en'>\n");
        System.out.println("<category name='Auto-generated rules'>\n");
        for (Map.Entry<String, List<ConfusionSet>> entry : confusionSetMap.entrySet()) {
            List<OccurrenceInfo> infos;
            System.err.println(" === " + entry + " === ");
            if (entry.getValue().size() > 1) {
                System.err.println("WARN: will use only first pair of " + entry.getValue().size() + ": " + entry.getValue().get(0));
            }
            if ((infos = this.occurrenceInfos.get(entry.getKey())) == null) {
                System.err.println("Could not find occurrence infos for '" + entry.getKey() + "', skipping");
                continue;
            }
            HashSet<ConfusionString> cleanSet = new HashSet<ConfusionString>(entry.getValue().get(0).getSet());
            cleanSet.remove(entry.getKey());
            String name = StringUtils.join(cleanSet, "/") + " -> " + entry.getKey();
            System.out.println("<rulegroup id='R" + groupCount + "' name=\"" + StringTools.escapeXML(name) + "\">\n");
            ++groupCount;
            for (OccurrenceInfo occurrenceInfo : infos) {
                String[] parts = occurrenceInfo.ngram.split(" ");
                for (ConfusionString variant : entry.getValue().get(0).getSet()) {
                    if (variant.getString().equals(entry.getKey())) continue;
                    this.printRule(occurrenceInfo, parts, variant.getString());
                }
            }
            System.out.println("</rulegroup>\n");
        }
        System.out.println("</category>");
        System.out.println("</rules>");
        System.err.println("Done. Wrote " + this.ruleCount + " rules.");
        System.err.println("Rules ignored because of different tokenization: " + this.tokenFilteredRules);
        System.err.println("Rules ignored because of error probability limit (" + this.minErrorProb + "): " + this.probFilteredRules);
    }

    private void printRule(OccurrenceInfo occurrenceInfo, String[] parts, String variant) {
        long totalOcc;
        String term = parts[1];
        String termPhrase = parts[0] + " " + parts[1] + " " + parts[2];
        String variantPhrase = parts[0] + " " + variant + " " + parts[2];
        List<String> tokens = this.wordTokenizer.tokenize(variantPhrase);
        if (tokens.size() != 5) {
            System.err.println("Skipping '" + variantPhrase + "', does not tokenize to 3 tokens: " + tokens);
            ++this.tokenFilteredRules;
            return;
        }
        Long variantOccObj = this.ngramToOccurrence.get(variantPhrase);
        long variantOcc = variantOccObj != null ? variantOccObj : 0L;
        float variantProb = (float)variantOcc / (float)(totalOcc = occurrenceInfo.occurrence + variantOcc);
        float variantErrorProb = 1.0f - variantProb;
        if (variantErrorProb < this.minErrorProb) {
            System.err.println("Skipping '" + variantPhrase + "', error probability too low: " + variantErrorProb + " < " + this.minErrorProb);
            ++this.probFilteredRules;
            return;
        }
        System.out.printf(Locale.ENGLISH, "  <rule case_sensitive='yes'>\n    <!-- auto-generated, error probability: %.3f, correct phrase occurrences: %d -->\n    <pattern>\n      <token>%s</token>\n      <marker><token>%s</token></marker>\n      <token>%s</token>\n    </pattern>\n    <message>Did you mean <suggestion>%s</suggestion>?</message>\n    <example type='incorrect'>%s</example>\n    <example type='correct'>%s</example>\n  </rule>\n\n", Float.valueOf(variantErrorProb), occurrenceInfo.occurrence, StringTools.escapeXML(parts[0]), StringTools.escapeXML(variant), StringTools.escapeXML(parts[2]), StringTools.escapeXML(term), StringTools.escapeXML(variantPhrase), StringTools.escapeXML(termPhrase));
        ++this.ruleCount;
    }

    private void initMaps(File homophoneOccurrenceFile) throws FileNotFoundException {
        try (Scanner s = new Scanner(homophoneOccurrenceFile);){
            while (s.hasNextLine()) {
                String line = s.nextLine();
                String[] parts = line.split("\t");
                if (parts.length != 3) {
                    throw new RuntimeException("Unexpected format: '" + line + "'");
                }
                long occurrenceCount = Integer.parseInt(parts[1]);
                OccurrenceInfo occurrenceInfo = new OccurrenceInfo(parts[2], occurrenceCount);
                List<OccurrenceInfo> list = this.occurrenceInfos.containsKey(parts[0]) ? this.occurrenceInfos.get(parts[0]) : new ArrayList<OccurrenceInfo>();
                list.add(occurrenceInfo);
                this.occurrenceInfos.put(parts[0], list);
                this.ngramToOccurrence.put(parts[2], occurrenceCount);
            }
        }
    }

    public static void main(String[] args) throws IOException {
        if (args.length < 1 || args.length > 2) {
            System.out.println("Usage: " + RuleCreator.class.getSimpleName() + " <homophoneResultFile> [minErrorProbability]");
            System.out.println("    homophoneResultFile   the output of org.languagetool.dev.HomophoneOccurrenceDumper");
            System.out.println("    minErrorProbability   the minimal error probability (0.0-1.0), other rules will be ignored");
            System.exit(1);
        }
        float minErrorProb = args.length >= 2 ? Float.parseFloat(args[1]) : 0.0f;
        RuleCreator creator = new RuleCreator(minErrorProb);
        creator.run(new File(args[0]), "/en/confusion_sets_subset.txt");
    }

    static class OccurrenceInfo {
        private final String ngram;
        private final long occurrence;

        OccurrenceInfo(String ngram, long occurrence) {
            this.ngram = ngram;
            this.occurrence = occurrence;
        }

        public String toString() {
            return this.ngram + "/" + this.occurrence;
        }
    }
}

