/*
 * Decompiled with CFR 0.152.
 */
package com.hughes.android.dictionary.engine;

import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import org.apache.xerces.jaxp.SAXParserFactoryImpl;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class WiktionarySplitter
extends DefaultHandler {
    static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", 8);
    final Map<String, List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
    List<Selector> currentSelectors = null;
    StringBuilder titleBuilder;
    StringBuilder textBuilder;
    StringBuilder currentBuilder = null;
    String lastPageTitle = null;
    int pageCount = 0;

    public static void main(String[] args) throws Exception {
        WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
        wiktionarySplitter.go();
    }

    private WiktionarySplitter() {
        for (String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
            ArrayList<Selector> selectors = new ArrayList<Selector>();
            this.pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
            for (Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
                String dir = String.format("data/inputs/wikiSplit/%s", code);
                new File(dir).mkdirs();
                selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
            }
        }
    }

    private void go() throws Exception {
        SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
        for (Map.Entry<String, List<Selector>> pathToSelectorsEntry : this.pathToSelectors.entrySet()) {
            this.currentSelectors = pathToSelectorsEntry.getValue();
            for (Selector selector : this.currentSelectors) {
                selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
            }
            try {
                parser.parse(new File(pathToSelectorsEntry.getKey()), (DefaultHandler)this);
            }
            catch (Exception e) {
                System.err.println("Exception during parse, lastPageTitle=" + this.lastPageTitle + ", titleBuilder=" + this.titleBuilder.toString());
                throw e;
            }
            for (Selector selector : this.currentSelectors) {
                selector.out.close();
            }
        }
    }

    private void endPage() {
        String title;
        this.lastPageTitle = title = this.titleBuilder.toString();
        if (++this.pageCount % 1000 == 0) {
            System.out.println("endPage: " + title + ", count=" + this.pageCount);
        }
        if (title.startsWith("Wiktionary:") || title.startsWith("Appendix:") || title.startsWith("Help:") || title.startsWith("Index:") || title.startsWith("MediaWiki:") || title.startsWith("Citations:") || title.startsWith("Concordance:") || title.startsWith("Glossary:") || title.startsWith("Rhymes:") || title.startsWith("Category:") || title.startsWith("Wikisaurus:") || title.startsWith("Unsupported titles/") || title.startsWith("Transwiki:") || title.startsWith("File:") || title.startsWith("Thread:") || title.startsWith("Template:") || title.startsWith("Summary:") || title.startsWith("Datei:") || title.startsWith("Verzeichnis:") || title.startsWith("Vorlage:") || title.startsWith("Thesaurus:") || title.startsWith("Kategorie:") || title.startsWith("Hilfe:") || title.startsWith("Annexe:") || title.startsWith("Cat\u00e9gori:") || title.startsWith("Mod\u00e8le:") || title.startsWith("Th\u00e9saurus:") || title.startsWith("Projet:") || title.startsWith("Aide:") || title.startsWith("Fichier:") || title.startsWith("Wiktionnaire:") || title.startsWith("Cat\u00e9gorie:") || title.startsWith("Portail:") || title.startsWith("utiliusateur:") || title.startsWith("Kategorio:") || title.startsWith("Wikizionario:") || title.startsWith("Appendice:") || title.startsWith("Categoria:") || title.startsWith("Aiuto:") || title.startsWith("Portail:")) {
            return;
        }
        if (title.contains(":") && !title.startsWith("Sign gloss:")) {
            System.err.println("title with colon: " + title);
        }
        String text = this.textBuilder.toString();
        while (text.length() > 0) {
            Matcher startMatcher = headingStart.matcher(text);
            if (!startMatcher.find()) {
                return;
            }
            text = text.substring(startMatcher.end());
            String heading = startMatcher.group();
            for (Selector selector : this.currentSelectors) {
                if (!selector.pattern.matcher(heading).find()) continue;
                int depth = startMatcher.group(1).length();
                Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), 8);
                Matcher endMatcher = endPattern.matcher(text);
                int end = endMatcher.find() ? endMatcher.start() : text.length();
                String sectionText = text.substring(0, end);
                Section section = new Section(title, heading, sectionText);
                try {
                    selector.out.writeUTF(section.title);
                    selector.out.writeUTF(section.heading);
                    byte[] bytes = section.text.getBytes("UTF8");
                    selector.out.writeInt(bytes.length);
                    selector.out.write(bytes);
                }
                catch (IOException e) {
                    throw new RuntimeException(e);
                }
                text = text.substring(end);
            }
        }
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes) {
        this.currentBuilder = null;
        if ("page".equals(qName)) {
            this.titleBuilder = new StringBuilder();
            this.textBuilder = new StringBuilder("\n");
        } else if ("title".equals(qName)) {
            this.currentBuilder = this.titleBuilder;
        } else if ("text".equals(qName)) {
            this.currentBuilder = this.textBuilder;
        }
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        if (this.currentBuilder != null) {
            this.currentBuilder.append(ch, start, length);
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        this.currentBuilder = null;
        if ("page".equals(qName)) {
            this.endPage();
        }
    }

    public void parse(File file) throws ParserConfigurationException, SAXException, IOException {
        SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
        parser.parse(file, (DefaultHandler)this);
    }

    static class Section
    implements Serializable {
        private static final long serialVersionUID = -7676549898325856822L;
        final String title;
        final String heading;
        final String text;

        public Section(String title, String heading, String text) {
            this.title = title;
            this.heading = heading;
            this.text = text;
        }
    }

    static class Selector {
        final String outFilename;
        final Pattern pattern;
        DataOutputStream out;

        public Selector(String filename, String pattern) {
            this.outFilename = filename;
            this.pattern = Pattern.compile(pattern, 2);
        }
    }
}

