package org.omegat.tokenizer;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.omegat.core.Core;
import org.omegat.core.data.IProject;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.gui.comments.ICommentProvider;
import org.omegat.tokenizer.ITokenizer;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;

/* loaded from: input_file:org/omegat/tokenizer/BaseTokenizer.class */
public abstract class BaseTokenizer implements ITokenizer {
    protected static final int DEFAULT_TOKENS_COUNT = 64;
    private final Map<String, Token[]> tokenCacheNone = new ConcurrentHashMap(2500);
    private final Map<String, Token[]> tokenCacheMatching = new ConcurrentHashMap(2500);
    private final Map<String, Token[]> tokenCacheGlossary = new ConcurrentHashMap(2500);
    protected boolean shouldDelegateTokenizeExactly = true;
    protected static final String[] EMPTY_STRING_LIST = new String[0];
    protected static final Token[] EMPTY_TOKENS_LIST = new Token[0];
    public static final ICommentProvider TOKENIZER_DEBUG_PROVIDER = new ICommentProvider() { // from class: org.omegat.tokenizer.BaseTokenizer.1
        @Override // org.omegat.gui.comments.ICommentProvider
        public String getComment(SourceTextEntry sourceTextEntry) {
            return ((BaseTokenizer) Core.getProject().getSourceTokenizer()).test(sourceTextEntry.getSrcText());
        }
    };

    @Override // org.omegat.tokenizer.ITokenizer
    public Token[] tokenizeWords(String str, ITokenizer.StemmingMode stemmingMode) {
        Map<String, Token[]> map;
        switch (stemmingMode) {
            case NONE:
                map = this.tokenCacheNone;
                break;
            case GLOSSARY:
                map = this.tokenCacheGlossary;
                break;
            case MATCHING:
                map = this.tokenCacheMatching;
                break;
            default:
                throw new RuntimeException("No cache for specified stemming mode");
        }
        Token[] tokenArr = map.get(str);
        if (tokenArr != null) {
            return tokenArr;
        }
        Token[] tokenArr2 = tokenize(str, stemmingMode == ITokenizer.StemmingMode.GLOSSARY || stemmingMode == ITokenizer.StemmingMode.MATCHING, stemmingMode == ITokenizer.StemmingMode.MATCHING, stemmingMode != ITokenizer.StemmingMode.GLOSSARY, true);
        map.put(str, tokenArr2);
        return tokenArr2;
    }

    @Override // org.omegat.tokenizer.ITokenizer
    public String[] tokenizeWordsToStrings(String str, ITokenizer.StemmingMode stemmingMode) {
        return tokenizeToStrings(str, stemmingMode == ITokenizer.StemmingMode.GLOSSARY || stemmingMode == ITokenizer.StemmingMode.MATCHING, stemmingMode == ITokenizer.StemmingMode.MATCHING, stemmingMode != ITokenizer.StemmingMode.GLOSSARY, true);
    }

    @Override // org.omegat.tokenizer.ITokenizer
    public Token[] tokenizeVerbatim(String str) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_TOKENS_LIST;
        }
        if (!this.shouldDelegateTokenizeExactly) {
            return tokenize(str, false, false, false, false);
        }
        ArrayList arrayList = new ArrayList(DEFAULT_TOKENS_COUNT);
        WordIterator wordIterator = new WordIterator();
        wordIterator.setText(str);
        int first = wordIterator.first();
        int next = wordIterator.next();
        while (true) {
            int i = next;
            if (i == -1) {
                return (Token[]) arrayList.toArray(new Token[arrayList.size()]);
            }
            arrayList.add(new Token(str.substring(first, i), first));
            first = i;
            next = wordIterator.next();
        }
    }

    @Override // org.omegat.tokenizer.ITokenizer
    public String[] tokenizeVerbatimToStrings(String str) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_STRING_LIST;
        }
        if (!this.shouldDelegateTokenizeExactly) {
            return tokenizeToStrings(str, false, false, false, false);
        }
        ArrayList arrayList = new ArrayList(DEFAULT_TOKENS_COUNT);
        WordIterator wordIterator = new WordIterator();
        wordIterator.setText(str);
        int first = wordIterator.first();
        int next = wordIterator.next();
        while (true) {
            int i = next;
            if (i == -1) {
                return (String[]) arrayList.toArray(new String[arrayList.size()]);
            }
            arrayList.add(str.substring(first, i));
            first = i;
            next = wordIterator.next();
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Token[] tokenizeByCodePoint(String str) {
        Token[] tokenArr = new Token[str.codePointCount(0, str.length())];
        int i = 0;
        int i2 = 0;
        while (i < str.length()) {
            int codePointAt = str.codePointAt(i);
            int i3 = i2;
            i2++;
            tokenArr[i3] = new Token(String.valueOf(Character.toChars(codePointAt)), i);
            i += Character.charCount(codePointAt);
        }
        return tokenArr;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String[] tokenizeByCodePointToStrings(String str) {
        String[] strArr = new String[str.codePointCount(0, str.length())];
        int i = 0;
        int i2 = 0;
        while (i < str.length()) {
            int codePointAt = str.codePointAt(i);
            int i3 = i2;
            i2++;
            strArr[i3] = String.valueOf(Character.toChars(codePointAt));
            i += Character.charCount(codePointAt);
        }
        return strArr;
    }

    protected Token[] tokenize(String str, boolean z, boolean z2, boolean z3, boolean z4) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_TOKENS_LIST;
        }
        ArrayList arrayList = new ArrayList(DEFAULT_TOKENS_COUNT);
        try {
            TokenStream tokenStream = getTokenStream(str, z, z2);
            Throwable th = null;
            try {
                try {
                    tokenStream.addAttribute(CharTermAttribute.class);
                    tokenStream.addAttribute(OffsetAttribute.class);
                    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
                    OffsetAttribute attribute2 = tokenStream.getAttribute(OffsetAttribute.class);
                    tokenStream.reset();
                    while (tokenStream.incrementToken()) {
                        String obj = attribute.toString();
                        if (acceptToken(obj, z3, z4)) {
                            arrayList.add(new Token(obj, attribute2.startOffset(), attribute2.endOffset() - attribute2.startOffset()));
                        }
                    }
                    tokenStream.end();
                    if (tokenStream != null) {
                        if (0 != 0) {
                            try {
                                tokenStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            tokenStream.close();
                        }
                    }
                } finally {
                }
            } finally {
            }
        } catch (IOException e) {
            Log.log(e);
        }
        return (Token[]) arrayList.toArray(new Token[arrayList.size()]);
    }

    protected String[] tokenizeToStrings(String str, boolean z, boolean z2, boolean z3, boolean z4) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_STRING_LIST;
        }
        ArrayList arrayList = new ArrayList(DEFAULT_TOKENS_COUNT);
        try {
            TokenStream tokenStream = getTokenStream(str, z, z2);
            Throwable th = null;
            try {
                try {
                    tokenStream.addAttribute(CharTermAttribute.class);
                    tokenStream.addAttribute(OffsetAttribute.class);
                    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
                    OffsetAttribute attribute2 = tokenStream.getAttribute(OffsetAttribute.class);
                    Locale locale = z ? getEffectiveLanguage().getLocale() : null;
                    tokenStream.reset();
                    while (tokenStream.incrementToken()) {
                        String obj = attribute.toString();
                        if (acceptToken(obj, z3, z4)) {
                            arrayList.add(obj);
                            if (z) {
                                String substring = str.substring(attribute2.startOffset(), attribute2.endOffset());
                                if (!substring.toLowerCase(locale).equals(obj.toLowerCase(locale))) {
                                    arrayList.add(substring);
                                }
                            }
                        }
                    }
                    tokenStream.end();
                    if (tokenStream != null) {
                        if (0 != 0) {
                            try {
                                tokenStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            tokenStream.close();
                        }
                    }
                } finally {
                }
            } finally {
            }
        } catch (IOException e) {
            Log.log(e);
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    private boolean acceptToken(String str, boolean z, boolean z2) {
        if (StringUtil.isEmpty(str)) {
            return false;
        }
        if (!z && !z2) {
            return true;
        }
        boolean z3 = true;
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= str.length()) {
                return (z2 && z3) ? false : true;
            }
            int codePointAt = str.codePointAt(i2);
            if (z && Character.isDigit(codePointAt)) {
                return false;
            }
            if (z2 && !StringUtil.isWhiteSpace(codePointAt)) {
                z3 = false;
            }
            i = i2 + Character.charCount(codePointAt);
        }
    }

    protected abstract TokenStream getTokenStream(String str, boolean z, boolean z2) throws IOException;

    /* JADX INFO: Access modifiers changed from: protected */
    public TokenStream getStandardTokenStream(String str) throws IOException {
        StandardTokenizer standardTokenizer = new StandardTokenizer();
        standardTokenizer.setReader(new StringReader(str));
        return standardTokenizer;
    }

    @Override // org.omegat.tokenizer.ITokenizer
    public String[] getSupportedLanguages() {
        return getAnnotationLanguages();
    }

    private String[] getAnnotationLanguages() {
        Tokenizer tokenizer = (Tokenizer) getClass().getAnnotation(Tokenizer.class);
        if (tokenizer == null) {
            throw new RuntimeException(getClass().getName() + " must have a " + Tokenizer.class.getName() + " annotation available at runtime.");
        }
        String[] languages = tokenizer.languages();
        if (languages.length == 0) {
            throw new RuntimeException(getClass().getName() + " must have a non-empty " + Tokenizer.class.getName() + " annotation available at runtime.");
        }
        return languages;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Language getEffectiveLanguage() {
        String[] annotationLanguages = getAnnotationLanguages();
        if (annotationLanguages.length == 1 && !annotationLanguages[0].equals(Tokenizer.DISCOVER_AT_RUNTIME)) {
            return new Language(annotationLanguages[0]);
        }
        return getProjectLanguage();
    }

    protected Language getProjectLanguage() {
        IProject project = Core.getProject();
        if (project == null) {
            throw new RuntimeException("This tokenizer's language can only be determined in the context of a project, but project is null.");
        }
        if (project.getSourceTokenizer() == this) {
            return project.getProjectProperties().getSourceLanguage();
        }
        if (project.getTargetTokenizer() == this) {
            return project.getProjectProperties().getTargetLanguage();
        }
        throw new RuntimeException("This tokenizer's language can only be determined in the context of a project, but is not assigned to current project.");
    }

    protected String test(String... strArr) {
        StringBuilder sb = new StringBuilder();
        sb.append(getClass().getName()).append('\n');
        for (String str : strArr) {
            sb.append("Input:\n");
            sb.append(str).append("\n");
            sb.append("tokenizeVerbatim:\n");
            sb.append(printTest(tokenizeVerbatimToStrings(str), str));
            sb.append("tokenize:\n");
            sb.append(printTest(tokenizeToStrings(str, false, false, false, true), str));
            sb.append("tokenize (stemsAllowed):\n");
            sb.append(printTest(tokenizeToStrings(str, true, false, false, true), str));
            sb.append("tokenize (stemsAllowed stopWordsAllowed):\n");
            sb.append(printTest(tokenizeToStrings(str, true, true, false, true), str));
            sb.append("tokenize (stemsAllowed stopWordsAllowed filterDigits) (=tokenizeWords(MATCHING)):\n");
            sb.append(printTest(tokenizeToStrings(str, true, true, true, true), str));
            sb.append("tokenize (stemsAllowed filterDigits) (=tokenizeWords(GLOSSARY)):\n");
            sb.append(printTest(tokenizeToStrings(str, true, false, true, true), str));
            sb.append("tokenize (filterDigits) (=tokenizeWords(NONE)):\n");
            sb.append(printTest(tokenizeToStrings(str, false, false, true, true), str));
            sb.append("----------------------------------\n");
        }
        return sb.toString();
    }

    protected String printTest(String[] strArr, String str) {
        StringBuilder sb = new StringBuilder();
        sb.append(StringUtils.join(strArr, ", ")).append('\n');
        sb.append("Is verbatim: ").append(StringUtils.join(strArr, "").equals(str)).append('\n');
        return sb.toString();
    }
}
