package com.sixfive.can.nl.lexical.ko_kr;

import com.google.common.base.CharMatcher;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.sixfive.can.nl.Utterance;
import com.sixfive.can.nl.lexical.Token;
import com.sixfive.can.nl.lexical.Tokenizer;
import com.sixfive.can.nl.lexical.UnigramModel;
import com.sixfive.can.nl.lexical.ko_kr.dict.MorphemeDictionary;
import com.sixfive.can.nl.lexical.ko_kr.dict.WordDictionary;
import com.sixfive.util.Pair;
import com.sixfive.util.StandardLocale;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/* loaded from: classes2.dex */
public class KoreaTokenizer extends Tokenizer {
    private static final int AVG_WORD_SIZE = 3;
    private static final int CURRENT_VERSION = 8;
    private static final String DATA_FILE = "Tokenizer.data";
    private static final String TAG = "KoreaTokenizer";
    private static KoreaTokenizer instance = null;
    private static final long serialVersionUID = -591782641415920645L;
    private final List<DictionaryMatcher> abbBePostPositionMatcher;
    Set<List<MorphemeDictionary.DictionaryType>> aux;
    Set<List<MorphemeDictionary.DictionaryType>> aux_ep;
    Set<List<MorphemeDictionary.DictionaryType>> bases;
    Set<List<MorphemeDictionary.DictionaryType>> bases_ep;
    private final List<DictionaryMatcher> exactlySeqMatchers;
    private final UnigramModel model;
    private final MorphemeDictionary morphemeDictionary;
    private final List<DictionaryMatcher> partialSeqMatchers;
    private final List<DictionaryMatcher> postPositionMatcher;
    private final List<RegexMatcher> priorRegexMatchers;
    private final List<DictionaryMatcher> startsWithMatchers;
    private final List<DictionaryMatcher> suffixMatcher;
    private final WakeupWordMatcher wakeupWordMatcher;
    Set<List<MorphemeDictionary.DictionaryType>> xvBases;
    Set<List<MorphemeDictionary.DictionaryType>> xvBases_ep;
    private static final CharMatcher WHITESPACE = CharMatcher.whitespace();
    private static final Pattern SPECIAL_LETTERS = Pattern.compile("^[!?,.-/:&(){}\\[\\]]$");
    public static final Map<String, Integer> KR_NUMERIC_PHRASES = ImmutableMap.copyOf(NumericPhraseBuilder.a());

    /* loaded from: classes2.dex */
    public class EndsWithMatcher extends DictionaryMatcher {
        private static final long serialVersionUID = 6121608111214594750L;
        private final boolean recursive;

        public EndsWithMatcher(WordDictionary wordDictionary, boolean z11) {
            super(Collections.singletonList(wordDictionary), KoreaTokenizer.this.morphemeDictionary);
            this.recursive = z11;
        }

        @Override // com.sixfive.can.nl.lexical.ko_kr.DictionaryMatcher, com.sixfive.can.nl.lexical.ko_kr.Matcher
        public List<Chunk> match(Chunk chunk) {
            if (this.dictionaries.isEmpty()) {
                return Collections.emptyList();
            }
            String text = chunk.getText();
            WordDictionary wordDictionary = this.dictionaries.get(0);
            ArrayList arrayList = new ArrayList();
            if (this.recursive) {
                while (true) {
                    int findEndsWith = DictionaryMatcher.findEndsWith(text, wordDictionary);
                    if (findEndsWith == -1) {
                        break;
                    }
                    arrayList.add(Integer.valueOf(findEndsWith));
                    text = text.substring(0, findEndsWith);
                }
            } else {
                int findEndsWith2 = DictionaryMatcher.findEndsWith(text, wordDictionary);
                if (findEndsWith2 != -1) {
                    arrayList.add(Integer.valueOf(findEndsWith2));
                }
            }
            if (arrayList.isEmpty()) {
                return Collections.emptyList();
            }
            ArrayList arrayList2 = new ArrayList();
            Iterator it = arrayList.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                }
                int intValue = ((Integer) it.next()).intValue();
                if (chunk.immutable()) {
                    break;
                }
                List<Chunk> split = chunk.split(intValue, false, true);
                if (split.size() == 1) {
                    arrayList2.add(0, split.get(0));
                    chunk = null;
                    break;
                }
                Chunk chunk2 = split.get(0);
                arrayList2.add(0, split.get(1));
                chunk = chunk2;
            }
            if (chunk != null) {
                arrayList2.add(0, chunk);
            }
            return arrayList2;
        }
    }

    /* loaded from: classes2.dex */
    public class ExactlySeqMatcher extends DictionaryMatcher {
        private static final long serialVersionUID = 2008571333760588403L;

        public ExactlySeqMatcher(List<WordDictionary> list) {
            super(list, KoreaTokenizer.this.morphemeDictionary);
        }

        @Override // com.sixfive.can.nl.lexical.ko_kr.DictionaryMatcher, com.sixfive.can.nl.lexical.ko_kr.Matcher
        public List<Chunk> match(Chunk chunk) {
            String text = chunk.getText();
            ArrayList arrayList = new ArrayList();
            Iterator<WordDictionary> it = this.dictionaries.iterator();
            while (it.hasNext()) {
                int findStartsWith = DictionaryMatcher.findStartsWith(text, it.next());
                if (findStartsWith == -1) {
                    return Collections.emptyList();
                }
                arrayList.add(Integer.valueOf(findStartsWith));
                text = text.substring(findStartsWith);
            }
            if (arrayList.isEmpty()) {
                return Collections.emptyList();
            }
            if (!text.isEmpty()) {
                List<Integer> findPostPosition = findPostPosition(text);
                if (findPostPosition.isEmpty()) {
                    return Collections.emptyList();
                }
                arrayList.addAll(findPostPosition);
            }
            ArrayList arrayList2 = new ArrayList();
            Iterator it2 = arrayList.iterator();
            while (it2.hasNext()) {
                int intValue = ((Integer) it2.next()).intValue();
                if (chunk == null) {
                    break;
                }
                chunk = DictionaryMatcher.splitChunk(chunk, intValue, true, false, arrayList2);
            }
            return arrayList2;
        }
    }

    /* loaded from: classes2.dex */
    public static final class NumericPhraseBuilder {
        private NumericPhraseBuilder() {
        }

        public static /* bridge */ /* synthetic */ Map a() {
            return build();
        }

        /* JADX WARN: Multi-variable type inference failed */
        private static Map<String, Integer> build() {
            ImmutableSet<Pair> of2 = ImmutableSet.of(Pair.of("", 0), Pair.of("한", 1), Pair.of("두", 2), Pair.of("세", 3), Pair.of("네", 4), Pair.of("하나", 1), Pair.of("둘", 2), Pair.of("셋", 3), Pair.of("넷", 4), Pair.of("다섯", 5), Pair.of("여섯", 6), Pair.of("일곱", 7), Pair.of("여덟", 8), Pair.of("아홉", 9));
            ImmutableSet<Pair> of3 = ImmutableSet.of(Pair.of("", 0), Pair.of("열", 10), Pair.of("스물", 20), Pair.of("서른", 30), Pair.of("마흔", 40), Pair.of("쉰", 50), Pair.of("예순", 60), Pair.of("일흔", 70), Pair.of("여든", 80), Pair.of("아흔", 90));
            ImmutableSet<Pair> of4 = ImmutableSet.of(Pair.of("한두", 2), Pair.of("두세", 3), Pair.of("세네", 4), Pair.of("여러", 3), Pair.of("스무", 20));
            HashMap hashMap = new HashMap();
            for (Pair pair : of4) {
                hashMap.put((String) pair.first, (Integer) pair.second);
            }
            for (Pair pair2 : of3) {
                for (Pair pair3 : of2) {
                    String str = ((String) pair2.first) + ((String) pair3.first);
                    if (!str.isEmpty()) {
                        hashMap.put(str, Integer.valueOf(((Integer) pair3.second).intValue() + ((Integer) pair2.second).intValue()));
                    }
                }
            }
            return hashMap;
        }
    }

    /* loaded from: classes2.dex */
    public class PartialSeqMatcher extends DictionaryMatcher {
        private static final long serialVersionUID = 5447552977803567470L;

        public PartialSeqMatcher(List<WordDictionary> list) {
            super(list, KoreaTokenizer.this.morphemeDictionary);
        }

        @Override // com.sixfive.can.nl.lexical.ko_kr.DictionaryMatcher, com.sixfive.can.nl.lexical.ko_kr.Matcher
        public List<Chunk> match(Chunk chunk) {
            int findStartsWith;
            String text = chunk.getText();
            ArrayList arrayList = new ArrayList();
            for (int i7 = 0; i7 < text.length(); i7++) {
                arrayList.clear();
                String substring = text.substring(i7);
                Iterator<WordDictionary> it = this.dictionaries.iterator();
                while (it.hasNext() && (findStartsWith = DictionaryMatcher.findStartsWith(substring, it.next())) != -1) {
                    arrayList.add(Integer.valueOf(findStartsWith));
                    substring = substring.substring(findStartsWith);
                }
                if (!arrayList.isEmpty() && arrayList.size() == this.dictionaries.size()) {
                    if (!substring.isEmpty()) {
                        List<Integer> findPostPosition = findPostPosition(substring);
                        if (!findPostPosition.isEmpty()) {
                            arrayList.addAll(findPostPosition);
                        }
                    }
                    ArrayList arrayList2 = new ArrayList();
                    if (i7 > 0) {
                        chunk = DictionaryMatcher.splitChunk(chunk, i7, false, false, arrayList2);
                    }
                    Iterator it2 = arrayList.iterator();
                    while (it2.hasNext()) {
                        int intValue = ((Integer) it2.next()).intValue();
                        if (chunk == null) {
                            break;
                        }
                        chunk = DictionaryMatcher.splitChunk(chunk, intValue, true, false, arrayList2);
                    }
                    return arrayList2;
                }
            }
            return Collections.emptyList();
        }
    }

    /* loaded from: classes2.dex */
    public class StartsWithMatcher extends DictionaryMatcher {
        private static final long serialVersionUID = 5142612297470859091L;

        public StartsWithMatcher(WordDictionary wordDictionary) {
            super(Collections.singletonList(wordDictionary), KoreaTokenizer.this.morphemeDictionary);
        }

        @Override // com.sixfive.can.nl.lexical.ko_kr.DictionaryMatcher, com.sixfive.can.nl.lexical.ko_kr.Matcher
        public List<Chunk> match(Chunk chunk) {
            int findStartsWith;
            if (!this.dictionaries.isEmpty() && (findStartsWith = DictionaryMatcher.findStartsWith(chunk.getText(), this.dictionaries.get(0))) != -1) {
                return chunk.split(findStartsWith, true, false);
            }
            return Collections.emptyList();
        }
    }

    private KoreaTokenizer() {
        super(StandardLocale.KOREA);
        MorphemeDictionary morphemeDictionary = new MorphemeDictionary();
        this.morphemeDictionary = morphemeDictionary;
        this.wakeupWordMatcher = new WakeupWordMatcher();
        this.partialSeqMatchers = new ArrayList();
        this.exactlySeqMatchers = new ArrayList();
        this.startsWithMatchers = ImmutableList.of(new StartsWithMatcher(morphemeDictionary.getDict(MorphemeDictionary.DictionaryType.Noun)));
        this.postPositionMatcher = ImmutableList.of(new EndsWithMatcher(morphemeDictionary.getDict(MorphemeDictionary.DictionaryType.Postposition), true));
        this.abbBePostPositionMatcher = ImmutableList.of(new EndsWithMatcher(morphemeDictionary.getDict(MorphemeDictionary.DictionaryType.AbbBePostPosition), false));
        this.suffixMatcher = ImmutableList.of(new EndsWithMatcher(morphemeDictionary.getDict(MorphemeDictionary.DictionaryType.Suffix), false));
        MorphemeDictionary.DictionaryType dictionaryType = MorphemeDictionary.DictionaryType.VavvStem;
        MorphemeDictionary.DictionaryType dictionaryType2 = MorphemeDictionary.DictionaryType.PreEnding;
        MorphemeDictionary.DictionaryType dictionaryType3 = MorphemeDictionary.DictionaryType.Ending;
        List asList = Arrays.asList(dictionaryType, dictionaryType2, dictionaryType3);
        MorphemeDictionary.DictionaryType dictionaryType4 = MorphemeDictionary.DictionaryType.PreEndingEnding;
        List asList2 = Arrays.asList(dictionaryType, dictionaryType4);
        MorphemeDictionary.DictionaryType dictionaryType5 = MorphemeDictionary.DictionaryType.ConjEnding;
        this.bases_ep = ImmutableSet.of(asList, asList2, Arrays.asList(dictionaryType, dictionaryType4, dictionaryType5));
        this.bases = ImmutableSet.of(Arrays.asList(MorphemeDictionary.DictionaryType.VavvStemConj, dictionaryType5), Arrays.asList(dictionaryType, dictionaryType3), Collections.singletonList(MorphemeDictionary.DictionaryType.VavvStemConjAbb));
        MorphemeDictionary.DictionaryType dictionaryType6 = MorphemeDictionary.DictionaryType.Xv;
        this.xvBases_ep = ImmutableSet.of(Arrays.asList(dictionaryType6, dictionaryType2, dictionaryType3), Arrays.asList(dictionaryType6, dictionaryType4), Arrays.asList(dictionaryType6, dictionaryType4, dictionaryType5));
        this.xvBases = ImmutableSet.of(Arrays.asList(MorphemeDictionary.DictionaryType.XvConj, dictionaryType5), Arrays.asList(dictionaryType6, dictionaryType3), Collections.singletonList(MorphemeDictionary.DictionaryType.XvConjAbb));
        MorphemeDictionary.DictionaryType dictionaryType7 = MorphemeDictionary.DictionaryType.AuxVavv;
        this.aux_ep = ImmutableSet.of(Arrays.asList(dictionaryType7, dictionaryType2, dictionaryType3), Arrays.asList(dictionaryType7, dictionaryType4), Arrays.asList(dictionaryType7, dictionaryType4, dictionaryType5));
        this.aux = ImmutableSet.of(Arrays.asList(MorphemeDictionary.DictionaryType.AuxVavvConj, dictionaryType5), Arrays.asList(dictionaryType7, dictionaryType3), Collections.singletonList(MorphemeDictionary.DictionaryType.AuxVavvConjAbb), Collections.emptyList());
        this.priorRegexMatchers = RegexMatcher.generateRegexMatchers();
        initSeqMatchers();
        try {
            this.model = new UnigramModel(KoreaTokenizer.class);
        } catch (IOException e11) {
            throw new RuntimeException("error initializing KoreaTokenizer", e11);
        }
    }

    private void addToExactlySeqMatchers(Set<List<MorphemeDictionary.DictionaryType>> set, Set<List<MorphemeDictionary.DictionaryType>> set2) {
        for (List<MorphemeDictionary.DictionaryType> list : set) {
            for (List<MorphemeDictionary.DictionaryType> list2 : set2) {
                List<DictionaryMatcher> list3 = this.exactlySeqMatchers;
                Stream concat = Stream.concat(list.stream(), list2.stream());
                MorphemeDictionary morphemeDictionary = this.morphemeDictionary;
                Objects.requireNonNull(morphemeDictionary);
                list3.add(new ExactlySeqMatcher((List) concat.map(new a(morphemeDictionary, 0)).collect(Collectors.toList())));
            }
        }
    }

    private void addToPartialSeqMatchers(Set<List<MorphemeDictionary.DictionaryType>> set, Set<List<MorphemeDictionary.DictionaryType>> set2) {
        for (List<MorphemeDictionary.DictionaryType> list : set) {
            for (List<MorphemeDictionary.DictionaryType> list2 : set2) {
                List<DictionaryMatcher> list3 = this.partialSeqMatchers;
                Stream concat = Stream.concat(list.stream(), list2.stream());
                MorphemeDictionary morphemeDictionary = this.morphemeDictionary;
                Objects.requireNonNull(morphemeDictionary);
                list3.add(new PartialSeqMatcher((List) concat.map(new a(morphemeDictionary, 1)).collect(Collectors.toList())));
            }
        }
    }

    private List<Chunk> extractNouns(List<Chunk> list) {
        split(list, this.partialSeqMatchers, true);
        split(list, this.exactlySeqMatchers);
        split(list, this.postPositionMatcher);
        split(list, this.abbBePostPositionMatcher);
        split(list, this.suffixMatcher);
        return list;
    }

    private void filterAsOneToken(List<Chunk> list, WordDictionary wordDictionary) {
        for (Chunk chunk : list) {
            if (!chunk.immutable() && wordDictionary.contains(chunk.getText())) {
                chunk.lock();
            }
        }
    }

    public static KoreaTokenizer getInstance() {
        if (instance == null) {
            instance = new KoreaTokenizer();
        }
        return instance;
    }

    private void initSeqMatchers() {
        addToExactlySeqMatchers(this.bases_ep, this.aux);
        addToExactlySeqMatchers(this.bases, this.aux_ep);
        addToExactlySeqMatchers(this.bases, this.aux);
        addToPartialSeqMatchers(this.xvBases_ep, this.aux);
        addToPartialSeqMatchers(this.xvBases, this.aux_ep);
        addToPartialSeqMatchers(this.xvBases, this.aux);
    }

    private static boolean isDifferent(char c11, char c12) {
        int type = Character.getType(c11);
        int type2 = Character.getType(c12);
        if (type == 1) {
            type = 2;
        }
        if (type2 == 1) {
            type2 = 2;
        }
        return (type == 24 || type2 == 24 || type2 == 12 || type == type2) ? false : true;
    }

    public static void setInstance(KoreaTokenizer koreaTokenizer) {
        instance = koreaTokenizer;
    }

    private List<Chunk> space(String str) {
        int i7;
        if (str.isEmpty()) {
            return Collections.emptyList();
        }
        ArrayList arrayList = new ArrayList();
        int i11 = 0;
        while (i7 < str.length()) {
            char charAt = str.charAt(i7);
            if (('.' == charAt || ',' == charAt) && i7 > 0 && i7 < str.length() - 1) {
                i7 = (Character.isDigit(str.charAt(i7 + (-1))) && Character.isDigit(str.charAt(i7 + 1))) ? i7 + 1 : 0;
            }
            if (Character.getType(charAt) == 24) {
                if (i11 < i7) {
                    arrayList.add(new Chunk(str.substring(i11, i7), i11, i7));
                }
                int i12 = i7 + 1;
                arrayList.add(new Chunk(str.substring(i7, i12), i7, i12));
                i11 = i12;
            } else if (WHITESPACE.matches(charAt)) {
                if (i11 < i7) {
                    arrayList.add(new Chunk(str.substring(i11, i7), i11, i7));
                }
                i11 = i7 + 1;
            } else if (i7 > 0 && isDifferent(charAt, str.charAt(i7 - 1))) {
                if (i11 < i7) {
                    arrayList.add(new Chunk(str.substring(i11, i7), i11, i7));
                }
                i11 = i7;
            }
        }
        if (i11 != str.length()) {
            arrayList.add(new Chunk(str.substring(i11), i11, str.length()));
        }
        return arrayList;
    }

    private List<Chunk> spaceNormalize(List<Chunk> list, UnigramModel unigramModel, String str) {
        boolean matchesAllOf = WHITESPACE.matchesAllOf(str);
        ArrayList arrayList = new ArrayList();
        for (Chunk chunk : list) {
            int length = chunk.getText().length();
            if (chunk.immutable() || length <= 3) {
                arrayList.add(chunk);
            } else {
                List<String> segment = unigramModel.segment(chunk.getText());
                if (matchesAllOf) {
                    if (segment.size() > ((length + 3) - 1) / 3) {
                        arrayList.add(chunk);
                    }
                }
                int start = chunk.getStart();
                for (String str2 : segment) {
                    arrayList.add(new Chunk(str2, start, str2.length() + start));
                    start += str2.length();
                }
            }
        }
        return arrayList;
    }

    private void split(List<Chunk> list, Collection<? extends Matcher> collection) {
        split(list, collection, false);
    }

    private void split(List<Chunk> list, Collection<? extends Matcher> collection, boolean z11) {
        int i7 = 0;
        while (i7 < list.size()) {
            Chunk chunk = list.get(i7);
            if (!chunk.immutable()) {
                Iterator<? extends Matcher> it = collection.iterator();
                while (true) {
                    if (it.hasNext()) {
                        List<Chunk> match = it.next().match(chunk);
                        if (!match.isEmpty()) {
                            list.remove(i7);
                            list.addAll(i7, match);
                            if (z11) {
                                i7--;
                            }
                        }
                    }
                }
            }
            i7++;
        }
    }

    private List<Chunk> tokenize(String str, UnigramModel unigramModel) {
        return Chunk.textNormalize(extractNouns(tokenizeWithStatisticalModel(tokenizeWithRuleModel(space(str)), unigramModel, str)));
    }

    private List<Chunk> tokenizeWithRuleModel(List<Chunk> list) {
        filterAsOneToken(list, this.morphemeDictionary.getDict(MorphemeDictionary.DictionaryType.Modifier));
        split(list, this.startsWithMatchers);
        split(list, this.priorRegexMatchers, true);
        return list;
    }

    private List<Chunk> tokenizeWithStatisticalModel(List<Chunk> list, UnigramModel unigramModel, String str) {
        return spaceNormalize(list, unigramModel, str);
    }

    @Override // com.sixfive.can.nl.lexical.Tokenizer
    public Utterance doParse(String str) {
        int detectWakeupWordIndex = this.wakeupWordMatcher.detectWakeupWordIndex(str);
        List<Chunk> list = tokenize(str.substring(detectWakeupWordIndex), this.model);
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (Chunk chunk : list) {
            String replaceAll = SPECIAL_LETTERS.matcher(chunk.getText()).replaceAll("");
            if (!replaceAll.isEmpty()) {
                arrayList.add(new Token(replaceAll));
                arrayList2.add(Integer.valueOf(chunk.getStart() + detectWakeupWordIndex));
                arrayList2.add(Integer.valueOf((chunk.getEnd() + detectWakeupWordIndex) - 1));
            }
        }
        return new Utterance(str, this.locale, arrayList2, arrayList);
    }

    @Override // com.sixfive.can.nl.lexical.Tokenizer
    public int version() {
        return 8;
    }
}
