package org.apache.ctakes.core.nlp.tokenizer;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.batik.dom.svg.SVGPathSegConstants;

/* loaded from: input_file:WEB-INF/lib/ctakes-core-3.2.2.jar:org/apache/ctakes/core/nlp/tokenizer/Tokenizer.class */
public class Tokenizer {
    private OffsetComparator iv_offsetComp = new OffsetComparator();
    private Map<String, Integer> iv_hyphMap;
    private int iv_freqCutoff;

    public Tokenizer() {
    }

    public Tokenizer(Map<String, Integer> map, int i) {
        this.iv_hyphMap = map;
        this.iv_freqCutoff = i;
    }

    public static void validateHyphenMap(Map<String, Integer> map) throws Exception {
        for (String str : map.keySet()) {
            Integer num = map.get(str);
            if (num == null) {
                throw new Exception("Hyphen map is missing frequency data for key=" + str);
            }
            if (!(num instanceof Integer)) {
                throw new Exception("Hyphen map has non java.lang.Integer frequency data for key=" + str);
            }
        }
    }

    public List<Token> tokenizeAndSort(String str) throws Exception {
        List<Token> list = tokenize(str);
        Collections.sort(list, this.iv_offsetComp);
        return list;
    }

    public List<Token> tokenize(String str) throws Exception {
        try {
            List<Token> endOfLineTokens = getEndOfLineTokens(str);
            List<Token> rawTokens = getRawTokens(str);
            applyPunctSymbolRules(rawTokens, str);
            for (int i = 0; i < rawTokens.size(); i++) {
                Token token = rawTokens.get(i);
                String substring = str.substring(token.getStartOffset(), token.getEndOffset());
                if (token.getType() != 3) {
                    if (isNumber(substring)) {
                        token.setType((byte) 2);
                        token.setIsInteger(isInteger(substring));
                    }
                    if (token.getType() == 0) {
                        token.setType((byte) 1);
                    }
                    if (token.getType() == 1) {
                        applyCapitalizationRules(token, substring);
                        applyWordNumRules(token, substring);
                    }
                }
            }
            rawTokens.addAll(endOfLineTokens);
            for (int i2 = 0; i2 < rawTokens.size(); i2++) {
                Token token2 = rawTokens.get(i2);
                token2.setText(str.substring(token2.getStartOffset(), token2.getEndOffset()));
            }
            return rawTokens;
        } catch (Exception e) {
            e.printStackTrace();
            throw new Exception("Internal Error with Tokenizer.");
        }
    }

    private void applyPunctSymbolRules(List<Token> list, String str) {
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            Token token = list.get(i);
            String substring = str.substring(token.getStartOffset(), token.getEndOffset());
            if (substring.length() == 1) {
                char charAt = substring.charAt(0);
                if (!isAlphabetLetterOrDigit(charAt)) {
                    if (isPunctuation(charAt)) {
                        token.setType((byte) 3);
                    } else {
                        token.setType((byte) 6);
                    }
                }
            } else {
                token.setStartOffset(token.getStartOffset() + processStartPunctSymbol(arrayList, token, substring));
                token.setEndOffset(token.getEndOffset() - processEndPunctSymbol(arrayList, token, str.substring(token.getStartOffset(), token.getEndOffset())));
                if (token.getStartOffset() == token.getEndOffset()) {
                    arrayList2.add(token);
                }
                String substring2 = str.substring(token.getStartOffset(), token.getEndOffset());
                int indexOf = substring2.indexOf(39);
                if (indexOf != -1) {
                    Token token2 = null;
                    String substring3 = substring2.substring(indexOf + 1, substring2.length());
                    if (substring3.length() == 1) {
                        if (substring3.equalsIgnoreCase("d") || substring3.equalsIgnoreCase("m") || substring3.equalsIgnoreCase("s")) {
                            token2 = new Token(token.getStartOffset() + indexOf, token.getEndOffset());
                        } else if (substring3.equalsIgnoreCase(SVGPathSegConstants.PATHSEG_CURVETO_QUADRATIC_SMOOTH_REL_LETTER) && substring2.substring(indexOf - 1, indexOf).equalsIgnoreCase("n")) {
                            token2 = new Token((token.getStartOffset() + indexOf) - 1, token.getEndOffset());
                        }
                    } else if (substring3.length() == 2 && (substring3.equalsIgnoreCase("re") || substring3.equalsIgnoreCase("ve") || substring3.equalsIgnoreCase("ll"))) {
                        token2 = new Token(token.getStartOffset() + indexOf, token.getEndOffset());
                    }
                    if (token2 != null) {
                        token2.setType((byte) 5);
                        arrayList.add(token2);
                        token.setEndOffset(token2.getStartOffset());
                    }
                } else if (substring2.equalsIgnoreCase("cannot")) {
                    Token token3 = new Token(token.getStartOffset() + 3, token.getEndOffset());
                    token3.setType((byte) 1);
                    arrayList.add(token3);
                    token.setEndOffset(token.getStartOffset() + 3);
                }
                boolean findPunctSymbolInsideToken = findPunctSymbolInsideToken(list, token, str.substring(token.getStartOffset(), token.getEndOffset()));
                if (token.getEndOffset() == token.getStartOffset()) {
                    findPunctSymbolInsideToken = true;
                }
                if (findPunctSymbolInsideToken) {
                    arrayList2.add(token);
                }
            }
        }
        list.addAll(arrayList);
        for (int i2 = 0; i2 < arrayList2.size(); i2++) {
            list.remove((Token) arrayList2.get(i2));
        }
    }

    private int processStartPunctSymbol(List<Token> list, Token token, String str) {
        int i = 0;
        for (int i2 = 0; i2 < str.length(); i2++) {
            char charAt = str.charAt(i2);
            if (isAlphabetLetterOrDigit(charAt)) {
                return i;
            }
            Token token2 = new Token(token.getStartOffset() + i2, token.getStartOffset() + i2 + 1);
            if (isPunctuation(charAt)) {
                token2.setType((byte) 3);
            } else {
                token2.setType((byte) 6);
            }
            list.add(token2);
            i++;
        }
        return i;
    }

    private int processEndPunctSymbol(List<Token> list, Token token, String str) {
        int i = 0;
        for (int length = str.length() - 1; length >= 0; length--) {
            char charAt = str.charAt(length);
            if (isAlphabetLetterOrDigit(charAt)) {
                return i;
            }
            Token token2 = new Token(token.getStartOffset() + length, token.getStartOffset() + length + 1);
            if (isPunctuation(charAt)) {
                token2.setType((byte) 3);
            } else {
                token2.setType((byte) 6);
            }
            list.add(token2);
            i++;
        }
        return i;
    }

    private int getFirstInsidePunctSymbol(String str) {
        for (int i = 0; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (charAt == ',' && !isNumber(str)) {
                return i;
            }
            if (charAt == '.' && !isNumber(str)) {
                return i;
            }
            if (!isAlphabetLetterOrDigit(charAt) && charAt != '.' && charAt != ',' && charAt != ':' && charAt != ';') {
                return i;
            }
        }
        return -1;
    }

    private boolean findPunctSymbolInsideToken(List<Token> list, Token token, String str) {
        int startOffset = token.getStartOffset();
        int firstInsidePunctSymbol = getFirstInsidePunctSymbol(str);
        if (firstInsidePunctSymbol == -1) {
            if (list.contains(token)) {
                return false;
            }
            list.add(token);
            return true;
        }
        char charAt = str.charAt(firstInsidePunctSymbol);
        if (charAt == '-' && this.iv_hyphMap != null && this.iv_hyphMap.containsKey(str.toLowerCase()) && this.iv_hyphMap.get(str.toLowerCase()).intValue() > this.iv_freqCutoff) {
            if (list.contains(token)) {
                return false;
            }
            list.add(token);
            return true;
        }
        Token token2 = new Token(startOffset + firstInsidePunctSymbol, startOffset + firstInsidePunctSymbol + 1);
        if (isPunctuation(charAt)) {
            token2.setType((byte) 3);
        } else {
            token2.setType((byte) 6);
        }
        list.add(token2);
        if (startOffset != token2.getStartOffset()) {
            list.add(new Token(startOffset, token2.getStartOffset()));
        }
        return findPunctSymbolInsideToken(list, new Token(token2.getEndOffset(), token.getEndOffset()), str.substring(firstInsidePunctSymbol + 1, str.length()));
    }

    private boolean isPunctuation(char c) {
        return c == ';' || c == ':' || c == ',' || c == '.' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}' || c == '<' || c == '>' || c == '\'' || c == '\"' || c == '/' || c == '\\' || c == '-';
    }

    private boolean isAlphabetLetterOrDigit(char c) {
        return isAlphabetLetter(c) || isDigit(c);
    }

    public boolean isAlphabetLetter(char c) {
        int numericValue = Character.getNumericValue(c);
        return numericValue >= 10 && numericValue <= 35;
    }

    private boolean isDigit(char c) {
        int numericValue = Character.getNumericValue(c);
        return numericValue >= 0 && numericValue <= 9;
    }

    public static boolean isNumber(String str) {
        boolean z = false;
        int i = 0;
        for (int length = str.length() - 1; length >= 0; length--) {
            char charAt = str.charAt(length);
            if (Character.isDigit(charAt)) {
                i++;
            } else if (charAt == '.' && !z) {
                z = true;
                i = 0;
            } else if (charAt != ',' || i % 3 != 0) {
                return false;
            }
        }
        return true;
    }

    private boolean isInteger(String str) {
        return str.indexOf(46) == -1;
    }

    private void applyCapitalizationRules(Token token, String str) {
        boolean[] zArr = new boolean[str.length()];
        boolean z = true;
        boolean z2 = true;
        for (int i = 0; i < str.length(); i++) {
            zArr[i] = Character.isUpperCase(str.charAt(i));
            if (zArr[i]) {
                z2 = false;
            } else {
                z = false;
            }
        }
        if (z2) {
            token.setCaps((byte) 1);
            return;
        }
        if (z) {
            token.setCaps((byte) 4);
            return;
        }
        if (!zArr[0]) {
            token.setCaps((byte) 2);
            return;
        }
        if (zArr.length == 1) {
            token.setCaps((byte) 3);
            return;
        }
        boolean z3 = true;
        for (int i2 = 1; i2 < zArr.length; i2++) {
            if (zArr[i2]) {
                z3 = false;
            }
        }
        if (z3) {
            token.setCaps((byte) 3);
        } else {
            token.setCaps((byte) 2);
        }
    }

    private void applyWordNumRules(Token token, String str) {
        boolean[] zArr = new boolean[str.length()];
        boolean z = true;
        for (int i = 0; i < str.length(); i++) {
            zArr[i] = Character.isDigit(str.charAt(i));
            if (zArr[i]) {
                z = false;
            }
        }
        if (z) {
            token.setNumPosition((byte) 0);
            return;
        }
        if (zArr[0]) {
            token.setNumPosition((byte) 1);
        } else if (zArr[str.length() - 1]) {
            token.setNumPosition((byte) 3);
        } else {
            token.setNumPosition((byte) 2);
        }
    }

    private List<Token> getEndOfLineTokens(String str) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        while (i < str.length()) {
            char charAt = str.charAt(i);
            Token token = null;
            if (charAt == '\n') {
                token = new Token(i, i + 1);
            } else if (charAt == '\r') {
                if (i + 1 >= str.length()) {
                    token = new Token(i, i + 1);
                } else if (str.charAt(i + 1) == '\n') {
                    token = new Token(i, i + 2);
                    i++;
                } else {
                    token = new Token(i, i + 1);
                }
            }
            if (token != null) {
                token.setType((byte) 4);
                arrayList.add(token);
            }
            i++;
        }
        return arrayList;
    }

    private List<Token> getRawTokens(String str) {
        boolean z = false;
        int i = 0;
        ArrayList arrayList = new ArrayList();
        for (int i2 = 0; i2 < str.length(); i2++) {
            char charAt = str.charAt(i2);
            if (charAt == ' ' || charAt == '\t' || charAt == '\n' || charAt == '\r') {
                if (z) {
                    z = false;
                    arrayList.add(new Token(i, i2));
                }
            } else if (!z) {
                z = true;
                i = i2;
            }
        }
        if (z) {
            arrayList.add(new Token(i, str.length()));
        }
        return arrayList;
    }
}
