package org.apache.ctakes.core.ae;

import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.AnalysisComponent;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

/* loaded from: input_file:WEB-INF/lib/ctakes-core-3.2.2.jar:org/apache/ctakes/core/ae/TokenizerAnnotatorPTB.class */
public class TokenizerAnnotatorPTB extends JCasAnnotator_ImplBase {
    public static final String PARAM_SEGMENTS_TO_SKIP = "SegmentsToSkip";

    @ConfigurationParameter(name = "SegmentsToSkip", mandatory = false, description = "Set of segments that can be skipped")
    private String[] skipSegmentsArray;
    private Set<String> skipSegmentsSet;
    private TokenizerPTB tokenizer;
    static char CR = '\r';
    static char LF = '\n';
    private Logger logger = Logger.getLogger(getClass().getName());
    private int tokenCount = 0;

    @Override // org.apache.uima.fit.component.JCasAnnotator_ImplBase, org.apache.uima.analysis_component.AnalysisComponent_ImplBase, org.apache.uima.analysis_component.AnalysisComponent
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        this.logger.info("Initializing " + getClass().getName());
        this.tokenizer = new TokenizerPTB();
        this.skipSegmentsSet = new HashSet();
        if (this.skipSegmentsArray != null) {
            Collections.addAll(this.skipSegmentsSet, this.skipSegmentsArray);
        }
    }

    @Override // org.apache.uima.analysis_component.JCasAnnotator_ImplBase
    public void process(JCas jCas) throws AnalysisEngineProcessException {
        this.logger.info("process(JCas) in " + getClass().getName());
        this.tokenCount = 0;
        for (Segment segment : JCasUtil.select(jCas, Segment.class)) {
            if (!this.skipSegmentsSet.contains(segment.getId())) {
                annotateRange(jCas, segment.getBegin(), segment.getEnd());
            }
        }
    }

    protected void annotateRange(JCas jCas, int i, int i2) throws AnalysisEngineProcessException {
        NewlineToken newlineToken;
        String documentText = jCas.getDocumentText();
        int i3 = i;
        while (i3 < i2) {
            if (documentText.charAt(i3) == CR) {
                if (i3 + 1 >= i2 || documentText.charAt(i3 + 1) != LF) {
                    newlineToken = new NewlineToken(jCas, i3, i3 + 1);
                } else {
                    newlineToken = new NewlineToken(jCas, i3, i3 + 2);
                    i3++;
                }
                newlineToken.addToIndexes();
            } else if (documentText.charAt(i3) == LF) {
                new NewlineToken(jCas, i3, i3 + 1).addToIndexes();
            }
            i3++;
        }
        for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
            if (sentence.getBegin() >= i && sentence.getEnd() <= i2) {
                List<?> list = this.tokenizer.tokenizeTextSegment(jCas, sentence.getCoveredText(), sentence.getBegin(), true);
                for (Object obj : list) {
                    if (obj == null) {
                        new RuntimeException("bta==null tokenCount=" + this.tokenCount + " tokens.size()==" + list.size()).printStackTrace();
                    } else {
                        if (!BaseToken.class.isAssignableFrom(obj.getClass())) {
                            throw new AnalysisEngineProcessException("Token returned cannot be cast as BaseToken", new Object[]{obj});
                        }
                        ((BaseToken) BaseToken.class.cast(obj)).addToIndexes();
                    }
                }
            }
        }
        for (BaseToken baseToken : JCasUtil.select(jCas, BaseToken.class)) {
            if (baseToken.getBegin() >= i && baseToken.getBegin() < i2) {
                baseToken.setTokenNumber(this.tokenCount);
                this.tokenCount++;
            }
        }
    }

    public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
        return AnalysisEngineFactory.createEngineDescription((Class<? extends AnalysisComponent>) TokenizerAnnotatorPTB.class, new Object[0]);
    }
}
