package org.apache.solr.handler.extraction;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.xml.serialize.BaseMarkupSerializer;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.TextSerializer;
import org.apache.xml.serialize.XMLSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

/* loaded from: input_file:org/apache/solr/handler/extraction/ExtractingDocumentLoader.class */
public class ExtractingDocumentLoader extends ContentStreamLoader {
    public static final String TEXT_FORMAT = "text";
    public static final String XML_FORMAT = "xml";
    final IndexSchema schema;
    final SolrParams params;
    final UpdateRequestProcessor processor;
    final boolean ignoreTikaException;
    protected AutoDetectParser autoDetectParser;
    private final AddUpdateCommand templateAdd = new AddUpdateCommand();
    protected TikaConfig config;
    protected SolrContentHandlerFactory factory;
    private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
    private static final XPathParser PARSER = new XPathParser("xhtml", "http://www.w3.org/1999/xhtml");

    public ExtractingDocumentLoader(SolrQueryRequest solrQueryRequest, UpdateRequestProcessor updateRequestProcessor, TikaConfig tikaConfig, SolrContentHandlerFactory solrContentHandlerFactory) {
        this.params = solrQueryRequest.getParams();
        this.schema = solrQueryRequest.getSchema();
        this.config = tikaConfig;
        this.processor = updateRequestProcessor;
        this.templateAdd.allowDups = false;
        this.templateAdd.overwriteCommitted = true;
        this.templateAdd.overwritePending = true;
        if (this.params.getBool("overwrite", true)) {
            this.templateAdd.allowDups = false;
            this.templateAdd.overwriteCommitted = true;
            this.templateAdd.overwritePending = true;
        } else {
            this.templateAdd.allowDups = true;
            this.templateAdd.overwriteCommitted = false;
            this.templateAdd.overwritePending = false;
        }
        this.templateAdd.commitWithin = this.params.getInt("commitWithin", -1);
        this.autoDetectParser = new AutoDetectParser(tikaConfig);
        this.factory = solrContentHandlerFactory;
        this.ignoreTikaException = this.params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
    }

    void doAdd(SolrContentHandler solrContentHandler, AddUpdateCommand addUpdateCommand) throws IOException {
        addUpdateCommand.solrDoc = solrContentHandler.newDocument();
        this.processor.processAdd(addUpdateCommand);
    }

    void addDoc(SolrContentHandler solrContentHandler) throws IOException {
        this.templateAdd.indexedId = null;
        doAdd(solrContentHandler, this.templateAdd);
    }

    public void load(SolrQueryRequest solrQueryRequest, SolrQueryResponse solrQueryResponse, ContentStream contentStream) throws IOException {
        this.errHeader = "ExtractingDocumentLoader: " + contentStream.getSourceInfo();
        String str = solrQueryRequest.getParams().get(ExtractingParams.STREAM_TYPE, (String) null);
        Parser parser = str != null ? (Parser) new DefaultParser(this.config.getMediaTypeRegistry()).getParsers().get(MediaType.parse(str.trim().toLowerCase())) : this.autoDetectParser;
        if (parser == null) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + str + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
        }
        Metadata metadata = new Metadata();
        String str2 = solrQueryRequest.getParams().get(ExtractingParams.RESOURCE_NAME, (String) null);
        if (str2 != null) {
            metadata.add("resourceName", str2);
        }
        if (contentStream.getContentType() != null) {
            metadata.add("Content-Type", contentStream.getContentType());
        }
        try {
            try {
                InputStream stream = contentStream.getStream();
                metadata.add(ExtractingMetadataConstants.STREAM_NAME, contentStream.getName());
                metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, contentStream.getSourceInfo());
                metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(contentStream.getSize()));
                metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, contentStream.getContentType());
                String charsetFromContentType = ContentStreamBase.getCharsetFromContentType(contentStream.getContentType());
                if (charsetFromContentType != null) {
                    metadata.add("Content-Encoding", charsetFromContentType);
                }
                String str3 = this.params.get(ExtractingParams.XPATH_EXPRESSION);
                boolean bool = this.params.getBool(ExtractingParams.EXTRACT_ONLY, false);
                BaseMarkupSerializer createSolrContentHandler = this.factory.createSolrContentHandler(metadata, this.params, this.schema);
                BaseMarkupSerializer baseMarkupSerializer = createSolrContentHandler;
                StringWriter stringWriter = null;
                BaseMarkupSerializer baseMarkupSerializer2 = null;
                if (bool) {
                    String str4 = this.params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT);
                    stringWriter = new StringWriter();
                    if (str4.equals(TEXT_FORMAT)) {
                        baseMarkupSerializer2 = new TextSerializer();
                        baseMarkupSerializer2.setOutputCharStream(stringWriter);
                        baseMarkupSerializer2.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                    } else {
                        baseMarkupSerializer2 = new XMLSerializer(stringWriter, new OutputFormat("XML", "UTF-8", true));
                    }
                    if (str3 != null) {
                        Matcher parse = PARSER.parse(str3);
                        baseMarkupSerializer2.startDocument();
                        baseMarkupSerializer = new MatchingContentHandler(baseMarkupSerializer2, parse);
                    } else {
                        baseMarkupSerializer = baseMarkupSerializer2;
                    }
                } else if (str3 != null) {
                    baseMarkupSerializer = new MatchingContentHandler(createSolrContentHandler, PARSER.parse(str3));
                }
                try {
                    parser.parse(stream, baseMarkupSerializer, metadata, new ParseContext());
                } catch (TikaException e) {
                    if (!this.ignoreTikaException) {
                        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
                    }
                    log.warn("skip extracting text due to " + e.getLocalizedMessage() + ". metadata=" + metadata.toString());
                }
                if (bool) {
                    if (str3 != null) {
                        baseMarkupSerializer2.endDocument();
                    }
                    solrQueryResponse.add(contentStream.getName(), stringWriter.toString());
                    stringWriter.close();
                    String[] names = metadata.names();
                    NamedList namedList = new NamedList();
                    for (int i = 0; i < names.length; i++) {
                        namedList.add(names[i], metadata.getValues(names[i]));
                    }
                    solrQueryResponse.add(contentStream.getName() + "_metadata", namedList);
                } else {
                    addDoc(createSolrContentHandler);
                }
                IOUtils.closeQuietly(stream);
            } catch (SAXException e2) {
                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e2);
            }
        } catch (Throwable th) {
            IOUtils.closeQuietly((InputStream) null);
            throw th;
        }
    }
}
