package org.codelibs.fess.crawler.transformer;

import java.io.BufferedInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.PostConstruct;
import javax.xml.transform.TransformerException;
import org.apache.xpath.objects.XObject;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.io.SerializeUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.stream.StreamUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.UnsafeStringBuilder;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.DocumentHelper;
import org.codelibs.fess.helper.FileTypeHelper;
import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

/* loaded from: input_file:org/codelibs/fess/crawler/transformer/FessXpathTransformer.class */
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
    private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
    private static final int UTF8_BOM_SIZE = 3;
    public boolean prunedContent = true;
    public Map<String, String> convertUrlMap = new HashMap();
    protected FessConfig fessConfig;

    @PostConstruct
    public void init() {
        this.fessConfig = ComponentUtil.getFessConfig();
    }

    @Override // org.codelibs.fess.crawler.transformer.FessTransformer
    public FessConfig getFessConfig() {
        return this.fessConfig;
    }

    @Override // org.codelibs.fess.crawler.transformer.FessTransformer
    public Logger getLogger() {
        return logger;
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:29:0x0131. Please report as an issue. */
    protected void storeData(ResponseData responseData, ResultData resultData) {
        DOMParser domParser = getDomParser();
        try {
            BufferedInputStream bufferedInputStream = new BufferedInputStream(responseData.getResponseBody());
            Throwable th = null;
            try {
                try {
                    byte[] bArr = new byte[3];
                    bufferedInputStream.mark(3);
                    if (bufferedInputStream.read(bArr) < 3 || !isUtf8BomBytes(bArr)) {
                        bufferedInputStream.reset();
                    }
                    InputSource inputSource = new InputSource(bufferedInputStream);
                    if (responseData.getCharSet() != null) {
                        inputSource.setEncoding(responseData.getCharSet());
                    }
                    domParser.parse(inputSource);
                    if (bufferedInputStream != null) {
                        if (0 != 0) {
                            try {
                                bufferedInputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            bufferedInputStream.close();
                        }
                    }
                    Document document = domParser.getDocument();
                    LinkedHashMap linkedHashMap = new LinkedHashMap();
                    for (Map.Entry entry : this.fieldRuleMap.entrySet()) {
                        try {
                            XObject eval = getXPathAPI().eval(document, (String) entry.getValue());
                            switch (eval.getType()) {
                                case -1:
                                case 0:
                                case 4:
                                case 5:
                                case 600:
                                default:
                                    Node selectSingleNode = getXPathAPI().selectSingleNode(document, (String) entry.getValue());
                                    putResultDataBody(linkedHashMap, (String) entry.getKey(), selectSingleNode != null ? selectSingleNode.getTextContent() : null);
                                    break;
                                case 1:
                                    putResultDataBody(linkedHashMap, (String) entry.getKey(), Boolean.toString(eval.bool()));
                                    break;
                                case 2:
                                    putResultDataBody(linkedHashMap, (String) entry.getKey(), Double.toString(eval.num()));
                                    break;
                                case 3:
                                    putResultDataBody(linkedHashMap, (String) entry.getKey(), eval.str());
                                    break;
                            }
                        } catch (TransformerException e) {
                            logger.warn("Could not parse a value of " + ((String) entry.getKey()) + ":" + ((String) entry.getValue()));
                        }
                    }
                    putAdditionalData(linkedHashMap, responseData, document);
                    try {
                        resultData.setData(SerializeUtil.fromObjectToBinary(linkedHashMap));
                        resultData.setEncoding(this.charsetName);
                    } catch (Exception e2) {
                        throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e2);
                    }
                } finally {
                }
            } finally {
            }
        } catch (Exception e3) {
            throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e3);
        }
    }

    protected void putAdditionalData(Map<String, Object> map, ResponseData responseData, Document document) {
        String canonicalUrl;
        if (StringUtil.isNotBlank(this.fessConfig.getCrawlerDocumentHtmlCannonicalXpath()) && (canonicalUrl = getCanonicalUrl(responseData, document)) != null && !canonicalUrl.equals(responseData.getUrl())) {
            HashSet hashSet = new HashSet();
            hashSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
            throw new ChildUrlsException(hashSet, getClass().getName() + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
        }
        FessConfig fessConfig = ComponentUtil.getFessConfig();
        CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        String canonicalSessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
        PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
        CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(responseData.getSessionId());
        Object documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
        SystemHelper systemHelper = ComponentUtil.getSystemHelper();
        FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
        DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
        String url = responseData.getUrl();
        Object indexingTarget = crawlingConfig.getIndexingTarget(url);
        String replaceUrl = pathMappingHelper.replaceUrl(canonicalSessionId, url);
        String mimeType = responseData.getMimeType();
        Map<String, String> configParameterMap = crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.FIELD);
        Map<String, String> configParameterMap2 = crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.XPATH);
        UrlQueue urlQueue = CrawlingParameterUtil.getUrlQueue();
        String charSet = (urlQueue == null || urlQueue.getEncoding() == null) ? responseData.getCharSet() : urlQueue.getEncoding();
        Object configId = crawlingConfig.getConfigId();
        if (configId != null) {
            putResultDataBody(map, fessConfig.getIndexFieldConfigId(), configId);
        }
        if (documentExpires != null) {
            putResultDataBody(map, fessConfig.getIndexFieldExpires(), documentExpires);
        }
        Object normalizeLang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, configParameterMap2), true));
        if (normalizeLang != null) {
            putResultDataBody(map, fessConfig.getIndexFieldLang(), normalizeLang);
        }
        String singleNodeValue = getSingleNodeValue(document, getContentXpath(fessConfig, configParameterMap2), this.prunedContent);
        putResultDataBody(map, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, singleNodeValue, map));
        if ((Constants.TRUE.equalsIgnoreCase(configParameterMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
            if (responseData.getContentLength() <= 0 || responseData.getContentLength() > fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
                logger.debug("Content size is too large({} > {}): {}", new Object[]{Long.valueOf(responseData.getContentLength()), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl()});
            } else {
                String charSet2 = responseData.getCharSet();
                if (charSet2 == null) {
                    charSet2 = "UTF-8";
                }
                try {
                    BufferedInputStream bufferedInputStream = new BufferedInputStream(responseData.getResponseBody());
                    Throwable th = null;
                    try {
                        try {
                            putResultDataBody(map, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(bufferedInputStream), charSet2));
                            putResultDataBody(map, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
                            if (bufferedInputStream != null) {
                                if (0 != 0) {
                                    try {
                                        bufferedInputStream.close();
                                    } catch (Throwable th2) {
                                        th.addSuppressed(th2);
                                    }
                                } else {
                                    bufferedInputStream.close();
                                }
                            }
                        } finally {
                        }
                    } finally {
                    }
                } catch (Exception e) {
                    logger.warn("Failed to write a cache: " + canonicalSessionId + ":" + responseData, e);
                }
            }
        }
        String singleNodeValue2 = getSingleNodeValue(document, getDigestXpath(fessConfig, configParameterMap2), false);
        if (StringUtil.isNotBlank(singleNodeValue2)) {
            putResultDataBody(map, fessConfig.getIndexFieldDigest(), singleNodeValue2);
        } else {
            putResultDataBody(map, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, singleNodeValue, map, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger().intValue()));
        }
        putResultDataBody(map, fessConfig.getIndexFieldSegment(), canonicalSessionId);
        putResultDataBody(map, fessConfig.getIndexFieldHost(), getHost(replaceUrl));
        putResultDataBody(map, fessConfig.getIndexFieldSite(), getSite(replaceUrl, charSet));
        String fileName = getFileName(replaceUrl, charSet);
        if (StringUtil.isNotBlank(fileName)) {
            putResultDataBody(map, fessConfig.getIndexFieldFilename(), fileName);
        }
        putResultDataBody(map, fessConfig.getIndexFieldUrl(), replaceUrl);
        Object currentTime = systemHelper.getCurrentTime();
        putResultDataBody(map, fessConfig.getIndexFieldCreated(), currentTime);
        putResultDataBody(map, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
        putResultDataBody(map, fessConfig.getIndexFieldMimetype(), mimeType);
        if (fileTypeHelper != null) {
            putResultDataBody(map, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
        }
        putResultDataBody(map, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
        Object lastModified = responseData.getLastModified();
        if (lastModified != null) {
            putResultDataBody(map, fessConfig.getIndexFieldLastModified(), lastModified);
            putResultDataBody(map, fessConfig.getIndexFieldTimestamp(), lastModified);
        } else {
            putResultDataBody(map, fessConfig.getIndexFieldTimestamp(), currentTime);
        }
        putResultDataBody(map, Constants.INDEXING_TARGET, indexingTarget);
        putResultDataBody(map, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
        HashSet hashSet2 = new HashSet();
        for (String str : crawlingConfig.getLabelTypeValues()) {
            hashSet2.add(str);
        }
        hashSet2.addAll(ComponentUtil.getLabelTypeHelper().getMatchedLabelValueSet(replaceUrl));
        putResultDataBody(map, fessConfig.getIndexFieldLabel(), hashSet2);
        ArrayList arrayList = new ArrayList();
        StreamUtil.stream(crawlingConfig.getPermissions()).of(stream -> {
            stream.forEach(str2 -> {
                arrayList.add(str2);
            });
        });
        putResultDataBody(map, fessConfig.getIndexFieldRole(), arrayList);
        putResultDataBody(map, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(map));
        String parentUrl = responseData.getParentUrl();
        if (StringUtil.isNotBlank(parentUrl)) {
            putResultDataBody(map, fessConfig.getIndexFieldUrl(), pathMappingHelper.replaceUrl(canonicalSessionId, parentUrl));
            putResultDataBody(map, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(map));
            putResultDataBody(map, fessConfig.getIndexFieldUrl(), replaceUrl);
        }
        Map<String, String> configParameterMap3 = crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.SCRIPT);
        configParameterMap2.entrySet().stream().filter(entry -> {
            return !((String) entry.getKey()).startsWith("default.");
        }).forEach(entry2 -> {
            String str2 = (String) entry2.getKey();
            putResultDataWithTemplate(map, str2, getSingleNodeValue(document, (String) entry2.getValue(), true), (String) configParameterMap3.get(str2));
        });
        crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.VALUE).entrySet().stream().forEach(entry3 -> {
            String str2 = (String) entry3.getKey();
            putResultDataWithTemplate(map, str2, (String) entry3.getValue(), (String) configParameterMap3.get(str2));
        });
    }

    protected String getLangXpath(FessConfig fessConfig, Map<String, String> map) {
        String str = map.get("default.lang");
        return StringUtil.isNotBlank(str) ? str : fessConfig.getCrawlerDocumentHtmlLangXpath();
    }

    protected String getContentXpath(FessConfig fessConfig, Map<String, String> map) {
        String str = map.get("default.content");
        return StringUtil.isNotBlank(str) ? str : fessConfig.getCrawlerDocumentHtmlContentXpath();
    }

    protected String getDigestXpath(FessConfig fessConfig, Map<String, String> map) {
        String str = map.get("default.digest");
        return StringUtil.isNotBlank(str) ? str : fessConfig.getCrawlerDocumentHtmlDigestXpath();
    }

    protected String getCanonicalUrl(ResponseData responseData, Document document) {
        String singleNodeValue = getSingleNodeValue(document, this.fessConfig.getCrawlerDocumentHtmlCannonicalXpath(), false);
        if (StringUtil.isBlank(singleNodeValue)) {
            return null;
        }
        return singleNodeValue.startsWith("/") ? normalizeCanonicalUrl(responseData.getUrl(), singleNodeValue) : singleNodeValue;
    }

    protected String normalizeCanonicalUrl(String str, String str2) {
        try {
            return new URL(new URL(str), str2).toString();
        } catch (MalformedURLException e) {
            logger.warn("Invalid canonical url: " + str + " : " + str2, e);
            return null;
        }
    }

    protected String removeCommentTag(String str) {
        int indexOf;
        if (str == null) {
            return Constants.DEFAULT_IGNORE_FAILURE_TYPE;
        }
        String str2 = str;
        int indexOf2 = str2.indexOf("<!--");
        while (true) {
            int i = indexOf2;
            if (i < 0 || (indexOf = str2.indexOf("-->", i)) < 0) {
                break;
            }
            str2 = i == 0 ? " " + str2.substring(indexOf + 3) : str2.substring(0, i) + " " + str2.substring(indexOf + 3);
            indexOf2 = str2.indexOf("<!--");
        }
        return str2;
    }

    protected String getSingleNodeValue(Document document, String str, boolean z) {
        UnsafeStringBuilder unsafeStringBuilder = null;
        try {
            NodeList selectNodeList = getXPathAPI().selectNodeList(document, str);
            for (int i = 0; i < selectNodeList.getLength(); i++) {
                if (unsafeStringBuilder == null) {
                    unsafeStringBuilder = new UnsafeStringBuilder(Constants.DEFAULT_INTERVAL_TIME_FOR_FS);
                } else {
                    unsafeStringBuilder.append(' ');
                }
                Node item = selectNodeList.item(i);
                if (z) {
                    unsafeStringBuilder.append(pruneNode(item.cloneNode(true)).getTextContent());
                } else {
                    unsafeStringBuilder.append(item.getTextContent());
                }
            }
        } catch (Exception e) {
            logger.warn("Could not parse a value of " + str);
        }
        if (unsafeStringBuilder == null) {
            return null;
        }
        return unsafeStringBuilder.toUnsafeString().trim();
    }

    protected Node pruneNode(Node node) {
        NodeList childNodes = node.getChildNodes();
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            if (isPrunedTag(item.getNodeName())) {
                arrayList2.add(item);
            } else {
                arrayList.add(item);
            }
        }
        Iterator it = arrayList2.iterator();
        while (it.hasNext()) {
            node.removeChild((Node) it.next());
        }
        Iterator it2 = arrayList.iterator();
        while (it2.hasNext()) {
            pruneNode((Node) it2.next());
        }
        return node;
    }

    protected boolean isPrunedTag(String str) {
        for (String str2 : getCrawlerDocumentHtmlPrunedTags()) {
            if (str2.equalsIgnoreCase(str)) {
                return true;
            }
        }
        return false;
    }

    protected String getMultipleNodeValue(Document document, String str) {
        UnsafeStringBuilder unsafeStringBuilder = new UnsafeStringBuilder(100);
        try {
            NodeList selectNodeList = getXPathAPI().selectNodeList(document, str);
            for (int i = 0; i < selectNodeList.getLength(); i++) {
                unsafeStringBuilder.append(selectNodeList.item(i).getTextContent());
                unsafeStringBuilder.append("\n");
            }
        } catch (Exception e) {
            logger.warn("Could not parse a value of " + str);
        }
        return unsafeStringBuilder.toUnsafeString().trim();
    }

    protected String replaceDuplicateHost(String str) {
        try {
            return ComponentUtil.getDuplicateHostHelper().convert(str);
        } catch (Exception e) {
            return str;
        }
    }

    protected List<String> getAnchorList(Document document, ResponseData responseData) {
        List<RequestData> arrayList = new ArrayList();
        String baseHref = getBaseHref(document);
        try {
            URL url = new URL(baseHref != null ? baseHref : responseData.getUrl());
            for (Map.Entry entry : this.childUrlRuleMap.entrySet()) {
                Iterator it = getUrlFromTagAttribute(url, document, (String) entry.getKey(), (String) entry.getValue(), responseData.getCharSet()).iterator();
                while (it.hasNext()) {
                    arrayList.add(RequestDataBuilder.newRequestData().get().url((String) it.next()).build());
                }
            }
            arrayList = convertChildUrlList(arrayList);
        } catch (Exception e) {
            logger.warn("Could not parse anchor tags.", e);
        }
        ArrayList arrayList2 = new ArrayList(arrayList.size());
        Iterator<RequestData> it2 = arrayList.iterator();
        while (it2.hasNext()) {
            arrayList2.add(it2.next().getUrl());
        }
        return arrayList2;
    }

    protected List<RequestData> convertChildUrlList(List<RequestData> list) {
        if (list != null) {
            for (RequestData requestData : list) {
                String url = requestData.getUrl();
                for (Map.Entry<String, String> entry : this.convertUrlMap.entrySet()) {
                    url = url.replaceAll(entry.getKey(), entry.getValue());
                }
                requestData.setUrl(replaceDuplicateHost(url));
            }
        }
        return list;
    }

    public Object getData(AccessResultData<?> accessResultData) {
        byte[] data = accessResultData.getData();
        if (data == null) {
            return new HashMap();
        }
        try {
            return SerializeUtil.fromBinaryToObject(data);
        } catch (Exception e) {
            throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
        }
    }

    protected boolean isValidPath(String str) {
        return super.isValidPath(str);
    }

    protected void addChildUrlFromTagAttribute(List<String> list, URL url, String str, String str2) {
        String trim = str.trim();
        String str3 = null;
        try {
            str3 = encodeUrl(normalizeUrl(new URL(url, trim).toExternalForm()), str2);
        } catch (MalformedURLException e) {
            int indexOf = trim.indexOf(58);
            if (indexOf > 0 && indexOf < 10) {
                str3 = encodeUrl(normalizeUrl(trim), str2);
            }
        }
        if (str3 == null) {
            logger.warn("Ignored child URL: " + str + " in " + url);
            return;
        }
        if (logger.isDebugEnabled()) {
            logger.debug(str + " -> " + str3);
        }
        if (StringUtil.isNotBlank(str3)) {
            if (logger.isDebugEnabled()) {
                logger.debug("Add Child: " + str3);
            }
            list.add(str3);
        } else if (logger.isDebugEnabled()) {
            logger.debug("Skip Child: " + str3);
        }
    }

    private boolean isUtf8BomBytes(byte[] bArr) {
        return bArr[0] == -17 && bArr[1] == -69 && bArr[2] == -65;
    }

    protected String[] getCrawlerDocumentHtmlPrunedTags() {
        return this.fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
    }
}
