package org.codelibs.fess.crawler.transformer;

import java.io.BufferedInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;
import javax.annotation.PostConstruct;
import javax.xml.xpath.XPathEvaluationResult;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathNodes;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.io.SerializeUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.ValueHolder;
import org.codelibs.core.stream.StreamUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.DocumentHelper;
import org.codelibs.fess.helper.FileTypeHelper;
import org.codelibs.fess.helper.LabelTypeHelper;
import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.PrunedTag;
import org.codelibs.nekohtml.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

/* loaded from: input_file:org/codelibs/fess/crawler/transformer/FessXpathTransformer.class */
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
    private static final Logger logger = LogManager.getLogger(FessXpathTransformer.class);
    private static final String X_ROBOTS_TAG = "X-Robots-Tag";
    private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
    private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
    private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content";
    private static final String ROBOTS_TAG_NONE = "none";
    private static final String ROBOTS_TAG_NOINDEX = "noindex";
    private static final String ROBOTS_TAG_NOFOLLOW = "nofollow";
    private static final int UTF8_BOM_SIZE = 3;
    protected FessConfig fessConfig;
    public boolean prunedContent = true;
    protected Map<String, String> convertUrlMap = new LinkedHashMap();
    protected boolean useGoogleOffOn = true;
    protected Map<String, Boolean> fieldPrunedRuleMap = new HashMap();
    protected Map<String, PrunedTag[]> prunedTagsCache = new HashMap();

    /* renamed from: org.codelibs.fess.crawler.transformer.FessXpathTransformer$1, reason: invalid class name */
    /* loaded from: input_file:org/codelibs/fess/crawler/transformer/FessXpathTransformer$1.class */
    static /* synthetic */ class AnonymousClass1 {
        static final /* synthetic */ int[] $SwitchMap$javax$xml$xpath$XPathEvaluationResult$XPathResultType = new int[XPathEvaluationResult.XPathResultType.values().length];

        static {
            try {
                $SwitchMap$javax$xml$xpath$XPathEvaluationResult$XPathResultType[XPathEvaluationResult.XPathResultType.BOOLEAN.ordinal()] = 1;
            } catch (NoSuchFieldError e) {
            }
            try {
                $SwitchMap$javax$xml$xpath$XPathEvaluationResult$XPathResultType[XPathEvaluationResult.XPathResultType.NUMBER.ordinal()] = 2;
            } catch (NoSuchFieldError e2) {
            }
            try {
                $SwitchMap$javax$xml$xpath$XPathEvaluationResult$XPathResultType[XPathEvaluationResult.XPathResultType.STRING.ordinal()] = 3;
            } catch (NoSuchFieldError e3) {
            }
        }
    }

    @PostConstruct
    public void init() {
        if (logger.isDebugEnabled()) {
            logger.debug("Initialize {}", getClass().getSimpleName());
        }
        this.fessConfig = ComponentUtil.getFessConfig();
    }

    @Override // org.codelibs.fess.crawler.transformer.FessTransformer
    public FessConfig getFessConfig() {
        return this.fessConfig;
    }

    @Override // org.codelibs.fess.crawler.transformer.FessTransformer
    public Logger getLogger() {
        return logger;
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:21:0x00fb. Please report as an issue. */
    protected void storeData(ResponseData responseData, ResultData resultData) {
        DOMParser domParser = getDomParser();
        try {
            BufferedInputStream bufferedInputStream = new BufferedInputStream(responseData.getResponseBody());
            try {
                byte[] bArr = new byte[3];
                bufferedInputStream.mark(3);
                if (bufferedInputStream.read(bArr) < 3 || !isUtf8BomBytes(bArr)) {
                    bufferedInputStream.reset();
                }
                InputSource inputSource = new InputSource(bufferedInputStream);
                if (responseData.getCharSet() != null) {
                    inputSource.setEncoding(responseData.getCharSet());
                }
                domParser.parse(inputSource);
                bufferedInputStream.close();
                Document document = domParser.getDocument();
                processMetaRobots(responseData, resultData, document);
                processXRobotsTag(responseData, resultData);
                LinkedHashMap linkedHashMap = new LinkedHashMap();
                for (Map.Entry entry : this.fieldRuleMap.entrySet()) {
                    try {
                        XPathEvaluationResult eval = getXPathAPI().eval(document, (String) entry.getValue());
                        switch (AnonymousClass1.$SwitchMap$javax$xml$xpath$XPathEvaluationResult$XPathResultType[eval.type().ordinal()]) {
                            case 1:
                                putResultDataBody(linkedHashMap, (String) entry.getKey(), ((Boolean) eval.value()).toString());
                                break;
                            case 2:
                                putResultDataBody(linkedHashMap, (String) entry.getKey(), ((Number) eval.value()).toString());
                                break;
                            case 3:
                                putResultDataBody(linkedHashMap, (String) entry.getKey(), (String) eval.value());
                                break;
                            default:
                                Boolean bool = this.fieldPrunedRuleMap.get(entry.getKey());
                                Node selectSingleNode = getXPathAPI().selectSingleNode(document, (String) entry.getValue());
                                if (selectSingleNode != null && bool != null && bool.booleanValue()) {
                                    selectSingleNode = pruneNode(selectSingleNode, getCrawlingConfig(responseData));
                                }
                                putResultDataBody(linkedHashMap, (String) entry.getKey(), selectSingleNode != null ? selectSingleNode.getTextContent() : null);
                                break;
                        }
                    } catch (XPathExpressionException e) {
                        logger.warn("Could not parse a value of {}:{}", entry.getKey(), entry.getValue(), e);
                    }
                }
                putAdditionalData(linkedHashMap, responseData, document);
                normalizeData(responseData, linkedHashMap);
                try {
                    resultData.setData(SerializeUtil.fromObjectToBinary(linkedHashMap));
                    resultData.setEncoding(this.charsetName);
                } catch (Exception e2) {
                    throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e2);
                }
            } finally {
            }
        } catch (Exception e3) {
            throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e3);
        }
    }

    protected void normalizeData(ResponseData responseData, Map<String, Object> map) {
        Object obj = map.get(this.fessConfig.getIndexFieldTitle());
        if (obj != null) {
            map.put(this.fessConfig.getIndexFieldTitle(), ComponentUtil.getDocumentHelper().getTitle(responseData, obj.toString(), map));
        }
    }

    protected void processMetaRobots(ResponseData responseData, ResultData resultData, Document document) {
        String str = getConfigPrameterMap(responseData, CrawlingConfig.ConfigName.CONFIG).get(CrawlingConfig.Param.Config.IGNORE_ROBOTS_TAGS);
        if (str == null) {
            if (this.fessConfig.isCrawlerIgnoreRobotsTags()) {
                return;
            }
        } else if (Boolean.parseBoolean(str)) {
            return;
        }
        try {
            Node selectSingleNode = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
            if (selectSingleNode != null) {
                boolean z = false;
                boolean z2 = false;
                String lowerCase = selectSingleNode.getTextContent().toLowerCase(Locale.ROOT);
                if (lowerCase.contains(ROBOTS_TAG_NONE)) {
                    z = true;
                    z2 = true;
                } else {
                    if (lowerCase.contains(ROBOTS_TAG_NOINDEX)) {
                        z = true;
                    }
                    if (lowerCase.contains(ROBOTS_TAG_NOFOLLOW)) {
                        z2 = true;
                    }
                }
                if (z && z2) {
                    logger.info("META(robots=noindex,nofollow): {}", responseData.getUrl());
                    throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots");
                }
                if (z) {
                    logger.info("META(robots=noindex): {}", responseData.getUrl());
                    storeChildUrls(responseData, resultData);
                    throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots");
                }
                if (z2) {
                    logger.info("META(robots=nofollow): {}", responseData.getUrl());
                    responseData.setNoFollow(true);
                }
            }
        } catch (XPathExpressionException e) {
            logger.warn("Could not parse a value of {}", META_NAME_ROBOTS_CONTENT, e);
        }
    }

    protected void processXRobotsTag(ResponseData responseData, ResultData resultData) {
        String str = getConfigPrameterMap(responseData, CrawlingConfig.ConfigName.CONFIG).get(CrawlingConfig.Param.Config.IGNORE_ROBOTS_TAGS);
        if (str == null) {
            if (this.fessConfig.isCrawlerIgnoreRobotsTags()) {
                return;
            }
        } else if (Boolean.parseBoolean(str)) {
            return;
        }
        responseData.getMetaDataMap().entrySet().stream().filter(entry -> {
            return X_ROBOTS_TAG.equalsIgnoreCase((String) entry.getKey()) && entry.getValue() != null;
        }).forEach(entry2 -> {
            boolean z = false;
            boolean z2 = false;
            String lowerCase = entry2.getValue().toString().toLowerCase(Locale.ROOT);
            if (lowerCase.contains(ROBOTS_TAG_NONE)) {
                z = true;
                z2 = true;
            } else {
                if (lowerCase.contains(ROBOTS_TAG_NOINDEX)) {
                    z = true;
                }
                if (lowerCase.contains(ROBOTS_TAG_NOFOLLOW)) {
                    z2 = true;
                }
            }
            if (z && z2) {
                logger.info("HEADER(robots=noindex,nofollow): {}", responseData.getUrl());
                throw new ChildUrlsException(Collections.emptySet(), "#processXRobotsTag");
            }
            if (z) {
                logger.info("HEADER(robots=noindex): {}", responseData.getUrl());
                storeChildUrls(responseData, resultData);
                throw new ChildUrlsException(resultData.getChildUrlSet(), "#processXRobotsTag");
            }
            if (z2) {
                logger.info("HEADER(robots=nofollow): {}", responseData.getUrl());
                responseData.setNoFollow(true);
            }
        });
    }

    protected Map<String, String> getConfigPrameterMap(ResponseData responseData, CrawlingConfig.ConfigName configName) {
        return ComponentUtil.getCrawlingConfigHelper().get(responseData.getSessionId()).getConfigParameterMap(configName);
    }

    protected boolean isValidUrl(String str) {
        if (StringUtil.isBlank(str)) {
            return false;
        }
        try {
            String host = new URL(str.startsWith("://") ? "http" + str : str.startsWith("//") ? "http:" + str : str).getHost();
            if (StringUtil.isBlank(host) || "http".equalsIgnoreCase(host)) {
                return false;
            }
            return !"https".equalsIgnoreCase(host);
        } catch (MalformedURLException e) {
            return false;
        }
    }

    protected boolean isValidCanonicalUrl(String str, String str2) {
        if (!str.startsWith("https:") || !str2.startsWith("http:")) {
            return true;
        }
        if (!logger.isDebugEnabled()) {
            return false;
        }
        logger.debug("Invalid Canonical Url(https->http): {} -> {}", str, str2);
        return false;
    }

    protected void putAdditionalData(Map<String, Object> map, ResponseData responseData, Document document) {
        String canonicalUrl = getCanonicalUrl(responseData, document);
        if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
            HashSet hashSet = new HashSet();
            hashSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
            logger.info("CANONICAL: {} -> {}", responseData.getUrl(), canonicalUrl);
            throw new ChildUrlsException(hashSet, getClass().getName() + "#putAdditionalData");
        }
        FessConfig fessConfig = ComponentUtil.getFessConfig();
        CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        String canonicalSessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
        PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
        CrawlingConfig crawlingConfig = getCrawlingConfig(responseData);
        Object documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
        SystemHelper systemHelper = ComponentUtil.getSystemHelper();
        FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
        DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
        LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
        String url = responseData.getUrl();
        String indexingTarget = crawlingConfig.getIndexingTarget(url);
        String replaceUrl = pathMappingHelper.replaceUrl(canonicalSessionId, url);
        String mimeType = responseData.getMimeType();
        Map<String, String> configParameterMap = crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.FIELD);
        Map<String, String> configParameterMap2 = crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.XPATH);
        UrlQueue urlQueue = CrawlingParameterUtil.getUrlQueue();
        String charSet = (urlQueue == null || urlQueue.getEncoding() == null) ? responseData.getCharSet() : urlQueue.getEncoding();
        String configId = crawlingConfig.getConfigId();
        if (configId != null) {
            putResultDataBody(map, fessConfig.getIndexFieldConfigId(), configId);
        }
        if (documentExpires != null) {
            putResultDataBody(map, fessConfig.getIndexFieldExpires(), documentExpires);
        }
        Object normalizeHtmlLang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, configParameterMap2), node -> {
            return pruneNode(node, crawlingConfig);
        }));
        if (normalizeHtmlLang != null) {
            putResultDataBody(map, fessConfig.getIndexFieldLang(), normalizeHtmlLang);
        }
        String singleNodeValue = getSingleNodeValue(document, getContentXpath(fessConfig, configParameterMap2), this.prunedContent ? node2 -> {
            return pruneNode(node2, crawlingConfig);
        } : node3 -> {
            return node3;
        });
        putResultDataBody(map, fessConfig.getIndexFieldContent(), documentHelper.getContent(crawlingConfig, responseData, singleNodeValue, map));
        if ((Constants.TRUE.equalsIgnoreCase(configParameterMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
            if (responseData.getContentLength() <= 0 || responseData.getContentLength() > fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
                logger.debug("Content size is too large({} > {}): {}", Long.valueOf(responseData.getContentLength()), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
            } else {
                String charSet2 = responseData.getCharSet();
                if (charSet2 == null) {
                    charSet2 = "UTF-8";
                }
                try {
                    BufferedInputStream bufferedInputStream = new BufferedInputStream(responseData.getResponseBody());
                    try {
                        putResultDataBody(map, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(bufferedInputStream), charSet2));
                        putResultDataBody(map, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
                        bufferedInputStream.close();
                    } finally {
                    }
                } catch (Exception e) {
                    logger.warn("Failed to write a cache: {}:{}", canonicalSessionId, responseData, e);
                }
            }
        }
        String singleNodeValue2 = getSingleNodeValue(document, getDigestXpath(fessConfig, configParameterMap2), node4 -> {
            return node4;
        });
        if (StringUtil.isNotBlank(singleNodeValue2)) {
            putResultDataBody(map, fessConfig.getIndexFieldDigest(), singleNodeValue2);
        } else {
            putResultDataBody(map, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, singleNodeValue, map, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger().intValue()));
        }
        putResultDataBody(map, fessConfig.getIndexFieldSegment(), canonicalSessionId);
        putResultDataBody(map, fessConfig.getIndexFieldHost(), getHost(replaceUrl));
        putResultDataBody(map, fessConfig.getIndexFieldSite(), getSite(replaceUrl, charSet));
        String fileName = getFileName(replaceUrl, charSet);
        if (StringUtil.isNotBlank(fileName)) {
            putResultDataBody(map, fessConfig.getIndexFieldFilename(), fileName);
        }
        putResultDataBody(map, fessConfig.getIndexFieldUrl(), replaceUrl);
        Object currentTime = systemHelper.getCurrentTime();
        putResultDataBody(map, fessConfig.getIndexFieldCreated(), currentTime);
        putResultDataBody(map, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
        putResultDataBody(map, fessConfig.getIndexFieldMimetype(), mimeType);
        if (fileTypeHelper != null) {
            putResultDataBody(map, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
        }
        putResultDataBody(map, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
        Object lastModified = responseData.getLastModified();
        if (lastModified != null) {
            putResultDataBody(map, fessConfig.getIndexFieldLastModified(), lastModified);
            putResultDataBody(map, fessConfig.getIndexFieldTimestamp(), lastModified);
        } else {
            putResultDataBody(map, fessConfig.getIndexFieldTimestamp(), currentTime);
        }
        putResultDataBody(map, Constants.INDEXING_TARGET, indexingTarget);
        putResultDataBody(map, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
        putResultDataBody(map, fessConfig.getIndexFieldLabel(), labelTypeHelper.getMatchedLabelValueSet(replaceUrl));
        ArrayList arrayList = new ArrayList();
        StreamUtil.stream(crawlingConfig.getPermissions()).of(stream -> {
            stream.forEach(str -> {
                arrayList.add(str);
            });
        });
        putResultDataBody(map, fessConfig.getIndexFieldRole(), arrayList);
        putResultDataBody(map, fessConfig.getIndexFieldVirtualHost(), StreamUtil.stream(crawlingConfig.getVirtualHosts()).get(stream2 -> {
            return (List) stream2.filter(StringUtil::isNotBlank).collect(Collectors.toList());
        }));
        putResultDataBody(map, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(map));
        String parentUrl = responseData.getParentUrl();
        if (StringUtil.isNotBlank(parentUrl)) {
            putResultDataBody(map, fessConfig.getIndexFieldUrl(), pathMappingHelper.replaceUrl(canonicalSessionId, parentUrl));
            putResultDataBody(map, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(map));
            putResultDataBody(map, fessConfig.getIndexFieldUrl(), replaceUrl);
        }
        String thumbnailUrl = getThumbnailUrl(responseData, document);
        if (StringUtil.isNotBlank(thumbnailUrl)) {
            putResultDataBody(map, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
        }
        String scriptType = crawlingConfig.getScriptType();
        Map<String, String> configParameterMap3 = crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.SCRIPT);
        configParameterMap2.entrySet().stream().filter(entry -> {
            return !((String) entry.getKey()).startsWith("default.");
        }).forEach(entry2 -> {
            String str = (String) entry2.getKey();
            putResultDataWithTemplate(map, str, getSingleNodeValue(document, (String) entry2.getValue(), node5 -> {
                return pruneNode(node5, crawlingConfig);
            }), (String) configParameterMap3.get(str), scriptType);
        });
        crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.VALUE).entrySet().stream().forEach(entry3 -> {
            String str = (String) entry3.getKey();
            putResultDataWithTemplate(map, str, (String) entry3.getValue(), (String) configParameterMap3.get(str), scriptType);
        });
    }

    protected CrawlingConfig getCrawlingConfig(ResponseData responseData) {
        return ComponentUtil.getCrawlingConfigHelper().get(responseData.getSessionId());
    }

    protected String getLangXpath(FessConfig fessConfig, Map<String, String> map) {
        String str = map.get(CrawlingConfig.Param.XPath.DEFAULT_LANG);
        return StringUtil.isNotBlank(str) ? str : fessConfig.getCrawlerDocumentHtmlLangXpath();
    }

    protected String getContentXpath(FessConfig fessConfig, Map<String, String> map) {
        String str = map.get(CrawlingConfig.Param.XPath.DEFAULT_CONTENT);
        return StringUtil.isNotBlank(str) ? str : fessConfig.getCrawlerDocumentHtmlContentXpath();
    }

    protected String getDigestXpath(FessConfig fessConfig, Map<String, String> map) {
        String str = map.get(CrawlingConfig.Param.XPath.DEFAULT_DIGEST);
        return StringUtil.isNotBlank(str) ? str : fessConfig.getCrawlerDocumentHtmlDigestXpath();
    }

    protected String getCanonicalUrl(ResponseData responseData, Document document) {
        String str = getConfigPrameterMap(responseData, CrawlingConfig.ConfigName.CONFIG).get(CrawlingConfig.Param.Config.HTML_CANONICAL_XPATH);
        if (str == null) {
            str = this.fessConfig.getCrawlerDocumentHtmlCanonicalXpath();
        }
        if (StringUtil.isBlank(str)) {
            return null;
        }
        String singleNodeValue = getSingleNodeValue(document, str, node -> {
            return node;
        });
        if (StringUtil.isBlank(singleNodeValue)) {
            return null;
        }
        return normalizeCanonicalUrl(responseData.getUrl(), singleNodeValue);
    }

    protected String normalizeCanonicalUrl(String str, String str2) {
        try {
            URL url = new URL(str);
            return new URL(url, str2.startsWith(":") ? url.getProtocol() + str2 : str2).toString();
        } catch (MalformedURLException e) {
            logger.warn("Invalid canonical url: {} : {}", str, str2, e);
            return null;
        }
    }

    protected String removeCommentTag(String str) {
        int indexOf;
        if (str == null) {
            return Constants.DEFAULT_IGNORE_FAILURE_TYPE;
        }
        String str2 = str;
        int indexOf2 = str2.indexOf("<!--");
        while (true) {
            int i = indexOf2;
            if (i < 0 || (indexOf = str2.indexOf("-->", i)) < 0) {
                break;
            }
            str2 = i == 0 ? " " + str2.substring(indexOf + 3) : str2.substring(0, i) + " " + str2.substring(indexOf + 3);
            indexOf2 = str2.indexOf("<!--");
        }
        return str2;
    }

    protected String getSingleNodeValue(Document document, String str, UnaryOperator<Node> unaryOperator) {
        StringBuilder sb = null;
        try {
            XPathNodes selectNodeList = getXPathAPI().selectNodeList(document, str);
            for (int i = 0; i < selectNodeList.size(); i++) {
                if (sb == null) {
                    sb = new StringBuilder(1000);
                }
                Node cloneNode = selectNodeList.get(i).cloneNode(true);
                if (this.useGoogleOffOn) {
                    cloneNode = processGoogleOffOn(cloneNode, new ValueHolder<>(true));
                }
                parseTextContent((Node) unaryOperator.apply(cloneNode), sb);
            }
        } catch (Exception e) {
            logger.warn("Could not parse a value of {}", str);
        }
        if (sb == null) {
            return null;
        }
        return sb.toString().trim();
    }

    protected void parseTextContent(Node node, StringBuilder sb) {
        String textContent;
        if (node.hasChildNodes()) {
            NodeList childNodes = node.getChildNodes();
            for (int i = 0; i < childNodes.getLength(); i++) {
                parseTextContent(childNodes.item(i), sb);
            }
            return;
        }
        if (node.getNodeType() != 3 || (textContent = node.getTextContent()) == null) {
            return;
        }
        String trim = textContent.trim();
        if (trim.length() > 0) {
            sb.append(' ').append(trim);
        }
    }

    protected Node processGoogleOffOn(Node node, ValueHolder<Boolean> valueHolder) {
        NodeList childNodes = node.getChildNodes();
        ArrayList arrayList = null;
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            if (item.getNodeType() == 8) {
                String trim = item.getNodeValue().trim();
                if (trim.startsWith("googleoff:")) {
                    valueHolder.setValue(false);
                } else if (trim.startsWith("googleon:")) {
                    valueHolder.setValue(true);
                }
            }
            if (((Boolean) valueHolder.getValue()).booleanValue() || item.getNodeType() != 3) {
                processGoogleOffOn(item, valueHolder);
            } else {
                if (arrayList == null) {
                    arrayList = new ArrayList();
                }
                arrayList.add(item);
            }
        }
        if (arrayList != null) {
            arrayList.stream().forEach(node2 -> {
                node.removeChild(node2);
            });
        }
        return node;
    }

    protected Node pruneNode(Node node, CrawlingConfig crawlingConfig) {
        PrunedTag[] prunedTagArr = null;
        if (crawlingConfig != null) {
            String configId = crawlingConfig.getConfigId();
            prunedTagArr = this.prunedTagsCache.get(configId);
            if (prunedTagArr == null) {
                String str = crawlingConfig.getConfigParameterMap(CrawlingConfig.ConfigName.CONFIG).get(CrawlingConfig.Param.Config.HTML_PRUNED_TAGS);
                if (StringUtil.isNotBlank(str)) {
                    prunedTagArr = PrunedTag.parse(str);
                }
                if (prunedTagArr == null) {
                    prunedTagArr = this.fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
                }
                this.prunedTagsCache.put(configId, prunedTagArr);
            }
        }
        if (prunedTagArr == null) {
            prunedTagArr = this.fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
        }
        return pruneNodeByTags(node, prunedTagArr);
    }

    protected Node pruneNodeByTags(Node node, PrunedTag[] prunedTagArr) {
        NodeList childNodes = node.getChildNodes();
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            if (isPrunedTag(item, prunedTagArr)) {
                arrayList2.add(item);
            } else {
                arrayList.add(item);
            }
        }
        Iterator it = arrayList2.iterator();
        while (it.hasNext()) {
            node.removeChild((Node) it.next());
        }
        Iterator it2 = arrayList.iterator();
        while (it2.hasNext()) {
            pruneNodeByTags((Node) it2.next(), prunedTagArr);
        }
        return node;
    }

    protected boolean isPrunedTag(Node node, PrunedTag[] prunedTagArr) {
        for (PrunedTag prunedTag : prunedTagArr) {
            if (prunedTag.matches(node)) {
                return true;
            }
        }
        return false;
    }

    protected String getMultipleNodeValue(Document document, String str) {
        StringBuilder sb = new StringBuilder(100);
        try {
            XPathNodes selectNodeList = getXPathAPI().selectNodeList(document, str);
            for (int i = 0; i < selectNodeList.size(); i++) {
                sb.append(selectNodeList.get(i).getTextContent());
                sb.append("\n");
            }
        } catch (Exception e) {
            logger.warn("Could not parse a value of {}", str, e);
        }
        return sb.toString().trim();
    }

    protected String replaceDuplicateHost(String str) {
        try {
            return ComponentUtil.getDuplicateHostHelper().convert(str);
        } catch (Exception e) {
            return str;
        }
    }

    protected List<String> getAnchorList(Document document, ResponseData responseData) {
        List<RequestData> arrayList = new ArrayList();
        try {
            URL baseUrl = getBaseUrl(responseData.getUrl(), getBaseHref(document));
            for (Map.Entry entry : this.childUrlRuleMap.entrySet()) {
                Iterator it = getUrlFromTagAttribute(baseUrl, document, (String) entry.getKey(), (String) entry.getValue(), responseData.getCharSet()).iterator();
                while (it.hasNext()) {
                    arrayList.add(RequestDataBuilder.newRequestData().get().url((String) it.next()).build());
                }
            }
            arrayList = convertChildUrlList(arrayList);
        } catch (Exception e) {
            logger.warn("Could not parse anchor tags.", e);
        }
        ArrayList arrayList2 = new ArrayList(arrayList.size());
        Iterator<RequestData> it2 = arrayList.iterator();
        while (it2.hasNext()) {
            arrayList2.add(it2.next().getUrl());
        }
        return arrayList2;
    }

    protected URL getBaseUrl(String str, String str2) throws MalformedURLException {
        return str2 != null ? getURL(str, str2) : new URL(str);
    }

    protected List<RequestData> convertChildUrlList(List<RequestData> list) {
        if (list != null) {
            PathMappingHelper pathMappingHelper = getPathMappingHelper();
            for (RequestData requestData : list) {
                String url = requestData.getUrl();
                for (Map.Entry<String, String> entry : this.convertUrlMap.entrySet()) {
                    url = url.replaceAll(entry.getKey(), entry.getValue());
                }
                requestData.setUrl(replaceDuplicateHost(pathMappingHelper.replaceUrl(url)));
            }
        }
        return list;
    }

    protected PathMappingHelper getPathMappingHelper() {
        return ComponentUtil.getPathMappingHelper();
    }

    public Object getData(AccessResultData<?> accessResultData) {
        byte[] data = accessResultData.getData();
        if (data == null) {
            return new HashMap();
        }
        try {
            return SerializeUtil.fromBinaryToObject(data);
        } catch (Exception e) {
            throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
        }
    }

    protected void addChildUrlFromTagAttribute(List<String> list, URL url, String str, String str2) {
        String trim = str.trim();
        String str3 = null;
        try {
            str3 = encodeUrl(normalizeUrl(new URL(url, trim.startsWith(":") ? url.getProtocol() + trim : trim).toExternalForm()), str2);
        } catch (MalformedURLException e) {
            int indexOf = trim.indexOf(58);
            if (indexOf > 0 && indexOf < 10) {
                str3 = encodeUrl(normalizeUrl(trim), str2);
            }
        }
        if (str3 == null) {
            logger.warn("Ignored child URL: {} in {}", str, url);
            return;
        }
        if (logger.isDebugEnabled()) {
            logger.debug("{} -> {}", str, str3);
        }
        if (StringUtil.isNotBlank(str3)) {
            if (logger.isDebugEnabled()) {
                logger.debug("Add Child: {}", str3);
            }
            list.add(str3);
        } else if (logger.isDebugEnabled()) {
            logger.debug("Skip Child: {}", str3);
        }
    }

    private boolean isUtf8BomBytes(byte[] bArr) {
        return bArr[0] == -17 && bArr[1] == -69 && bArr[2] == -65;
    }

    public void setUseGoogleOffOn(boolean z) {
        this.useGoogleOffOn = z;
    }

    protected String getThumbnailUrl(ResponseData responseData, Document document) {
        URL url;
        URL url2;
        try {
            Node selectSingleNode = getXPathAPI().selectSingleNode(document, META_NAME_THUMBNAIL_CONTENT);
            if (selectSingleNode != null) {
                String textContent = selectSingleNode.getTextContent();
                if (StringUtil.isNotBlank(textContent) && (url2 = getURL(responseData.getUrl(), textContent)) != null) {
                    return url2.toExternalForm();
                }
            }
            Node selectSingleNode2 = getXPathAPI().selectSingleNode(document, META_PROPERTY_OGIMAGE_CONTENT);
            if (selectSingleNode2 != null) {
                String textContent2 = selectSingleNode2.getTextContent();
                if (StringUtil.isNotBlank(textContent2) && (url = getURL(responseData.getUrl(), textContent2)) != null) {
                    return url.toExternalForm();
                }
            }
            XPathNodes selectNodeList = getXPathAPI().selectNodeList(document, this.fessConfig.getThumbnailHtmlImageXpath());
            String str = null;
            for (int i = 0; i < selectNodeList.size(); i++) {
                Node node = selectNodeList.get(i);
                if (logger.isDebugEnabled()) {
                    logger.debug("img tag: {}", node);
                }
                NamedNodeMap attributes = node.getAttributes();
                String thumbnailSrc = getThumbnailSrc(responseData.getUrl(), attributes);
                Integer attributeAsInteger = getAttributeAsInteger(attributes, "height");
                Integer attributeAsInteger2 = getAttributeAsInteger(attributes, "width");
                if (this.fessConfig.isThumbnailHtmlImageUrl(thumbnailSrc)) {
                    if (attributeAsInteger != null && attributeAsInteger2 != null) {
                        try {
                            if (this.fessConfig.validateThumbnailSize(attributeAsInteger2.intValue(), attributeAsInteger.intValue())) {
                                return thumbnailSrc;
                            }
                        } catch (Exception e) {
                            logger.debug("Failed to parse {} at {}", node, responseData.getUrl(), e);
                        }
                    } else if (str == null) {
                        str = thumbnailSrc;
                    }
                }
            }
            if (str != null) {
                return str;
            }
            return null;
        } catch (Exception e2) {
            logger.warn("Failed to retrieve thumbnail url from {}", responseData.getUrl(), e2);
            return null;
        }
    }

    protected String getThumbnailSrc(String str, NamedNodeMap namedNodeMap) {
        Node namedItem = namedNodeMap.getNamedItem("src");
        if (namedItem == null) {
            return null;
        }
        try {
            URL url = getURL(str, namedItem.getTextContent());
            if (url != null) {
                return url.toExternalForm();
            }
            return null;
        } catch (Exception e) {
            if (!logger.isDebugEnabled()) {
                return null;
            }
            logger.debug("Failed to parse thumbnail url for {} : {}", str, namedNodeMap, e);
            return null;
        }
    }

    protected Integer getAttributeAsInteger(NamedNodeMap namedNodeMap, String str) {
        String textContent;
        Node namedItem = namedNodeMap.getNamedItem(str);
        if (namedItem == null || (textContent = namedItem.getTextContent()) == null) {
            return null;
        }
        try {
            return Integer.valueOf(Integer.parseInt(textContent));
        } catch (NumberFormatException e) {
            return (textContent.endsWith("%") || textContent.endsWith("px")) ? null : 0;
        }
    }

    protected URL getURL(String str, String str2) throws MalformedURLException {
        if (str2 != null) {
            return str2.startsWith("://") ? new URL(str.split(":")[0] + str2) : str2.startsWith("//") ? new URL(str.split(":")[0] + ":" + str2) : (str2.startsWith("/") || str2.indexOf(58) == -1) ? new URL(new URL(str), str2) : new URL(str2);
        }
        return null;
    }

    public void addFieldRule(String str, String str2, boolean z) {
        addFieldRule(str, str2);
        this.fieldPrunedRuleMap.put(str, Boolean.valueOf(z));
    }

    public void setConvertUrlMap(Map<String, String> map) {
        this.convertUrlMap.putAll(map);
    }

    public void addConvertUrl(String str, String str2) {
        this.convertUrlMap.put(str, str2);
    }
}
