package com.github.vector4wang.thread;

import com.github.vector4wang.VWCrawler;
import com.github.vector4wang.annotation.CssSelector;
import com.github.vector4wang.model.PageRequest;
import com.github.vector4wang.proxy.Proxy2;
import com.github.vector4wang.util.CrawlerUtil;
import com.github.vector4wang.util.GenericsUtils;
import com.github.vector4wang.util.ReflectUtils;
import com.github.vector4wang.util.SelectType;
import java.io.IOException;
import java.lang.reflect.Field;
import java.net.ConnectException;
import java.net.SocketTimeoutException;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/github/vector4wang/thread/CrawlerThread.class */
public class CrawlerThread implements Runnable {
    private Logger logger = LoggerFactory.getLogger(CrawlerThread.class.getName());
    private VWCrawler vwCrawler;
    private boolean isRunning;

    public CrawlerThread(VWCrawler vWCrawler) {
        this.vwCrawler = vWCrawler;
    }

    public boolean isRunning() {
        return this.isRunning;
    }

    public void setRunning(boolean z) {
        this.isRunning = z;
    }

    @Override // java.lang.Runnable
    public void run() {
        while (true) {
            try {
                this.isRunning = false;
                this.vwCrawler.tryStop();
                String generateUrl = this.vwCrawler.generateUrl();
                this.isRunning = true;
                if (StringUtils.isEmpty(generateUrl)) {
                    this.logger.info("no url");
                    return;
                }
                process(generateUrl);
            } catch (Exception e) {
                if (e instanceof InterruptedException) {
                    this.logger.info("vw-crawler[" + Thread.currentThread().getName() + "] stopped!", e.getMessage());
                    return;
                } else {
                    this.logger.error(e.getMessage(), e);
                    return;
                }
            }
        }
    }

    private void process(String str) {
        if (this.vwCrawler.getCrawlerService().isExist(str)) {
            return;
        }
        this.logger.info("{}开始抓取[{}]当前待抓取数为{},已抓取数为{}", new Object[]{Thread.currentThread().getName(), str, Integer.valueOf(this.vwCrawler.getWaitCrawlerUrls().size()), Integer.valueOf(this.vwCrawler.getCrawledUrls().size())});
        Document document = null;
        int i = 0;
        do {
            try {
                PageRequest pageRequest = new PageRequest();
                pageRequest.setUrl(str);
                pageRequest.setTimeout(this.vwCrawler.getTimeout());
                if (this.vwCrawler.getHeaderMap() != null && !this.vwCrawler.getHeaderMap().isEmpty()) {
                    pageRequest.setHeader(this.vwCrawler.getHeaderMap());
                }
                List<Proxy2> proxy2s = this.vwCrawler.getProxyExtractor().getProxy2s();
                if (proxy2s != null && proxy2s.size() > 0) {
                    pageRequest.setProxy(this.vwCrawler.getProxyExtractor().extractProxyIp());
                }
                try {
                    try {
                        document = this.vwCrawler.getDownloader().downloadPage(pageRequest);
                        i++;
                        if (i >= this.vwCrawler.getRetryCount()) {
                            break;
                        }
                    } catch (Throwable th) {
                        if (i + 1 < this.vwCrawler.getRetryCount()) {
                            throw th;
                        }
                    }
                } catch (ConnectException e) {
                    this.logger.warn("链接超时");
                    i++;
                    if (i >= this.vwCrawler.getRetryCount()) {
                        break;
                    }
                } catch (SocketTimeoutException e2) {
                    i++;
                    if (i >= this.vwCrawler.getRetryCount()) {
                        break;
                    }
                } catch (Exception e3) {
                    e3.printStackTrace();
                    i++;
                    if (i >= this.vwCrawler.getRetryCount()) {
                        break;
                    }
                }
            } catch (Exception e4) {
                e4.printStackTrace();
                if (e4 instanceof IOException) {
                    this.logger.warn("请求地址发生错误");
                    return;
                } else {
                    this.logger.error(e4.getMessage());
                    return;
                }
            }
        } while (document == null);
        if (this.vwCrawler.getCrawlerService().isContinue(document)) {
            if (document != null) {
                Elements select = document.select("a[href]");
                if (select.size() > 0) {
                    Iterator it = select.iterator();
                    while (it.hasNext()) {
                        String absUrl = ((Element) it.next()).absUrl("href");
                        Iterator<String> it2 = this.vwCrawler.getSeedsPageUrlRex().iterator();
                        while (it2.hasNext()) {
                            if (CrawlerUtil.isMatch(it2.next(), absUrl)) {
                                this.vwCrawler.addWaitCrawlerUrl(absUrl);
                            }
                        }
                    }
                    Iterator it3 = select.iterator();
                    while (it3.hasNext()) {
                        String absUrl2 = ((Element) it3.next()).absUrl("href");
                        Iterator<String> it4 = this.vwCrawler.getTargetUrlRex().iterator();
                        while (it4.hasNext()) {
                            if (CrawlerUtil.isMatch(it4.next(), absUrl2)) {
                                this.vwCrawler.addWaitCrawlerUrl(absUrl2);
                            }
                        }
                    }
                }
                if (!this.vwCrawler.isTargetUrl(str)) {
                    return;
                }
                Object newInstance = GenericsUtils.getSuperClassGenericType(this.vwCrawler.getCrawlerService().getClass()).newInstance();
                Field[] declaredFields = newInstance.getClass().getDeclaredFields();
                if (declaredFields != null) {
                    for (Field field : declaredFields) {
                        CssSelector cssSelector = (CssSelector) field.getAnnotation(CssSelector.class);
                        if (cssSelector != null) {
                            String selector = cssSelector.selector();
                            SelectType resultType = cssSelector.resultType();
                            if (selector != null && selector.length() > 0) {
                                String elements = resultType == SelectType.HTML ? document.select(selector).toString() : document.select(selector).text();
                                field.setAccessible(true);
                                field.set(newInstance, ReflectUtils.parseValueWithType(elements, field));
                            }
                        }
                    }
                }
                this.vwCrawler.getCrawlerService().parsePage(document, newInstance);
                this.vwCrawler.getCrawlerService().save(newInstance);
            }
        }
    }
}
