package com.rw.crawler;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;
import com.rw.crawler.EntryObject;
import com.wb.util.DbUtil;
import com.wb.util.SysUtil;
import com.xiaoleilu.hutool.util.StrUtil;
import com.xiaoleilu.hutool.util.URLUtil;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import jdk.nashorn.api.scripting.ScriptObjectMirror;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class GTCrawler2 extends RamCrawler {

   private static final Logger logger = LoggerFactory.getLogger(GTCrawler2.class);


   public GTCrawler2() {
      CrawlDatum datum = (new CrawlDatum("http://bbs.gtja.net/hgwx/menu.htm")).meta("pageType", "menu");
      this.addSeed(datum);
   }

   public void visit(Page page, CrawlDatums next) {
      String pageType = page.meta("pageType");
      if("menu".equals(pageType)) {
         String data = ((Element)page.select("script").get(2)).data();
         String jsVar = StrUtil.removePrefix(StrUtil.removeSuffix(StrUtil.trim(data), "-->"), "<!--");
         ScriptEngineManager scriptEngineManager = new ScriptEngineManager();
         ScriptEngine nashorn = scriptEngineManager.getEngineByName("nashorn");
         ScriptObjectMirror o = null;
         CrawlDatum cd = null;

         try {
            nashorn.eval(jsVar);
            o = (ScriptObjectMirror)nashorn.get("menu");
            ArrayList e = new ArrayList();
            this.eachObject("-1", o, e);
            String url = "http://bbs.gtja.net/hgwx/";
            String seed = null;
            int i = 1;
            Iterator var19 = e.iterator();

            while(var19.hasNext()) {
               EntryObject eo = (EntryObject)var19.next();
               if(StrUtil.isNotBlank(eo.getUrl()) && StrUtil.endWithIgnoreCase(eo.getUrl(), "htm")) {
                  seed = url + URLUtil.encode(eo.getUrl(), "utf-8");
                  logger.debug(i++ + ":" + seed + ",id=" + eo.getId() + ", title=" + eo.getTitle());
                  cd = (new CrawlDatum(seed)).meta("id", eo.getId()).meta("pid", eo.getPid()).meta("title", eo.getTitle()).meta("pageType", "book").key(eo.getId());
                  next.add(cd);
               }
            }
         } catch (ScriptException var20) {
            var20.printStackTrace();
            logger.error("Error crawler: " + var20.getMessage());
         }
      } else if("book".equals(pageType)) {
         String id = page.meta("id");
         String pid = page.meta("pid");
         String title = page.meta("title");
         String content = StrUtil.trimStart(StrUtil.removePrefix(page.select("body").first().html().trim(), "”"));
         if(StrUtil.isNotBlank(content)) {
            if(StrUtil.containsIgnoreCase(content, "script")) {
               content = StrUtil.subPre(content, content.indexOf("<script"));
            }

            if(!StrUtil.containsIgnoreCase(content, "<table")) {
               content = content.replaceAll("<p[^>]*>", "").replaceAll("</p>", "\r").replaceAll("<br>", "\r").replaceAll("&nbsp;", "").replaceAll("<(S*?)[^>]*>.*?|<.*? />", "");
            }

            logger.debug("URL=" + page.url() + ",ID=" + id + ", PID=" + pid + ", TITLE=" + title + ", BODY=" + content);
            this.record(id, pid, title, content);
         } else {
            logger.debug("URL=" + page.url() + ",ID=" + id + ", PID=" + pid + ", TITLE=" + title + ", BODY=" + content);
            this.record(id, pid, title, "");
         }
      }

   }

   public void eachObject(String pid, ScriptObjectMirror o, List entryList) {
      String key = null;
      Object value = null;
      EntryObject eo = new EntryObject();
      String id = SysUtil.getId();
      eo.setId(id);
      eo.setPid(pid);
      Iterator var9 = o.entrySet().iterator();

      while(var9.hasNext()) {
         Entry entry = (Entry)var9.next();
         key = (String)entry.getKey();
         value = entry.getValue();
         if(value instanceof ScriptObjectMirror) {
            this.eachObject(id, (ScriptObjectMirror)value, entryList);
         } else {
            switch(key.hashCode()) {
            case 49:
               if(key.equals("1")) {
                  eo.setTitle(value.toString());
               }
               break;
            case 50:
               if(key.equals("2")) {
                  eo.setUrl(value.toString());
               }
            }
         }
      }

      entryList.add(eo);
   }

   private void record(String id, String pid, String title, String content) {
      Connection conn = null;
      PreparedStatement st = null;

      try {
         conn = DbUtil.getConnection();
         st = conn.prepareStatement("insert into WB_CRAWLER_GT values(?,?,?,?)");
         st.setString(1, id);
         st.setString(2, pid);
         st.setString(3, title);
         st.setString(4, content);
         st.executeUpdate();
      } catch (Throwable var11) {
         logger.error("网页抓取内容入库错误:" + var11.getMessage());
      } finally {
         DbUtil.close(st);
         DbUtil.close(conn);
      }

   }

   public static void main(String[] args) throws Exception {
      GTCrawler2 crawler = new GTCrawler2();
      crawler.start();
   }
}
