jsoup爬取某网站安全数据

Posted 西北逍遥

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了jsoup爬取某网站安全数据相关的知识,希望对你有一定的参考价值。

jsoup爬取某网站安全数据

package com.vfsd.net;

import java.io.IOException;
import java.sql.SQLException;
import java.util.Map;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.vfsd.dao.Managemysql;

/**
 * Servlet implementation class GetURL13
 */
@WebServlet("/GetURL13")
public class GetURL13 extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public GetURL13() {
        super();
        // TODO Auto-generated constructor stub
    }
    private String message;
    
    @Override
    public void init() throws ServletException {
        message = "Hello world, this message is from servlet!";
        System.out.println("------"+message);
        try {
            ManageMySQL.getConnection();
            
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        //response.getWriter().append("Served at: ").append(request.getContextPath());
        String agent1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/74.0.3729.169 Safari/537.36";
        
        int pageNum=1;
        int pageSize=10;
        //for(pageNum=1;pageNum<101;pageNum++)
        for(pageNum=1;pageNum<924;pageNum++)
        {
            try {
                int page1= (pageNum-1)*pageSize;
                Map<Integer,String> map1 = ManageMySQL.getNewsLinkInTable(page1,pageSize,"data_bjszfhcxjswyh");
                for(Integer key : map1.keySet())
                {
                    System.out.println(key+"  "+map1.get(key));
                    String news_link = map1.get(key);
                    String context1="";
                    String source1="";
                    String publishDate = "";
                    //String context1 = getContentByURL(news_link).replace(" ", "");
                    
                    if(!news_link.contains("void"))
                    {
                        if(news_link.endsWith("html"))
                        {
                            Document documentRoot = Jsoup.connect(news_link).userAgent(agent1).get();
                            Elements elements2 = documentRoot.select("#content_list");
                            //Elements elements2_1 = documentRoot.select("div.div_right");
                            if(elements2.size()==1)
                            {
                                Element div_ele = elements2.get(0);
                                context1 = div_ele.text();
                                ManageMySQL.updateContextAndPublishDate2(key, context1.replace("‘", "").replace(""", ""),source1,publishDate,"data_bjszfhcxjswyh");
                            }
                            
                            
                        }
                        
                    }
                    
                }
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            
        }
    }



    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        doGet(request, response);
    }

}

 

以上是关于jsoup爬取某网站安全数据的主要内容,如果未能解决你的问题,请参考以下文章

Scala实现爬取某网站数据

2020学习 04 python 爬取某政府网站信件

用python爬取某宝热卖网站商品信息(爬虫之路,永无止境!)

正则爬取某段子网站前20页段子(request库)

python爬虫--爬取某网站电影下载地址

Python爬虫:爬取某网站关键词对应商品ID,且存入DB2数据库