CVPR论文爬取并进行词云展示

Posted dwx8845

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CVPR论文爬取并进行词云展示相关的知识,希望对你有一定的参考价值。

效果图:

 技术图片技术图片

 

 

 

 

 

 源码如下:

首先是Python对cvpr论文的爬取部分:爬取的网址为 http://openaccess.thecvf.com

技术图片
import  pymysql
import re
import requests

# 连接数据库函数
def insertCvpr(value):

    try:
        db = pymysql.connect("localhost", "root", "root", "jiaoli")
        print("数据库连接成功!")
        cur = db.cursor()
        sql = INSERT INTO cvpr(title,ab,hotword,pdf) VALUE (%s,%s,%s,%s)
        cur.execute(sql, value)
        db.commit()
        print("增加数据成功!")
    except pymysql.Error as e:
        print("增加数据失败:  " + str(e))
        db.rollback()

    db.close()


    #开头
    url = "http://openaccess.thecvf.com/ICCV2019.py"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/81.0.4044.92 Safari/537.36"}
    res = requests.get(url,headers=headers)
    res.encoding = "utf-8"
    # 先爬取每个论文的网址
    web = re.findall("""<dt class="ptitle"><br><a href="(.*?)">.*?</a></dt>""", res.text, re.S)
    for each in web:
        try:
            each = "http://openaccess.thecvf.com/" + each
            print(each)
            res = requests.get(each, headers=headers, timeout=(3, 7))
            res.encoding = "utf-8"
            # 在各各论文网站中爬取详细信息
            title = re.findall("""<div id="papertitle">(.*?)</div>""", res.text, re.S)
            ab = re.findall("""<div id="abstract" >(.*?)</div>""", res.text, re.S)
            pdf = re.findall("""[<a href="../../(.*?)">pdf</a>]""", res.text, re.S)
            if (len(title) > 0):
                title = title[0].replace("
", "")
                ab = ab[0].replace("
", "")
                pdf = "http://openaccess.thecvf.com/" + pdf[0]
                print(title)
                value = (title, ab, "", pdf)
                insertCvpr(value)
        except:
            print("闪过")
爬虫

然后对爬取的文章进行题目和摘要的关键词分析分析出出现词频最高的有效词汇: 

技术图片
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpSession;

    //2.将删除改成类名
    /**
     * Servlet implementation class index
     */
    @WebServlet("/input")
    public class input extends HttpServlet{
        private static final long serialVersionUID = 1L;
        
        /**
         * @see HttpServlet#HttpServlet()
         */
        public input() {
            super();
            // TODO Auto-generated constructor stub
        }

        /**
         * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
         */
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    
    request.setCharacterEncoding("UTF-8");
    response.setContentType("text/html;charset=UTF-8");
    //声明缓冲区
    HttpSession session = request.getSession();
    String url = "jdbc:mysql://localhost:3306/cvpr?&useSSL=false&serverTimezone=UTC&useUnicode=yes&characterEncoding=utf8";        
    
    Connection conn = null;
    PreparedStatement ps = null;

    try {
        Class.forName("com.mysql.cj.jdbc.Driver");
        conn = DriverManager.getConnection(url, "root", "root");
        
    } catch (ClassNotFoundException e) {
        response.getWriter().print("加载驱动失败");
    } catch (SQLException e) {
        response.getWriter().print("连接数据库失败");
    }
    
    
    StringBuffer buffer2 = new StringBuffer();
    try {
        Statement stmt = conn.createStatement();
        //1.改sql语句        
            ResultSet rs = stmt.executeQuery("select * from cvpr");
        while (rs.next()) {            
            String title=new String(rs.getString("title"));        
            String ab=new String(rs.getString("ab"));        
            buffer2.append(title);
            buffer2.append(ab);        
        }
        
}catch (SQLException e) {
        response.getWriter().print("查找失败");
    }    
    String file = buffer2.toString();
    String[] a=file.split("[^a-zA-Z]+");   
    int n=a.length;
    int kind=0,zs=0;  
    Object[][] b=new Object[n][2];
    for(;zs<n;zs++){
        int k=0;
        for(int i=0;i<kind;i++){
            if(((String) b[i][0]).equalsIgnoreCase(a[zs])){
                b[i][1]=(int)b[i][1]+1;
                k=1;
                break;
            }
        }
        if(k==0){
            b[kind][0]=a[zs];
            b[kind][1]=1;
            kind++;
        } 
    }
    int max=0;
    int p=0,q=0;
    String m;
    String[] c=new String[1000];
    int[] d=new int[1000];
    for(int i=0;i<1000;i++)
    {
        c[i]="";
        d[i]=0;
    }
    for(int i=0;i<kind;i++){
        for(int j=0;j<kind;j++){
            if((int)b[j][1]>(int)b[max][1])
            {if(((String) b[j][0]).length()<4)
                {j=j++;
                continue;}
            if(((String) b[j][0]).equals("With"))
                {j++;
                continue;}
            if(((String) b[j][0]).equals("with"))
                {j++;
                continue;}
            if(((String) b[j][0]).equals("that"))
                {j++;
                continue;}
            if(((String) b[j][0]).equals("this"))
                {j++;
                continue;}
            if(((String) b[j][0]).equals("from"))
                {j++;
                continue;}
            if(((String) b[j][0]).equals("which"))
            {j++;
            continue;}
             else max=j;
            }
            }
        System.out.println(b[max][0]+"出现次数为:"+b[max][1]);
        if(i<30)
        {c[i]=(String) b[max][0];
         d[i]= (int) b[max][1];
         b[max][1]=0;    
        }
        else
        b[max][1]=0;    
}

    session.setAttribute("c",c); 
    session.setAttribute("d",d); 
    request.getRequestDispatcher( "reciyun.jsp").forward(request,response);
}
}
View Code

然后是展示界面:

展示界面需要导入echarts.min.js    echarts-wordcloud.min.js两个包

技术图片
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Document</title>
</head>
<body>
<%String num[] = (String[])session.getAttribute("c");%>
<%int num2[] = (int[])session.getAttribute("d");%>

<form action="tiaozhuan" method="post">
<div id = "main" style="width: 1200px;height: 800px;"></div>
</form>

<script type="text/javascript" src = "js/echarts.min.js"></script>
<script type="text/javascript" src = "js/echarts-wordcloud.min.js"></script>
<script type="text/javascript">
         
         var worldCloudcharts=echarts.init(document.getElementById(‘main‘));
         var worldCloudoption = {
                 title: {
                     text: ‘CVPR热词‘,
                     x: ‘center‘,
                     textStyle: {
                         fontSize: 23,
                         color:‘#FFFFFF‘
                     }
 
                 },
                 tooltip: {
                     show: true
                 },
                 series: [{
                     name: ‘CVPR热词‘,
                     type: ‘wordCloud‘,
                     sizeRange: [20, 130],
                     rotationRange: [-45, 90],
                     textPadding: 0,
                     autoSize: {
                         enable: true,
                         minSize: 10
                     },
                     textStyle: {
                         normal: {
                             color: function() {
                                 return ‘rgb(‘ + [
                                     Math.round(Math.random() * 160),
                                     Math.round(Math.random() * 160),
                                     Math.round(Math.random() * 160)
                                 ].join(‘,‘) + ‘)‘;
                             }
                         },
                         emphasis: {
                             shadowBlur: 10,
                             shadowColor: ‘#333‘
                         }
                     },
                     data: [{
                         name: "Jayfee",
                         value: 666
                     }, {
                         name: "Nancy",
                         value: 520
                     }]
                 }]
             };
 
             var JosnList = [];
 
             JosnList.push(
                     <%for(int i=0;i<29;i++)
                     {
                     %>
                     {name: "<%=num[i]%>",value: <%=num2[i]%>,url:‘tiaozhuan?title=<%=num[i]%>‘},
                     <%
                     }
                     %>
                     {name: "<%=num[99]%>",value: <%=num2[29]%>,url:‘tiaozhuan?title=<%=num[29]%>‘}
                     );

         worldCloudoption.series[0].data = JosnList;
         worldCloudcharts.setOption(worldCloudoption);
         worldCloudcharts.on("click",function(e){
             console.log(e);
             window.open(e.data.url);
         });
     </script>
</body>
</html>
jsp

最后对关键词在数据库进行模糊查询把相关文章信息返回jsp中展示:

技术图片
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpSession;

import com.cvpr.lun.user;

//2.将删除改成类名
/**
 * Servlet implementation class index
 */
@WebServlet("/tiaozhuan")
public class tiaozhuan extends HttpServlet{
    private static final long serialVersionUID = 1L;
    
    /**
     * @see HttpServlet#HttpServlet()
     */
    public tiaozhuan() {
        super();
        // TODO Auto-generated constructor stub
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        
        request.setCharacterEncoding("UTF-8");
        response.setContentType("text/html;charset=UTF-8");
        //声明list
        ArrayList<user> list = new ArrayList();
        //声明缓冲区
        HttpSession session = request.getSession();
        
        String url = "jdbc:mysql://localhost:3306/cvpr?&useSSL=false&serverTimezone=UTC&useUnicode=yes&characterEncoding=utf8";        
        
        Connection conn = null;
        PreparedStatement ps = null;

        try {
            Class.forName("com.mysql.cj.jdbc.Driver");
            conn = DriverManager.getConnection(url, "root", "root");
            
        } catch (ClassNotFoundException e) {
            response.getWriter().print("加载驱动失败");
        } catch (SQLException e) {
            response.getWriter().print("连接数据库失败");
        }
        
        String name=request.getParameter("title"); 
        System.out.printf(name);
        
        try {
            Statement stmt = conn.createStatement();
            //1.改sql语句        
             ResultSet rs = stmt.executeQuery("select * from cvpr where title like ‘%"+name+"%‘ ");
            while (rs.next()) {            
                String title=new String(rs.getString("title"));    
                String ab=new String(rs.getString("ab"));    
                String pdf=new String(rs.getString("pdf"));    
                user use2=new user(title,ab,pdf);
                list.add(use2);
                System.out.printf(title);
            }

            
    }catch (SQLException e) {
            response.getWriter().print("查找失败");
        }    
        request.setAttribute("list",list);       
        request.getRequestDispatcher( "tiaozhuan.jsp").forward(request,response);
        //***************************************
    }
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        doGet(request, response);
    }
    
    
}
servlet
技术图片
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Document</title>
</head>
<body>
    <c:forEach items="${list}" var="use2" >    
        书名:${use2.title}<br/>
        摘要:${use2.ab}<br/>
        连接:<a href =${use2.pdf} >查看原文</a><br/>
    </c:forEach>
</body>
</html>
jsp

 

以上是关于CVPR论文爬取并进行词云展示的主要内容,如果未能解决你的问题,请参考以下文章

爬取CVPR2019年的论文数据并实现可视化热词云

双人合作---爬取CVPR论文

python 爬取豆瓣电影评论,并进行词云展示及出现的问题解决办法

python 爬取腾讯微博并生成词云

Python爬虫实战,爬取A股公司数据,简单分析A股公司并生成词云

爬取网易云音乐评论并使用词云展示