Elasticsearch之新闻案例实战

Posted 爱上口袋的天空

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Elasticsearch之新闻案例实战相关的知识,希望对你有一定的参考价值。

1、创建数据库表且预置数据

1.1、表结构如下

DROP TABLE IF EXISTS `news`;
CREATE TABLE `news` (
    `id` int(11) NOT NULL AUTO_INCREMENT,
    `title` varchar(255) NOT NULL comment '主题',
    `url` varchar(255) DEFAULT NULL comment '连接',
    `content` text comment '内容',
    `tags` varchar(1000) DEFAULT NULL comment '搜索的关键字',
    PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=92 DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;

1.2、数据如下

2、定义分词器以及属性类型

PUT news
{
  "settings": {
    "analysis": {
      "analyzer": {
        "news_tags_analyzer": {
          "char_filter": ["html_strip"],
          "tokenizer": "keyword",
          "filter": "news_tags_filter"
        }
      },
      "filter": {
        "news_tags_filter": {
          "type": "pinyin",
          "keep_full_pinyin": true,
          "keep_joined_full_pinyin": true,
          "keep_original": true
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {
        "type": "long"
      },
      "title": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_smart"
      },
      "content": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_smart"
      },
      "url": {
        "type": "keyword"
      },
      "tags": {
        "type": "completion",
        "analyzer": "news_tags_analyzer",
        "search_analyzer": "keyword"
      }
    }
  }
}

 

3、将mysql数据导入es

2.1、创建logstash-mysql-news.conf文件

#input表示将数据读取到logstash中
input {
  jdbc {
    jdbc_driver_library => "/opt/es781/mysql/mysql-connector-java-5.1.49.jar"
    jdbc_driver_class => "com.mysql.jdbc.Driver"
    jdbc_connection_string => "jdbc:mysql://192.168.1.13:3306/oss?userUnicode=true&characterEncoding=UTF-8&serverTimeZone=UTC"
    jdbc_user => "root"
    jdbc_password => "897570"
    #表示是否分页
    jdbc_paging_enabled => "true"
    #表示每页的数量
    jdbc_page_size => "20"
    #查询语句
    statement => "SELECT * FROM news where tags is not null"
  }
}
#这个filter表示对数据进行过滤
filter {
  mutate {
    #根据逗号切割关键字
    split => { "tags" => ","}
  }
  #将下面两个字段过滤掉,logstash会自动帮我们加这两个属性,我们不需要,直接去掉
  mutate {
    remove_field => ["@timestamp","@version"]
  }
}
#将logstash中的数据输出到es中
output {
  elasticsearch {
    document_id => "%{id}"
    document_type => "_doc"
    index => "news"
    hosts => ["http://192.168.56.20:9200"]
  }
  stdout{
    codec => rubydebug
  }
}

2.2、将上面需要的mysql jar包上传到Linux服务器上

2.3、将logstash-mysql-news.conf文件上传到/opt/es781/logstash-7.8.1/目录下

2.4、在/opt/es781/logstash-7.8.1/目录下执行下面的命令将数据从mysql中导入es,

     命令:bin/logstash -f /opt/es781/logstash-7.8.1/logstash-mysql-news.conf
     

2.5、在kibana上查询news索引是否成功导入数据

4、根据需求编写kibana脚本

4.1、自动补全语句

GET news/_search
{
  "_source": false,
  "suggest": {
    "news_tags_suggest": {
      "prefix": "zh",
      "completion":{
        "field":"tags",
        "size": 10,
        "skip_duplicates": true
      }
    }
  }
}

4.2、内容搜索

GET news/_search
{
  "_source": false,
  "query": {
    "multi_match": {
      "query": "中国",
      "fields": ["title","content"]
    }
  },
  "highlight": {
    "pre_tags": "<span class='highLight'>",
    "post_tags": "</span>",
    "fields": {
      "title": {},
      "content": {}
    }
  }
}

5、在java中使用代码实现上面两个搜索

5.1、创建news实体类

package com.kgf.es.model;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@AllArgsConstructor
@NoArgsConstructor
public class News {
    private Integer id;
    private String title;
    private String content;
    private String url;
}

5.2、创建NewsController

package com.kgf.es.controller;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.kgf.es.model.News;
import org.apache.http.util.EntityUtils;
import org.elasticsearch.client.Request;
import org.elasticsearch.client.Response;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import javax.annotation.Resource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

@RestController
@RequestMapping("/news")
public class NewsController {

    @Resource
    private RestHighLevelClient restHighLevelClient;


    @GetMapping("/tips")
    public Object autoComplete(String term) throws IOException {
        Request request = new Request("GET", "news/_search");

        request.setJsonEntity(String.format("{" +
                "  \\"_source\\": false, " +
                "  \\"suggest\\": {" +
                "    \\"news_tags_suggest\\": {" +
                "      \\"prefix\\": \\"%s\\"," +
                "      \\"completion\\": {" +
                "        \\"field\\": \\"tags\\"," +
                "        \\"size\\": 10," +
                "        \\"skip_duplicates\\": true" +
                "      }" +
                "    }" +
                "  }" +
                "}", term));

        Response response = restHighLevelClient.getLowLevelClient().performRequest(request);

        String jsonString = EntityUtils.toString(response.getEntity()); // "{\\"age\\": 10}"  {"age": 10}

        JSONObject jsonObject = JSONObject.parseObject(jsonString);

        JSONArray suggests = jsonObject.getJSONObject("suggest").getJSONArray("news_tags_suggest");

        JSONArray options = suggests.getJSONObject(0).getJSONArray("options");

        List<String> results = new ArrayList<>();
        for(int i = 0; i < options.size(); i++) {
            JSONObject opt = options.getJSONObject(i);
            results.add(opt.getString("text"));
        }

        return results;
    }

    @GetMapping("/search")
    public List<News> query(String text) throws Exception{
        /**
         * 1.对于高亮的数据,ES是抽取的一个个片段,然后将这些片段设置到一个数组中。
         * 2.对于有些数据,可能title或者content中没有高亮的字眼,那么我们就需要取原始数据的 title 和 content.
         */
        Request request = new Request("GET", "news/_search");
        request.setJsonEntity(String.format("{" +
                "  \\"_source\\": [\\"url\\", \\"title\\", \\"content\\"], " +
                "  \\"query\\": {" +
                "    \\"multi_match\\": {" +
                "      \\"query\\": \\"%s\\"," +
                "      \\"fields\\": [\\"title\\", \\"content\\"]" +
                "    }" +
                "  }," +
                "  \\"highlight\\": {" +
                "    \\"pre_tags\\": \\"<span class='highLight'>\\", " +
                "    \\"post_tags\\": \\"</span>\\", " +
                "    \\"fields\\": {" +
                "      \\"title\\": {}," +
                "      \\"content\\": {}" +
                "    }" +
                "  }" +
                "}", text));

        Response response = restHighLevelClient.getLowLevelClient().performRequest(request);

        JSONObject jsonObject = JSONObject.parseObject(EntityUtils.toString(response.getEntity()));

        JSONArray hits = jsonObject.getJSONObject("hits").getJSONArray("hits");

        List<News> results = new ArrayList<>();

        for (int i = 0; i < hits.size(); i++) {
            News news = new News();
            JSONObject hit = hits.getJSONObject(i);
            JSONObject highLight = hit.getJSONObject("highlight");  //获取高亮的数据结果

            JSONObject _source = hit.getJSONObject("_source"); //这个是原始的数据
            news.setUrl(_source.getString("url"));  //设置url

            JSONArray highLightTitle = highLight.getJSONArray("title");  //获取高亮的 title 数组
            JSONArray highLightContent = highLight.getJSONArray("content");

            if(null != highLightTitle) {
                StringBuffer highLightTitleStringBuffer = new StringBuffer();
                for (int j = 0; j < highLightTitle.size(); j++) {
                    String titleSegment = highLightTitle.getString(j);
                    highLightTitleStringBuffer.append(titleSegment);
                }
                news.setTitle(highLightTitleStringBuffer.toString());
            }else {  // 如果不存在高亮的数据,那么就取原始数据
                news.setTitle(_source.getString("title"));
            }

            if(null != highLightContent) {
                StringBuffer highLightContentStringBuffer = new StringBuffer();
                for (int j = 0; j < highLightContent.size(); j++) {
                    String contentSegment = highLightContent.getString(j);
                    highLightContentStringBuffer.append(contentSegment);
                }
                news.setContent(highLightContentStringBuffer.toString());
            }else {  // 如果不存在高亮的数据,那么就取原始数据
                news.setContent(_source.getString("content"));
            }

            results.add(news);
        }
        return results;
    }
}

6、测试

6.1、测试tips,完成对tags的关键字检索

6.2、测试内容检索

以上是关于Elasticsearch之新闻案例实战的主要内容,如果未能解决你的问题,请参考以下文章

Elasticsearch 5.4新闻搜索项目实战

Elasticsearch 5.4新闻搜索项目实战

Elasticsearch语法知多少之Match query

Elasticsearch顶尖高手系列-快速入门篇

深度学习实战案例:新闻文本分类

深度学习实战案例:新闻文本分类