数据挖掘:提取百度知道QA中的影视信息

Posted mengrennwpu

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了数据挖掘:提取百度知道QA中的影视信息相关的知识,希望对你有一定的参考价值。

1. 背景

网站上爬取了部分关于影视的百度知道QA,为了后续提高影视的搜索效果,需要基于百度知道QA的答案抽取相关的影视信息。

2. 流程

目前已有基础的媒资视频库信息,基于媒资视频库中的视频名称,构建分词字典,结合使用AC双数组,然后针对百度的QA进行分词。针对分词后的结果,可以结合视频热度与评分进行筛选。

3. 代码实现

(1) 基于文本(格式为每行一条视频名称),结合AC双数组,构建分词

package com.test.model.act;

import com.google.common.collect.Lists;
import com.test.util.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.test.model.act.AhoCorasickDoubleArrayTrie.*;
import java.io.*;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;

/**
 * @author test
 * @date 2018/11/1
 */
public class Act {

    private static Logger logger = LoggerFactory.getLogger(Act.class);

    private static Act instance = null;
    private static String path = "act";
    private AhoCorasickDoubleArrayTrie<Resource> act = new AhoCorasickDoubleArrayTrie<>();

    public static Act getInstance() throws IOException, ClassNotFoundException {
        if(null == instance){
            instance = new Act();
        }
        return instance;
    }

    public Act() throws IOException, ClassNotFoundException {
        this.initTrie();
    }

    /**
     * AC自动机初始化
     * @throws IOException
     * @throws ClassNotFoundException
     */
    private void initTrie() throws IOException, ClassNotFoundException {
        if(new File(path).exists()){
            FileInputStream fis = new FileInputStream(path);
            ObjectInputStream ois = new ObjectInputStream(fis);
            long curTime = System.currentTimeMillis()/1000;
            act.load(ois);
            logger.info("load act cost: " + (System.currentTimeMillis()/1000 - curTime));
        }else{
            TreeMap<String, Resource> treeMap = new TreeMap<>();
            List<String> datas = IOUtil.getPreprocessedData("videoNames.txt");
            for(String data : datas){
                data = data.trim();
                if(!treeMap.containsKey(data)){
                    Resource resource = new Resource(data);
                    treeMap.put(data, resource);
                }
            }
            long curTime = System.currentTimeMillis()/1000;
            act.build(treeMap);
            logger.info("build act cost: " + (System.currentTimeMillis()/1000 - curTime));

            curTime = System.currentTimeMillis()/1000;
            act.save(new ObjectOutputStream(new FileOutputStream(path)));
            logger.info("save act cost: " + (System.currentTimeMillis()/1000 - curTime));
        }
    }

    /**
     * AC字段树最长匹配分词
     * @param queryText
     * @return
     */
    public List<Term<Resource>> parse(String queryText){
        final List<Term<Resource>> terms = Lists.newArrayList();
        act.parseText(queryText, new AhoCorasickDoubleArrayTrie.IHit<Resource>(){
            @Override
            public void hit(int begin, int end, Resource value) {
                Iterator<Term<Resource>> iterator = terms.iterator();
                int length = end - begin;
                boolean isSubStr = false;
                while (iterator.hasNext()) {
                    Term<Resource> current = iterator.next();
                    // 相交且小于当前,移除
                    if (current.end >= begin && length > current.getLength()) {
                        iterator.remove();
                    }
                    if(current.getValue().getValue().contains(value.getValue())){
                        isSubStr = true;
                    }
                }
                if(!isSubStr){
                    terms.add(new Term<Resource>(begin, end, value));
                }
            }
        });
        return terms;
    }

    public List<String> neatSplitResult(List<Term<Resource>> terms){
        List<String> dupResults = Lists.newArrayList();
        for(int j = terms.size() - 1; j > 0; j --){
            String termJ = terms.get(j).getValue().getValue();
            if(!terms.get(j-1).getValue().getValue().endsWith(termJ)){
                dupResults.add(termJ);
            }
        }
        dupResults.add(terms.get(0).getValue().getValue());

        List<String> results = Lists.newArrayList();
        for(int j = dupResults.size() - 1; j >= 0; j--){
            results.add(dupResults.get(j));
        }
        return results;
    }
}
View Code

(2) 引用的AhoCorasickDoubleArrayTrie

package com.test.model.act;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.*;
import java.util.concurrent.LinkedBlockingDeque;

/**
 * An implementation of Aho Corasick algorithm based on Double Array Trie
 *
 * @author hankcs
 */
public class AhoCorasickDoubleArrayTrie<V> implements Serializable {
    /**
     * check array of the Double Array Trie structure
     */
    protected int check[];
    /**
     * base array of the Double Array Trie structure
     */
    protected int base[];
    /**
     * fail table of the Aho Corasick automata
     */
    protected int fail[];
    /**
     * output table of the Aho Corasick automata
     */
    protected int[][] output;
    /**
     * outer value array
     */
    protected V[] v;

    /**
     * the length of every key
     */
    protected int[] l;

    /**
     * the size of base and check array
     */
    protected int size;

    /**
     * Parse text
     *
     * @param text The text
     * @return a list of outputs
     */
    public List<Hit<V>> parseText(String text) {
        int position = 1;
        int currentState = 0;
        List<Hit<V>> collectedEmits = new LinkedList<Hit<V>>();
        for (int i = 0; i < text.length(); ++i) {
            currentState = getState(currentState, text.charAt(i));
            storeEmits(position, currentState, collectedEmits);
            ++position;
        }

        return collectedEmits;
    }

    /**
     * Parse text
     *
     * @param text      The text
     * @param processor A processor which handles the output
     */
    public void parseText(String text, IHit<V> processor) {
        int position = 1;
        int currentState = 0;
        for (int i = 0; i < text.length(); ++i) {
            currentState = getState(currentState, text.charAt(i));
            int[] hitArray = output[currentState];
            if (hitArray != null) {
                for (int hit : hitArray) {
                    processor.hit(position - l[hit], position, v[hit]);
                }
            }
            ++position;
        }
    }

    /**
     * Parse text
     *
     * @param text      The text
     * @param processor A processor which handles the output
     */
    public void parseText(String text, IHitCancellable<V> processor) {
        int currentState = 0;
        for (int i = 0; i < text.length(); i++) {
            final int position = i + 1;
            currentState = getState(currentState, text.charAt(i));
            int[] hitArray = output[currentState];
            if (hitArray != null) {
                for (int hit : hitArray) {
                    boolean proceed = processor.hit(position - l[hit], position, v[hit]);
                    if (!proceed) {
                        return;
                    }
                }
            }
        }
    }

    /**
     * Parse text
     *
     * @param text      The text
     * @param processor A processor which handles the output
     */
    public void parseText(char[] text, IHit<V> processor) {
        int position = 1;
        int currentState = 0;
        for (char c : text) {
            currentState = getState(currentState, c);
            int[] hitArray = output[currentState];
            if (hitArray != null) {
                for (int hit : hitArray) {
                    processor.hit(position - l[hit], position, v[hit]);
                }
            }
            ++position;
        }
    }

    /**
     * Parse text
     *
     * @param text      The text
     * @param processor A processor which handles the output
     */
    public void parseText(char[] text, IHitFull<V> processor) {
        int position = 1;
        int currentState = 0;
        for (char c : text) {
            currentState = getState(currentState, c);
            int[] hitArray = output[currentState];
            if (hitArray != null) {
                for (int hit : hitArray) {
                    processor.hit(position - l[hit], position, v[hit], hit);
                }
            }
            ++position;
        }
    }


    /**
     * Save
     *
     * @param out An ObjectOutputStream object
     * @throws IOException Some IOException
     */
    public void save(ObjectOutputStream out) throws IOException {
        out.writeObject(base);
        out.writeObject(check);
        out.writeObject(fail);
        out.writeObject(output);
        out.writeObject(l);
        out.writeObject(v);
    }

    /**
     * Load
     *
     * @param in An ObjectInputStream object
     * @throws IOException
     * @throws ClassNotFoundException
     */
    public void load(ObjectInputStream in) throws IOException, ClassNotFoundException {
        base = (int[]) in.readObject();
        check = (int[]) in.readObject();
        fail = (int[]) in.readObject();
        output = (int[][]) in.readObject();
        l = (int[]) in.readObject();
        v = (V[]) in.readObject();
    }

    /**
     * Get value by a String key, just like a map.get() method
     *
     * @param key The key
     * @return
     */
    public V get(String key) {
        int index = exactMatchSearch(key);
        if (index >= 0) {
            return v[index];
        }

        return null;
    }

    /**
     * Pick the value by index in value array <br>
     * Notice that to be more efficiently, this method DONOT check the parameter
     *
     * @param index The index
     * @return The value
     */
    public V get(int index) {
        return v[index];
    }

    /**
     * Processor handles the output when hit a keyword
     */
    public interface IHit<V> {
        /**
         * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
         *
         * @param begin the beginning index, inclusive.
         * @param end   the ending index, exclusive.
         * @param value the value assigned to the keyword
         */
        void hit(int begin, int end, V value);
    }

    /**
     * Processor handles the output when hit a keyword, with more detail
     */
    public interface IHitFull<V> {
        /**
         * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
         *
         * @param begin the beginning index, inclusive.
         * @param end   the ending index, exclusive.
         * @param value the value assigned to the keyword
         * @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value
         */
        void hit(int begin, int end, V value, int index);
    }

    /**
     * Callback that allows to cancel the search process.
     */
    public interface IHitCancellable<V> {
        /**
         * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
         *
         * @param begin the beginning index, inclusive.
         * @param end   the ending index, exclusive.
         * @param value the value assigned to the keyword
         * @return Return true for continuing the search and false for stopping it.
         */
        boolean hit(int begin, int end, V value);
    }

    /**
     * A result output
     *
     * @param <V> the value type
     */
    public class Hit<V> {
        /**
         * the beginning index, inclusive.
         */
        public final int begin;
        /**
         * the ending index, exclusive.
         */
        public final int end;
        /**
         * the value assigned to the keyword
         */
        public final V value;

        public Hit(int begin, int end, V value) {
            this.begin = begin;
            this.end = end;
            this.value = value;
        }

        @Override
        public String toString() {
            return String.format("[%d:%d]=%s", begin, end, value);
        }
    }

    /**
     * transmit state, supports failure function
     *
     * @param currentState
     * @param character
     * @return
     */
    private int getState(int currentState, char character) {
        // 先按success跳转
        int newCurrentState = transitionWithRoot(currentState, character);
        // 跳转失败的话,按failure跳转
        while (newCurrentState == -1)
        {
            currentState = fail[currentState];
            newCurrentState = transitionWithRoot(currentState, character);
        }
        return newCurrentState;
    }

    /**
     * store output
     *
     * @param position
     * @param currentState
     * @param collectedEmits
     */
    private void storeEmits(int position, int currentState, List<Hit<V>> collectedEmits) {
        int[] hitArray = output[currentState];
        if (hitArray != null) {
            for (int hit : hitArray) {
                collectedEmits.add(new Hit<V>(position - l[hit], position, v[hit]));
            }
        }
    }

    /**
     * transition of a state
     *
     * @param current
     * @param c
     * @return
     */
    protected int transition(int current, char c) {
        int b = current;
        int p;

        p = b + c + 1;
        if (b == check[p]) {
            b = base[p];

        } else {
            return -1;
        }
        p = b;
        return p;
    }

    /**
     * transition of a state, if the state is root and it failed, then returns the root
     *
     * @param nodePos
     * @param c
     * @return
     */
    protected int transitionWithRoot(int nodePos, char c) {
        int b = base[nodePos];
        int p;

        p = b + c + 1;
        if (b != check[p]) {
            if (nodePos == 0) {
                return 0;
            }
            return -1;
        }

        return p;
    }


    /**
     * Build a AhoCorasickDoubleArrayTrie from a map
     *
     * @param map a map containing key-value pairs
     */
    public void build(Map<String, V> map) {
        new Builder().build(map);
    }


    /**
     * match exactly by a key
     *
     * @param key the key
     * @return the index of the key, you can use it as a perfect hash function
     */
    public int exactMatchSearch(String key) {
        return exactMatchSearch(key, 0, 0, 0);
    }

    /**
     * match exactly by a key
     *
     * @param key
     * @param pos
     * @param len
     * @param nodePos
     * @return
     */
    private int exactMatchSearch(String key, int pos, int len, int nodePos) {
        if (len <= 0){
            len = key.length();
        }
        if (nodePos <= 0){
            nodePos = 0;
        }

        int result = -1;

        char[] keyChars = key.toCharArray();

        int b = base[nodePos];
        int p;

        for (int i = pos; i < len; i++) {
            p = b + (int) (keyChars[i]) + 1;
            if (b == check[p]) {
                b = base[p];
            } else {
                return result;
            }
        }

        p = b;
        int n = base[p];
        if (b == check[p] && n < 0) {
            result = -n - 1;
        }
        return result;
    }

    /**
     * match exactly by a key
     *
     * @param keyChars the char array of the key
     * @param pos      the begin index of char array
     * @param len      the length of the key
     * @param nodePos  the starting position of the node for searching
     * @return the value index of the key, minus indicates null
     */
    private int exactMatchSearch(char[] keyChars, int pos, int len, int nodePos) {
        int result = -1;

        int b = base[nodePos];
        int p;

        for (int i = pos; i < len; i++) {
            p = b + (int) (keyChars[i]) + 1;
            if (b == check[p]){
                b = base[p];
            } else {
                return result;
            }
        }

        p = b;
        int n = base[p];
        if (b == check[p] && n < 0) {
            result = -n - 1;
        }
        return result;
    }

    /**
     * Get the size of the keywords
     *
     * @return
     */
    public int size() {
        return v.length;
    }

    /**
     * A builder to build the AhoCorasickDoubleArrayTrie
     */
    private class Builder {
        /**
         * the root state of trie
         */
        private State rootState = new State();
        /**
         * whether the position has been used
         */
        private boolean used[];
        /**
         * the allocSize of the dynamic array
         */
        private int allocSize;
        /**
         * a parameter controls the memory growth speed of the dynamic array
         */
        private int progress;
        /**
         * the next position to check unused memory
         */
        private int nextCheckPos;
        /**
         * the size of the key-pair sets
         */
        private int keySize;

        /**
         * Build from a map
         *
         * @param map a map containing key-value pairs
         */
        @SuppressWarnings("unchecked")
        public void build(Map<String, V> map) {
            // 把值保存下来
            v = (V[]) map.values().toArray();
            l = new int[v.length];
            Set<String> keySet = map.keySet();
            // 构建二分trie树
            addAllKeyword(keySet);
            // 在二分trie树的基础上构建双数组trie树
            buildDoubleArrayTrie(keySet.size());
            used = null;
            // 构建failure表并且合并output表
            constructFailureStates();
            rootState = null;
            loseWeight();
        }

        /**
         * fetch siblings of a parent node
         *
         * @param parent   parent node
         * @param siblings parent node\'s child nodes, i . e . the siblings
         * @return the amount of the siblings
         */
        private int fetch(State parent, List<Map.Entry<Integer, State>> siblings) {
            if (parent.isAcceptable()) {
                State fakeNode = new State(-(parent.getDepth() + 1));  // 此节点是parent的子节点,同时具备parent的输出
                fakeNode.addEmit(parent.getLargestValueId());
                siblings.add(new AbstractMap.SimpleEntry<Integer, State>(0, fakeNode));
            }
            for (Map.Entry<Character, State> entry : parent.getSuccess().entrySet()) {
                siblings.add(new AbstractMap.SimpleEntry<Integer, State>(entry.getKey() + 1, entry.getValue()));
            }
            return siblings.size();
        }

        /**
         * add a keyword
         *
         * @param keyword a keyword
         * @param index   the index of the keyword
         */
        private void addKeyword(String keyword, int index) {
            State currentState = this.rootState;
            for (Character character : keyword.toCharArray()) {
                currentState = currentState.addState(character);
            }
            currentState.addEmit(index);
            l[index] = keyword.length();
        }

        /**
         * add a collection of keywords
         *
         * @param keywordSet the collection holding keywords
         */
        private void addAllKeyword(Collection<String> keywordSet) {
            int i = 0;
            for (String keyword : keywordSet) {
                addKeyword(keyword, i++);
            }
        }

        /**
         * construct failure table
         */
        private void constructFailureStates() {
            fail = new int[size + 1];
            fail[1] = base[0];
            output = new int[size + 1][];
            Queue<State> queue = new LinkedBlockingDeque<State>();

            // 第一步,将深度为1的节点的failure设为根节点
            for (State depthOneState : this.rootState.getStates()) {
                depthOneState.setFailure(this.rootState, fail);
                queue.add(depthOneState);
                constructOutput(depthOneState);
            }

            // 第二步,为深度 > 1 的节点建立failure表,这是一个bfs
            while (!queue.isEmpty()) {
                State currentState = queue.remove();

                for (Character transition : currentState.getTransitions()) {
                    State targetState = currentState.nextState(transition);
                    queue.add(targetState);

                    State traceFailureState = currentState.failure();
         

以上是关于数据挖掘:提取百度知道QA中的影视信息的主要内容,如果未能解决你的问题,请参考以下文章

数据挖掘:基于Spark+HanLP实现影视评论关键词抽取

java通过百度AI开发平台提取身份证图片中的文字信息

10 个你可能还不知道 VS Code 使用技巧

基于百度OCR提取图像中的文本

正则表达式提取器使用

文本分类