数据挖掘:提取百度知道QA中的影视信息
Posted mengrennwpu
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了数据挖掘:提取百度知道QA中的影视信息相关的知识,希望对你有一定的参考价值。
1. 背景
网站上爬取了部分关于影视的百度知道QA,为了后续提高影视的搜索效果,需要基于百度知道QA的答案抽取相关的影视信息。
2. 流程
目前已有基础的媒资视频库信息,基于媒资视频库中的视频名称,构建分词字典,结合使用AC双数组,然后针对百度的QA进行分词。针对分词后的结果,可以结合视频热度与评分进行筛选。
3. 代码实现
(1) 基于文本(格式为每行一条视频名称),结合AC双数组,构建分词
package com.test.model.act; import com.google.common.collect.Lists; import com.test.util.IOUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.test.model.act.AhoCorasickDoubleArrayTrie.*; import java.io.*; import java.util.Iterator; import java.util.List; import java.util.TreeMap; /** * @author test * @date 2018/11/1 */ public class Act { private static Logger logger = LoggerFactory.getLogger(Act.class); private static Act instance = null; private static String path = "act"; private AhoCorasickDoubleArrayTrie<Resource> act = new AhoCorasickDoubleArrayTrie<>(); public static Act getInstance() throws IOException, ClassNotFoundException { if(null == instance){ instance = new Act(); } return instance; } public Act() throws IOException, ClassNotFoundException { this.initTrie(); } /** * AC自动机初始化 * @throws IOException * @throws ClassNotFoundException */ private void initTrie() throws IOException, ClassNotFoundException { if(new File(path).exists()){ FileInputStream fis = new FileInputStream(path); ObjectInputStream ois = new ObjectInputStream(fis); long curTime = System.currentTimeMillis()/1000; act.load(ois); logger.info("load act cost: " + (System.currentTimeMillis()/1000 - curTime)); }else{ TreeMap<String, Resource> treeMap = new TreeMap<>(); List<String> datas = IOUtil.getPreprocessedData("videoNames.txt"); for(String data : datas){ data = data.trim(); if(!treeMap.containsKey(data)){ Resource resource = new Resource(data); treeMap.put(data, resource); } } long curTime = System.currentTimeMillis()/1000; act.build(treeMap); logger.info("build act cost: " + (System.currentTimeMillis()/1000 - curTime)); curTime = System.currentTimeMillis()/1000; act.save(new ObjectOutputStream(new FileOutputStream(path))); logger.info("save act cost: " + (System.currentTimeMillis()/1000 - curTime)); } } /** * AC字段树最长匹配分词 * @param queryText * @return */ public List<Term<Resource>> parse(String queryText){ final List<Term<Resource>> terms = Lists.newArrayList(); act.parseText(queryText, new AhoCorasickDoubleArrayTrie.IHit<Resource>(){ @Override public void hit(int begin, int end, Resource value) { Iterator<Term<Resource>> iterator = terms.iterator(); int length = end - begin; boolean isSubStr = false; while (iterator.hasNext()) { Term<Resource> current = iterator.next(); // 相交且小于当前,移除 if (current.end >= begin && length > current.getLength()) { iterator.remove(); } if(current.getValue().getValue().contains(value.getValue())){ isSubStr = true; } } if(!isSubStr){ terms.add(new Term<Resource>(begin, end, value)); } } }); return terms; } public List<String> neatSplitResult(List<Term<Resource>> terms){ List<String> dupResults = Lists.newArrayList(); for(int j = terms.size() - 1; j > 0; j --){ String termJ = terms.get(j).getValue().getValue(); if(!terms.get(j-1).getValue().getValue().endsWith(termJ)){ dupResults.add(termJ); } } dupResults.add(terms.get(0).getValue().getValue()); List<String> results = Lists.newArrayList(); for(int j = dupResults.size() - 1; j >= 0; j--){ results.add(dupResults.get(j)); } return results; } }
(2) 引用的AhoCorasickDoubleArrayTrie
package com.test.model.act; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.*; import java.util.concurrent.LinkedBlockingDeque; /** * An implementation of Aho Corasick algorithm based on Double Array Trie * * @author hankcs */ public class AhoCorasickDoubleArrayTrie<V> implements Serializable { /** * check array of the Double Array Trie structure */ protected int check[]; /** * base array of the Double Array Trie structure */ protected int base[]; /** * fail table of the Aho Corasick automata */ protected int fail[]; /** * output table of the Aho Corasick automata */ protected int[][] output; /** * outer value array */ protected V[] v; /** * the length of every key */ protected int[] l; /** * the size of base and check array */ protected int size; /** * Parse text * * @param text The text * @return a list of outputs */ public List<Hit<V>> parseText(String text) { int position = 1; int currentState = 0; List<Hit<V>> collectedEmits = new LinkedList<Hit<V>>(); for (int i = 0; i < text.length(); ++i) { currentState = getState(currentState, text.charAt(i)); storeEmits(position, currentState, collectedEmits); ++position; } return collectedEmits; } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ public void parseText(String text, IHit<V> processor) { int position = 1; int currentState = 0; for (int i = 0; i < text.length(); ++i) { currentState = getState(currentState, text.charAt(i)); int[] hitArray = output[currentState]; if (hitArray != null) { for (int hit : hitArray) { processor.hit(position - l[hit], position, v[hit]); } } ++position; } } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ public void parseText(String text, IHitCancellable<V> processor) { int currentState = 0; for (int i = 0; i < text.length(); i++) { final int position = i + 1; currentState = getState(currentState, text.charAt(i)); int[] hitArray = output[currentState]; if (hitArray != null) { for (int hit : hitArray) { boolean proceed = processor.hit(position - l[hit], position, v[hit]); if (!proceed) { return; } } } } } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ public void parseText(char[] text, IHit<V> processor) { int position = 1; int currentState = 0; for (char c : text) { currentState = getState(currentState, c); int[] hitArray = output[currentState]; if (hitArray != null) { for (int hit : hitArray) { processor.hit(position - l[hit], position, v[hit]); } } ++position; } } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ public void parseText(char[] text, IHitFull<V> processor) { int position = 1; int currentState = 0; for (char c : text) { currentState = getState(currentState, c); int[] hitArray = output[currentState]; if (hitArray != null) { for (int hit : hitArray) { processor.hit(position - l[hit], position, v[hit], hit); } } ++position; } } /** * Save * * @param out An ObjectOutputStream object * @throws IOException Some IOException */ public void save(ObjectOutputStream out) throws IOException { out.writeObject(base); out.writeObject(check); out.writeObject(fail); out.writeObject(output); out.writeObject(l); out.writeObject(v); } /** * Load * * @param in An ObjectInputStream object * @throws IOException * @throws ClassNotFoundException */ public void load(ObjectInputStream in) throws IOException, ClassNotFoundException { base = (int[]) in.readObject(); check = (int[]) in.readObject(); fail = (int[]) in.readObject(); output = (int[][]) in.readObject(); l = (int[]) in.readObject(); v = (V[]) in.readObject(); } /** * Get value by a String key, just like a map.get() method * * @param key The key * @return */ public V get(String key) { int index = exactMatchSearch(key); if (index >= 0) { return v[index]; } return null; } /** * Pick the value by index in value array <br> * Notice that to be more efficiently, this method DONOT check the parameter * * @param index The index * @return The value */ public V get(int index) { return v[index]; } /** * Processor handles the output when hit a keyword */ public interface IHit<V> { /** * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword * * @param begin the beginning index, inclusive. * @param end the ending index, exclusive. * @param value the value assigned to the keyword */ void hit(int begin, int end, V value); } /** * Processor handles the output when hit a keyword, with more detail */ public interface IHitFull<V> { /** * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword * * @param begin the beginning index, inclusive. * @param end the ending index, exclusive. * @param value the value assigned to the keyword * @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value */ void hit(int begin, int end, V value, int index); } /** * Callback that allows to cancel the search process. */ public interface IHitCancellable<V> { /** * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword * * @param begin the beginning index, inclusive. * @param end the ending index, exclusive. * @param value the value assigned to the keyword * @return Return true for continuing the search and false for stopping it. */ boolean hit(int begin, int end, V value); } /** * A result output * * @param <V> the value type */ public class Hit<V> { /** * the beginning index, inclusive. */ public final int begin; /** * the ending index, exclusive. */ public final int end; /** * the value assigned to the keyword */ public final V value; public Hit(int begin, int end, V value) { this.begin = begin; this.end = end; this.value = value; } @Override public String toString() { return String.format("[%d:%d]=%s", begin, end, value); } } /** * transmit state, supports failure function * * @param currentState * @param character * @return */ private int getState(int currentState, char character) { // 先按success跳转 int newCurrentState = transitionWithRoot(currentState, character); // 跳转失败的话,按failure跳转 while (newCurrentState == -1) { currentState = fail[currentState]; newCurrentState = transitionWithRoot(currentState, character); } return newCurrentState; } /** * store output * * @param position * @param currentState * @param collectedEmits */ private void storeEmits(int position, int currentState, List<Hit<V>> collectedEmits) { int[] hitArray = output[currentState]; if (hitArray != null) { for (int hit : hitArray) { collectedEmits.add(new Hit<V>(position - l[hit], position, v[hit])); } } } /** * transition of a state * * @param current * @param c * @return */ protected int transition(int current, char c) { int b = current; int p; p = b + c + 1; if (b == check[p]) { b = base[p]; } else { return -1; } p = b; return p; } /** * transition of a state, if the state is root and it failed, then returns the root * * @param nodePos * @param c * @return */ protected int transitionWithRoot(int nodePos, char c) { int b = base[nodePos]; int p; p = b + c + 1; if (b != check[p]) { if (nodePos == 0) { return 0; } return -1; } return p; } /** * Build a AhoCorasickDoubleArrayTrie from a map * * @param map a map containing key-value pairs */ public void build(Map<String, V> map) { new Builder().build(map); } /** * match exactly by a key * * @param key the key * @return the index of the key, you can use it as a perfect hash function */ public int exactMatchSearch(String key) { return exactMatchSearch(key, 0, 0, 0); } /** * match exactly by a key * * @param key * @param pos * @param len * @param nodePos * @return */ private int exactMatchSearch(String key, int pos, int len, int nodePos) { if (len <= 0){ len = key.length(); } if (nodePos <= 0){ nodePos = 0; } int result = -1; char[] keyChars = key.toCharArray(); int b = base[nodePos]; int p; for (int i = pos; i < len; i++) { p = b + (int) (keyChars[i]) + 1; if (b == check[p]) { b = base[p]; } else { return result; } } p = b; int n = base[p]; if (b == check[p] && n < 0) { result = -n - 1; } return result; } /** * match exactly by a key * * @param keyChars the char array of the key * @param pos the begin index of char array * @param len the length of the key * @param nodePos the starting position of the node for searching * @return the value index of the key, minus indicates null */ private int exactMatchSearch(char[] keyChars, int pos, int len, int nodePos) { int result = -1; int b = base[nodePos]; int p; for (int i = pos; i < len; i++) { p = b + (int) (keyChars[i]) + 1; if (b == check[p]){ b = base[p]; } else { return result; } } p = b; int n = base[p]; if (b == check[p] && n < 0) { result = -n - 1; } return result; } /** * Get the size of the keywords * * @return */ public int size() { return v.length; } /** * A builder to build the AhoCorasickDoubleArrayTrie */ private class Builder { /** * the root state of trie */ private State rootState = new State(); /** * whether the position has been used */ private boolean used[]; /** * the allocSize of the dynamic array */ private int allocSize; /** * a parameter controls the memory growth speed of the dynamic array */ private int progress; /** * the next position to check unused memory */ private int nextCheckPos; /** * the size of the key-pair sets */ private int keySize; /** * Build from a map * * @param map a map containing key-value pairs */ @SuppressWarnings("unchecked") public void build(Map<String, V> map) { // 把值保存下来 v = (V[]) map.values().toArray(); l = new int[v.length]; Set<String> keySet = map.keySet(); // 构建二分trie树 addAllKeyword(keySet); // 在二分trie树的基础上构建双数组trie树 buildDoubleArrayTrie(keySet.size()); used = null; // 构建failure表并且合并output表 constructFailureStates(); rootState = null; loseWeight(); } /** * fetch siblings of a parent node * * @param parent parent node * @param siblings parent node\'s child nodes, i . e . the siblings * @return the amount of the siblings */ private int fetch(State parent, List<Map.Entry<Integer, State>> siblings) { if (parent.isAcceptable()) { State fakeNode = new State(-(parent.getDepth() + 1)); // 此节点是parent的子节点,同时具备parent的输出 fakeNode.addEmit(parent.getLargestValueId()); siblings.add(new AbstractMap.SimpleEntry<Integer, State>(0, fakeNode)); } for (Map.Entry<Character, State> entry : parent.getSuccess().entrySet()) { siblings.add(new AbstractMap.SimpleEntry<Integer, State>(entry.getKey() + 1, entry.getValue())); } return siblings.size(); } /** * add a keyword * * @param keyword a keyword * @param index the index of the keyword */ private void addKeyword(String keyword, int index) { State currentState = this.rootState; for (Character character : keyword.toCharArray()) { currentState = currentState.addState(character); } currentState.addEmit(index); l[index] = keyword.length(); } /** * add a collection of keywords * * @param keywordSet the collection holding keywords */ private void addAllKeyword(Collection<String> keywordSet) { int i = 0; for (String keyword : keywordSet) { addKeyword(keyword, i++); } } /** * construct failure table */ private void constructFailureStates() { fail = new int[size + 1]; fail[1] = base[0]; output = new int[size + 1][]; Queue<State> queue = new LinkedBlockingDeque<State>(); // 第一步,将深度为1的节点的failure设为根节点 for (State depthOneState : this.rootState.getStates()) { depthOneState.setFailure(this.rootState, fail); queue.add(depthOneState); constructOutput(depthOneState); } // 第二步,为深度 > 1 的节点建立failure表,这是一个bfs while (!queue.isEmpty()) { State currentState = queue.remove(); for (Character transition : currentState.getTransitions()) { State targetState = currentState.nextState(transition); queue.add(targetState); State traceFailureState = currentState.failure();以上是关于数据挖掘:提取百度知道QA中的影视信息的主要内容,如果未能解决你的问题,请参考以下文章