3.01-敏感词过滤
Posted lpzh
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了3.01-敏感词过滤相关的知识,希望对你有一定的参考价值。
编写敏感词过滤器
敏感词过滤器用来过滤掉字符串或文本中的敏感词汇!
package com.nowcoder.community.util; import org.apache.commons.lang3.CharUtils; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; @Component public class SensitiveFilter { private static final Logger logger = LoggerFactory.getLogger(SensitiveFilter.class); // 替换符(用来替换掉敏感词) private static final String REPLACEMENT = "***"; // 根节点 private TrieNode rootNode = new TrieNode(); @PostConstruct // spring容器在实例化bean之后,调用这个初始化方法(即在构造器之后被调用) public void init() { try ( InputStream is = this.getClass().getClassLoader().getResourceAsStream("sensitive-words.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); ) { String keyword; while ((keyword = reader.readLine()) != null) { // 添加到前缀树 this.addKeyword(keyword); } } catch (IOException e) { logger.error("加载敏感词文件失败: " + e.getMessage()); } } // 将一个敏感词添加到前缀树中 private void addKeyword(String keyword) { TrieNode tempNode = rootNode; for (int i = 0; i < keyword.length(); i++) { char c = keyword.charAt(i); TrieNode subNode = tempNode.getSubNode(c); if (subNode == null) { // 初始化子节点 subNode = new TrieNode(); tempNode.addSubNode(c, subNode); } // 指向子节点,进入下一轮循环 tempNode = subNode; // 设置结束标识 if (i == keyword.length() - 1) { tempNode.setKeywordEnd(true); } } } /** * 过滤敏感词 * * @param text 待过滤的文本 * @return 过滤后的文本 */ public String filter(String text) { if (StringUtils.isBlank(text)) { return null; } // 指针1 TrieNode tempNode = rootNode; // 指针2 int begin = 0; // 指针3 int position = 0; // 结果 StringBuilder sb = new StringBuilder(); while (position < text.length()) { char c = text.charAt(position); // 跳过符号 if (isSymbol(c)) { // 若指针1处于根节点,将此符号计入结果,让指针2向下走一步 if (tempNode == rootNode) { sb.append(c); begin++; } // 无论符号在开头或中间,指针3都向下走一步 position++; continue; } // 检查下级节点 tempNode = tempNode.getSubNode(c); if (tempNode == null) { // 以begin开头的字符串不是敏感词 sb.append(text.charAt(begin)); // 进入下一个位置 position = ++begin; // 重新指向根节点 tempNode = rootNode; } else if (tempNode.isKeywordEnd()) { // 发现敏感词,将begin~position字符串替换掉 sb.append(REPLACEMENT); // 进入下一个位置 begin = ++position; // 重新指向根节点 tempNode = rootNode; } else { // 检查下一个字符 position++; } } // 将最后一批字符计入结果 sb.append(text.substring(begin)); return sb.toString(); } // 判断是否为符号 private boolean isSymbol(Character c) { // 0x2E80~0x9FFF 是东亚文字范围 return !CharUtils.isAsciiAlphanumeric(c) && (c < 0x2E80 || c > 0x9FFF); } // 前缀树 private class TrieNode { // 关键词结束标识 private boolean isKeywordEnd = false; // 子节点(key是下级字符,value是下级节点) private Map<Character, TrieNode> subNodes = new HashMap<>(); public boolean isKeywordEnd() { return isKeywordEnd; } public void setKeywordEnd(boolean keywordEnd) { isKeywordEnd = keywordEnd; } // 添加子节点 public void addSubNode(Character c, TrieNode node) { subNodes.put(c, node); } // 获取子节点 public TrieNode getSubNode(Character c) { return subNodes.get(c); } } }
测试敏感词过滤器
单元测试类编写
package com.nowcoder.community; import com.nowcoder.community.util.SensitiveFilter; import org.junit.Test; import org.junit.runner.RunWith; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.junit4.SpringRunner; @RunWith(SpringRunner.class) @SpringBootTest @ContextConfiguration(classes = CommunityApplication.class) public class SensitiveTests { @Autowired private SensitiveFilter sensitiveFilter; @Test public void testSensitiveFilter() { String text = "这里可以赌博,可以嫖娼,可以吸毒,可以开票,哈哈哈!"; text = sensitiveFilter.filter(text); System.out.println(text); text = "这里可以☆赌☆博☆,可以☆嫖☆娼☆,可以☆吸☆毒☆,可以☆开☆票☆,哈哈哈!"; text = sensitiveFilter.filter(text); System.out.println(text); } }
测试结果
. ____ _ __ _ _ /\\ / ___‘_ __ _ _(_)_ __ __ _ ( ( )\\___ | ‘_ | ‘_| | ‘_ / _` | \\/ ___)| |_)| | | | | || (_| | ) ) ) ) ‘ |____| .__|_| |_|_| |_\\__, | / / / / =========|_|==============|___/=/_/_/_/ :: Spring Boot :: (v2.1.5.RELEASE) 2020-03-22 09:54:32,698 INFO [main] c.n.c.SensitiveTests [StartupInfoLogger.java:50] Starting SensitiveTests on lzph-pc with PID 8344 (started by admin in E: owcoder owcoder-workspacecommunity-3.1) 2020-03-22 09:54:32,724 DEBUG [main] c.n.c.SensitiveTests [StartupInfoLogger.java:53] Running with Spring Boot v2.1.5.RELEASE, Spring v5.1.7.RELEASE 2020-03-22 09:54:32,727 INFO [main] c.n.c.SensitiveTests [SpringApplication.java:675] No active profile set, falling back to default profiles: default 2020-03-22 09:54:43,864 INFO [main] o.s.s.c.ThreadPoolTaskExecutor [ExecutorConfigurationSupport.java:171] Initializing ExecutorService ‘applicationTaskExecutor‘ 2020-03-22 09:54:44,702 INFO [main] o.s.b.a.w.s.WelcomePageHandlerMapping [WelcomePageHandlerMapping.java:61] Adding welcome page template: index 2020-03-22 09:54:45,895 INFO [main] c.n.c.SensitiveTests [StartupInfoLogger.java:59] Started SensitiveTests in 15.275 seconds (JVM running for 18.752) 这里可以***,可以***,可以***,可以***,哈哈哈! 这里可以☆***☆,可以☆***☆,可以☆***☆,可以☆***☆,哈哈哈! 2020-03-22 09:54:47,377 INFO [Thread-3] o.s.s.c.ThreadPoolTaskExecutor [ExecutorConfigurationSupport.java:208] Shutting down ExecutorService ‘applicationTaskExecutor‘
参考资料
示例中用到的相关数据结构的知识点:前缀树(字典树),参考如下文章:
https://www.cnblogs.com/luosongchao/p/3239521.html
以上是关于3.01-敏感词过滤的主要内容,如果未能解决你的问题,请参考以下文章