敏感词屏蔽工具类


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;public class SensitiveWordUtils {/*** 敏感词匹配规则*/public static final int MinMatchTYpe = 1;      //最小匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国]人public static final int MaxMatchType = 2;      //最大匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国人]/*** 敏感词集合*/public static HashMap sensitiveWordMap;/*** 初始化敏感词库,构建DFA算法模型** @param sensitiveWordSet 敏感词库*/public static synchronized void init(Set<String> sensitiveWordSet) {initSensitiveWordMap(sensitiveWordSet);}/*** 初始化敏感词库,构建DFA算法模型** @param sensitiveWordSet 敏感词库*/private static void initSensitiveWordMap(Set<String> sensitiveWordSet) {//初始化敏感词容器,减少扩容操作sensitiveWordMap = new HashMap(sensitiveWordSet.size());String key;Map nowMap;Map<String, String> newWorMap;//迭代sensitiveWordSetIterator<String> iterator = sensitiveWordSet.iterator();while (iterator.hasNext()) {//关键字key = iterator.next();nowMap = sensitiveWordMap;for (int i = 0; i < key.length(); i++) {//转换成char型char keyChar = key.charAt(i);//库中获取关键字Object wordMap = nowMap.get(keyChar);//如果存在该key,直接赋值,用于下一个循环获取if (wordMap != null) {nowMap = (Map) wordMap;} else {//不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个newWorMap = new HashMap<>();//不是最后一个newWorMap.put("isEnd", "0");nowMap.put(keyChar, newWorMap);nowMap = newWorMap;}if (i == key.length() - 1) {//最后一个nowMap.put("isEnd", "1");}}}}/*** 判断文字是否包含敏感字符** @param txt       文字* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则* @return 若包含返回true,否则返回false*/public static boolean contains(String txt, int matchType) {boolean flag = false;for (int i = 0; i < txt.length(); i++) {int matchFlag = checkSensitiveWord(txt, i, matchType); //判断是否包含敏感字符if (matchFlag > 0) {    //大于0存在,返回trueflag = true;}}return flag;}/*** 判断文字是否包含敏感字符** @param txt 文字* @return 若包含返回true,否则返回false*/public static boolean contains(String txt) {return contains(txt, MaxMatchType);}/*** 获取文字中的敏感词** @param txt       文字* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则* @return*/public static Set<String> getSensitiveWord(String txt, int matchType) {Set<String> sensitiveWordList = new HashSet<>();for (int i = 0; i < txt.length(); i++) {//判断是否包含敏感字符int length = checkSensitiveWord(txt, i, matchType);if (length > 0) {//存在,加入list中sensitiveWordList.add(txt.substring(i, i + length));i = i + length - 1;//减1的原因,是因为for会自增}}return sensitiveWordList;}/*** 获取文字中的敏感词** @param txt 文字* @return*/public static Set<String> getSensitiveWord(String txt) {return getSensitiveWord(txt, MaxMatchType);}/*** 替换敏感字字符** @param txt         文本* @param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱**** @param matchType   敏感词匹配规则* @return*/public static String replaceSensitiveWord(String txt, char replaceChar, int matchType) {String resultTxt = txt;//获取所有的敏感词Set<String> set = getSensitiveWord(txt, matchType);Iterator<String> iterator = set.iterator();String word;String replaceString;while (iterator.hasNext()) {word = iterator.next();replaceString = getReplaceChars(replaceChar, word.length());resultTxt = resultTxt.replaceAll(word, replaceString);}return resultTxt;}/*** 替换敏感字字符** @param txt         文本* @param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱**** @return*/public static String replaceSensitiveWord(String txt, char replaceChar) {return replaceSensitiveWord(txt, replaceChar, MaxMatchType);}/*** 替换敏感字字符** @param txt        文本* @param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]* @param matchType  敏感词匹配规则* @return*/public static String replaceSensitiveWord(String txt, String replaceStr, int matchType) {String resultTxt = txt;//获取所有的敏感词Set<String> set = getSensitiveWord(txt, matchType);Iterator<String> iterator = set.iterator();String word;while (iterator.hasNext()) {word = iterator.next();resultTxt = resultTxt.replaceAll(word, replaceStr);}return resultTxt;}/*** 替换敏感字字符** @param txt        文本* @param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]* @return*/public static String replaceSensitiveWord(String txt, String replaceStr) {return replaceSensitiveWord(txt, replaceStr, MaxMatchType);}/*** 获取替换字符串** @param replaceChar* @param length* @return*/private static String getReplaceChars(char replaceChar, int length) {String resultReplace = String.valueOf(replaceChar);for (int i = 1; i < length; i++) {resultReplace += replaceChar;}return resultReplace;}/*** 检查文字中是否包含敏感字符,检查规则如下:<br>** @param txt* @param beginIndex* @param matchType* @return 如果存在,则返回敏感词字符的长度,不存在返回0*/private static int checkSensitiveWord(String txt, int beginIndex, int matchType) {//敏感词结束标识位:用于敏感词只有1位的情况boolean flag = false;//匹配标识数默认为0int matchFlag = 0;char word;Map nowMap = sensitiveWordMap;for (int i = beginIndex; i < txt.length(); i++) {word = txt.charAt(i);//获取指定keynowMap = (Map) nowMap.get(word);if (nowMap != null) {//存在,则判断是否为最后一个//找到相应key,匹配标识+1matchFlag++;//如果为最后一个匹配规则,结束循环,返回匹配标识数if ("1".equals(nowMap.get("isEnd"))) {//结束标志位为trueflag = true;//最小规则,直接返回,最大规则还需继续查找if (MinMatchTYpe == matchType) {break;}}} else {//不存在,直接返回break;}}if (matchFlag < 2 || !flag) {//长度必须大于等于1,为词matchFlag = 0;}return matchFlag;}/*** 读取敏感资源文件** @param name  文件名* @return 读取成功返回set集合 否则抛出异常*/public static Set<String> readResource(String name) throws Exception {//存放文件内容的set集合Set<String> set = null;//获取文件的路径String path = Thread.currentThread().getContextClassLoader().getResource(name).getPath();//读取文件File file = new File(path);//建立读取流InputStreamReader read = new InputStreamReader(new FileInputStream(path),"utf-8");//判断文件是否存在if(file.isFile() && file.exists()){//初始化set集合set = new HashSet<String>();//缓冲区读取流BufferedReader bufferedReader = new BufferedReader(read);//循环读取文件中内容,每次读取一行内容String txt = null;while((txt = bufferedReader.readLine()) != null){//读取文件,将文件内容放入到set中set.add(txt);}}else {throw new Exception();}read.close();return set;}public static void main(String[] args) throws Exception {/*Set<String> sensitiveWordSet = new HashSet<>();sensitiveWordSet.add("太多");sensitiveWordSet.add("爱恋");sensitiveWordSet.add("静静");sensitiveWordSet.add("哈哈");sensitiveWordSet.add("啦啦");sensitiveWordSet.add("感动");sensitiveWordSet.add("发呆");*/Set<String> sensitiveWordSet=readResource("keywords.txt");System.err.println(sensitiveWordSet.size());//初始化敏感词库SensitiveWordUtils.init(sensitiveWordSet);System.out.println("敏感词的数量:" + SensitiveWordUtils.sensitiveWordMap.size());String string = "太多的伤感情怀也许只局限于饲养基地 荧幕中的情节。"+ "然后我们的扮演的角色就是跟随着主人公的喜红客联盟 怒哀乐而过于牵强的把自己的情感也附加于银幕情节中,然后感动就流泪,"+ "难过就躺在某一个人的怀里尽情的阐述心扉或者手机卡复制器一个贱人一杯红酒一部电影在夜 深人静的晚上,关上电话静静的发呆着。";System.out.println("待检测语句字数:" + string.length());//是否含有关键字boolean result = SensitiveWordUtils.contains(string);System.out.println(result);result = SensitiveWordUtils.contains(string, SensitiveWordUtils.MinMatchTYpe);System.out.println(result);//获取语句中的敏感词Set<String> set = SensitiveWordUtils.getSensitiveWord(string);System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set);set = SensitiveWordUtils.getSensitiveWord(string, SensitiveWordUtils.MinMatchTYpe);System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set);//替换语句中的敏感词String filterStr = SensitiveWordUtils.replaceSensitiveWord(string, '*');System.out.println(filterStr);filterStr = SensitiveWordUtils.replaceSensitiveWord(string, '*', SensitiveWordUtils.MinMatchTYpe);System.out.println(filterStr);String filterStr2 = SensitiveWordUtils.replaceSensitiveWord(string, "[*敏感词*]");System.out.println(filterStr2);filterStr2 = SensitiveWordUtils.replaceSensitiveWord(string, "[*敏感词*]", SensitiveWordUtils.MinMatchTYpe);System.out.println(filterStr2);}}
屏蔽词资源包下载

Published by

风君子

独自遨游何稽首 揭天掀地慰生平