emoji处理方法
Posted 安
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了emoji处理方法相关的知识,希望对你有一定的参考价值。
在做微信公众号开发时碰到了获取微信基本信息的需求,但是在像数据库保存用户昵称的时候出错了,
出错原因是微信用户的昵称中包含emoji等特殊符号,表情图片,
mysql数据库使用的是utf8,最大存储3个字节,而emoji等以4个字节进行的保存,所以保存不了
处理方法:
1:修改数据库编码由utf8升级为utf8mb4,utf8mb4是utf8的超级,包含全部unicode编码;该方法没有具体操作;
2:进行过滤,对获取到的用户昵称进行编码过滤,对emoji等替换为“”空;但是该方法在碰到iso上的一些emoji就失败了。在下方增加了一些处理过滤方法;
该过滤方法找自于网上 /** * 检测是否有emoji字符 * @param source * @return 一旦含有就抛出 */ public static boolean containsEmoji(String source) { if (StringUtils.isBlank(source)) { return false; } int len = source.length(); for (int i = 0; i < len; i++) { char codePoint = source.charAt(i); if (isEmojiCharacter(codePoint)) { //do nothing,判断到了这里表明,确认有表情字符 return true; } } return false; } private static boolean isEmojiCharacter(char codePoint) { return (codePoint == 0x0) || (codePoint == 0x9) || (codePoint == 0xA) || (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF)); } /** * 过滤emoji 或者 其他非文字类型的字符 * @param source * @return */ public static String filterEmoji(String source) { if (!containsEmoji(source)) {
//特殊处理
source = filterSpecialCharacter(source);
return source;//如果不包含,直接返回 } //到这里铁定包含 StringBuilder buf = null; int len = source.length(); for (int i = 0; i < len; i++) { char codePoint = source.charAt(i); if (isEmojiCharacter(codePoint)) { if (buf == null) { buf = new StringBuilder(source.length()); } buf.append(codePoint); } else { } } if (buf == null) { return source;//如果没有找到 emoji表情,则返回源字符串 } else { if (buf.length() == len) {//这里的意义在于尽可能少的toString,因为会重新生成字符串 buf = null; return source; } else { return buf.toString(); } } }
/** * 判断特殊字符,替换成空格 * * @param source * @return 过滤后的字符串 */ public static String filterSpecialCharacter(String source) { if(StringUtils.isNotBlank(source)){ Pattern emoji = Pattern.compile("[\\ud83c\\udc00-\\ud83c\\udfff]|[\\ud83d\\udc00-\\ud83d\\udfff]|[\\u2600-\\u27ff]",Pattern . UNICODE_CASE | Pattern . CASE_INSENSITIVE); Matcher emojiMatcher = emoji.matcher(source); if (emojiMatcher.find()) { return source.replaceAll("[\\\\ud800\\\\udc00-\\\\udbff\\\\udfff\\\\ud800-\\\\udfff]", ""); }else{ return source; } }else{ return source; } }
方法3:
/** * 过滤掉超过3个字节的UTF8字符 * @param text * @return * @throws UnsupportedEncodingException */ public static String filterOffUtf8Mb4(String text) throws UnsupportedEncodingException { byte[] bytes = text.getBytes("utf-8"); ByteBuffer buffer = ByteBuffer.allocate(bytes.length); int i = 0; while (i < bytes.length) { short b = bytes[i]; if (b > 0) { buffer.put(bytes[i++]); continue; } b += 256; // 去掉符号位 if (((b >> 5) ^ 0x6) == 0) { buffer.put(bytes, i, 2); i += 2; } else if (((b >> 4) ^ 0xE) == 0) { buffer.put(bytes, i, 3); i += 3; } else if (((b >> 3) ^ 0x1E) == 0) { i += 4; } else if (((b >> 2) ^ 0x3E) == 0) { i += 5; } else if (((b >> 1) ^ 0x7E) == 0) { i += 6; } else { buffer.put(bytes[i++]); } } buffer.flip(); return new String(buffer.array(), "utf-8"); }
方法4:进行编码转换保存
将需要处理的字符串进行编码转换,存储到数据库
/** * 字符串转换ascii */ public static String string2Unicode(String string) { StringBuffer unicode = new StringBuffer(); for (int i = 0; i < string.length(); i++) { // 取出每一个字符 char c = string.charAt(i); // 转换为unicode unicode.append("\\\\u" + Integer.toHexString(c)); } return unicode.toString(); } /** * ascii 转字符串 */ public static String unicode2String(String unicode) { StringBuffer string = new StringBuffer(); String[] hex = unicode.split("\\\\\\\\u"); for (int i = 1; i < hex.length; i++) { // 转换出每一个代码点 int data = Integer.parseInt(hex[i], 16); // 追加成string string.append((char) data); } return string.toString(); }
在页面获取的时候进行处理
//js ascii转string function ascii2native(){ //var character=document.getElementById("nikeunicode").value.split("\\\\u"); var x=document.getElementsByClassName("nikeunicode"); var k; for (k = 0; k < x.length; k++) { console.log(x[k].innerhtml); var character=x[k].innerHTML.split("\\\\u"); var native=character[0]; console.log(native); for(var i=1;i<character.length;i++){ var code=character[i]; native+=String.fromCharCode(parseInt("0x"+code.substring(0,4))); if(code.length>4){ native+=code.substring(4,code.length); } } x[k].innerHTML=native; } //document.getElementById("nikeunicode").value=native1; }
页面处理过的效果
上文中的方法在android输入法自带的emoji下,没有起到效果,在上文方法2的if (isEmojiCharacter(codePoint)) 处加入下列判断
private static boolean isChinese(char c) { Character.UnicodeScript sc = Character.UnicodeScript.of(c); if (sc == Character.UnicodeScript.HAN) { return true; } return false; } public static boolean isPunctuation(char c) { Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); if ( // punctuation, spacing, and formatting characters ub == Character.UnicodeBlock.GENERAL_PUNCTUATION // symbols and punctuation in the unified Chinese, Japanese and Korean script || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION // fullwidth character or a halfwidth character || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS // vertical glyph variants for east Asian compatibility || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS // vertical punctuation for compatibility characters with the Chinese Standard GB 18030 || ub == Character.UnicodeBlock.VERTICAL_FORMS // ascii || ub == Character.UnicodeBlock.BASIC_LATIN ) { return true; } else { return false; } } private static Boolean isUserDefined(char c) { Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); if (ub == Character.UnicodeBlock.NUMBER_FORMS || ub == Character.UnicodeBlock.ENCLOSED_ALPHANUMERICS || ub == Character.UnicodeBlock.LETTERLIKE_SYMBOLS || c == \'\\ufeff\' || c == \'\\u00a0\' ) return true; return false; } public static boolean isMessy(String str) { float chlength = 0; float count = 0; for(int i = 0; i < str.length(); i++) { char c = str.charAt(i); if(isPunctuation(c) || isUserDefined(c)) continue; else { if(!isChinese(c)) { count = count + 1; } chlength ++; } } float result = count / chlength; if(result > 0.3){ return true; }else{ return false; } }
以上是关于emoji处理方法的主要内容,如果未能解决你的问题,请参考以下文章