对比俩个字符串的相似度
Posted 我是司
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了对比俩个字符串的相似度相关的知识,希望对你有一定的参考价值。
package com.opslab.util.algorithmImpl;
import com.opslab.util.CharsetUtil;
import com.opslab.util.SysUtil;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
/**
* 对比俩个字符串的相似度
*/
public class StringImpl {
//第一种实现方式
private static String longestCommonSubstring(String strA, String strB) {
char[] chars_strA = strA.toCharArray();
char[] chars_strB = strB.toCharArray();
int m = chars_strA.length;
int n = chars_strB.length;
int[][] matrix = new int[m + 1][n + 1];
for (int i = 1; i <= m; i++) {
for (int j = 1; j <= n; j++) {
if (chars_strA[i - 1] == chars_strB[j - 1])
matrix[i][j] = matrix[i - 1][j - 1] + 1;
else
matrix[i][j] = Math.max(matrix[i][j - 1], matrix[i - 1][j]);
}
}
char[] result = new char[matrix[m][n]];
int currentIndex = result.length - 1;
while (matrix[m][n] != 0) {
if (matrix[n] == matrix[n - 1])
n--;
else if (matrix[m][n] == matrix[m - 1][n])
m--;
else {
result[currentIndex] = chars_strA[m - 1];
currentIndex--;
n--;
m--;
}
}
return new String(result);
}
private static boolean charReg(char charValue) {
return (charValue >= 0x4E00 && charValue <= 0X9FA5) || (charValue >= ‘a‘ && charValue <= ‘z‘) || (charValue >= ‘A‘ && charValue <= ‘Z‘) || (charValue >= ‘0‘ && charValue <= ‘9‘);
}
private static String removeSign(String str) {
StringBuffer sb = new StringBuffer();
for (char item : str.toCharArray()){
if (charReg(item)) {
sb.append(item);
}
}
return sb.toString();
}
/**
* 快速比较俩个字符串的相似度
*
* @param strA 较长的字符串
* @param strB 较短的字符串
* @return 俩个字符串的相似度
* <p>summary</p>:较长的字符串放到前面有助于提交效率
*/
public static double SimilarDegree(String strA, String strB) {
String newStrA = removeSign(strA);
String newStrB = removeSign(strB);
int temp = Math.max(newStrA.length(), newStrB.length());
int temp2 = longestCommonSubstring(newStrA, newStrB).length();
return temp2 * 1.0 / temp;
}
//第二种实现方式
private static int compare(String str, String target) {
int d[][]; // 矩阵
int n = str.length();
int m = target.length();
int i; // 遍历str的
int j; // 遍历target的
char ch1; // str的
char ch2; // target的
int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
for (i = 0; i <= n; i++) { // 初始化第一列
d[i][0] = i;
}
for (j = 0; j <= m; j++) { // 初始化第一行
d[0][j] = j;
}
for (i = 1; i <= n; i++) { // 遍历str
ch1 = str.charAt(i - 1);
// 去匹配target
for (j = 1; j <= m; j++) {
ch2 = target.charAt(j - 1);
if (ch1 == ch2) {
temp = 0;
} else {
temp = 1;
}
// 左边+1,上边+1, 左上角+temp取最小
d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + temp);
}
}
return d[n][m];
}
private static int min(int one, int two, int three) {
return (one = one < two ? one : two) < three ? one : three;
}
/**
* 获取字符串的相似度
*
* @param str
* @param target
* @return
*/
public static double SimilarityRatio(String str, String target) {
return 1 - (double) compare(str, target) / Math.max(str.length(), target.length());
}
/**
* 获取字符串编码
*
* @param str 需要处理的字符串
*/
public static String simpleEncoding(String str) {
try{
byte[] bs = str.getBytes(SysUtil.JVM_ENCODING);
if(str.equals(new String(bs,CharsetUtil.UTF_8))){
return CharsetUtil.UTF_8;
}
if(str.equals(new String(bs,CharsetUtil.GBK))){
return CharsetUtil.GBK;
}
if(str.equals(new String(bs,"ISO-8859-1"))){
return "ISO-8859-1";
}
}catch(UnsupportedEncodingException e) {
System.out.println("111111111");
e.printStackTrace();
}
String encode = "GB2312";
try {
if (str.equals(new String(str.getBytes(encode), encode))) {
return encode;
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
encode = "ISO-8859-1";
try {
if (str.equals(new String(str.getBytes(encode), encode))) {
return encode;
}
} catch (UnsupportedEncodingException exception1) {
exception1.printStackTrace();
}
encode = "UTF-8";
try {
if (str.equals(new String(str.getBytes(encode), encode))) {
return encode;
}
} catch (UnsupportedEncodingException exception1) {
exception1.printStackTrace();
}
encode = "GBK";
try {
if (str.equals(new String(str.getBytes(encode), encode))) {
return encode;
}
} catch (UnsupportedEncodingException exception1) {
exception1.printStackTrace();
}
return "";
}
}
以上是关于对比俩个字符串的相似度的主要内容,如果未能解决你的问题,请参考以下文章