SQL(大查询)文本相似度
Posted
技术标签:
【中文标题】SQL(大查询)文本相似度【英文标题】:SQL (Big Query) Text Similarity 【发布时间】:2018-04-19 14:43:41 【问题描述】:我想做的事情如下: 我得到的数据可能在列中,可能只是一个带有类似方案的字符串:
> 420-xyz-somefancytext-12.3.2018-etc...
> 4-20-xyz-somefancytext-12.3.2018-etc...
> 4-250-xyz-somefancyothertext-13.3.2018-etc...
> 4-230-xyz-somefancyothertext-14.3.2018-etc...
用例想要检测前两行。因为第一个数字和文本非常相似,当然还有日期。我想到的是诸如编辑或余弦距离之类的东西来衡量这种相似性。
我还在 BigQuery 中实现了一个非常简单的 UDF:
CREATE TEMPORARY FUNCTION similariry(Name1 STRING, Name2 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst)
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i)
var src = sources[i];
for (var p in src)
if (src.hasOwnProperty(p)) dst[p] = src[p];
return dst;
;
var Levenshtein =
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2)
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i)
prevRow[i] = i;
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i)
nextCol = i + 1;
for (j=0; j<str2.length; ++j)
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp)
nextCol = tmp;
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp)
nextCol = tmp;
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
return nextCol;
;
var the_Name1;
try
the_Name1 = decodeURI(Name1).toLowerCase();
catch (ex)
the_Name1 = Name1.toLowerCase();
try
the_Name2 = decodeURI(Name2).toLowerCase();
catch (ex)
the_Name2 = Name2.toLowerCase();
return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;
""";
WITH strings AS (
SELECT NULL string1, NULL string2 UNION ALL
SELECT 'test' string1, NULL string2 UNION ALL
SELECT NULL string1, 'test' string2 UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'test' string1, 'blank' string2 UNION ALL
SELECT 'everybody' string1, 'every' string2 UNION ALL
SELECT 'a' string1, 'aaa' string2 UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 UNION ALL
SELECT 'ça' string1, 'ca' string2 UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2
)
SELECT string1, string2, similariry(string1, string2) my_sim
FROM strings
ORDER BY my_sim DESC
它衡量两列的相似性。但我需要的是一种测量行相似度的算法。所以这意味着我必须每隔一行检查每一行。我不知道该怎么做以及如何以最有效的方式去做。最后应该生成一个具有高相似性的行的表。
【问题讨论】:
你能提供一个更具体的“测量行的相似性”的例子吗?请提供一些示例数据和期望的结果。 【参考方案1】:快速大纲第 1 步 - 将表中的所有列值连接到一列中
例如,下面的行
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2
应该变成:
SELECT 'AbromsAbrams' cols UNION ALL
SELECT 'LampleyCampley'
值连接的逻辑可能与上面有所不同——但这只是为了演示方法
第 2 步 - 交叉连接表并应用您想要的任何相似性函数,因此现在您将整行视为一列,并将其与其余行进行明显比较
详情: 做出的假设(为了简单起见):没有重复的字段和结构 - 只是原始数据类型
我将在 CTE 下面使用字符串表,
WITH strings AS (
SELECT NULL string1, NULL string2 UNION ALL
SELECT 'test' string1, NULL string2 UNION ALL
SELECT NULL string1, 'test' string2 UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'test' string1, 'blank' string2 UNION ALL
SELECT 'everybody' string1, 'every' string2 UNION ALL
SELECT 'a' string1, 'aaa' string2 UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 UNION ALL
SELECT 'ça' string1, 'ca' string2 UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2
)
所以将在其余代码中省略它
步骤 1A - 构建 CTE 以提取所有列名称并将它们连接起来,以便我们可以使用它们从结果列中清除
#standardSQL
WITH columns AS (
SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
SELECT
REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
FROM (
SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[]', ''), ',"') pairs
FROM strings t
LIMIT 1
), UNNEST(pairs) pair
)
)
SELECT *
FROM columns
结果是
Row cols
1 "string1":|"string2":
我们很快就会需要这个
步骤 1B - 让我们将原始表格转换为只有一列包含行中所有值的表格
#standardSQL
CREATE TEMPORARY FUNCTION concatenate_row(row STRING, columns STRING) AS ((
REGEXP_REPLACE(REGEXP_REPLACE(row, columns, ''), '"|"|","', '')
));
WITH columns AS (
SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
SELECT
REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
FROM (
SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[]', ''), ',"') pairs
FROM strings t
LIMIT 1
), UNNEST(pairs) pair
)
), lines AS (
SELECT
TO_JSON_STRING(t) original_row,
concatenate_row(TO_JSON_STRING(t), cols) pure_values
FROM strings t
CROSS JOIN columns
)
SELECT *
FROM lines
有结果(只显示几行...)
Row original_row pure_values
1 "string1":"Dunningham","string2":"Cunningham" DunninghamCunningham
2 "string1":"Jeraldine","string2":"Gerladine" JeraldineGerladine
3 "string1":"Géraldine","string2":"Gerladine" GéraldineGerladine
4 "string1":"Jonathon","string2":"Jonathan" JonathonJonathan
5 "string1":"everybody","string2":"every" everybodyevery
最后,第二步 - CROSS JOIN 和计算相似度
#standardSQL
SELECT
similarity(s1.pure_values, s2.pure_values) my_sim,
s1.pure_values s1,
s2.pure_values s2
FROM lines s1
CROSS JOIN lines s2
WHERE s1.pure_values < s2.pure_values
ORDER BY my_sim DESC
结果(只显示几行...)
Row my_sim s1 s2
1 0.8888888888888888 GéraldineGerladine JeraldineGerladine
2 0.5454545454545454 test",null null,null
3 0.5454545454545454 null,"test null,null
4 0.5 aaaa çaca
5 0.36363636363636365 test",null testblank
6 0.36363636363636365 DWAYNEDUANE ÜweUwe
7 0.33333333333333337 JeraldineGerladine JérômeJerome
. . .
注意:这只是你可能的方向,如果选择 - 有足够的空间进行改进、完善等。
因此,如果将所有内容放在一起 - 您会得到以下结果:
#standardSQL
CREATE TEMPORARY FUNCTION concatenate_row(row STRING, columns STRING) AS (
(
REGEXP_REPLACE(REGEXP_REPLACE(row, columns, ''), '"|"|","', '')
)
);
CREATE TEMPORARY FUNCTION similarity(Name1 STRING, Name2 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst)
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i)
var src = sources[i];
for (var p in src)
if (src.hasOwnProperty(p)) dst[p] = src[p];
return dst;
;
var Levenshtein =
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2)
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i)
prevRow[i] = i;
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i)
nextCol = i + 1;
for (j=0; j<str2.length; ++j)
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp)
nextCol = tmp;
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp)
nextCol = tmp;
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
return nextCol;
;
var the_Name1;
try
the_Name1 = decodeURI(Name1).toLowerCase();
catch (ex)
the_Name1 = Name1.toLowerCase();
try
the_Name2 = decodeURI(Name2).toLowerCase();
catch (ex)
the_Name2 = Name2.toLowerCase();
return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;
""";
WITH strings AS (
SELECT NULL string1, NULL string2 UNION ALL
SELECT 'test' string1, NULL string2 UNION ALL
SELECT NULL string1, 'test' string2 UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'test' string1, 'blank' string2 UNION ALL
SELECT 'everybody' string1, 'every' string2 UNION ALL
SELECT 'a' string1, 'aaa' string2 UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 UNION ALL
SELECT 'ça' string1, 'ca' string2 UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2
), columns AS (
SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
SELECT
REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
FROM (
SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[]', ''), ',"') pairs
FROM strings t
LIMIT 1
), UNNEST(pairs) pair
)
), lines AS (
SELECT
TO_JSON_STRING(t) original_row,
concatenate_row(TO_JSON_STRING(t), cols) pure_values
FROM strings t
CROSS JOIN columns
)
SELECT
similarity(s1.pure_values, s2.pure_values) my_sim,
s1.pure_values s1,
s2.pure_values s2
FROM lines s1
CROSS JOIN lines s2
WHERE s1.pure_values < s2.pure_values
ORDER BY my_sim DESC
【讨论】:
以上是关于SQL(大查询)文本相似度的主要内容,如果未能解决你的问题,请参考以下文章