如何使用 SQL (BigQuery) 计算 TF/IDF
Posted
技术标签:
【中文标题】如何使用 SQL (BigQuery) 计算 TF/IDF【英文标题】:How can I compute TF/IDF with SQL (BigQuery) 【发布时间】:2017-10-31 05:42:11 【问题描述】:我正在对 reddit cmets 进行文本分析,我想在 BigQuery 中计算 TF-IDF。
【问题讨论】:
【参考方案1】:此查询分为 5 个阶段:
-
获取我感兴趣的所有 reddit 帖子。规范化单词(LOWER,只有字母和
'
,取消转义一些 html)。将这些单词拆分成一个数组。
计算每个文档中每个单词的 tf(词频) - 计算它在每个文档中出现的次数,相对于所述文档中的单词数。
对于每个单词,计算包含它的文档数。
从(3.),得到idf(inverse document frequency):“包含词的文档的反分数,通过将文档总数除以包含该词的文档数,然后取对数得到商数”
tf*idf 相乘得到 tf-idf。
此查询通过将获得的值向上传递链来设法一次性完成此操作。
#standardSQL
WITH words_by_post AS (
SELECT CONCAT(link_id, '/', id) id, REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(REGEXP_REPLACE(LOWER(body), '&', '&'), r'&[a-z]2,4;', '*')
, r'[a-z]2,20\'?[a-z]+') words
, COUNT(*) OVER() docs_n
FROM `fh-bigquery.reddit_comments.2017_07`
WHERE body NOT IN ('[deleted]', '[removed]')
AND subreddit = 'movies'
AND score > 100
), words_tf AS (
SELECT id, word, COUNT(*) / ARRAY_LENGTH(ANY_VALUE(words)) tf, ARRAY_LENGTH(ANY_VALUE(words)) words_in_doc
, ANY_VALUE(docs_n) docs_n
FROM words_by_post, UNNEST(words) word
GROUP BY id, word
HAVING words_in_doc>30
), docs_idf AS (
SELECT tf.id, word, tf.tf, ARRAY_LENGTH(tfs) docs_with_word, LOG(docs_n/ARRAY_LENGTH(tfs)) idf
FROM (
SELECT word, ARRAY_AGG(STRUCT(tf, id, words_in_doc)) tfs, ANY_VALUE(docs_n) docs_n
FROM words_tf
GROUP BY 1
), UNNEST(tfs) tf
)
SELECT *, tf*idf tfidf
FROM docs_idf
WHERE docs_with_word > 1
ORDER BY tfidf DESC
LIMIT 1000
【讨论】:
我可能是错的,但不知何故我觉得在REGEXP_EXTRACT_ALL
而不是r'[a-z]2,20\'?[a-z]*'
你应该使用r'[a-z]2,20\'?[a-z]+'
感谢您查看@MikhailBerlyant!不同之处在于单词是否可以以'
结尾?
这正是我的想法 - 所以检查一下评论是否有单词环绕撇号 - 例如'abc' :o)
我成功地让它在一个完全不同的数据集上工作。谢谢!现在我正在尝试将词干添加到组合中。如果你有一个词干版本,我很乐意看到它。我的计划是使用一个简单的字典词干分析器并进行连接以将“单词”替换为“词干”。对更好的方法有什么建议吗?【参考方案2】:
堆栈溢出数据集版本:
#standardSQL
WITH words_by_post AS (
SELECT id, REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
LOWER(CONCAT(title, ' ', body))
, r'&', '&')
, r'&[a-z]*;', '')
, r'<[= \-:a-z0-9/\."]*>', '')
, r'[a-z]2,20\'?[a-z]+') words
, title, body
, COUNT(*) OVER() docs_n
FROM `bigquery-public-data.***.posts_questions`
WHERE score >= 150
), words_tf AS (
SELECT id, words
, ARRAY(
SELECT AS STRUCT w word, COUNT(*)/ARRAY_LENGTH(words) tf
FROM UNNEST(words) a
JOIN (SELECT DISTINCT w FROM UNNEST(words) w) b
ON a=b.w
WHERE w NOT IN ('the', 'and', 'for', 'this', 'that', 'can', 'but')
GROUP BY word ORDER BY word
) tfs
, ARRAY_LENGTH((words)) words_in_doc
, docs_n
, title, body
FROM words_by_post
WHERE ARRAY_LENGTH(words)>20
), docs_idf AS (
SELECT *, LOG(docs_n/docs_with_word) idf
FROM (
SELECT id, word, tf.tf, COUNTIF(word IN UNNEST(words)) OVER(PARTITION BY word) docs_with_word, docs_n
, title, body
FROM words_tf, UNNEST(tfs) tf
)
)
SELECT id, ARRAY_AGG(STRUCT(word, tf*idf AS tf_idf, docs_with_word) ORDER BY tf*idf DESC) tfidfs
# , ANY_VALUE(title) title, ANY_VALUE(body) body # makes query slower
FROM docs_idf
WHERE docs_with_word > 1
GROUP BY 1
与上一个答案相比的改进:需要在整个数据集中减少一个 GROUP BY,从而帮助查询运行得更快。
【讨论】:
【参考方案3】:这个可能更容易理解 - 采用一个已经包含每个电视台和每天的单词数的数据集:
# in this query the combination of date+station represents a "document"
WITH data AS (
SELECT *
FROM `gdelt-bq.gdeltv2.iatv_1grams`
WHERE DATE BETWEEN 20190601 AND 20190629
AND station NOT IN ('KSTS', 'KDTV')
)
, word_day_station AS (
# how many times a word is mentioned in each "document"
SELECT word, SUM(count) counts, date, station
FROM data
GROUP BY 1, 3, 4
)
, day_station AS (
# total # of words in each "document"
SELECT SUM(count) counts, date, station
FROM data
GROUP BY 2,3
)
, tf AS (
# TF for a word in a "document"
SELECT word, date, station, a.counts/b.counts tf
FROM word_day_station a
JOIN day_station b
USING(date, station)
)
, word_in_docs AS (
# how many "documents" have a word
SELECT word, COUNT(DISTINCT FORMAT('%i %s', date, station)) indocs
FROM word_day_station
GROUP BY 1
)
, total_docs AS (
# total # of docs
SELECT COUNT(DISTINCT FORMAT('%i %s', date, station)) total_docs
FROM data
)
, idf AS (
# IDF for a word
SELECT word, LOG(total_docs.total_docs/indocs) idf
FROM word_in_docs
CROSS JOIN total_docs
)
SELECT date,
ARRAY_AGG(STRUCT(station, ARRAY_TO_STRING(words, ', ')) ORDER BY station) top_words
FROM (
SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 5) words
FROM (
SELECT word, date, station, tf.tf * idf.idf tfidf
FROM tf
JOIN idf
USING(word)
)
GROUP BY date, station
)
GROUP BY date
ORDER BY date DESC
【讨论】:
以上是关于如何使用 SQL (BigQuery) 计算 TF/IDF的主要内容,如果未能解决你的问题,请参考以下文章
如何将 terraform 文件(main.tf)拆分为多个文件(无模块)?
Bigquery:在 Bigquery 中计算余额或重写 SQL 脚本