Databricks/Spark SQL 中的反透视表
Posted
技术标签:
【中文标题】Databricks/Spark SQL 中的反透视表【英文标题】:Unpivot table in Databricks/Spark SQL 【发布时间】:2021-08-17 20:46:04 【问题描述】:我在将 Oracle SQL 中的脚本迁移到 Spark SQL 或 SQL Server 时遇到问题,我需要在 Databricks 实例中运行此脚本,但在 SQL Server/Spark SQL 中,我找不到 unpivot 函数,我试图用另一种方式重写这个表,但是 ops 数据会有所不同,你能帮我解决我做错了什么吗?
带有 Unpivot 的 SQL
select count(*) from
(WITH
pq AS (
SELECT
DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
upper(least(substr(TX_TRECHO,0,3) || substr(TX_TRECHO,5,3) ,substr(TX_TRECHO,5,3) || substr(TX_TRECHO,0,3))) as TX_MERCADO,
SUM(NB_QUANTIDADE_PESQUISA) AS NB_QUANTIDADE_PESQUISA
FROM
bigq.bigq_data_pesquisas
WHERE
(date_format(DT_CAPTURE_DATE, 'yMMdd') >= '20210816'
AND date_format(DT_CAPTURE_DATE, 'yMMdd') <= '20210816')
AND DT_OUTBOUND_DATE >= DT_CAPTURE_DATE
AND (DT_INBOUND_DATE >= DT_OUTBOUND_DATE OR DT_INBOUND_DATE IS NULL)
GROUP BY
DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
upper(least(substr(TX_TRECHO,0,3) || substr(TX_TRECHO,5,3) ,substr(TX_TRECHO,5,3) || substr(TX_TRECHO,0,3)))
),
pn AS (
SELECT
DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
upper(least(substr(TX_TRECHO,0,3) || substr(TX_TRECHO,5,3) ,substr(TX_TRECHO,5,3) || substr(TX_TRECHO,0,3))) as TX_MERCADO,
SUM(NB_QUANTIDADE_PNRS) AS NB_QUANTIDADE_PNRS
FROM
bigq.bigq_data_pnrs
WHERE
(date_format(DT_CAPTURE_DATE, 'yMMdd') >= '20210816'
AND date_format(DT_CAPTURE_DATE, 'yMMdd') <= '20210816')
AND DT_OUTBOUND_DATE >= DT_CAPTURE_DATE
AND (DT_INBOUND_DATE >= DT_OUTBOUND_DATE OR DT_INBOUND_DATE IS NULL)
GROUP BY
DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
upper(least(substr(TX_TRECHO,0,3) || substr(TX_TRECHO,5,3) ,substr(TX_TRECHO,5,3) || substr(TX_TRECHO,0,3)))
)
SELECT
DT_CAPTURE_DATE,
DT_FLIGHT_DATE,
TX_FLIGHT_TYPE,
TX_MERCADO,
SUM(NVL(NB_PESQUISA, 0)),
SUM(NVL(NB_PNRS, 0))
FROM (
SELECT
pq.DT_CAPTURE_DATE DT_CAPTURE_DATE,
pq.DT_OUTBOUND_DATE DT_OUTBOUND_DATE,
pq.DT_INBOUND_DATE DT_INBOUND_DATE,
pq.TX_MERCADO TX_MERCADO,
SUM(pq.NB_QUANTIDADE_PESQUISA) NB_PESQUISA,
SUM(pn.NB_QUANTIDADE_PNRS) NB_PNRS
FROM pq
LEFT JOIN pn
ON
pq.DT_CAPTURE_DATE = pn.DT_CAPTURE_DATE
AND pq.TX_MERCADO = pn.TX_MERCADO
AND pq.DT_OUTBOUND_DATE = pn.DT_OUTBOUND_DATE
AND pq.DT_INBOUND_DATE = pn.DT_INBOUND_DATE
GROUP BY
pq.DT_CAPTURE_DATE,
pq.DT_OUTBOUND_DATE,
pq.DT_INBOUND_DATE,
pq.TX_MERCADO
UNION
SELECT
pn.DT_CAPTURE_DATE DT_CAPTURE_DATE,
pn.DT_OUTBOUND_DATE DT_OUTBOUND_DATE,
pn.DT_INBOUND_DATE DT_INBOUND_DATE,
pn.TX_MERCADO TX_MERCADO,
SUM(pq.NB_QUANTIDADE_PESQUISA) NB_PESQUISA,
SUM(pn.NB_QUANTIDADE_PNRS) NB_PNRS
FROM pq
RIGHT JOIN pn
ON
pq.DT_CAPTURE_DATE = pn.DT_CAPTURE_DATE
AND pq.TX_MERCADO = pn.TX_MERCADO
AND pq.DT_OUTBOUND_DATE = pn.DT_OUTBOUND_DATE
AND pq.DT_INBOUND_DATE = pn.DT_INBOUND_DATE
GROUP BY
pn.DT_CAPTURE_DATE,
pn.DT_OUTBOUND_DATE,
pn.DT_INBOUND_DATE,
pn.TX_MERCADO
)
UNPIVOT(
(DT_FLIGHT_DATE)
FOR TX_FLIGHT_TYPE
IN (
(DT_OUTBOUND_DATE) AS 'Outbound',
(DT_INBOUND_DATE) AS 'Inbound'
)
)
GROUP BY
DT_CAPTURE_DATE,
DT_FLIGHT_DATE,
TX_FLIGHT_TYPE,
TX_MERCADO
)
这是我的脚本:
select count(*) from
(WITH pq
AS (SELECT DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
Upper(Least(Substr(TX_TRECHO, 0, 3) || Substr(TX_TRECHO, 5, 3), Substr(TX_TRECHO, 5, 3) || Substr(TX_TRECHO, 0, 3)))
AS
TX_MERCADO,
SUM(NB_QUANTIDADE_PESQUISA)
AS
NB_QUANTIDADE_PESQUISA
FROM bigq.bigq_data_pesquisas
WHERE (date_format(DT_CAPTURE_DATE, 'yMMdd') >= '20210816'
AND date_format(DT_CAPTURE_DATE, 'yMMdd') <= '20210816')
AND DT_OUTBOUND_DATE >= DT_CAPTURE_DATE
AND (DT_INBOUND_DATE >= DT_OUTBOUND_DATE
OR DT_INBOUND_DATE IS NULL)
GROUP BY DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
Upper(Least(Substr(TX_TRECHO, 0, 3) || Substr(TX_TRECHO, 5, 3), Substr(TX_TRECHO, 5, 3) || Substr(TX_TRECHO, 0, 3)))),
pn
AS (SELECT DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
Upper(Least(Substr(TX_TRECHO, 0, 3) || Substr(TX_TRECHO, 5, 3), Substr(TX_TRECHO, 5, 3) || Substr(TX_TRECHO, 0, 3)))
AS
TX_MERCADO,
Sum(NB_QUANTIDADE_PNRS)
AS
NB_QUANTIDADE_PNRS
FROM bigq.bigq_data_pnrs
WHERE (date_format(DT_CAPTURE_DATE, 'yMMdd') >= '20210816'
AND date_format(DT_CAPTURE_DATE, 'yMMdd') <= '20210816')
AND DT_OUTBOUND_DATE >= DT_CAPTURE_DATE
AND (DT_INBOUND_DATE >= DT_OUTBOUND_DATE
OR DT_INBOUND_DATE IS NULL)
GROUP BY DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
DT_INBOUND_DATE,
Upper(Least(Substr(TX_TRECHO, 0, 3) || Substr(TX_TRECHO, 5, 3), Substr(TX_TRECHO, 5, 3) || Substr(TX_TRECHO, 0, 3)))),
seu_bloco
AS (SELECT pq.DT_CAPTURE_DATE DT_CAPTURE_DATE,
pq.DT_OUTBOUND_DATE DT_OUTBOUND_DATE,
pq.DT_INBOUND_DATE DT_INBOUND_DATE,
pq.TX_MERCADO TX_MERCADO,
Sum(pq.NB_QUANTIDADE_PESQUISA) NB_PESQUISA,
Sum(pn.NB_QUANTIDADE_PNRS) NB_PNRS
FROM pq
LEFT JOIN pn
ON pq.DT_CAPTURE_DATE = pn.DT_CAPTURE_DATE
AND pq.TX_MERCADO = pn.TX_MERCADO
AND pq.DT_OUTBOUND_DATE = pn.DT_OUTBOUND_DATE
AND pq.DT_INBOUND_DATE = pn.DT_INBOUND_DATE
GROUP BY pq.DT_CAPTURE_DATE,
pq.DT_OUTBOUND_DATE,
pq.DT_INBOUND_DATE,
pq.TX_MERCADO
UNION
SELECT pn.DT_CAPTURE_DATE DT_CAPTURE_DATE,
pn.DT_OUTBOUND_DATE DT_OUTBOUND_DATE,
pn.DT_INBOUND_DATE DT_INBOUND_DATE,
pn.TX_MERCADO TX_MERCADO,
Sum(pq.NB_QUANTIDADE_PESQUISA) NB_PESQUISA,
Sum(pn.NB_QUANTIDADE_PNRS) NB_PNRS
FROM pq
RIGHT JOIN pn
ON pq.DT_CAPTURE_DATE = pn.DT_CAPTURE_DATE
AND pq.TX_MERCADO = pn.TX_MERCADO
AND pq.DT_OUTBOUND_DATE = pn.DT_OUTBOUND_DATE
AND pq.DT_INBOUND_DATE = pn.DT_INBOUND_DATE
GROUP BY pn.DT_CAPTURE_DATE,
pn.DT_OUTBOUND_DATE,
pn.DT_INBOUND_DATE,
pn.TX_MERCADO)
SELECT DT_CAPTURE_DATE,
DT_OUTBOUND_DATE DT_FLIGHT_DATE,
'Outbound' TX_FLIGHT_TYPE,
TX_MERCADO,
Sum(NB_PESQUISA) NB_PESQUISA,
Sum(NB_PNRS) NB_PNRS
FROM seu_bloco
GROUP BY DT_CAPTURE_DATE,
DT_OUTBOUND_DATE,
'Outbound',
TX_MERCADO
UNION ALL
SELECT DT_CAPTURE_DATE,
DT_INBOUND_DATE,
'Inbound',
TX_MERCADO,
Sum(NB_PESQUISA),
Sum(NB_PNRS)
FROM seu_bloco
GROUP BY DT_CAPTURE_DATE,
DT_INBOUND_DATE,
'Inbound',
TX_MERCADO)
【问题讨论】:
显然与UNPIVOT
等效的Spark SQL 是STACK
sparkbyexamples.com/spark/…
嗨尼克,谢谢你的帮助,据我所知,我设法解决了 Unpivot 函数,忽略空行,所以值不同,我的脚本确实没有问题,但是它正在读取空行...所以我需要添加一个子句不是 null 以使两个脚本都匹配:D
【参考方案1】:
据我了解,我设法解决了 Unpivot 函数,忽略空行,因此值不同,我的脚本确实没有问题,但它正在读取空行......所以我需要为两个脚本添加一个不为空的子句来匹配
【讨论】:
以上是关于Databricks/Spark SQL 中的反透视表的主要内容,如果未能解决你的问题,请参考以下文章
Databricks/Spark SQL - 如何在地图类型字段中获取数据
从 Azure Synapse 中的 Apache Spark 将数据写入 SQL DW
基于 Databricks Spark SQL 子查询的查询抛出 TreeNodeException