岛屿和间隙的 SQL:岛屿可以重叠
Posted
技术标签:
【中文标题】岛屿和间隙的 SQL:岛屿可以重叠【英文标题】:SQL for islands-and-gaps: islands can overlap 【发布时间】:2016-10-20 22:42:01 【问题描述】:我有带证书的机器人。有两种证书。对于每种证书(由Certif_ID
标识),对于每个机器人,我都需要最新的认证日期跨度。
为了清楚起见更新:不重叠但连续的日期跨度被视为单个跨度。查看代码顶部示例表中的前两条记录。
日期跨度可能重叠! 这些必须被视为一个单一的跨度。这是我遇到问题的地方。
在 SQL Server 2012 中,按原样运行此代码以查看发生了什么。
BEGIN -- #certif_span
IF OBJECT_ID('TEMPDB..#certif_span') IS NOT NULL DROP TABLE #certif_span;
CREATE TABLE #certif_span
( Robot_ID CHAR(3)
, Certif_ID SMALLINT
, d_Start SMALLDATETIME
, d_End SMALLDATETIME );
INSERT INTO #certif_span VALUES ('210', '1', '2000-01-01', '2001-02-02');
INSERT INTO #certif_span VALUES ('210', '1', '2001-02-03', '2001-12-31');
INSERT INTO #certif_span VALUES ('210', '1', '2000-01-01', '2000-12-31');
INSERT INTO #certif_span VALUES ('880', '1', '2001-01-01', '2001-12-31');
INSERT INTO #certif_span VALUES ('880', '1', '2002-02-02', '2003-02-01');
INSERT INTO #certif_span VALUES ('880', '1', '2003-01-01', '2004-12-31'); -- *
INSERT INTO #certif_span VALUES ('880', '7', '2010-05-05', '2011-05-04');
INSERT INTO #certif_span VALUES ('880', '7', '2011-05-05', '2012-02-10');
INSERT INTO #certif_span VALUES ('880', '7', '2013-03-03', '2013-04-04');
INSERT INTO #certif_span VALUES ('880', '7', '2013-04-01', '2013-05-05'); -- *
-- * This line has dates that overlap with the line above
END
SELECT Robot_ID
, Certif_ID
, d_Start = FORMAT(d_Start, 'yyyy-MM-dd')
, d_End = FORMAT(d_End, 'yyyy-MM-dd')
, commentary = 'Here is the raw data'
FROM #certif_span AS cs
ORDER BY Robot_ID
, Certif_ID
, d_End
IF OBJECT_ID('TEMPDB..#prac_date_span') IS NOT NULL DROP TABLE #prac_date_span;
SELECT DISTINCT
cs.Robot_ID
, cs.Certif_ID
, cs.d_Start
, cs.d_End
INTO
--DROP TABLE --SELECT * FROM
#prac_date_span
FROM
#certif_span AS cs
GROUP BY
cs.Robot_ID
, cs.Certif_ID
, cs.d_Start
, cs.d_End
ORDER BY 1, 2, 3;
BEGIN
IF OBJECT_ID('TEMPDB..#prac_date_span_grp') IS NOT NULL
DROP TABLE #prac_date_span_grp;
WITH cte as (
SELECT
a.Robot_ID, a.Certif_ID
, a.d_Start, a.d_End
FROM
#prac_date_span a
LEFT JOIN #prac_date_span b
ON a.Robot_ID = b.Robot_ID
AND b.Certif_ID = a.Certif_ID
AND a.d_Start - 1 = b.d_End
WHERE
b.Robot_ID IS NULL
UNION ALL -----------------------------
SELECT
a.Robot_ID, a.Certif_ID
, a.d_Start, b.d_End
FROM
cte a
JOIN
#prac_date_span b
ON a.Robot_ID = b.Robot_ID
AND b.Certif_ID = a.Certif_ID
AND b.d_Start - 1 = a.d_End
)
SELECT
Robot_ID
, Certif_ID
, d_Start
, d_End = MAX(d_End)
INTO
--drop table --select * from
#prac_date_span_grp
FROM cte
GROUP BY Robot_ID, Certif_ID, d_Start
ORDER BY Robot_ID, Certif_ID;
END
SELECT
Robot_ID
, Certif_ID
, d_Start = FORMAT(d_Start, 'yyyy-MM-dd')
, d_End = FORMAT(d_End, 'yyyy-MM-dd')
, commentary = 'Here is the grouped data (flawed)'
FROM #prac_date_span_grp
SELECT
Robot_ID
, Certif_ID
, d_Start = FORMAT(MAX(d_Start), 'yyyy-MM-dd')
, d_End = FORMAT(MAX(d_End), 'yyyy-MM-dd')
, commentary = 'Final result: Start date ' +
CASE FORMAT(MAX(d_Start), 'yyyy-MM-dd')
WHEN '2003-01-01' THEN 'should be 2002-02-02'
WHEN '2013-04-01' THEN 'should be 2013-03-03'
ELSE 'good' END
FROM #prac_date_span_grp
GROUP BY Robot_ID, Certif_ID
最终结果应该是:
Robot_ID Certif_ID d_Start d_End
210 1 2000-01-01 2001-12-31
880 1 2002-02-02 2004-12-31
880 7 2013-03-03 2013-05-05
我一直在摆弄日期比较。在cte
的这一点中,-1
看起来允许在日期跨度中错开一天:
AND b.Certif_ID = a.Certif_ID
AND a.d_Start - 1 = b.d_End
...
AND b.Certif_ID = a.Certif_ID
AND b.d_Start - 1 = a.d_End
我确信这是需要解决的问题。我尝试将日期更改为>=
。 (这需要我处理最大递归。)分组更改,但不正确。
【问题讨论】:
请您显示预期的结果。 运行代码将显示这一点。不过,我会在早上发帖。 【参考方案1】:这不是一项简单的任务。我希望这能解决问题。
Declare @certif_span TABLE(Robot_ID CHAR(3), Certif_ID SMALLINT, StartDate date, EndDate date);
INSERT INTO @certif_span VALUES ('210', '1', '2000-01-01', '2001-02-02');
INSERT INTO @certif_span VALUES ('210', '1', '2001-02-03', '2001-12-31');
INSERT INTO @certif_span VALUES ('210', '1', '2000-01-01', '2000-12-31');
INSERT INTO @certif_span VALUES ('880', '1', '2001-01-01', '2001-12-31');
INSERT INTO @certif_span VALUES ('880', '1', '2002-02-02', '2003-02-01');
INSERT INTO @certif_span VALUES ('880', '1', '2003-01-01', '2004-12-31'); -- *
INSERT INTO @certif_span VALUES ('880', '7', '2010-05-05', '2011-05-04');
INSERT INTO @certif_span VALUES ('880', '7', '2011-05-05', '2012-02-10');
INSERT INTO @certif_span VALUES ('880', '7', '2013-03-03', '2013-04-04');
INSERT INTO @certif_span VALUES ('880', '7', '2013-04-01', '2013-05-05'); -- *
;with Src as(
SELECT ROW_NUMBER() Over(Partition by Robot_ID, Certif_ID order by StartDate, EndDate) as RN
,a.*
FROM @certif_span as a
)
, Islands as(
SELECT RN, Robot_ID, Certif_ID, StartDate, EndDate, 0 as islandNo, EndDate AS MovingEnd
FROM Src as a WHERE a.RN=1
UNION ALL
SELECT a.RN, a.Robot_ID, a.Certif_ID, a.StartDate, a.EndDate
, b.islandNo + CASE WHEN DATEDIFF(d, a.StartDate, b.MovingEnd)>=-1 THEN 0 ELSE 1 END as IslandNO
, CASE WHEN a.EndDate>b.MovingEnd THEN a.EndDate ELSE b.MovingEnd END as MovingEnd
FROM Src as a
INNER JOIN Islands as b on a.Robot_ID=b.Robot_ID and a.Certif_ID=b.Certif_ID and a.RN=b.RN+1
) -- SELECT * FROM Islands order by Robot_ID, Certif_ID, IslandNo
, LastIsland as(
SELECT Robot_ID, Certif_ID, islandNo, MIN(StartDate) as startDate, MAX(EndDate) as EndDate
,ROW_NUMBER() over(partition by Robot_ID, Certif_ID order by IslandNO desc) as RN
FROM Islands
Group by Robot_ID, Certif_ID, islandNo
)
SELECT Robot_ID, Certif_ID, startDate, EndDate
FROM LastIsland
where RN=1
【讨论】:
@Smandoli,我更新了代码以适应重叠和连续的日期跨度。我还测试了一些日期范围。我希望,它会为你工作..【参考方案2】:这是一个令人头疼的问题,因为它不是典型的间隙和岛屿,所以我突然想到要先在日期维度之外创建间隙和岛屿。
现在,我确实比您预期的多了一个岛屿。但是,无论我怎么看,它似乎都是正确的。
我还应该注意,我使用 TVF(表值用户定义函数)来创建动态日期范围。这个逻辑可以很容易地移植到初步的 cte 中。一个计数/日历表也可以解决问题。
SQL
;with cte0 as(
Select A.*,GrpSeq=RetSeq-Row_Number() over (Order by RetSeq)
From (
Select Distinct RetSeq,RetVal
From [dbo].[udf-Range-Date]((Select min(d_Start) from #certif_span),(Select max(d_End) from #certif_span),'DD',1) A
Join #certif_span B on A.RetVal between B.d_Start and B.d_End
) A
)
, cte1 as(
Select d_Start = min(A.RetVal)
,d_End = max(A.RetVal)
From cte0 A
Group By GrpSeq
)
Select Robot_ID = min(Robot_ID)
,Certif_ID = min(Certif_ID)
,A.d_Start
,A.d_End
from cte1 A
Join #certif_span B on B.d_Start Between A.d_Start and A.d_End
Group By A.d_Start,A.d_End
退货
Robot_ID Certif_ID d_Start d_End
210 1 2000-01-01 2001-12-31
880 1 2002-02-02 2004-12-31
880 7 2010-05-05 2012-02-10 << Extra Mentioned
880 7 2013-03-03 2013-05-05
UDF(如果需要)
CREATE FUNCTION [dbo].[udf-Range-Date] (@R1 datetime,@R2 datetime,@Part varchar(10),@Incr int)
Returns Table
Return (
with cte0(M) As (Select 1+Case @Part When 'YY' then DateDiff(YY,@R1,@R2)/@Incr When 'QQ' then DateDiff(QQ,@R1,@R2)/@Incr When 'MM' then DateDiff(MM,@R1,@R2)/@Incr When 'WK' then DateDiff(WK,@R1,@R2)/@Incr When 'DD' then DateDiff(DD,@R1,@R2)/@Incr When 'HH' then DateDiff(HH,@R1,@R2)/@Incr When 'MI' then DateDiff(MI,@R1,@R2)/@Incr When 'SS' then DateDiff(SS,@R1,@R2)/@Incr End),
cte1(N) As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),
cte2(N) As (Select Top (Select M from cte0) Row_Number() over (Order By (Select NULL)) From cte1 a, cte1 b, cte1 c, cte1 d, cte1 e, cte1 f, cte1 g, cte1 h ),
cte3(N,D) As (Select 0,@R1 Union All Select N,Case @Part When 'YY' then DateAdd(YY, N*@Incr, @R1) When 'QQ' then DateAdd(QQ, N*@Incr, @R1) When 'MM' then DateAdd(MM, N*@Incr, @R1) When 'WK' then DateAdd(WK, N*@Incr, @R1) When 'DD' then DateAdd(DD, N*@Incr, @R1) When 'HH' then DateAdd(HH, N*@Incr, @R1) When 'MI' then DateAdd(MI, N*@Incr, @R1) When 'SS' then DateAdd(SS, N*@Incr, @R1) End From cte2 )
Select RetSeq = N+1
,RetVal = D
From cte3,cte0
Where D<=@R2
)
/*
Max 100 million observations -- Date Parts YY QQ MM WK DD HH MI SS
Syntax:
Select * from [dbo].[udf-Range-Date]('2016-10-01','2020-10-01','YY',1)
Select * from [dbo].[udf-Range-Date]('2016-01-01','2017-01-01','MM',1)
*/
【讨论】:
此解决方案解决了所提出的问题,并且具有教育意义 (+1)。 @Ahmed Saeed 得到了公认的答案,因为代码更便携(没有 UDF),更重要的是因为它成功地在我的大量生产数据上运行。 @Smandoli 我有一个非常简单的晴雨表......每次都更好地获胜。不过,我确实很感谢您的反馈。以上是关于岛屿和间隙的 SQL:岛屿可以重叠的主要内容,如果未能解决你的问题,请参考以下文章