Netezza 分区排除特定记录
Netezza Parition by with excluding specific records
我在 Netezza 的 web_event table 中有一些数据,格式如下。
vstr_id | sessn_id | sessn_ts | wbpg_nm
V1 | V1S1 | 02-02-2015 09:20:00 | /home/login
V1 | V1S1 | 02-02-2015 09:22:00 | -1
V1 | V1S1 | 02-02-2015 09:30:00 | /home/contacts
V1 | V1S1 | 02-02-2015 09:32:00 | -1
V1 | V1S1 | 02-02-2015 09:50:00 | /home/search
V1 | V1S1 | 02-02-2015 09:55:00 | -1
V2 | V2S1 | 02-02-2015 09:10:00 | /home
V2 | V2S1 | 02-02-2015 09:15:00 | /home/apps
V2 | V2S2 | 02-02-2015 09:20:00 | /home/news
V2 | V2S2 | 02-02-2015 09:23:00 | /home/news/internal
这是我的来源table。
我正在尝试使用 web_event table 并创建另一个 table,如下所示。
我希望 sessn_durtn table 和 time_on_pg table 像下面这样加载。
1) time_on_page 列:这是当前页面和下一页加载之间的时间差,如果没有其他事件或页面加载,会话的最后一页可以有 0 秒。它可以用分钟或秒来表示。
Insert into time_on_pg (select VSTR_ID,
SESSN_ID,
sessn_ts,
WBPG_NM,
????? as time_on_page
from web_event)
vstr_id | sessn_id | sessn_ts | wbpg_nm | wanted_time_on_page | currently_known_time_on_page
V1 | V1S1 | 02-02-2015 09:20:00 | /home/login | 10mins | 2mins
V1 | V1S1 | 02-02-2015 09:22:00 | -1 | | 8mins
V1 | V1S1 | 02-02-2015 09:30:00 | /home/contacts | 20mins | 2mins
V1 | V1S1 | 02-02-2015 09:32:00 | -1 | | 18mins
V1 | V1S1 | 02-02-2015 09:50:00 | /home/search | 5mins | 5mins
V1 | V1S1 | 02-02-2015 09:55:00 | -1 | |
V2 | V2S1 | 02-02-2015 09:10:00 | /home | 5mins | 5mins
V2 | V2S1 | 02-02-2015 09:15:00 | /home/apps | |
V2 | V2S2 | 02-02-2015 09:20:00 | /home/news | 3mins | 3mins
V2 | V2S2 | 02-02-2015 09:23:00 | /home/news/internal | |
我们如何在 Netezza 或任何 SQL 查询中执行此操作?
我有计算 currently_known_time_on_page 的逻辑
SELECT vstr_id,
sessn_id,
sessn_ts,
wbpg_nm,
???????? AS wanted_time_on_page,
extract(epoch from (lag(event_ts) over (partition by vstr_id, sessn_id order by event_ts DESC) - event_ts)) AS currently_known_time_on_page
from web_event;
wanted_time_on_page 和 currently_known_time_on_page 的主要区别是在计算除最后一页之外的时间差时消除了“-1”页。
我觉得event_ts和sessn_ts一样????无论如何,这是一个应该适合你的查询,它使用 OUTER APPLY
技术在 (> sessn_ts)
之后的 table 中查找结果,而不是网页 -1
,然后获取最高结果升序。
只需将 table 名称更改为您的 table。
这是一个主要使用 outer apply
但也使用通用 table 表达式 (cte
) 来设置最后一个 '-1'
所需时间的解决方案.
;WITH cteMaxNeg1 AS (
SELECT
sessn_id
,MaxNeg1SessnTs = MAX(CASE WHEN we.webpg_nm = '-1' THEN we.sessn_ts ELSE NULL END)
,MaxPageSessnTs = MAX(CASE WHEN we.webpg_nm <> '-1' THEN we.sessn_ts ELSE NULL END)
FROM
@WebEvents we
GROUP BY
sessn_id
)
SELECT
we.*
,currently_known_time_on_page = ISNULL(LAG(we.sessn_ts) over (partition by we.vstr_id, we.sessn_id order by we.sessn_ts DESC) - we.sessn_ts,CAST(0 AS DATETIME))
,WantedTimeOnPage = CASE
WHEN we.sessn_ts = m.MaxPageSessnTs AND we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,m.MaxNeg1SessnTs)
WHEN we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,o.sessn_ts)
ELSE NULL
END
FROM
@WebEvents we
LEFT JOIN cteMaxNeg1 m
ON we.sessn_id = m.sessn_id
OUTER APPLY (
SELECT TOP 1sessn_ts
FROM
@WebEvents i
WHERE
i.webpg_nm <> '-1'
AND i.sessn_id = we.sessn_id
AND i.sessn_ts > we.sessn_ts
ORDER BY
i.sessn_ts ASC
) o
ORDER BY
we.sessn_id
,we.sessn_ts
这是一个仅使用 CTE 和 window 函数的解决方案
;WITH cte AS (
SELECT
*
,RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts)
,LastNeg1RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts DESC)
FROM
@WebEvents
)
SELECT
c1.*
,WantedTimeOnPage = CASE
WHEN c1.LastNeg1RowNum = 1 AND c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c3.sessn_ts)
WHEN c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c2.sessn_ts)
ELSE NULL
END
FROM
cte c1
LEFT JOIN cte c2
ON c1.sessn_id = c2.sessn_id
AND (c1.RowNum + 1) = c2.RowNum
AND c2.webpg_nm <> '-1'
LEFT JOIN cte c3
ON c1.sessn_id = c3.sessn_id
AND c3.LastNeg1RowNum = 1
AND c3.webpg_nm = '-1'
ORDER BY
c1.sessn_id
,c1.sessn_ts
我用你的测试数据:
DECLARE @WebEvents AS TABLE (vstr_id CHAR(2), sessn_id CHAR(5), sessn_ts DATETIME, webpg_nm VARCHAR(100))
INSERT INTO @WebEvents (vstr_id, sessn_id, sessn_ts, webpg_nm)
VALUES
('V1','V1S1','02-02-2015 09:20:00','/home/login')
,('V1','V1S1','02-02-2015 09:22:00','-1')
,('V1','V1S1','02-02-2015 09:30:00','/home/contacts')
,('V1','V1S1','02-02-2015 09:32:00','-1')
,('V1','V1S1','02-02-2015 09:50:00','/home/search')
,('V1','V1S1','02-02-2015 09:55:00','-1')
,('V2','V2S1','02-02-2015 09:10:00','/home')
,('V2','V2S1','02-02-2015 09:15:00','/home/apps')
,('V2','V2S2','02-02-2015 09:20:00','/home/news')
,('V2','V2S2','02-02-2015 09:23:00','/home/news/internal')
我不知道你的数据集有多大,有多少内存可用。这个查询是在内存中完成的。您可以将每个单独的 CTE 变成临时表以提高速度。
WITH CTE_SessionOrder AS (
SELECT
sessn_id
,sessn_ts
,wbpg_nm
,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts DESC) AS RowNum -- This is sorted Desc to get last row
FROM
web_event
)
,CTE_KeepLastRow AS (
SELECT *
FROM
CTE_SessionOrder
WHERE
RowNum = 1
AND wbpg_nm = '-1'
)
,CTE_OtherRows AS (
SELECT *
FROM
CTE_SessionOrder
WHERE
wbpg_nm != '-1'
)
,CTE_FilteredData AS (
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_KeepLastRow
UNION
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_OtherRows
)
,CTE_FilterOrderedData AS (
SELECT
*
,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts) AS RowNum -- Now Ordered Asc
FROM
CTE_FilteredData
)
,CTE_FinalData AS (
SELECT
D1.sessn_id
,D1.sessn_ts
,D1.wbpg_nm
,DATEDIFF(mi,D1.sessn_ts,D2.sessn_ts) time_on_page
FROM
CTE_FilterOrderedData D1
LEFT JOIN CTE_FilterOrderedData D2
ON D1.sessn_id = D2.sessn_id
AND D1.RowNum + 1 = D2.RowNum
UNION
SELECT
sessn_id
,sessn_ts
,wbpg_nm
,CAST(NULL AS INT) time_on_page
FROM
CTE_SessionOrder
WHERE
RowNum != 1
AND wbpg_nm = '-1'
)
SELECT *
FROM
CTE_FinalData
我在 Netezza 的 web_event table 中有一些数据,格式如下。
vstr_id | sessn_id | sessn_ts | wbpg_nm
V1 | V1S1 | 02-02-2015 09:20:00 | /home/login
V1 | V1S1 | 02-02-2015 09:22:00 | -1
V1 | V1S1 | 02-02-2015 09:30:00 | /home/contacts
V1 | V1S1 | 02-02-2015 09:32:00 | -1
V1 | V1S1 | 02-02-2015 09:50:00 | /home/search
V1 | V1S1 | 02-02-2015 09:55:00 | -1
V2 | V2S1 | 02-02-2015 09:10:00 | /home
V2 | V2S1 | 02-02-2015 09:15:00 | /home/apps
V2 | V2S2 | 02-02-2015 09:20:00 | /home/news
V2 | V2S2 | 02-02-2015 09:23:00 | /home/news/internal
这是我的来源table。
我正在尝试使用 web_event table 并创建另一个 table,如下所示。
我希望 sessn_durtn table 和 time_on_pg table 像下面这样加载。
1) time_on_page 列:这是当前页面和下一页加载之间的时间差,如果没有其他事件或页面加载,会话的最后一页可以有 0 秒。它可以用分钟或秒来表示。
Insert into time_on_pg (select VSTR_ID,
SESSN_ID,
sessn_ts,
WBPG_NM,
????? as time_on_page
from web_event)
vstr_id | sessn_id | sessn_ts | wbpg_nm | wanted_time_on_page | currently_known_time_on_page
V1 | V1S1 | 02-02-2015 09:20:00 | /home/login | 10mins | 2mins
V1 | V1S1 | 02-02-2015 09:22:00 | -1 | | 8mins
V1 | V1S1 | 02-02-2015 09:30:00 | /home/contacts | 20mins | 2mins
V1 | V1S1 | 02-02-2015 09:32:00 | -1 | | 18mins
V1 | V1S1 | 02-02-2015 09:50:00 | /home/search | 5mins | 5mins
V1 | V1S1 | 02-02-2015 09:55:00 | -1 | |
V2 | V2S1 | 02-02-2015 09:10:00 | /home | 5mins | 5mins
V2 | V2S1 | 02-02-2015 09:15:00 | /home/apps | |
V2 | V2S2 | 02-02-2015 09:20:00 | /home/news | 3mins | 3mins
V2 | V2S2 | 02-02-2015 09:23:00 | /home/news/internal | |
我们如何在 Netezza 或任何 SQL 查询中执行此操作?
我有计算 currently_known_time_on_page 的逻辑
SELECT vstr_id,
sessn_id,
sessn_ts,
wbpg_nm,
???????? AS wanted_time_on_page,
extract(epoch from (lag(event_ts) over (partition by vstr_id, sessn_id order by event_ts DESC) - event_ts)) AS currently_known_time_on_page
from web_event;
wanted_time_on_page 和 currently_known_time_on_page 的主要区别是在计算除最后一页之外的时间差时消除了“-1”页。
我觉得event_ts和sessn_ts一样????无论如何,这是一个应该适合你的查询,它使用 OUTER APPLY
技术在 (> sessn_ts)
之后的 table 中查找结果,而不是网页 -1
,然后获取最高结果升序。
只需将 table 名称更改为您的 table。
这是一个主要使用 outer apply
但也使用通用 table 表达式 (cte
) 来设置最后一个 '-1'
所需时间的解决方案.
;WITH cteMaxNeg1 AS (
SELECT
sessn_id
,MaxNeg1SessnTs = MAX(CASE WHEN we.webpg_nm = '-1' THEN we.sessn_ts ELSE NULL END)
,MaxPageSessnTs = MAX(CASE WHEN we.webpg_nm <> '-1' THEN we.sessn_ts ELSE NULL END)
FROM
@WebEvents we
GROUP BY
sessn_id
)
SELECT
we.*
,currently_known_time_on_page = ISNULL(LAG(we.sessn_ts) over (partition by we.vstr_id, we.sessn_id order by we.sessn_ts DESC) - we.sessn_ts,CAST(0 AS DATETIME))
,WantedTimeOnPage = CASE
WHEN we.sessn_ts = m.MaxPageSessnTs AND we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,m.MaxNeg1SessnTs)
WHEN we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,o.sessn_ts)
ELSE NULL
END
FROM
@WebEvents we
LEFT JOIN cteMaxNeg1 m
ON we.sessn_id = m.sessn_id
OUTER APPLY (
SELECT TOP 1sessn_ts
FROM
@WebEvents i
WHERE
i.webpg_nm <> '-1'
AND i.sessn_id = we.sessn_id
AND i.sessn_ts > we.sessn_ts
ORDER BY
i.sessn_ts ASC
) o
ORDER BY
we.sessn_id
,we.sessn_ts
这是一个仅使用 CTE 和 window 函数的解决方案
;WITH cte AS (
SELECT
*
,RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts)
,LastNeg1RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts DESC)
FROM
@WebEvents
)
SELECT
c1.*
,WantedTimeOnPage = CASE
WHEN c1.LastNeg1RowNum = 1 AND c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c3.sessn_ts)
WHEN c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c2.sessn_ts)
ELSE NULL
END
FROM
cte c1
LEFT JOIN cte c2
ON c1.sessn_id = c2.sessn_id
AND (c1.RowNum + 1) = c2.RowNum
AND c2.webpg_nm <> '-1'
LEFT JOIN cte c3
ON c1.sessn_id = c3.sessn_id
AND c3.LastNeg1RowNum = 1
AND c3.webpg_nm = '-1'
ORDER BY
c1.sessn_id
,c1.sessn_ts
我用你的测试数据:
DECLARE @WebEvents AS TABLE (vstr_id CHAR(2), sessn_id CHAR(5), sessn_ts DATETIME, webpg_nm VARCHAR(100))
INSERT INTO @WebEvents (vstr_id, sessn_id, sessn_ts, webpg_nm)
VALUES
('V1','V1S1','02-02-2015 09:20:00','/home/login')
,('V1','V1S1','02-02-2015 09:22:00','-1')
,('V1','V1S1','02-02-2015 09:30:00','/home/contacts')
,('V1','V1S1','02-02-2015 09:32:00','-1')
,('V1','V1S1','02-02-2015 09:50:00','/home/search')
,('V1','V1S1','02-02-2015 09:55:00','-1')
,('V2','V2S1','02-02-2015 09:10:00','/home')
,('V2','V2S1','02-02-2015 09:15:00','/home/apps')
,('V2','V2S2','02-02-2015 09:20:00','/home/news')
,('V2','V2S2','02-02-2015 09:23:00','/home/news/internal')
我不知道你的数据集有多大,有多少内存可用。这个查询是在内存中完成的。您可以将每个单独的 CTE 变成临时表以提高速度。
WITH CTE_SessionOrder AS (
SELECT
sessn_id
,sessn_ts
,wbpg_nm
,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts DESC) AS RowNum -- This is sorted Desc to get last row
FROM
web_event
)
,CTE_KeepLastRow AS (
SELECT *
FROM
CTE_SessionOrder
WHERE
RowNum = 1
AND wbpg_nm = '-1'
)
,CTE_OtherRows AS (
SELECT *
FROM
CTE_SessionOrder
WHERE
wbpg_nm != '-1'
)
,CTE_FilteredData AS (
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_KeepLastRow
UNION
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_OtherRows
)
,CTE_FilterOrderedData AS (
SELECT
*
,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts) AS RowNum -- Now Ordered Asc
FROM
CTE_FilteredData
)
,CTE_FinalData AS (
SELECT
D1.sessn_id
,D1.sessn_ts
,D1.wbpg_nm
,DATEDIFF(mi,D1.sessn_ts,D2.sessn_ts) time_on_page
FROM
CTE_FilterOrderedData D1
LEFT JOIN CTE_FilterOrderedData D2
ON D1.sessn_id = D2.sessn_id
AND D1.RowNum + 1 = D2.RowNum
UNION
SELECT
sessn_id
,sessn_ts
,wbpg_nm
,CAST(NULL AS INT) time_on_page
FROM
CTE_SessionOrder
WHERE
RowNum != 1
AND wbpg_nm = '-1'
)
SELECT *
FROM
CTE_FinalData