Snowflake - 根据满足的条件删除重复行
Snowflake - Remove duplicate rows based on conditions met
我有下面的 table 对于相同的 ID 和相同的文件有重复的开始和结束(因为我不小心从 s3 加载了文件两次)。所以每个 id / start / end 应该只有 1 个文件名
当我 运行 这个查询在我的 table:
SELECT filename, id, start, end from table where id = '262627';
我看到以下内容:
filename
id
start
end
click_total
name_2022/01/01.csv
262627
2022-01-01
2022-01-02
142
name_2022/01/01.csv
262627
2022-01-01
2022-01-02
142
name_2022/01/02.csv
262627
2022-01-02
2022-01-03
753
name_2022/01/02.csv
262627
2022-01-02
2022-01-03
753
但它应该是这样的:
filename
id
start
end
click_total
name_2022/01/01.csv
262627
2022-01-01
2022-01-02
142
name_2022/01/02.csv
262627
2022-01-02
2022-01-03
753
是否有一个查询可以运行删除重复的行,而无需在语句中对每个 id 进行硬编码?
您可以使用 row_number 并仅在 row_number =1 处取货并删除其他行。
select * from (
select column1 as filename,column2 as id, column3 as start1, column4 as end1,
row_number() over (partition by filename, id order by filename, id) as rnum
from values
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03'))
) where rnum = 1
;
请找到创建中间体 table 来存储不同行的示例。
create or replace table tab1 as (select *
from values
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03'))
);
-- create intermediate table using analytical functions to remove duplicate rows.
create table uniq_tab1 as (
select column1 as filename,column2 as id, column3 as start1, column4 as end1
from tab1
qualify row_number() over (partition by filename, id order by filename, id) = 1
);
select * from uniq_tab1;
如果所有字段都是重复的,最简单的方法是将您的 table 替换为已删除重复数据的自身
create or replace table your_table as
select distinct *
from your_table;
好吧,鉴于您在评论中注意到实际上有“另一个列行号”,您现在使用它来删除高行号。
首先 table 假数据“工作”于:
CREATE OR REPLACE TABLE too_much_data AS
select *
from values
('name_2022/01/01.csv' ,262627, '2022-01-01','2022-01-02', 1),
('name_2022/01/01.csv' ,262627, '2022-01-01','2022-01-02', 2),
('name_2022/01/02.csv' ,262627, '2022-01-02','2022-01-03', 3),
('name_2022/01/02.csv' ,262627, '2022-01-02','2022-01-03', 4)
t(filename, id, start1, end1, file_row_number);
现在让我们看看 table:
SELECT * FROM too_much_data;
FILENAME
ID
START1
END1
FILE_ROW_NUMBER
name_2022/01/01.csv
262627
2022-01-01
2022-01-02
1
name_2022/01/01.csv
262627
2022-01-01
2022-01-02
2
name_2022/01/02.csv
262627
2022-01-02
2022-01-03
3
name_2022/01/02.csv
262627
2022-01-02
2022-01-03
4
所以这些是我们要删除的行:
SELECT filename, id, start1, end1, file_row_number
FROM too_much_data
QUALIFY file_row_number <> min(file_row_number) over(partition by filename, id, start1, end1);
FILENAME
ID
START1
END1
FILE_ROW_NUMBER
name_2022/01/01.csv
262627
2022-01-01
2022-01-02
2
name_2022/01/02.csv
262627
2022-01-02
2022-01-03
4
因此 DELETE 可以是:
DELETE FROM too_much_data as d
USING (
SELECT filename, id, start1, end1, file_row_number
FROM too_much_data
QUALIFY file_row_number <> min(file_row_number) over(partition by filename, id, start1, end1)
) as td
WHERE d.filename = td.filename and td.id = d.id and td.start1 = d.start1 and td.end1 = d.end1 and td.file_row_number = d.file_row_number;
number of rows deleted
2
SELECT * FROM too_much_data;
FILENAME
ID
START1
END1
FILE_ROW_NUMBER
name_2022/01/01.csv
262627
2022-01-01
2022-01-02
1
name_2022/01/02.csv
262627
2022-01-02
2022-01-03
3
我有下面的 table 对于相同的 ID 和相同的文件有重复的开始和结束(因为我不小心从 s3 加载了文件两次)。所以每个 id / start / end 应该只有 1 个文件名
当我 运行 这个查询在我的 table:
SELECT filename, id, start, end from table where id = '262627';
我看到以下内容:
filename | id | start | end | click_total |
---|---|---|---|---|
name_2022/01/01.csv | 262627 | 2022-01-01 | 2022-01-02 | 142 |
name_2022/01/01.csv | 262627 | 2022-01-01 | 2022-01-02 | 142 |
name_2022/01/02.csv | 262627 | 2022-01-02 | 2022-01-03 | 753 |
name_2022/01/02.csv | 262627 | 2022-01-02 | 2022-01-03 | 753 |
但它应该是这样的:
filename | id | start | end | click_total |
---|---|---|---|---|
name_2022/01/01.csv | 262627 | 2022-01-01 | 2022-01-02 | 142 |
name_2022/01/02.csv | 262627 | 2022-01-02 | 2022-01-03 | 753 |
是否有一个查询可以运行删除重复的行,而无需在语句中对每个 id 进行硬编码?
您可以使用 row_number 并仅在 row_number =1 处取货并删除其他行。
select * from (
select column1 as filename,column2 as id, column3 as start1, column4 as end1,
row_number() over (partition by filename, id order by filename, id) as rnum
from values
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03'))
) where rnum = 1
;
请找到创建中间体 table 来存储不同行的示例。
create or replace table tab1 as (select *
from values
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/01.csv') ,('262627'), ('2022-01-01'),('2022-01-02')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03')),
(('name_2022/01/02.csv') ,('262627'), ('2022-01-02'),('2022-01-03'))
);
-- create intermediate table using analytical functions to remove duplicate rows.
create table uniq_tab1 as (
select column1 as filename,column2 as id, column3 as start1, column4 as end1
from tab1
qualify row_number() over (partition by filename, id order by filename, id) = 1
);
select * from uniq_tab1;
如果所有字段都是重复的,最简单的方法是将您的 table 替换为已删除重复数据的自身
create or replace table your_table as
select distinct *
from your_table;
好吧,鉴于您在评论中注意到实际上有“另一个列行号”,您现在使用它来删除高行号。
首先 table 假数据“工作”于:
CREATE OR REPLACE TABLE too_much_data AS
select *
from values
('name_2022/01/01.csv' ,262627, '2022-01-01','2022-01-02', 1),
('name_2022/01/01.csv' ,262627, '2022-01-01','2022-01-02', 2),
('name_2022/01/02.csv' ,262627, '2022-01-02','2022-01-03', 3),
('name_2022/01/02.csv' ,262627, '2022-01-02','2022-01-03', 4)
t(filename, id, start1, end1, file_row_number);
现在让我们看看 table:
SELECT * FROM too_much_data;
FILENAME | ID | START1 | END1 | FILE_ROW_NUMBER |
---|---|---|---|---|
name_2022/01/01.csv | 262627 | 2022-01-01 | 2022-01-02 | 1 |
name_2022/01/01.csv | 262627 | 2022-01-01 | 2022-01-02 | 2 |
name_2022/01/02.csv | 262627 | 2022-01-02 | 2022-01-03 | 3 |
name_2022/01/02.csv | 262627 | 2022-01-02 | 2022-01-03 | 4 |
所以这些是我们要删除的行:
SELECT filename, id, start1, end1, file_row_number
FROM too_much_data
QUALIFY file_row_number <> min(file_row_number) over(partition by filename, id, start1, end1);
FILENAME | ID | START1 | END1 | FILE_ROW_NUMBER |
---|---|---|---|---|
name_2022/01/01.csv | 262627 | 2022-01-01 | 2022-01-02 | 2 |
name_2022/01/02.csv | 262627 | 2022-01-02 | 2022-01-03 | 4 |
因此 DELETE 可以是:
DELETE FROM too_much_data as d
USING (
SELECT filename, id, start1, end1, file_row_number
FROM too_much_data
QUALIFY file_row_number <> min(file_row_number) over(partition by filename, id, start1, end1)
) as td
WHERE d.filename = td.filename and td.id = d.id and td.start1 = d.start1 and td.end1 = d.end1 and td.file_row_number = d.file_row_number;
number of rows deleted |
---|
2 |
SELECT * FROM too_much_data;
FILENAME | ID | START1 | END1 | FILE_ROW_NUMBER |
---|---|---|---|---|
name_2022/01/01.csv | 262627 | 2022-01-01 | 2022-01-02 | 1 |
name_2022/01/02.csv | 262627 | 2022-01-02 | 2022-01-03 | 3 |