如何删除MYSQL中千万级数据的重复数据?
How can I delete the repeated data in MYSQL with over ten million data in it?
我抓取了很多数据,保存到mysqltable,但是有一些重复的数据,我想通过有效的方式删除它们。
table (ads_info)
+------------------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+------------------+--------------+------+-----+---------+----------------+
| id | int(11) | NO | PRI | NULL | auto_increment |
| ad_id | varchar(64) | YES | MUL | NULL | |
| adset_id | varchar(64) | YES | MUL | NULL | |
| campaign_id | varchar(64) | YES | | NULL | |
| account_id | varchar(64) | YES | MUL | NULL | |
| conversion_specs | text | YES | | NULL | |
| creative | text | YES | | NULL | |
| effective_status | varchar(32) | YES | | NULL | |
| status | varchar(32) | YES | | NULL | |
| name | varchar(255) | YES | | NULL | |
| tracking_specs | text | YES | | NULL | |
| object_store_url | varchar(255) | YES | | NULL | |
| link | varchar(255) | YES | | NULL | |
| object_type | varchar(32) | YES | | NULL | |
| updated_time | timestamp | YES | | NULL | |
| created_time | timestamp | YES | | NULL | |
+------------------+--------------+------+-----+---------+----------------+
show create table ads_info
CREATE TABLE `ads_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`ad_id` varchar(64) DEFAULT NULL,
`adset_id` varchar(64) DEFAULT NULL,
`campaign_id` varchar(64) DEFAULT NULL,
`account_id` varchar(64) DEFAULT NULL,
`conversion_specs` text,
`creative` text,
`effective_status` varchar(32) DEFAULT NULL,
`status` varchar(32) DEFAULT NULL,
`name` varchar(255) DEFAULT NULL,
`tracking_specs` text,
`object_store_url` varchar(255) DEFAULT NULL,
`link` varchar(255) DEFAULT NULL,
`object_type` varchar(32) DEFAULT NULL,
`updated_time` timestamp NULL DEFAULT NULL,
`created_time` timestamp NULL DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `ad_id` (`ad_id`),
KEY `adset_id` (`adset_id`),
KEY `account_id` (`account_id`)
) ENGINE=InnoDB AUTO_INCREMENT=18827534 DEFAULT CHARSET=utf8mb4
table中有超过一千万条广告信息,大约有 40 条这样重复。我想删除所有那些重复的数据。
这是我糟糕的尝试
1)select全部重复ad_id
select ad_id from ads_info group by ad_id having count(id) > 1;
#42387 rows in set (12.42 sec)
查询耗时12s,不知如何优化
2) 使用子查询删除所有这些重复的数据。
delete from ads_info where ad_id in ( select ad_id from (select ad_id from ads_info group by ad_id having count(id) > 1) t);
但是我试了一下没有得到mysql的回复,好像挂了。
如何删除这些重复的数据?
如果你想删除所有出现的事件,那么
您可以尝试使用连接
而不是 IN 子句
delete ads_info
from ads_info
INNER JOIN (
select ad_id
from ads_info
group by ad_id
having count(*) > 1
) T ON T.ad_id = ads_info.ad_id
请确保您在 ads_info.ad_id
上有索引
如果您有索引 .. 但查询优化器不使用并且您确定这是一个有效的索引,您可以尝试使用 USE 或 FORCE
delete ads_info
from ads_info
INNER JOIN (
select ad_id
from ads_info
group by ad_id
having count(*) > 1
) T FORCE INDEX FOR JOIN (`ad_id`) ON T.ad_id = ads_info.ad_id
您首先需要一个 UNIQUE
密钥。这将添加 and dedup:
ALTER IGNORE TABLE ads_info
ADD UNIQUE KEY(ad_id);
我抓取了很多数据,保存到mysqltable,但是有一些重复的数据,我想通过有效的方式删除它们。
table (ads_info)
+------------------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+------------------+--------------+------+-----+---------+----------------+
| id | int(11) | NO | PRI | NULL | auto_increment |
| ad_id | varchar(64) | YES | MUL | NULL | |
| adset_id | varchar(64) | YES | MUL | NULL | |
| campaign_id | varchar(64) | YES | | NULL | |
| account_id | varchar(64) | YES | MUL | NULL | |
| conversion_specs | text | YES | | NULL | |
| creative | text | YES | | NULL | |
| effective_status | varchar(32) | YES | | NULL | |
| status | varchar(32) | YES | | NULL | |
| name | varchar(255) | YES | | NULL | |
| tracking_specs | text | YES | | NULL | |
| object_store_url | varchar(255) | YES | | NULL | |
| link | varchar(255) | YES | | NULL | |
| object_type | varchar(32) | YES | | NULL | |
| updated_time | timestamp | YES | | NULL | |
| created_time | timestamp | YES | | NULL | |
+------------------+--------------+------+-----+---------+----------------+
show create table ads_info
CREATE TABLE `ads_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`ad_id` varchar(64) DEFAULT NULL,
`adset_id` varchar(64) DEFAULT NULL,
`campaign_id` varchar(64) DEFAULT NULL,
`account_id` varchar(64) DEFAULT NULL,
`conversion_specs` text,
`creative` text,
`effective_status` varchar(32) DEFAULT NULL,
`status` varchar(32) DEFAULT NULL,
`name` varchar(255) DEFAULT NULL,
`tracking_specs` text,
`object_store_url` varchar(255) DEFAULT NULL,
`link` varchar(255) DEFAULT NULL,
`object_type` varchar(32) DEFAULT NULL,
`updated_time` timestamp NULL DEFAULT NULL,
`created_time` timestamp NULL DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `ad_id` (`ad_id`),
KEY `adset_id` (`adset_id`),
KEY `account_id` (`account_id`)
) ENGINE=InnoDB AUTO_INCREMENT=18827534 DEFAULT CHARSET=utf8mb4
table中有超过一千万条广告信息,大约有 40 条这样重复。我想删除所有那些重复的数据。
这是我糟糕的尝试
1)select全部重复ad_id
select ad_id from ads_info group by ad_id having count(id) > 1;
#42387 rows in set (12.42 sec)
查询耗时12s,不知如何优化
2) 使用子查询删除所有这些重复的数据。
delete from ads_info where ad_id in ( select ad_id from (select ad_id from ads_info group by ad_id having count(id) > 1) t);
但是我试了一下没有得到mysql的回复,好像挂了。
如何删除这些重复的数据?
如果你想删除所有出现的事件,那么
您可以尝试使用连接
而不是 IN 子句 delete ads_info
from ads_info
INNER JOIN (
select ad_id
from ads_info
group by ad_id
having count(*) > 1
) T ON T.ad_id = ads_info.ad_id
请确保您在 ads_info.ad_id
上有索引如果您有索引 .. 但查询优化器不使用并且您确定这是一个有效的索引,您可以尝试使用 USE 或 FORCE
delete ads_info
from ads_info
INNER JOIN (
select ad_id
from ads_info
group by ad_id
having count(*) > 1
) T FORCE INDEX FOR JOIN (`ad_id`) ON T.ad_id = ads_info.ad_id
您首先需要一个 UNIQUE
密钥。这将添加 and dedup:
ALTER IGNORE TABLE ads_info
ADD UNIQUE KEY(ad_id);