计算 SQL 中文本列中一组单词的出现次数
Count the occurrences of a group of words in a text column in SQL
我有两个表如下:
CREATE TABLE keyword_tbl
(
WORDS VARCHAR(100),
TOPIC VARCHAR(100)
);
INSERT INTO keyword_tbl
VALUES ('leaf', 'nature'), ('leaves', 'nature'),
('wind', 'nature'), ('knife', 'utensils'),
('knives', 'utensils'), ('calf', 'animal'),
('calves', 'animal')
CREATE TABLE content
(
CONTENT_ID VARCHAR(100),
DESCRIPTION VARCHAR(100)
);
INSERT INTO content
VALUES ('uuid1', 'leaves fall in autumn like leafs'),
('uuid2', 'the calf is playing in the leaf, the knife' ),
('uuid3', 'knives cutting the wind'),
('uuid4', 'he says hi'),
('uuid5', 'the calves running through the wind')
我希望能够计算每个主题中每个单词的出现次数。我的理想输出如下所示。
content_id
description
nature
utensils
animal
uuid1
leaves fall in autumn like leafs
2
0
0
uuid2
the calf is playing in the leaf, the knife
1
1
1
uuid3
knives cutting the wind
1
1
0
uuid4
he says hi
0
0
0
uuid5
the calves running through the wind
1
0
1
解释:
- 对于 uuid1,我们计算
leaves
和 leaf
因此自然值为 2,
- 对于 uuid2,我们计数
calf
、leaf
、knife
因此自然、器具和动物的计数为 1,等等...
有没有办法自动完成此操作?
像这样创建Split
函数
CREATE FUNCTION [dbo].[Split]
(
@String varchar(8000), @Delimiter char(1)
)
returns @temptable TABLE (items varchar(8000))
as
begin
declare @idx int
declare @slice varchar(8000)
select @idx = 1
if len(@String)<1 or @String is null return
while @idx!= 0
begin
set @idx = charindex(@Delimiter,@String)
if @idx!=0
set @slice = left(@String,@idx - 1)
else
set @slice = @String
if(len(@slice)>0)
insert into @temptable(Items) values(@slice)
set @String = right(@String,len(@String) - @idx)
if len(@String) = 0 break
end
return
end
使用从您的 table
获取字符串计数
select CONTENT_ID,DESCRIPTION,
(select COUNT(1) from keyword_tbl where WORDS in (select items from Split(DESCRIPTION,' ')) AND TOPIC = 'nature') as animal,
(select COUNT(1) from keyword_tbl where WORDS in (select items from Split(DESCRIPTION,' ')) AND TOPIC = 'utensils') as nature,
(select COUNT(1) from keyword_tbl where WORDS in (select items from Split(DESCRIPTION,' ')) AND TOPIC = 'animal') as utensils from content
这里是从 ' '
中拆分出来的字符串,您在 leafs
和 leaf
中的字符串不同,不算。
Tokenizes a string with the given set of delimiters and flattens the results into rows.
SELECT c.CONTENT_ID, c.DESCRIPTION
,COUNT_IF(k.TOPIC = 'nature') AS nature
,COUNT_IF(k.TOPIC = 'utensils') AS utensils
,COUNT_IF(k.TOPIC = 'animal') AS animals
FROM content c
,LATERAL STRTOK_SPLIT_TO_TABLE(c.description, '(),. ') s
JOIN keyword_tbl k
ON TRIM(s.value) = k.words
GROUP BY c.CONTENT_ID, c.DESCRIPTION
ORDER BY c.CONTENT_ID;
输出:
要处理“leaf”、“leafs”,需要更改连接条件:
-- substring
ON TRIM(s.value) ILIKE k.words|| '%'
-- only 's'
ON TRIM(s.value) ILIKE ANY (k.words, k.words|| 's')
输出:
抛出一个基于 regex
的解决方案。
with cte (nature, animal, utensils) as
(select listagg(iff(topic='nature', words,null),'\b|\b'),
listagg(iff(topic='animal', words,null),'\b|\b'),
listagg(iff(topic='utensils', words,null),'\b|\b')
from keyword_tbl)
select a.*,
regexp_count(a.description,nature) as nature,
regexp_count(a.description,utensils) as utensils,
regexp_count(a.description,animal) as animal
from content a
cross join cte;
备注:
|
类似于 OR 条件
\b
添加了 word boundaries,但可以根据需要随意修改
我有两个表如下:
CREATE TABLE keyword_tbl
(
WORDS VARCHAR(100),
TOPIC VARCHAR(100)
);
INSERT INTO keyword_tbl
VALUES ('leaf', 'nature'), ('leaves', 'nature'),
('wind', 'nature'), ('knife', 'utensils'),
('knives', 'utensils'), ('calf', 'animal'),
('calves', 'animal')
CREATE TABLE content
(
CONTENT_ID VARCHAR(100),
DESCRIPTION VARCHAR(100)
);
INSERT INTO content
VALUES ('uuid1', 'leaves fall in autumn like leafs'),
('uuid2', 'the calf is playing in the leaf, the knife' ),
('uuid3', 'knives cutting the wind'),
('uuid4', 'he says hi'),
('uuid5', 'the calves running through the wind')
我希望能够计算每个主题中每个单词的出现次数。我的理想输出如下所示。
content_id | description | nature | utensils | animal |
---|---|---|---|---|
uuid1 | leaves fall in autumn like leafs | 2 | 0 | 0 |
uuid2 | the calf is playing in the leaf, the knife | 1 | 1 | 1 |
uuid3 | knives cutting the wind | 1 | 1 | 0 |
uuid4 | he says hi | 0 | 0 | 0 |
uuid5 | the calves running through the wind | 1 | 0 | 1 |
解释:
- 对于 uuid1,我们计算
leaves
和leaf
因此自然值为 2, - 对于 uuid2,我们计数
calf
、leaf
、knife
因此自然、器具和动物的计数为 1,等等...
有没有办法自动完成此操作?
像这样创建Split
函数
CREATE FUNCTION [dbo].[Split]
(
@String varchar(8000), @Delimiter char(1)
)
returns @temptable TABLE (items varchar(8000))
as
begin
declare @idx int
declare @slice varchar(8000)
select @idx = 1
if len(@String)<1 or @String is null return
while @idx!= 0
begin
set @idx = charindex(@Delimiter,@String)
if @idx!=0
set @slice = left(@String,@idx - 1)
else
set @slice = @String
if(len(@slice)>0)
insert into @temptable(Items) values(@slice)
set @String = right(@String,len(@String) - @idx)
if len(@String) = 0 break
end
return
end
使用从您的 table
获取字符串计数select CONTENT_ID,DESCRIPTION,
(select COUNT(1) from keyword_tbl where WORDS in (select items from Split(DESCRIPTION,' ')) AND TOPIC = 'nature') as animal,
(select COUNT(1) from keyword_tbl where WORDS in (select items from Split(DESCRIPTION,' ')) AND TOPIC = 'utensils') as nature,
(select COUNT(1) from keyword_tbl where WORDS in (select items from Split(DESCRIPTION,' ')) AND TOPIC = 'animal') as utensils from content
这里是从 ' '
中拆分出来的字符串,您在 leafs
和 leaf
中的字符串不同,不算。
Tokenizes a string with the given set of delimiters and flattens the results into rows.
SELECT c.CONTENT_ID, c.DESCRIPTION
,COUNT_IF(k.TOPIC = 'nature') AS nature
,COUNT_IF(k.TOPIC = 'utensils') AS utensils
,COUNT_IF(k.TOPIC = 'animal') AS animals
FROM content c
,LATERAL STRTOK_SPLIT_TO_TABLE(c.description, '(),. ') s
JOIN keyword_tbl k
ON TRIM(s.value) = k.words
GROUP BY c.CONTENT_ID, c.DESCRIPTION
ORDER BY c.CONTENT_ID;
输出:
要处理“leaf”、“leafs”,需要更改连接条件:
-- substring
ON TRIM(s.value) ILIKE k.words|| '%'
-- only 's'
ON TRIM(s.value) ILIKE ANY (k.words, k.words|| 's')
输出:
抛出一个基于 regex
的解决方案。
with cte (nature, animal, utensils) as
(select listagg(iff(topic='nature', words,null),'\b|\b'),
listagg(iff(topic='animal', words,null),'\b|\b'),
listagg(iff(topic='utensils', words,null),'\b|\b')
from keyword_tbl)
select a.*,
regexp_count(a.description,nature) as nature,
regexp_count(a.description,utensils) as utensils,
regexp_count(a.description,animal) as animal
from content a
cross join cte;
备注:
|
类似于 OR 条件
\b
添加了 word boundaries,但可以根据需要随意修改