与 postgres 函数的相似性检查
similarity check with postgres function
我有 table 和 SentenceID 和词袋(tokenizedsentence::varchar[]):
sID | TokenizedSentence
1 | {0, 0, 0, 0, 1, 1, 0, 0, 1, 0}
2 | {1, 1, 0, 0, 1, 1, 1, 1, 1, 1}
3 | {0, 1, 1, 0, 1, 0, 0, 0, 1, 1}
4 | {1, 1, 0, 1, 1, 0, 1, 0, 1, 1}
5 | {1, 0, 0, 0, 1, 1, 0, 0, 1, 0}
我想使用词袋表示来比较句子的相似性。我写了函数,但我遗漏了一些东西。这个想法是将每个数组值与相应的值进行比较,只有当值为 1(如果单词在句子中可用)时才增加计数器。在遍历所有值后,计数器除以数组的长度。我写的函数:
CREATE OR REPLACE FUNCTION bow() RETURNS float AS $$
DECLARE
length int:= array_length(nlpdata.tokenizedsentence, 1)
counter int;
result float;
BEGIN
FROM nlpdata a, nlpdata b;
FOR i IN 0..length LOOP
IF tokenizedSentence[i] = 1 THEN
IF a.tokenizedSentence[i] = b.tokenizedSentence[i] THEN
counter := counter + 1;
END IF;
END IF;
END LOOP;
result = counter / length
RETURN;
END;
$$ LANGUAGE plpgsql;
也不知道如何删除 "FROM nlpdata a, nlpdata b"。有什么想法吗?
何去何从(如果我正确理解任务):
with t(t_id, t_serie) as ( -- Test data
values
(1, array[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]),
(2, array[1, 1, 0, 0, 1, 1, 1, 1, 1, 1]),
(3, array[0, 1, 1, 0, 1, 0, 0, 0, 1, 1]),
(4, array[1, 1, 0, 1, 1, 0, 1, 0, 1, 1]),
(5, array[1, 0, 0, 0, 1, 1, 0, 0, 1, 0])
)
select
*, -- Data columns
-- Positions of 1s in the arrays
array_positions(t1.t_serie, 1), array_positions(t2.t_serie, 1),
-- Intersections of the positins
array(select unnest(array_positions(t1.t_serie, 1)) intersect select unnest(array_positions(t2.t_serie, 1))),
-- Count of intersections / length of arrays
cardinality(array(select unnest(array_positions(t1.t_serie, 1)) intersect select unnest(array_positions(t2.t_serie, 1))))::float / cardinality(t1.t_serie)::float
from t as t1 cross join t as t2
where
t1.t_id <> t2.t_id
我有 table 和 SentenceID 和词袋(tokenizedsentence::varchar[]):
sID | TokenizedSentence
1 | {0, 0, 0, 0, 1, 1, 0, 0, 1, 0}
2 | {1, 1, 0, 0, 1, 1, 1, 1, 1, 1}
3 | {0, 1, 1, 0, 1, 0, 0, 0, 1, 1}
4 | {1, 1, 0, 1, 1, 0, 1, 0, 1, 1}
5 | {1, 0, 0, 0, 1, 1, 0, 0, 1, 0}
我想使用词袋表示来比较句子的相似性。我写了函数,但我遗漏了一些东西。这个想法是将每个数组值与相应的值进行比较,只有当值为 1(如果单词在句子中可用)时才增加计数器。在遍历所有值后,计数器除以数组的长度。我写的函数:
CREATE OR REPLACE FUNCTION bow() RETURNS float AS $$
DECLARE
length int:= array_length(nlpdata.tokenizedsentence, 1)
counter int;
result float;
BEGIN
FROM nlpdata a, nlpdata b;
FOR i IN 0..length LOOP
IF tokenizedSentence[i] = 1 THEN
IF a.tokenizedSentence[i] = b.tokenizedSentence[i] THEN
counter := counter + 1;
END IF;
END IF;
END LOOP;
result = counter / length
RETURN;
END;
$$ LANGUAGE plpgsql;
也不知道如何删除 "FROM nlpdata a, nlpdata b"。有什么想法吗?
何去何从(如果我正确理解任务):
with t(t_id, t_serie) as ( -- Test data
values
(1, array[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]),
(2, array[1, 1, 0, 0, 1, 1, 1, 1, 1, 1]),
(3, array[0, 1, 1, 0, 1, 0, 0, 0, 1, 1]),
(4, array[1, 1, 0, 1, 1, 0, 1, 0, 1, 1]),
(5, array[1, 0, 0, 0, 1, 1, 0, 0, 1, 0])
)
select
*, -- Data columns
-- Positions of 1s in the arrays
array_positions(t1.t_serie, 1), array_positions(t2.t_serie, 1),
-- Intersections of the positins
array(select unnest(array_positions(t1.t_serie, 1)) intersect select unnest(array_positions(t2.t_serie, 1))),
-- Count of intersections / length of arrays
cardinality(array(select unnest(array_positions(t1.t_serie, 1)) intersect select unnest(array_positions(t2.t_serie, 1))))::float / cardinality(t1.t_serie)::float
from t as t1 cross join t as t2
where
t1.t_id <> t2.t_id