寻找最短路径最多分离十度

finding shortest path up to ten degrees of separation

我在SQL中有以下三个表:

select * from movie limit 2;

  id  |           title            | year | content_rating | duration |    lang    |       country        |  gross   |  budget  | director_id 
------+----------------------------+------+----------------+----------+------------+----------------------+----------+----------+-------------
  407 | 102 Dalmatians             | 2000 | G              |      100 | English    | USA                  | 66941559 | 85000000 |        2174
 3699 | 10 Cloverfield Lane        | 2016 | PG-13          |      104 | English    | USA                  | 71897215 | 15000000 |        1327
(2 rows)

select * from actor limit 3;

  id  |         name         | facebook_likes 
------+----------------------+----------------
  408 | Christian Bale       |          23000
 1430 | Donna Murphy         |            553
   66 | Robert Downey Jr.    |          21000
(3 rows)

select * from acting limit 3;

 movie_id | actor_id 
----------+----------
      407 |     2024
     3699 |     1841
     3016 |       11
(3 rows)

给定两个演员 a1a2,我想找到 最短 路径 a1a2

例如,假设 a1 = 'Tom Cruise'a2 = 'Robert Downey Jr'

输出应该是

Tom Cruise was in Days of Thunder with Robert Duvall -> Robert Duvall was in Lucky You with Robert Downey Jr.

在这种情况下,Tom CruiseRobert Downey Jr 相距 2 度,Robert Durvall 连接它们。我最多想输出 10 度 ,然后忽略任何连接。

我尝试使用递归 CTE 实施解决方案 ,但我认为我没有正确应用它。感谢帮助,提前致谢:)

尝试查询:

with recursive cte as (
select actor.name, movie.title, 1 as level from movie
left join acting on acting.movie_id = movie.id 
left join actor on actor.id = acting.actor_id
where actor.name = 'Tom Cruise'
union  
select actor.name, movie.title, level+1 from movie
left join acting on acting.movie_id = movie.id 
left join actor on actor.id = acting.actor_id
inner join cte on cte.name = actor.name
where cte.name = actor.name and cte.level < 10
)
select * from cte

我不确定查询中的第二个 select 会是什么 return 但这里有一种获取参与者之间分离度的方法:

假设我们有 table 个演员 ID,Origin。为了获得与我们 table 中的其中一位演员在同一部电影中扮演过的所有演员,我们需要从 Origin 开始,加入 Acting 然后加入 Movie 以获得我们起源的所有电影演员已经上场,然后再次加入 Acting 和 Actor table 以获得我们想要的。注意 Acting table 出现了两次。如果我们将其应用于递归 CTE 和您的问题,并注意到 Origin table 在您的示例中将是 Cte,我们将得到以下结果:

WITH RECURSIVE cte(id, distance) AS (
    SELECT actor.id, 0 
    FROM actor
    WHERE actor.name = 'Tom Cruise'

    UNION

    SELECT DISTINCT actor.id, cte.distance + 1
    FROM cte
    JOIN acting AS acting1 ON (cte.id = acting1.actor_id)
    JOIN movie ON (acting1.movie_id = movie.id)
    JOIN acting AS acting2 ON (movie.id = acting2.movie_id)
    JOIN actor ON (acting2.actor_id = actor.id)
    WHERE cte.id <> actor.id AND cte.distance + 1 <= 10
)

在此之后,cte table 将包含类型为 (id, dist) 的元组,这意味着存在从汤姆克鲁斯到具有此 id 且距离为 dist 的演员的路径。

DISTINCT 是为了提高效率。我们的 Cte table 中会有很多错误对(第二个值大于真实距离),尤其是在 actor 图密集的情况下,但正确的元组 在 Cte table。正确的元组是指一个元组(演员,距离),这样的距离是起始演员(例如汤姆克鲁斯)和这个演员之间的最短路径

编辑: 不好意思,UNION 已经这样做了,所以重复不需要 DISTINCT。

为了得到那个距离,我们添加一个 select 和一个 group by 子句:

WITH RECURSIVE cte(id, distance) AS (
    SELECT actor.id, 0 
    FROM actor
    WHERE actor.name = 'Tom Cruise'

    UNION

    SELECT actor.id, cte.distance + 1
    FROM cte
    JOIN acting AS acting1 ON (cte.id = acting1.actor_id)
    JOIN movie ON (acting1.movie_id = movie.id)
    JOIN acting AS acting2 ON (movie.id = acting2.movie_id)
    JOIN actor ON (acting2.actor_id = actor.id)
    WHERE cte.id <> actor.id AND cte.distance + 1 <= 10
)
SELECT id, MIN(distance) AS distance
FROM cte
GROUP BY id
ORDER BY 2 ASC;

如果您想查看给定的第二个演员的结果,比如小罗伯特唐尼,那么这将为您提供有关分离度的答案:

WITH RECURSIVE cte(id, distance) AS (
    SELECT actor.id, 0 
    FROM actor
    WHERE actor.name = 'Tom Cruise'

    UNION

    SELECT actor.id, cte.distance + 1
    FROM cte
    JOIN acting AS acting1 ON (cte.id = acting1.actor_id)
    JOIN movie ON (acting1.movie_id = movie.id)
    JOIN acting AS acting2 ON (movie.id = acting2.movie_id)
    JOIN actor ON (acting2.actor_id = actor.id)
    WHERE cte.id <> actor.id AND cte.distance + 1 <= 10
), distance_table (id, distance) AS (
    SELECT id, MIN(distance) AS distance
    FROM cte
    GROUP BY id
)
SELECT 'Tom Cruise and ' || actor.name || ' are separated by ' ||
       COALESCE(TO_CHAR(distance_table.distance, '999999'), 'more than 10') || ' degrees of separation'
FROM actor
LEFT JOIN distance_table ON (actor.id = distance_table.id)
WHERE actor.name = 'Robert Downey Jr';

尽管我认为您一般不会想直接从数据库中计算此类信息,但如果您想要一条消息告诉演员之间的路径,就像您提供的那样(汤姆克鲁斯是在 Days of Thunder with Robert Duvall -> Robert Duvall 在 Lucky You 和 Robert Downey Jr.) 中,那么这样的事情可以 return 那:

WITH RECURSIVE cte(id, name, distance, message) AS (
    SELECT actor.id, actor.name, 0, ''
    FROM actor
    WHERE actor.name = 'Tom Cruise'

    UNION

    SELECT actor.id, actor.name, cte.distance + 1, 
           cte.message || '> ' || cte.name || ' was in ' ||
           movie.title || ' with ' || actor.name || ' '
    FROM cte
    JOIN acting AS acting1 ON (cte.id = acting1.actor_id)
    JOIN movie ON (acting1.movie_id = movie.id)
    JOIN acting AS acting2 ON (movie.id = acting2.movie_id)
    JOIN actor ON (acting2.actor_id = actor.id)
    WHERE cte.id <> actor.id AND cte.distance + 1 <= 10
), distance_table (id, distance) AS (
    SELECT id, MIN(distance) AS distance
    FROM cte
    GROUP BY id
)
SELECT id, name, message, distance
FROM cte
WHERE (id, distance) IN (SELECT * FROM distance_table)
ORDER BY distance;

这是一个尝试(没有 CTE)。我碰巧有一份包含 4175 个美国城邦对的列表。 (想想 state==movie 和 city==actor。)

这是来自 table us 的设置:

SET NAMES utf8 COLLATE utf8_unicode_ci;

DROP TABLE IF EXISTS p_mapping;  -- state-city pairs (movie-actor)
CREATE TABLE p_mapping (
    state char(2) CHARACTER SET ascii NOT NULL,
    city varchar(255)  CHARACTER SET utf8 COLLATE utf8_unicode_ci  NOT NULL,
    PRIMARY KEY(state, city),
    INDEX(city, state)
) ENGINE=InnoDB;
INSERT INTO p_mapping (state, city)
    SELECT state, city  FROM us;

DROP TABLE IF EXISTS p_cities;  -- city ~= actor
CREATE TABLE p_cities (
    depth TINYINT UNSIGNED NOT NULL DEFAULT 0,
    from_state  char(2)  CHARACTER SET ascii  NOT NULL DEFAULT '',
    city  VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
    PRIMARY KEY(city)
) ENGINE=InnoDB COMMENT 'SO 55717636';
INSERT INTO p_cities (city)
    SELECT DISTINCT city  FROM p_mapping;

DROP TABLE IF EXISTS p_states;  -- state ~= movie
CREATE TABLE p_states (
    depth TINYINT UNSIGNED NOT NULL DEFAULT 0,
    from_city  VARCHAR(255)  CHARACTER SET utf8 COLLATE utf8_unicode_ci  NOT NULL  DEFAULT '',
    state char(2)  CHARACTER SET ascii  NOT NULL,
    PRIMARY KEY(state)
) ENGINE=InnoDB COMMENT 'SO 55717636';
INSERT INTO p_states (state)
    SELECT DISTINCT state  FROM p_mapping;

-- 我选择了连接奥马哈(仅内布拉斯加州)和伯明翰(多个州)的目标。首先进行一些初始化:

SET @city := 'Omaha';   -- starting here

UPDATE p_cities
    SET depth = 1
    WHERE city = @city;
UPDATE p_states  AS s
  JOIN p_mapping AS m  USING(state)
    SET s.from_city = @city,
        s.depth = 1
    WHERE m.city = @city;

SET @depth := 1;

-- 然后重复以下10次或者直到rows_affected降为0。迭代3次后停止。

UPDATE  p_cities AS c
   JOIN p_mapping AS m  USING(city)
   JOIN p_states  AS s  USING(state)
    SET c.from_state = m.state,
        c.depth = s.depth + 1
    WHERE s.depth = @depth
      AND c.depth = 0;

SET @depth := @depth + 1;

UPDATE  p_states AS s
   JOIN p_mapping AS m  USING(state)
   JOIN p_cities AS c   USING(city)
    SET s.from_city = m.city,
        s.depth = c.depth
    WHERE c.depth = @depth
      AND s.depth = 0;

-- 结束循环(算法结束)

-- 正确路径:奥马哈 -> 东北 -> 哥伦布 -> 俄亥俄州 -> 雅典 -> 阿拉巴马州 -> 伯明翰 -- 注意这是如何列出答案的(但是垂直的):

SELECT * FROM p_cities
    WHERE city in ('Omaha', 'Columbus', 'Athens', 'Birmingham')
    ORDER BY depth;        

    +-------+------------+------------+
    | depth | from_state | city       |
    +-------+------------+------------+
    |     1 |            | Omaha      |
    |     2 | NE         | Columbus   |
    |     3 | OH         | Athens     |
    |     4 | AL         | Birmingham |
    +-------+------------+------------+
    4 rows in set (0.00 sec)

-- 'Proof' 链接适用于以下答案:

SELECT * FROM p_mapping
    WHERE city IN ('Omaha', 'Columbus', 'Athens', 'Birmingham')
      AND state IN ('NE', 'OH', 'TN', 'AL');

    +-------+------------+
    | state | city       |
    +-------+------------+
    | AL    | Athens     |
    | OH    | Athens     |
    | TN    | Athens     |
    | AL    | Birmingham |
    | NE    | Columbus   |
    | OH    | Columbus   |
    | NE    | Omaha      |
    +-------+------------+
    7 rows in set (0.00 sec)

--(其他table)

SELECT * FROM p_states
    WHERE from_city IN ('Omaha', 'Columbus', 'Athens', 'Birmingham')
       OR state IN ('NE', 'OH', 'TN', 'AL')
    ORDER BY depth;

    +-------+-----------+-------+
    | depth | from_city | state |
    +-------+-----------+-------+
    |     1 | Omaha     | NE    |
    |     2 | Columbus  | GA    |
    |     2 | Columbus  | IN    |
    |     2 | Columbus  | MS    |
    |     2 | Columbus  | OH    |
    |     3 | Athens    | AL    |
    |     3 | Athens    | TN    |
    +-------+-----------+-------+
    7 rows in set (0.00 sec)