用 R 和 ggplot2 简洁地再现下图

Succinctly Reproducing the following graph with R and ggplot2

我借用了 link 的 R 代码并生成了下图:

使用相同的想法,我尝试使用我的数据如下:

library(tidyverse)
library(tidytable)
library(ggforce)
library(ggtext)
library(camcorder)
library(bibliometrix)
library(bibliometrixData)

data(management)

M <- metaTagExtraction(management, "AU_CO")

CO <- 
  tidytable(
      Country   = unlist(strsplit(M$AU_CO,";"))
    , year      = rep(M$PY, lengths(strsplit(M$AU_CO,";")))
    , nAuPerArt = rep(lengths(strsplit(M$AU_CO,";")),lengths(strsplit(M$AU_CO,";")))
  )

df0 <- 
  CO %>% 
  summarise.(
      frequency = length(Country)
    , frequencyFractionalized = sum(1/nAuPerArt)
    , .by = c(Country, year)
  ) %>% 
  arrange.(Country, year)

df1 <- 
  df0 %>% 
  mutate.(
      min_year  = min(year)
    , n_total   = sum(frequency)
    , .by       = Country
  ) %>% 
  mutate.(Country = fct_reorder(Country, min_year)) %>% 
  count(Country, n_total, min_year, year) %>% 
  mutate.(
      a_deg   = as.numeric(Country) * 2.7 + 8.5
    , a       = a_deg * pi/180
    , x       = -(year - min(year) + 10) * cos(a + pi/2.07)
    , y       = (year - min(year) + 10) * sin(a + pi/2.07)
    , label_a = ifelse(a_deg > 180, 270 - a_deg, 90 - a_deg)
    , h       = ifelse(a_deg > 180, 1, 0)
    , label   = ifelse(h == 0,
                       paste0(Country, " <span style = 'color:darkorange;'>(", n_total, ")</span>"),
                       paste0(" <span style = 'color:darkorange;'>(", n_total, ")</span>", Country))
  ) %>% 
  arrange.(as.character(Country), year)

df1
# df1 %>% view()


Years <- 
  tidytable(
    r = seq(
        from         = 10
      , to           = 280
      , length.out   = 12
    )
    , l = seq(from = min(df0$year), to = max(df0$year), by = 3)
  ) %>% 
  mutate.(
    lt = ifelse(row_number.() %% 2 == 0, "dotted", "solid")
  )

Years

f1 = "Porpora"

gg_record(dir = "temp", device = "png", width = 10, height = 11, units = "in", dpi = 320)

ggplot(data = df1) +
  # Purple points
  geom_point(data = df1, aes(x = x, y = y, size = n * 10), shape = 21, stroke = 0.15, fill = "purple") +
  # Year circles
  geom_circle(
    data = Years
    , aes(x0 = 0, y0 = 0, r = r, linetype = lt), size = 0.08, color = "grey50"
  ) +
  # Year labels
  geom_label(
    data = Years
    , aes(x = 0, y = r, label = l), size = 3, family = f1, label.padding = unit(0.25, "lines"), label.size = NA, fill = "grey95", color = "grey70") +
  # Orange points (totals)
  geom_point(aes(x = -290 * cos(a + pi/2.07), y = 290 * sin(a + pi/2.07), size = n_total), stat = "unique", shape = 21, stroke = 0.5, fill = "orange") +
  # Family names and totals
  geom_richtext(aes(x = -305 * cos(a + pi/2.07),
                    y = 305 * sin(a + pi/2.07),
                    label = label,
                    angle = label_a,
                    hjust = h), stat = "unique", Country = f1, size = 3.5,
                fill = NA, label.color = NA, color = "#0b5029") +
  # Annotations
  annotate("text", 0, 293, label = "Total", Country = f1, color = "orange") +
  scale_size_continuous(range = c(0, 8)) +
  scale_color_viridis_c(option = "turbo") +
  coord_fixed(clip = "off", xlim = c(-400, 400)) +
  # labs(
  #   caption = "<span style = 'font-size:30px;'>Taxonomic publications, 1757-2021</span><br>
  #   Publications by family and year <span style = 'color:purple;'>(purple circles)</span>and total publications by family <span style = 'color:darkorange;'>(orange circles and text)</span><br>
  #    <span style = 'color:black;'>Source: World Spider Catalog - Graphic: Georgios Karamanis</span>"
  # ) +
  theme_void() +
  theme(
    legend.position = "none",
    plot.background = element_rect(fill = "grey95", color = NA),
    plot.margin = margin(0, 20, 20, 20),
    plot.caption = element_markdown(family = f1, hjust = 0.5, margin = margin(100, 0, -100, 0), size = 14, lineheight = 1.4, color = "#0b5029")
  )

生成以下图表

但是,生成的图表存在两个问题。

  1. 如何正确对齐粉色点?
  2. 国名如何使用整圈?

已编辑

正在寻找可用于不同数据集的更简洁和可重现的代码。最好寻找可以处理不同数据集的通用函数。

您可以在函数内计算 x 和 y 值以构建 ggplot,它一直延伸圆圈并为标签提供正确的高度。

我已经调整了一个函数来处理其他数据集。这采用整洁格式的数据集,其中:

  • 一个'year'列
  • 每行一行 'event'
  • 分组变量(如国家)

我使用来自 here 的诺贝尔奖获得者数据作为示例数据集来展示该功能在实践中的应用。数据设置:

library(tidyverse)
library(ggforce)
library(ggtext)

nobel <- read_csv("archive.csv")

# Filtering in this example to create a plottable dataset
nobel_filt <- nobel %>%
  mutate(country = fct_lump_n(factor(`Birth Country`), n = 50)) %>% 
  filter(country != "Other")

nobel_filt
#> # A tibble: 883 x 19
#>     Year Category   Prize Motivation `Prize Share` `Laureate ID` `Laureate Type`
#>    <dbl> <chr>      <chr> <chr>      <chr>                 <dbl> <chr>          
#>  1  1901 Chemistry  The ~ "\"in rec~ 1/1                     160 Individual     
#>  2  1901 Literature The ~ "\"in spe~ 1/1                     569 Individual     
#>  3  1901 Medicine   The ~ "\"for hi~ 1/1                     293 Individual     
#>  4  1901 Peace      The ~  <NA>      1/2                     462 Individual     
#>  5  1901 Peace      The ~  <NA>      1/2                     463 Individual     
#>  6  1901 Physics    The ~ "\"in rec~ 1/1                       1 Individual     
#>  7  1902 Chemistry  The ~ "\"in rec~ 1/1                     161 Individual     
#>  8  1902 Literature The ~ "\"the gr~ 1/1                     571 Individual     
#>  9  1902 Medicine   The ~ "\"for hi~ 1/1                     294 Individual     
#> 10  1902 Peace      The ~  <NA>      1/2                     464 Individual     
#> # ... with 873 more rows, and 12 more variables: Full Name <chr>,
#> #   Birth Date <date>, Birth City <chr>, Birth Country <chr>, Sex <chr>,
#> #   Organization Name <chr>, Organization City <chr>,
#> #   Organization Country <chr>, Death Date <date>, Death City <chr>,
#> #   Death Country <chr>, country <fct>

此函数然后将数据帧作为参数,以及作为分组依据的列的名称和作为标记时间段依据的列的名称。它不是超级简洁,因为正在进行大量数据处理。但希望在一个函数中它更整洁。


circle_plot <- function(data, group_var, time_var) {

  df_full <-
    data %>%
    select(group = {{group_var}}, year = {{time_var}}) %>% 
    mutate(group = factor(group),
           group = fct_reorder(group, year, .fun = min),
           order = as.numeric(group))
  
    
  
  year_vals <-
    tibble(year = as.character(seq(min(df_full$year), max(df_full$year), 1)),
           level = 1 + 1:length(year))
  
  y_vals <- year_vals %>% 
    bind_rows(tribble(~ year, ~ level,
                      "total", max(year_vals$level) + 5,
                      "title", max(year_vals$level) + 10
    ))
  
  year_labs <-
    tibble(year = as.character(floor(seq(
      min(df_full$year), max(df_full$year), length.out = 10
    )))) %>%
    left_join(y_vals, by = "year")
  
  x_len <- max(df_full$order)
  
  df_ang <- df_full %>% 
    mutate(year = as.character(year)) %>% 
    count(group, order, year) %>% 
    left_join(y_vals, by = "year") %>% 
    mutate(a_deg = order * 350/x_len + 5,
           x = - level * cos(a_deg * pi/180 + pi/2.07),
           y = level * sin(a_deg * pi/180 + pi/2.07))
  
  df_lab <- df_ang %>%
    group_by(group, a_deg) %>%
    summarise(n_total = n()) %>%
    mutate(
      group_name = str_trunc(as.character(group), 30),
      label_a = ifelse(a_deg > 180, 270 - a_deg, 90 - a_deg),
      h = ifelse(a_deg > 180, 1, 0),
      label = ifelse(
        h == 0,
        paste0(
          group_name,
          " <span style = 'color:darkorange;'>(",
          n_total,
          ")</span>"
        ),
        paste0(
          "<span style = 'color:darkorange;'>(",
          n_total,
          ")</span> ",
          group_name
        )
      ),
      year = "title"
    ) %>%
    left_join(y_vals, by = "year") %>%
    mutate(
      x = -level * cos(a_deg * pi / 180 + pi / 2.07),
      y = level * sin(a_deg * pi / 180 + pi / 2.07),
      total_x = -(level - 5) * cos(a_deg * pi / 180 + pi / 2.07),
      total_y = (level - 5) * sin(a_deg * pi / 180 + pi / 2.07)
    )
  
  
  ggplot() +
    geom_circle(
      data = year_labs,
      aes(
        x0 = 0,
        y0 = 0,
        r = level
      ),
      size = 0.08,
      color = "grey50"
    ) +
    geom_label(
      data = year_labs,
      aes(x = 0, y = level, label = year),
      size = 3,
      label.padding = unit(0.25, "lines"),
      label.size = NA,
      fill = "grey95",
      color = "grey70"
    ) +
    geom_point(
      data = df_ang,
      aes(x = x, y = y, size = n),
      shape = 21,
      stroke = 0.15,
      fill = "purple"
    ) +
    geom_point(
      data = df_lab,
      aes(total_x, total_y,
        size = n_total
      ),
      stat = "unique",
      shape = 21,
      stroke = 0.5,
      fill = "orange"
    ) +
    geom_richtext(
      data = df_lab,
      aes(x, y,
        label = label,
        angle = label_a,
        hjust = h
      ),
      stat = "unique",
      size = 4,
      fill = NA,
      label.color = NA,
      color = "#0b5029"
    ) +
    annotate(
      "text",
      0,
      y = y_vals[y_vals$year=="total",]$level,
      label = "Total",
      color = "orange",
      size = 4,
      vjust = 0
    ) +
    scale_size_continuous(range = c(1, 9)) +
    scale_color_viridis_c(option = "turbo") +
    coord_fixed(clip = "off", xlim = c(-120, 120)) +
    theme_void() +
    theme(
      legend.position = "none",
      plot.background = element_rect(fill = "grey95", color = NA),
      plot.margin = margin(100, 180, 150, 180),
    )
  
  }
  

circle_plot(nobel_filt, `Birth Country`, Year)

# ggsave("test.png", height = 10, width = 10)

这将创建以下图表:

最令人头疼的问题(正如您在此处看到的那样)将是更改边距以适应长标签和导出图尺寸以整齐地适合 text/numbers 年圆的尺寸。这可能必须在每个地块上进行试验。您可以将函数内的 margin 调用调整为合理的默认值,或者向函数调用添加进一步的 theme 元素,如下所示:

circle_plot(nobel_filt, `Birth Country`, Year) +
  theme(plot.margin = margin(80, 150, 120, 150))

希望对您有所帮助!

reprex package (v2.0.1)

于 2021-12-27 创建