如何用成千上万替换 "k" 和 "m"?
How do I replace "k" and "m" with thousands and millions?
我有一个从 Coursera 解析的数据框。其中一列是注册该课程的学生人数。看起来像这样:
df <- data.frame(uni = c("Yale", "Toronto", "NYU"), students = c("16m", "240k", "7.5k"))
uni students
1 Yale "16m"
2 Toronto "240k"
3 NYU "7.5k"
我需要得到的是
uni students
1 Yale 16000000
2 Toronto 240000
3 NYU 75000
所以,对我来说主要的困难是values的class是字符,我不知道替换ks和ms,以及将column的class转换成数字的函数.
请帮帮我!
w=12=WILL.y.w=11=w
w=10=sh
使用 stringr
和 dplyr
来自 tidyverse
library(tidyverse)
df %>%
mutate(students = case_when(
str_detect(students, "m") ~ as.numeric(str_extract(students, "[\d\.]+")) * 1000000,
str_detect(students, "k") ~ as.numeric(str_extract(students, "[\d\.]+")) * 1000,
))
# A tibble: 3 x 2
uni students
<chr> <dbl>
1 Yale 16000000
2 Toronto 240000
3 NYU 7500
这是一种使用 separate
的方法,适用于任意数量的修饰符,只需在 case_when
语句中继续定义它们即可。
library(dplyr)
library(tidry)
df %>%
separate(students,into = c("value","modifier"),
sep = "(?<=[\d])(?=[^\d.])") %>%
mutate(modifier = case_when(modifier == "b" ~ 1000000000,
modifier == "m" ~ 1000000,
modifier == "k" ~ 1000,
TRUE ~ 1),
result = as.numeric(value) * modifier)
uni value modifier result
1 Yale 16 1e+06 1.6e+07
2 Toronto 240 1e+03 2.4e+05
3 NYU 7.5 1e+03 7.5e+03
可以编写一个函数来进行转换,例如:
f <- function(s) {
l <- nchar(s)
x <- as.numeric(substr(s, 1, l-1))
u <- substr(s, l, l)
x * 10^(3 * match(u, c("k", "M", "G")))
}
f("2M")
f("200k")
编辑: 或更通用一点:
f <- function(s) {
x <- as.numeric(gsub("[kMG]", "", s))
u <- gsub("[0-9.]", "", s)
if (nchar(u)) x <- x * 10^(3 * match(u, c("k", "M", "G")))
x
}
f("20")
f("2M")
f("200k")
使用 gsub
和 dplyr
:
df %>% mutate(
unit=gsub("[0-9]+\.*[0-9]*","",students), #selecting unit
value=as.numeric(gsub("([0-9]+\.*[0-9]+).", "\1", students)),
students=ifelse(unit=="k",1e3*value,
ifelse(unit=="m",1e6*value,
ifelse(unit=="b",1e9*value,value)))) %>%
select(-c(unit,value))
基数为 r 的选项:
df$students <- ifelse(grepl('m', ignore.case = TRUE, df$students), as.numeric(gsub("[$m]", "", df$students)) * 10^6,
as.numeric(gsub("[$k]", "", df$students)) * 10^3)
# uni students
# 1 Yale 16000000
# 2 Toronto 240000
# 3 NYU 7500
我有一个从 Coursera 解析的数据框。其中一列是注册该课程的学生人数。看起来像这样:
df <- data.frame(uni = c("Yale", "Toronto", "NYU"), students = c("16m", "240k", "7.5k"))
uni students
1 Yale "16m"
2 Toronto "240k"
3 NYU "7.5k"
我需要得到的是
uni students
1 Yale 16000000
2 Toronto 240000
3 NYU 75000
所以,对我来说主要的困难是values的class是字符,我不知道替换ks和ms,以及将column的class转换成数字的函数.
请帮帮我!
使用 stringr
和 dplyr
来自 tidyverse
library(tidyverse)
df %>%
mutate(students = case_when(
str_detect(students, "m") ~ as.numeric(str_extract(students, "[\d\.]+")) * 1000000,
str_detect(students, "k") ~ as.numeric(str_extract(students, "[\d\.]+")) * 1000,
))
# A tibble: 3 x 2
uni students
<chr> <dbl>
1 Yale 16000000
2 Toronto 240000
3 NYU 7500
这是一种使用 separate
的方法,适用于任意数量的修饰符,只需在 case_when
语句中继续定义它们即可。
library(dplyr)
library(tidry)
df %>%
separate(students,into = c("value","modifier"),
sep = "(?<=[\d])(?=[^\d.])") %>%
mutate(modifier = case_when(modifier == "b" ~ 1000000000,
modifier == "m" ~ 1000000,
modifier == "k" ~ 1000,
TRUE ~ 1),
result = as.numeric(value) * modifier)
uni value modifier result
1 Yale 16 1e+06 1.6e+07
2 Toronto 240 1e+03 2.4e+05
3 NYU 7.5 1e+03 7.5e+03
可以编写一个函数来进行转换,例如:
f <- function(s) {
l <- nchar(s)
x <- as.numeric(substr(s, 1, l-1))
u <- substr(s, l, l)
x * 10^(3 * match(u, c("k", "M", "G")))
}
f("2M")
f("200k")
编辑: 或更通用一点:
f <- function(s) {
x <- as.numeric(gsub("[kMG]", "", s))
u <- gsub("[0-9.]", "", s)
if (nchar(u)) x <- x * 10^(3 * match(u, c("k", "M", "G")))
x
}
f("20")
f("2M")
f("200k")
使用 gsub
和 dplyr
:
df %>% mutate(
unit=gsub("[0-9]+\.*[0-9]*","",students), #selecting unit
value=as.numeric(gsub("([0-9]+\.*[0-9]+).", "\1", students)),
students=ifelse(unit=="k",1e3*value,
ifelse(unit=="m",1e6*value,
ifelse(unit=="b",1e9*value,value)))) %>%
select(-c(unit,value))
基数为 r 的选项:
df$students <- ifelse(grepl('m', ignore.case = TRUE, df$students), as.numeric(gsub("[$m]", "", df$students)) * 10^6,
as.numeric(gsub("[$k]", "", df$students)) * 10^3)
# uni students
# 1 Yale 16000000
# 2 Toronto 240000
# 3 NYU 7500