通过忽略 r 中的特定字符对数据框的列执行操作
Perform the operation on the columns of dataframe by ignoring specific characters in r
当 DataFrame 的那些列中有一些缺失值时,我在 DataFrame 中执行操作时遇到问题。我的目标只是忽略它们(不是删除具有缺失值的行或列)并正常执行操作。
这是我的数据框:
dat <- data.frame(
time = factor(c("Breakfast","Breakfast","Lunch","Lunch","Breakfast","Dinner","Dinner","Dinner","Snack","Snack","Lunch","Snack","Snack"), levels=c("Breakfast","Lunch","Dinner","Snack")),
total_bill_x = c("*",14.89,20.5,17.23,30.3,"*",20.7,32.3,25.4,14.5,13.7,14.2,15.7), total_bill_y= c(20.75,"*",18.52,"*",27.3,23.6,19.75,27.3,21.48,13.66,15.59,17.3,14.78)
)
我想执行sum(dat$total_bill_x)
sum(dat$total_bill_y)
.
这样的操作
列为factor
,需要转换为numeric
。有元素如*
在处理过程中会变成NA
并给出友好警告
library(dplyr)
dat %>%
summarise_at(vars(starts_with('total')), ~
sum(as.numeric(as.character(.)), na.rm = TRUE))
如果我们不想要警告,一种选择是在执行 sum
之前有选择地删除那些 *
元素
dat %>%
summarise_at(vars(starts_with('total')),
~ sum(as.numeric(as.character(.[.!= "*"]))))
# total_bill_x total_bill_y
#1 219.42 220.03
或 base R
sapply(dat[-1], function(x) sum(as.numeric(as.character(x[x!= "*"]))))
# total_bill_x total_bill_y
# 219.42 220.03
或将 *
更改为 NA
,转换为 numeric
,提取列并 sum
dat[-1] <- lapply(dat[-1], function(x)
as.numeric(replace(as.character(x), x == "*", NA)))
sum(dat$total_bill_x, na.rm = TRUE)
如果我们想通过操作进行分组,一种选择是将 *
转换为 NA
并使用 na_if
(来自 dplyr
),转换为 numeric
(as.numeric
),按'time'分组,得到summarise
中列的sum
并通过除以[=创建'pourcentage'列53=] 与 'total'
的 sum
dat %>%
mutate_at(vars(starts_with('total')), ~ as.numeric(na_if(., "*"))) %>%
group_by(time) %>%
summarise(total = sum(total_bill_x, na.rm = TRUE)) %>%
mutate(pourcentage=total/sum(total)*100)
# A tibble: 4 x 3
# time total pourcentage
# <fct> <dbl> <dbl>
#1 Breakfast 16 20.8
#2 Lunch 17 22.1
#3 Dinner 21 27.3
#4 Snack 23 29.9
我们也可以
dat %>%
mutate(across(starts_with('total'), readr::parse_number)) %>%
summarise(across(starts_with('total'),
~sum(., na.rm = TRUE), names = "total_{col}"))
我们可以使用readr::parse_number
library(dplyr)
dat %>%
mutate_at(vars(starts_with('total')), readr::parse_number) %>%
summarise_at(vars(starts_with('total')), sum, na.rm = TRUE)
# total_bill_x total_bill_y
#1 219.42 220.03
按组求和即time
。
dat %>%
mutate_at(vars(starts_with('total')), readr::parse_number) %>%
group_by(time) %>%
summarise_at(vars(starts_with('total')), sum, na.rm = TRUE)
# time total_bill_x total_bill_y
# <fct> <dbl> <dbl>
#1 Breakfast 45.2 48.0
#2 Lunch 51.4 34.1
#3 Dinner 53 70.6
#4 Snack 69.8 67.2
当 DataFrame 的那些列中有一些缺失值时,我在 DataFrame 中执行操作时遇到问题。我的目标只是忽略它们(不是删除具有缺失值的行或列)并正常执行操作。
这是我的数据框:
dat <- data.frame(
time = factor(c("Breakfast","Breakfast","Lunch","Lunch","Breakfast","Dinner","Dinner","Dinner","Snack","Snack","Lunch","Snack","Snack"), levels=c("Breakfast","Lunch","Dinner","Snack")),
total_bill_x = c("*",14.89,20.5,17.23,30.3,"*",20.7,32.3,25.4,14.5,13.7,14.2,15.7), total_bill_y= c(20.75,"*",18.52,"*",27.3,23.6,19.75,27.3,21.48,13.66,15.59,17.3,14.78)
)
我想执行sum(dat$total_bill_x)
sum(dat$total_bill_y)
.
列为factor
,需要转换为numeric
。有元素如*
在处理过程中会变成NA
并给出友好警告
library(dplyr)
dat %>%
summarise_at(vars(starts_with('total')), ~
sum(as.numeric(as.character(.)), na.rm = TRUE))
如果我们不想要警告,一种选择是在执行 sum
*
元素
dat %>%
summarise_at(vars(starts_with('total')),
~ sum(as.numeric(as.character(.[.!= "*"]))))
# total_bill_x total_bill_y
#1 219.42 220.03
或 base R
sapply(dat[-1], function(x) sum(as.numeric(as.character(x[x!= "*"]))))
# total_bill_x total_bill_y
# 219.42 220.03
或将 *
更改为 NA
,转换为 numeric
,提取列并 sum
dat[-1] <- lapply(dat[-1], function(x)
as.numeric(replace(as.character(x), x == "*", NA)))
sum(dat$total_bill_x, na.rm = TRUE)
如果我们想通过操作进行分组,一种选择是将 *
转换为 NA
并使用 na_if
(来自 dplyr
),转换为 numeric
(as.numeric
),按'time'分组,得到summarise
中列的sum
并通过除以[=创建'pourcentage'列53=] 与 'total'
sum
dat %>%
mutate_at(vars(starts_with('total')), ~ as.numeric(na_if(., "*"))) %>%
group_by(time) %>%
summarise(total = sum(total_bill_x, na.rm = TRUE)) %>%
mutate(pourcentage=total/sum(total)*100)
# A tibble: 4 x 3
# time total pourcentage
# <fct> <dbl> <dbl>
#1 Breakfast 16 20.8
#2 Lunch 17 22.1
#3 Dinner 21 27.3
#4 Snack 23 29.9
我们也可以
dat %>%
mutate(across(starts_with('total'), readr::parse_number)) %>%
summarise(across(starts_with('total'),
~sum(., na.rm = TRUE), names = "total_{col}"))
我们可以使用readr::parse_number
library(dplyr)
dat %>%
mutate_at(vars(starts_with('total')), readr::parse_number) %>%
summarise_at(vars(starts_with('total')), sum, na.rm = TRUE)
# total_bill_x total_bill_y
#1 219.42 220.03
按组求和即time
。
dat %>%
mutate_at(vars(starts_with('total')), readr::parse_number) %>%
group_by(time) %>%
summarise_at(vars(starts_with('total')), sum, na.rm = TRUE)
# time total_bill_x total_bill_y
# <fct> <dbl> <dbl>
#1 Breakfast 45.2 48.0
#2 Lunch 51.4 34.1
#3 Dinner 53 70.6
#4 Snack 69.8 67.2