数据框的元素作为 R 中新数据框的列名
Elements of a dataframe as the column names of a new dataframe in R
我在 r 中有以下名为 DF 的数据框:
1 2 3
1 VW Mercedes Audi
2 Porsche BMW VW
3 Audi Honda Toyota
4 Dodge Opel VW
5 Lexus Volvo BMW
6 Dodge VW Porsche
我想创建一个新数据框 (DF2),其中 DF 的每个元素都是新数据框的列名,而 DF 的列名是 DF2 的元素:
Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
1 3 0 0 0 0 2 0 0 0 0 1
2 0 2 0 0 0 0 0 1 0 0 3
3 1 0 0 2 0 0 0 0 3 0 0
4 0 0 1 0 0 0 2 0 3 0 3
5 0 3 0 0 1 0 0 0 0 2 0
6 0 0 1 0 0 0 0 3 0 0 2
试试这个:
names <- unique(unlist(df))
x <- sapply(names, function(x) apply(df, 1, function(y) names(df)[x==y]))
x[] <- as.numeric(x)
x[is.na(x)] <- 0
x
VW Porsche Audi Dodge Lexus Mercedes BMW Honda Opel Volvo Toyota
[1,] 1 0 3 0 0 2 0 0 0 0 0
[2,] 3 1 0 0 0 0 2 0 0 0 0
[3,] 0 0 1 0 0 0 0 2 0 0 3
[4,] 3 0 0 1 0 0 0 0 2 0 0
[5,] 0 0 0 0 1 0 3 0 0 2 0
[6,] 2 3 0 1 0 0 0 0 0 0 0
这也有效:
DF <- read.table( text =
" VW Mercedes Audi
Porsche BMW VW
Audi Honda Toyota
Dodge Opel VW
Lexus Volvo BMW
Dodge VW Porsche " )
DF1 <- apply(DF,1:2,as.character) # Convert factors to strings, if necessary.
cars <- sort(unique(c(as.matrix(DF1))))
DF2 <- data.frame( matrix(0,nrow(DF),length(cars)) )
colnames(DF2) <- cars
for ( i in 1:nrow(DF) ) { DF2[i,DF1[i,]] <- 1:ncol(DF) }
for
-loop 是无害的,因为它没有在里面生长。
Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
1 3 0 0 0 0 2 0 0 0 0 1
2 0 2 0 0 0 0 0 1 0 0 3
3 1 0 0 2 0 0 0 0 3 0 0
4 0 0 1 0 0 0 2 0 0 0 3
5 0 3 0 0 1 0 0 0 0 2 0
6 0 0 1 0 0 0 0 3 0 0 2
>
for
-循环更快。很奇怪,不是吗?
library(microbenchmark)
mra68 <- function()
{
DF1 <- apply(DF,1:2,as.character)
cars <- sort(unique(c(as.matrix(DF1))))
DF2 <- data.frame( matrix(0,nrow(DF),length(cars)) )
colnames(DF2) <- cars
for ( i in 1:nrow(DF) ) { DF2[i,DF1[i,]] <- 1:ncol(DF) }
return( DF2 )
}
DatamineR <- function()
{
names <- unique(unlist(DF))
x <- sapply(names, function(x) apply(DF, 1, function(y) names(DF)[x==y]))
x[] <- as.numeric(x)
x[is.na(x)] <- 0
return(x)
}
microbenchmark( mra68(), DatamineR() )
.
> microbenchmark( mra68(), DatamineR() )
Unit: milliseconds
expr min lq mean median uq max neval
mra68() 2.360912 4.618337 4.74136 4.738126 4.931509 8.496653 100
DatamineR() 8.151552 16.083225 16.42256 16.284309 16.480636 20.860074 100
另一种选择:
library(tidyr)
library(dplyr)
DF %>%
add_rownames() %>%
gather(key, value, -rowname, convert = TRUE) %>%
spread(value, key, fill = 0) %>%
select(-rowname)
给出:
#Source: local data frame [6 x 11]
#
# Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
# (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
#1 3 0 0 0 0 2 0 0 0 0 1
#2 0 2 0 0 0 0 0 1 0 0 3
#3 1 0 0 2 0 0 0 0 3 0 0
#4 0 0 1 0 0 0 2 0 0 0 3
#5 0 3 0 0 1 0 0 0 0 2 0
#6 0 0 1 0 0 0 0 3 0 0 2
这是 reshape2
中 acast
的另一个选项
library(reshape2)
acast(melt(as.matrix(df)), Var1~value, value.var='Var2', fill=0)
# Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
#1 3 0 0 0 0 2 0 0 0 0 1
#2 0 2 0 0 0 0 0 1 0 0 3
#3 1 0 0 2 0 0 0 0 3 0 0
#4 0 0 1 0 0 0 2 0 0 0 3
#5 0 3 0 0 1 0 0 0 0 2 0
#6 0 0 1 0 0 0 0 3 0 0 2
我在 r 中有以下名为 DF 的数据框:
1 2 3
1 VW Mercedes Audi
2 Porsche BMW VW
3 Audi Honda Toyota
4 Dodge Opel VW
5 Lexus Volvo BMW
6 Dodge VW Porsche
我想创建一个新数据框 (DF2),其中 DF 的每个元素都是新数据框的列名,而 DF 的列名是 DF2 的元素:
Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
1 3 0 0 0 0 2 0 0 0 0 1
2 0 2 0 0 0 0 0 1 0 0 3
3 1 0 0 2 0 0 0 0 3 0 0
4 0 0 1 0 0 0 2 0 3 0 3
5 0 3 0 0 1 0 0 0 0 2 0
6 0 0 1 0 0 0 0 3 0 0 2
试试这个:
names <- unique(unlist(df))
x <- sapply(names, function(x) apply(df, 1, function(y) names(df)[x==y]))
x[] <- as.numeric(x)
x[is.na(x)] <- 0
x
VW Porsche Audi Dodge Lexus Mercedes BMW Honda Opel Volvo Toyota
[1,] 1 0 3 0 0 2 0 0 0 0 0
[2,] 3 1 0 0 0 0 2 0 0 0 0
[3,] 0 0 1 0 0 0 0 2 0 0 3
[4,] 3 0 0 1 0 0 0 0 2 0 0
[5,] 0 0 0 0 1 0 3 0 0 2 0
[6,] 2 3 0 1 0 0 0 0 0 0 0
这也有效:
DF <- read.table( text =
" VW Mercedes Audi
Porsche BMW VW
Audi Honda Toyota
Dodge Opel VW
Lexus Volvo BMW
Dodge VW Porsche " )
DF1 <- apply(DF,1:2,as.character) # Convert factors to strings, if necessary.
cars <- sort(unique(c(as.matrix(DF1))))
DF2 <- data.frame( matrix(0,nrow(DF),length(cars)) )
colnames(DF2) <- cars
for ( i in 1:nrow(DF) ) { DF2[i,DF1[i,]] <- 1:ncol(DF) }
for
-loop 是无害的,因为它没有在里面生长。
Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
1 3 0 0 0 0 2 0 0 0 0 1
2 0 2 0 0 0 0 0 1 0 0 3
3 1 0 0 2 0 0 0 0 3 0 0
4 0 0 1 0 0 0 2 0 0 0 3
5 0 3 0 0 1 0 0 0 0 2 0
6 0 0 1 0 0 0 0 3 0 0 2
>
for
-循环更快。很奇怪,不是吗?
library(microbenchmark)
mra68 <- function()
{
DF1 <- apply(DF,1:2,as.character)
cars <- sort(unique(c(as.matrix(DF1))))
DF2 <- data.frame( matrix(0,nrow(DF),length(cars)) )
colnames(DF2) <- cars
for ( i in 1:nrow(DF) ) { DF2[i,DF1[i,]] <- 1:ncol(DF) }
return( DF2 )
}
DatamineR <- function()
{
names <- unique(unlist(DF))
x <- sapply(names, function(x) apply(DF, 1, function(y) names(DF)[x==y]))
x[] <- as.numeric(x)
x[is.na(x)] <- 0
return(x)
}
microbenchmark( mra68(), DatamineR() )
.
> microbenchmark( mra68(), DatamineR() )
Unit: milliseconds
expr min lq mean median uq max neval
mra68() 2.360912 4.618337 4.74136 4.738126 4.931509 8.496653 100
DatamineR() 8.151552 16.083225 16.42256 16.284309 16.480636 20.860074 100
另一种选择:
library(tidyr)
library(dplyr)
DF %>%
add_rownames() %>%
gather(key, value, -rowname, convert = TRUE) %>%
spread(value, key, fill = 0) %>%
select(-rowname)
给出:
#Source: local data frame [6 x 11]
#
# Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
# (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
#1 3 0 0 0 0 2 0 0 0 0 1
#2 0 2 0 0 0 0 0 1 0 0 3
#3 1 0 0 2 0 0 0 0 3 0 0
#4 0 0 1 0 0 0 2 0 0 0 3
#5 0 3 0 0 1 0 0 0 0 2 0
#6 0 0 1 0 0 0 0 3 0 0 2
这是 reshape2
acast
的另一个选项
library(reshape2)
acast(melt(as.matrix(df)), Var1~value, value.var='Var2', fill=0)
# Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
#1 3 0 0 0 0 2 0 0 0 0 1
#2 0 2 0 0 0 0 0 1 0 0 3
#3 1 0 0 2 0 0 0 0 3 0 0
#4 0 0 1 0 0 0 2 0 0 0 3
#5 0 3 0 0 1 0 0 0 0 2 0
#6 0 0 1 0 0 0 0 3 0 0 2