R编程中for循环返回的重复数据帧行
Duplicated dataframe rows returned by for loop in R programming
我目前正在根据计算添加新列。这是示例数据
REC = c(237, 1781, NA, 3710, 2099)
S = c(2509, 25616, NaN, 19224, 6569)
Industry = c("ABC", "ABC", "ABC", "CDE", "CDE")
data = data.frame(REC, S, Industry)
我想对新添加的列应用单位长度缩放。为此,我写了这段代码
data2 = data.frame()
foreach(i = unique(data$Industry)) %do% {
foreach(j = fnames) %do% {
dataOrg = data
# Calculate unit length per feature
dataFin = dataOrg[dataOrg[,"Industry"] == i & is.finite(dataOrg[,j]), ] #Filtering only finite data
data1 = dplyr::filter(dataOrg[!is.finite(dataOrg[,j]), ]) # Filtering the non finite data
dataFin[ , sprintf("%s_uLen", j)] = dataFin[, j] / sqrt(sum(dataFin[, j]^2)) # Calculation
data2 = data2 %>%
dplyr::bind_rows(data1, dataFin)
}
}
这是每次迭代后的输出
[[1]]
[[1]][[1]]
REC S Industry REC_uLen
1 NA NaN ABC NA
2 237 2509 ABC 0.1319085
3 1781 25616 ABC 0.9912619
[[1]][[2]]
REC S Industry REC_uLen S_uLen
1 NA NaN ABC NA NA
2 237 2509 ABC 0.1319085 NA
3 1781 25616 ABC 0.9912619 NA
4 NA NaN ABC NA NA
5 237 2509 ABC NA 0.09748012
6 1781 25616 ABC NA 0.99523747
[[2]]
[[2]][[1]]
REC S Industry REC_uLen S_uLen
1 NA NaN ABC NA NA
2 237 2509 ABC 0.1319085 NA
3 1781 25616 ABC 0.9912619 NA
4 NA NaN ABC NA NA
5 237 2509 ABC NA 0.09748012
6 1781 25616 ABC NA 0.99523747
7 NA NaN ABC NA NA
8 3710 19224 CDE 0.8703574 NA
9 2099 6569 CDE 0.4924205 NA
[[2]][[2]]
REC S Industry REC_uLen S_uLen
1 NA NaN ABC NA NA
2 237 2509 ABC 0.1319085 NA
3 1781 25616 ABC 0.9912619 NA
4 NA NaN ABC NA NA
5 237 2509 ABC NA 0.09748012
6 1781 25616 ABC NA 0.99523747
7 NA NaN ABC NA NA
8 3710 19224 CDE 0.8703574 NA
9 2099 6569 CDE 0.4924205 NA
10 NA NaN ABC NA NA
11 3710 19224 CDE NA 0.94627897
12 2099 6569 CDE NA 0.32335136
每一步都会添加 3 条新闻。我希望我的输出包含相同的 5 行数据,但包含新添加的列。
这是预期的输出
REC S Industry REC_uLen S_uLen
1 237 2509 ABC 0.1319085 0.09748012
2 1781 25616 ABC 0.9912619 0.99523747
3 NA NaN ABC NA NA
4 3710 19224 CDE 0.8703574 0.94627897
5 2099 6569 CDE 0.4924205 0.32335136
这是我在连接等方面的想法:
library(foreach)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
REC = c(237, 1781, NA, 3710, 2099)
S = c(2509, 25616, NaN, 19224, 6569)
Industry = c("ABC", "ABC", "ABC", "CDE", "CDE")
data = data.frame(REC, S, Industry)
fnames <- c("REC", "S")
out <- NULL
foreach(i = unique(data$Industry)) %do% {
dataFin = subset(data, Industry == i) #Filtering only finite data
foreach(j = fnames) %do% {
dataFin[[sprintf("%s_uLen", j)]] = dataFin[[j]] / sqrt(sum(dataFin[[j]]^2, na.rm=TRUE)) # Calculation
}
out <- bind_rows(out, dataFin)
}
#> [[1]]
#> REC S Industry REC_uLen S_uLen
#> 1 237 2509 ABC 0.1319085 0.09748012
#> 2 1781 25616 ABC 0.9912619 0.99523747
#> 3 NA NaN ABC NA NaN
#>
#> [[2]]
#> REC S Industry REC_uLen S_uLen
#> 1 237 2509 ABC 0.1319085 0.09748012
#> 2 1781 25616 ABC 0.9912619 0.99523747
#> 3 NA NaN ABC NA NaN
#> 4 3710 19224 CDE 0.8703574 0.94627897
#> 5 2099 6569 CDE 0.4924205 0.32335136
out
#> REC S Industry REC_uLen S_uLen
#> 1 237 2509 ABC 0.1319085 0.09748012
#> 2 1781 25616 ABC 0.9912619 0.99523747
#> 3 NA NaN ABC NA NaN
#> 4 3710 19224 CDE 0.8703574 0.94627897
#> 5 2099 6569 CDE 0.4924205 0.32335136
由 reprex package (v2.0.1)
创建于 2022-02-16
我目前正在根据计算添加新列。这是示例数据
REC = c(237, 1781, NA, 3710, 2099)
S = c(2509, 25616, NaN, 19224, 6569)
Industry = c("ABC", "ABC", "ABC", "CDE", "CDE")
data = data.frame(REC, S, Industry)
我想对新添加的列应用单位长度缩放。为此,我写了这段代码
data2 = data.frame()
foreach(i = unique(data$Industry)) %do% {
foreach(j = fnames) %do% {
dataOrg = data
# Calculate unit length per feature
dataFin = dataOrg[dataOrg[,"Industry"] == i & is.finite(dataOrg[,j]), ] #Filtering only finite data
data1 = dplyr::filter(dataOrg[!is.finite(dataOrg[,j]), ]) # Filtering the non finite data
dataFin[ , sprintf("%s_uLen", j)] = dataFin[, j] / sqrt(sum(dataFin[, j]^2)) # Calculation
data2 = data2 %>%
dplyr::bind_rows(data1, dataFin)
}
}
这是每次迭代后的输出
[[1]]
[[1]][[1]]
REC S Industry REC_uLen
1 NA NaN ABC NA
2 237 2509 ABC 0.1319085
3 1781 25616 ABC 0.9912619
[[1]][[2]]
REC S Industry REC_uLen S_uLen
1 NA NaN ABC NA NA
2 237 2509 ABC 0.1319085 NA
3 1781 25616 ABC 0.9912619 NA
4 NA NaN ABC NA NA
5 237 2509 ABC NA 0.09748012
6 1781 25616 ABC NA 0.99523747
[[2]]
[[2]][[1]]
REC S Industry REC_uLen S_uLen
1 NA NaN ABC NA NA
2 237 2509 ABC 0.1319085 NA
3 1781 25616 ABC 0.9912619 NA
4 NA NaN ABC NA NA
5 237 2509 ABC NA 0.09748012
6 1781 25616 ABC NA 0.99523747
7 NA NaN ABC NA NA
8 3710 19224 CDE 0.8703574 NA
9 2099 6569 CDE 0.4924205 NA
[[2]][[2]]
REC S Industry REC_uLen S_uLen
1 NA NaN ABC NA NA
2 237 2509 ABC 0.1319085 NA
3 1781 25616 ABC 0.9912619 NA
4 NA NaN ABC NA NA
5 237 2509 ABC NA 0.09748012
6 1781 25616 ABC NA 0.99523747
7 NA NaN ABC NA NA
8 3710 19224 CDE 0.8703574 NA
9 2099 6569 CDE 0.4924205 NA
10 NA NaN ABC NA NA
11 3710 19224 CDE NA 0.94627897
12 2099 6569 CDE NA 0.32335136
每一步都会添加 3 条新闻。我希望我的输出包含相同的 5 行数据,但包含新添加的列。
这是预期的输出
REC S Industry REC_uLen S_uLen
1 237 2509 ABC 0.1319085 0.09748012
2 1781 25616 ABC 0.9912619 0.99523747
3 NA NaN ABC NA NA
4 3710 19224 CDE 0.8703574 0.94627897
5 2099 6569 CDE 0.4924205 0.32335136
这是我在连接等方面的想法:
library(foreach)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
REC = c(237, 1781, NA, 3710, 2099)
S = c(2509, 25616, NaN, 19224, 6569)
Industry = c("ABC", "ABC", "ABC", "CDE", "CDE")
data = data.frame(REC, S, Industry)
fnames <- c("REC", "S")
out <- NULL
foreach(i = unique(data$Industry)) %do% {
dataFin = subset(data, Industry == i) #Filtering only finite data
foreach(j = fnames) %do% {
dataFin[[sprintf("%s_uLen", j)]] = dataFin[[j]] / sqrt(sum(dataFin[[j]]^2, na.rm=TRUE)) # Calculation
}
out <- bind_rows(out, dataFin)
}
#> [[1]]
#> REC S Industry REC_uLen S_uLen
#> 1 237 2509 ABC 0.1319085 0.09748012
#> 2 1781 25616 ABC 0.9912619 0.99523747
#> 3 NA NaN ABC NA NaN
#>
#> [[2]]
#> REC S Industry REC_uLen S_uLen
#> 1 237 2509 ABC 0.1319085 0.09748012
#> 2 1781 25616 ABC 0.9912619 0.99523747
#> 3 NA NaN ABC NA NaN
#> 4 3710 19224 CDE 0.8703574 0.94627897
#> 5 2099 6569 CDE 0.4924205 0.32335136
out
#> REC S Industry REC_uLen S_uLen
#> 1 237 2509 ABC 0.1319085 0.09748012
#> 2 1781 25616 ABC 0.9912619 0.99523747
#> 3 NA NaN ABC NA NaN
#> 4 3710 19224 CDE 0.8703574 0.94627897
#> 5 2099 6569 CDE 0.4924205 0.32335136
由 reprex package (v2.0.1)
创建于 2022-02-16