删除R中的连续空行
Delete consecutive empty rows in R
df
显示可能的名称匹配。每对比赛应由一个空行分开。但是,在某些情况下,我的输出在匹配对之间包含几个空行:
> df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 NA <NA>
8 NA <NA>
9 5 alan turing
10 6 allan turing
11 NA <NA>
期望的输出是:
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 5 alan turing
8 6 allan turing
9 NA <NA>
我可以用 for 循环来做到这一点,我知道这不是最佳选择。
也许这有帮助
v1 <- rowSums(!is.na(df))
df[unlist(lapply(split(seq_along(v1),
cumsum(c(1, diff(!v1))<0)), function(i)
i[seq(which.max(v1[i]==0))])),]
# id name
#1 1 john jones
#2 2 john joners
#3 NA <NA>
#4 3 clara prat
#5 4 klara prat
#6 NA <NA>
#9 5 alan turing
#10 6 allan turing
#11 NA <NA>
使用 IRanges
包。
df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
library(IRanges)
na.rs <- which(is.na(df$id) & is.na(df$name))
na.rs.re <- reduce(IRanges(na.rs, na.rs))
na.rs.rm <- na.rs.re[width(na.rs.re)>1]
start(na.rs.rm) <- start(na.rs.rm) + 1
df[-as.integer(na.rs.rm), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>
这是另一种方法,使用 rle
来查找缺失的运行
miss <- rowSums(is.na(df))
# get runs of missing
r <- rle(miss)
r$values <- seq_along(r$values)
# subset data, removing rows when all columns are missing
# and rows sequentially missing
df[!(miss == ncol(df) & duplicated(inverse.rle(r))), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>
正如 Akrun 所提到的,您可以使用 data.table::rleid
来避免一些明确的 rle
计算
df[!(rowSums(is.na(df)) == ncol(df) & duplicated(data.table::rleid(is.na(df[[1]])))) , ]
当然不是最好的解决方案,但很容易遵循..
miss <- rowSums(is.na(df))
r <- sum(rle(miss)[[2]])
for(i in 2:length(df$id)){
while(is.na(df$id[i-1]) & is.na(df$id[i])){
df <- df[-(i),]
if(sum(is.na(df$id)) == r) break
}
}
df
显示可能的名称匹配。每对比赛应由一个空行分开。但是,在某些情况下,我的输出在匹配对之间包含几个空行:
> df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 NA <NA>
8 NA <NA>
9 5 alan turing
10 6 allan turing
11 NA <NA>
期望的输出是:
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 5 alan turing
8 6 allan turing
9 NA <NA>
我可以用 for 循环来做到这一点,我知道这不是最佳选择。
也许这有帮助
v1 <- rowSums(!is.na(df))
df[unlist(lapply(split(seq_along(v1),
cumsum(c(1, diff(!v1))<0)), function(i)
i[seq(which.max(v1[i]==0))])),]
# id name
#1 1 john jones
#2 2 john joners
#3 NA <NA>
#4 3 clara prat
#5 4 klara prat
#6 NA <NA>
#9 5 alan turing
#10 6 allan turing
#11 NA <NA>
使用 IRanges
包。
df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
library(IRanges)
na.rs <- which(is.na(df$id) & is.na(df$name))
na.rs.re <- reduce(IRanges(na.rs, na.rs))
na.rs.rm <- na.rs.re[width(na.rs.re)>1]
start(na.rs.rm) <- start(na.rs.rm) + 1
df[-as.integer(na.rs.rm), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>
这是另一种方法,使用 rle
来查找缺失的运行
miss <- rowSums(is.na(df))
# get runs of missing
r <- rle(miss)
r$values <- seq_along(r$values)
# subset data, removing rows when all columns are missing
# and rows sequentially missing
df[!(miss == ncol(df) & duplicated(inverse.rle(r))), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>
正如 Akrun 所提到的,您可以使用 data.table::rleid
来避免一些明确的 rle
计算
df[!(rowSums(is.na(df)) == ncol(df) & duplicated(data.table::rleid(is.na(df[[1]])))) , ]
当然不是最好的解决方案,但很容易遵循..
miss <- rowSums(is.na(df))
r <- sum(rle(miss)[[2]])
for(i in 2:length(df$id)){
while(is.na(df$id[i-1]) & is.na(df$id[i])){
df <- df[-(i),]
if(sum(is.na(df$id)) == r) break
}
}