如果匹配条件,则删除字符串中的最后两个字符
Delete last two characters in string if they match criteria
我的数据库中有 200 万个名字。例如:
df <- data.frame(names=c("A ADAM", "S BEAN", "A APPLE A", "A SCHWARZENEGGER"))
> df
names
1 A ADAM
2 S BEAN
3 A APPLE A
4 A SCHWARZENEGGER
如果这些是字符串的最后两个字符,我想删除 ' A'
(白色 space A)。
我知道正则表达式是我们的朋友。如何有效地将正则表达式函数应用于字符串的最后两个字符?
期望的输出:
> output
names
1 A ADAM
2 S BEAN
3 A APPLE
4 A SCHWARZENEGGER
我们可以用sub
匹配一个space\s
后跟[=17=]在字符串的末尾($
)并替换为空白 (""
)
df$names <- sub("\sA$", "", df$names)
df$names
#[1] "A ADAM" "S BEAN" "A APPLE" "A SCHWARZENEGGER"
@akrun 的回答当然是正确的,但根据评论,当列为 factor
.
时,我将再添加一件事
在评论中使用@vincentmajor 的例子:
df <- df2 <- data.frame(names = rep(c("A ADAM", "S BEAN", "A APPLE A", "A SCHWARZENEGGER"), length.out = 2000000))
# Probably we want the column to remain factor after substitution
system.time(
df$names <- factor(sub("\sA$", "", df$names))
)
# user system elapsed
# 0.892 0.000 0.893
# Also if there are a lot of duplicates, like in this example,
# substituting the levels is way quicker
system.time(
levels(df2$names) <- sub("\sA$", "", levels(df2$names))
)
# user system elapsed
# 0.052 0.000 0.053
如果您希望对数百万条记录有良好的性能,stringi
包就是您所需要的。它甚至优于基本的 R 函数:
require(stringi)
n <- 10000
x <- stri_rand_strings(n, 1:100)
ind <- sample(n, n/100)
x[ind] <- stri_paste(x[ind]," A")
baseR <- function(x){
sub("\sA$", "", x)
}
stri1 <- function(x){
stri_replace_last_regex(x, "\sA$","")
}
stri2 <- function(x){
ind <- stri_detect_regex(x, "\sA$")
x[ind] <- stri_sub(x[ind],1, -3)
x
}
#if we assume that there can only be space, not any white character
#this is even faster (ca 200x)
stri3 <- function(x){
ind <- stri_endswith_fixed(x, " A")
x[ind] <- stri_sub(x[ind],1, -3)
x
}
head(stri2(x),44)
require(microbenchmark)
microbenchmark(baseR(x), stri1(x),stri2(x),stri3(x))
Unit: microseconds
expr min lq mean median uq max neval
baseR(x) 166044.032 172054.30 183919.6684 183112.1765 194586.231 219207.905 100
stri1(x) 36704.180 39015.59 41836.8612 40164.9365 43773.034 60373.866 100
stri2(x) 17736.535 18884.56 20575.3306 19818.2895 21759.489 31846.582 100
stri3(x) 491.963 802.27 918.1626 868.9935 1008.776 2489.923 100
也许不是最快的解决方案,但这也行得通:
require(stringi)
x <- stri_rand_strings(10, 1:10)
ind <- sample(10, 5)
x[ind] <- stri_paste(x[ind]," A")
x
# [1] "z A" "hX" "uv0 A" "HQtD A" "kTNZh" "4SIVBh" "v28UrqS A" "uskxxNkl A"
# [9] "dKxloBsA6" "sRkCQp7sn4"
y <- stri_sub(x, -2,-1) == " A"
x[y] <- stri_sub(x[y], 1, -3)
x
# [1] "z" "hX" "uv0" "HQtD" "kTNZh" "4SIVBh" "v28UrqS" "uskxxNkl"
# [9] "dKxloBsA6" "sRkCQp7sn4"
我的数据库中有 200 万个名字。例如:
df <- data.frame(names=c("A ADAM", "S BEAN", "A APPLE A", "A SCHWARZENEGGER"))
> df
names
1 A ADAM
2 S BEAN
3 A APPLE A
4 A SCHWARZENEGGER
如果这些是字符串的最后两个字符,我想删除 ' A'
(白色 space A)。
我知道正则表达式是我们的朋友。如何有效地将正则表达式函数应用于字符串的最后两个字符?
期望的输出:
> output
names
1 A ADAM
2 S BEAN
3 A APPLE
4 A SCHWARZENEGGER
我们可以用sub
匹配一个space\s
后跟[=17=]在字符串的末尾($
)并替换为空白 (""
)
df$names <- sub("\sA$", "", df$names)
df$names
#[1] "A ADAM" "S BEAN" "A APPLE" "A SCHWARZENEGGER"
@akrun 的回答当然是正确的,但根据评论,当列为 factor
.
在评论中使用@vincentmajor 的例子:
df <- df2 <- data.frame(names = rep(c("A ADAM", "S BEAN", "A APPLE A", "A SCHWARZENEGGER"), length.out = 2000000))
# Probably we want the column to remain factor after substitution
system.time(
df$names <- factor(sub("\sA$", "", df$names))
)
# user system elapsed
# 0.892 0.000 0.893
# Also if there are a lot of duplicates, like in this example,
# substituting the levels is way quicker
system.time(
levels(df2$names) <- sub("\sA$", "", levels(df2$names))
)
# user system elapsed
# 0.052 0.000 0.053
如果您希望对数百万条记录有良好的性能,stringi
包就是您所需要的。它甚至优于基本的 R 函数:
require(stringi)
n <- 10000
x <- stri_rand_strings(n, 1:100)
ind <- sample(n, n/100)
x[ind] <- stri_paste(x[ind]," A")
baseR <- function(x){
sub("\sA$", "", x)
}
stri1 <- function(x){
stri_replace_last_regex(x, "\sA$","")
}
stri2 <- function(x){
ind <- stri_detect_regex(x, "\sA$")
x[ind] <- stri_sub(x[ind],1, -3)
x
}
#if we assume that there can only be space, not any white character
#this is even faster (ca 200x)
stri3 <- function(x){
ind <- stri_endswith_fixed(x, " A")
x[ind] <- stri_sub(x[ind],1, -3)
x
}
head(stri2(x),44)
require(microbenchmark)
microbenchmark(baseR(x), stri1(x),stri2(x),stri3(x))
Unit: microseconds
expr min lq mean median uq max neval
baseR(x) 166044.032 172054.30 183919.6684 183112.1765 194586.231 219207.905 100
stri1(x) 36704.180 39015.59 41836.8612 40164.9365 43773.034 60373.866 100
stri2(x) 17736.535 18884.56 20575.3306 19818.2895 21759.489 31846.582 100
stri3(x) 491.963 802.27 918.1626 868.9935 1008.776 2489.923 100
也许不是最快的解决方案,但这也行得通:
require(stringi)
x <- stri_rand_strings(10, 1:10)
ind <- sample(10, 5)
x[ind] <- stri_paste(x[ind]," A")
x
# [1] "z A" "hX" "uv0 A" "HQtD A" "kTNZh" "4SIVBh" "v28UrqS A" "uskxxNkl A"
# [9] "dKxloBsA6" "sRkCQp7sn4"
y <- stri_sub(x, -2,-1) == " A"
x[y] <- stri_sub(x[y], 1, -3)
x
# [1] "z" "hX" "uv0" "HQtD" "kTNZh" "4SIVBh" "v28UrqS" "uskxxNkl"
# [9] "dKxloBsA6" "sRkCQp7sn4"