根据字母分隔列内的字母编号值
Seperation on letternumber value inside a column based on a letter
我在列内分隔时遇到问题
该列中的数据是设备位置的代码,类似于 SE005 或 H0002 或 MANA。 S 是一种移动设备,后面的字母表示它的使用位置。
SE005是第五个移动设备位置E。
H0002为H位置2号不动器。
MANA 是一个地方的设备
对于我在 Power BI 中的分析,我不需要在一个我不关心具体是哪个设备的地方扫描了多少篇文章。因为 Power BI 无法汇总设备的每个位置(因为它是列中的组合值)我想将其拆分。
我希望它看起来像这样。
v1 v2 v3
SE005 becomes S E 005 # 2 separations
H0002 becomes H 005 #1 separation and one deleted number
MANA MANA #R should not change this but is should be inside the same column as E and H
我必须将其应用于 800 万行。
而且我认为必须分两步或三步完成,首先将字母与数字分开。
请注意,字母比预览中的字母多。但是安排是一样的。感谢任何帮助。
编辑
只想拆分设备列,以便 power bi 可以使用它。
art <- c(1:100)
device <-c("SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X01", "W007", "MANA", "SE02", "H005",
"SE05", "H007", "E003", "MANA", "J012", "X02", "W007", "MANA", "SE02", "H005",
"SE05", "H008", "E004", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X017", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E008", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X009", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E0010", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H009", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005")
ACCEPT <- as.data.frame(art)
ACCEPT$device <- device
head(BLABLA)
Article device V3 V4
1 52290 SE05 20170223 162756
2 52300 SE05 20170223 162758
3 10090 SE05 20170223 162831
4 10060 SE08 20170223 162834
5 10070 SE08 20170223 162839
6 10070 SE08 20170223 162859
尝试 this site 以更好地理解正则表达式以及如何在您的案例中应用它。如果没有可重现的例子,就很难理解你的具体情况和你可能遇到的边缘情况。希望我下面的示例能帮助您入门:
编辑:更改我的答案以使用您的示例数据集
art <- c(1:100)
device <-c("SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X01", "W007", "MANA", "SE02", "H005",
"SE05", "H007", "E003", "MANA", "J012", "X02", "W007", "MANA", "SE02", "H005",
"SE05", "H008", "E004", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X017", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E008", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X009", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E0010", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H009", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005")
ACCEPT <- as.data.frame(art)
ACCEPT$device <- device
library(tidyverse)
library(magrittr)
library(stringr)
# Find mobile devices
# '^' for start of string
# '[\D]' for any non-numeric
# '{2}' for exactly two of them
ACCEPT %<>% mutate(mobile = str_detect(device, pattern = '^[\D]{2}[\d]{1}'))
# Now looking for exactly one letter at the start,
# followed by a number
ACCEPT %<>% mutate(immobile = str_detect(device, pattern = '^[\D]{1}[\d]{1}'))
# Finally, look for "no numbers"
# (alternatively, if all places have the same value, '== "MANA"' would do)
ACCEPT %<>% mutate(place = !str_detect(device, pattern = '\d'))
# Split and process device types individually
bind_rows(ACCEPT %>%
filter(mobile) %>%
mutate(v1 = str_extract(device, pattern = '[^\d]{1}'),
v2 = str_sub(device, start = 2, end = 2),
v3 = str_extract(device, pattern = '\d{1,9}')),
ACCEPT %>%
filter(immobile) %>%
mutate(v1 = '',
v2 = str_sub(device, start = 1, end = 1),
v3 = str_extract(device, pattern = '\d{1,9}')),
ACCEPT %>%
filter(place) %>%
mutate(v1 = '',
v2 = device,
v3 = '')) %>%
arrange(art) %>%
select(art, v1, v2, v3)
这是一个略短的版本,它不使用 dplyr
。
# v2 gets all of 'device' so long as this is entirely alphabetical:
ACCEPT$v2 <- ifelse(grepl('^[A-Z]+$', ACCEPT$device), ACCEPT$device, NA)
# v3 gets the number, if there is one - we check by seeing if v2 is NA
ACCEPT$v3 <- ifelse(is.na(ACCEPT$v2), sub('\D+(\d+)', '\1', ACCEPT$device), NA)
# now v1 and v2 will get the first two letters,
# but only if v2 hasn't already been filled out:
ACCEPT$v1[is.na(ACCEPT$v2)] <- substr(ACCEPT$device[is.na(ACCEPT$v2)], 1, 1)
ACCEPT$v2[is.na(ACCEPT$v2)] <- substr(ACCEPT$device[is.na(ACCEPT$v2)], 2, 2)
我在列内分隔时遇到问题
该列中的数据是设备位置的代码,类似于 SE005 或 H0002 或 MANA。 S 是一种移动设备,后面的字母表示它的使用位置。
SE005是第五个移动设备位置E。
H0002为H位置2号不动器。
MANA 是一个地方的设备
对于我在 Power BI 中的分析,我不需要在一个我不关心具体是哪个设备的地方扫描了多少篇文章。因为 Power BI 无法汇总设备的每个位置(因为它是列中的组合值)我想将其拆分。
我希望它看起来像这样。
v1 v2 v3
SE005 becomes S E 005 # 2 separations
H0002 becomes H 005 #1 separation and one deleted number
MANA MANA #R should not change this but is should be inside the same column as E and H
我必须将其应用于 800 万行。 而且我认为必须分两步或三步完成,首先将字母与数字分开。 请注意,字母比预览中的字母多。但是安排是一样的。感谢任何帮助。
编辑
只想拆分设备列,以便 power bi 可以使用它。
art <- c(1:100)
device <-c("SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X01", "W007", "MANA", "SE02", "H005",
"SE05", "H007", "E003", "MANA", "J012", "X02", "W007", "MANA", "SE02", "H005",
"SE05", "H008", "E004", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X017", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E008", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X009", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E0010", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H009", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005")
ACCEPT <- as.data.frame(art)
ACCEPT$device <- device
head(BLABLA)
Article device V3 V4
1 52290 SE05 20170223 162756
2 52300 SE05 20170223 162758
3 10090 SE05 20170223 162831
4 10060 SE08 20170223 162834
5 10070 SE08 20170223 162839
6 10070 SE08 20170223 162859
尝试 this site 以更好地理解正则表达式以及如何在您的案例中应用它。如果没有可重现的例子,就很难理解你的具体情况和你可能遇到的边缘情况。希望我下面的示例能帮助您入门:
编辑:更改我的答案以使用您的示例数据集
art <- c(1:100)
device <-c("SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X01", "W007", "MANA", "SE02", "H005",
"SE05", "H007", "E003", "MANA", "J012", "X02", "W007", "MANA", "SE02", "H005",
"SE05", "H008", "E004", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X017", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E008", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X009", "W007", "MANA", "SE02", "H005",
"SE05", "H0010", "E0010", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H005", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005",
"SE05", "H009", "E003", "MANA", "J012", "X021", "W007", "MANA", "SE02", "H005")
ACCEPT <- as.data.frame(art)
ACCEPT$device <- device
library(tidyverse)
library(magrittr)
library(stringr)
# Find mobile devices
# '^' for start of string
# '[\D]' for any non-numeric
# '{2}' for exactly two of them
ACCEPT %<>% mutate(mobile = str_detect(device, pattern = '^[\D]{2}[\d]{1}'))
# Now looking for exactly one letter at the start,
# followed by a number
ACCEPT %<>% mutate(immobile = str_detect(device, pattern = '^[\D]{1}[\d]{1}'))
# Finally, look for "no numbers"
# (alternatively, if all places have the same value, '== "MANA"' would do)
ACCEPT %<>% mutate(place = !str_detect(device, pattern = '\d'))
# Split and process device types individually
bind_rows(ACCEPT %>%
filter(mobile) %>%
mutate(v1 = str_extract(device, pattern = '[^\d]{1}'),
v2 = str_sub(device, start = 2, end = 2),
v3 = str_extract(device, pattern = '\d{1,9}')),
ACCEPT %>%
filter(immobile) %>%
mutate(v1 = '',
v2 = str_sub(device, start = 1, end = 1),
v3 = str_extract(device, pattern = '\d{1,9}')),
ACCEPT %>%
filter(place) %>%
mutate(v1 = '',
v2 = device,
v3 = '')) %>%
arrange(art) %>%
select(art, v1, v2, v3)
这是一个略短的版本,它不使用 dplyr
。
# v2 gets all of 'device' so long as this is entirely alphabetical:
ACCEPT$v2 <- ifelse(grepl('^[A-Z]+$', ACCEPT$device), ACCEPT$device, NA)
# v3 gets the number, if there is one - we check by seeing if v2 is NA
ACCEPT$v3 <- ifelse(is.na(ACCEPT$v2), sub('\D+(\d+)', '\1', ACCEPT$device), NA)
# now v1 and v2 will get the first two letters,
# but only if v2 hasn't already been filled out:
ACCEPT$v1[is.na(ACCEPT$v2)] <- substr(ACCEPT$device[is.na(ACCEPT$v2)], 1, 1)
ACCEPT$v2[is.na(ACCEPT$v2)] <- substr(ACCEPT$device[is.na(ACCEPT$v2)], 2, 2)