使用条件语句根据来自其他行的数据创建新的因子级别(新行)
Create a new factor level (new row) based on data from other rows with conditional statements
df <- data.frame(PatientID = c("0002" ,"0002", "0005", "0005" ,"0009" ,"0009" ,"0018", "0018" ,"0039" ,"0039" , "0043" ,"0043", "0046", "0046" ,"0048" ,"0048"),
Timepoint= c("A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B"),
A = c(NA , 977.146 , NA , 964.315 ,NA , 952.311 , NA , 950.797 , 947.465 , 902.852 , 985.124 ,NA , 930.141 ,1007.790 , 1027.110 , 999.414),
B = c(998.988 , NA , 998.680 , NA , 1020.560 , 955.540 , 911.606 , 964.039 , 988.087 , 902.367 , 959.338 ,1029.050 , 987.374 ,1066.400 ,957.512 , 917.597),
C = c( 987.140 , 961.810 , 929.466 , 978.166, 969.469 , 943.398 ,936.034, 965.292 , 996.404 , 920.610 , 967.047, 913.517 , 893.428 , 921.606 , 929.590 ,950.493),
D = c( 961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469 ,943.398 , 965.292 , 996.404 , 967.047 , NA , 893.428 , 921.606 , 976.192 , 929.590 , 950.493),
E = c(1006.330, 1028.070 , 954.274 ,1005.910 ,949.969 , 992.820 ,934.407 , 948.913 , 961.375 ,955.296 , 961.128 ,998.119 ,1009.110 , 994.891 ,1000.170 ,982.763),
G= c(NA , 958.990 , 924.680 , 955.927 , NA , 949.384 ,973.348 , 984.392 , 943.894 , 961.468 , 995.368 , 994.997 , 979.454 , 952.605 ,NA , 956.507), stringsAsFactors = F)
基于此数据框,我需要为变量 (df$TimePoint
) 创建一个额外的因子级别,该级别将根据以下条件进行填充 - 我们已经有因子 A
和 B
在那个变量中,所以假设我们要创建因子级别 X
:
对于df$A
。如果 df$Timepoint
B >999 则因子 X 将填充为 df$Timepoint level B 值相同的值,否则(如果≤999)则将填充为 df$timepoint A 处的值.
对于df$B
。如果df$Timepoint
B >986,因子X将==作为df$Timepoint水平B值,否则X == df$timepoint A.
对于df$C
。如果 df$Timepoint
B >1000,因子 X 将 == 作为 df$Timepoint level B 值,否则 X == df$timepoint A.
对于df$D
。如果df$Timepoint
B >1030,因子X将==作为df$Timepoint水平B值,否则X == df$timepoint A.
对于df$E
。如果 df$Timepoint
B >800,因子 X 将 == 作为 df$Timepoint level B 值,否则 X == df$timepoint A.
为df$G
。如果df$Timepoint
B >950,因子X将==作为df$Timepoint水平B值,否则X == df$timepoint A.
新数据框如下所示:
提前致谢!
最佳
这是一种 tidyverse 方法。
library(tidyverse)
df %>%
pivot_wider(names_from = Timepoint,
values_from = A:G
) %>%
mutate(A_X = ifelse(A_B > 999, A_B, A_A),
B_X = ifelse(B_B > 986, B_B, B_A),
C_X = ifelse(C_B > 1000, C_B, C_A),
D_X = ifelse(D_B > 1030, D_B, D_A),
E_X = ifelse(E_B > 800, E_B, E_A),
G_X = ifelse(G_B > 950, G_B, G_A)) %>%
pivot_longer(cols = A_A:G_X,
names_sep = "_",
names_to = c("cat","Timepoint")) %>%
pivot_wider(names_from = cat,
values_from = value)
#> # A tibble: 24 x 8
#> PatientID Timepoint A B C D E G
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0002 A NA 999. 987. 962. 1006. NA
#> 2 0002 B 977. NA 962. 929. 1028. 959.
#> 3 0002 X NA NA 987. 962. 1028. 959.
#> 4 0005 A NA 999. 929. 978. 954. 925.
#> 5 0005 B 964. NA 978. 1006. 1006. 956.
#> 6 0005 X NA NA 929. 978. 1006. 956.
#> 7 0009 A NA 1021. 969. 926. 950. NA
#> 8 0009 B 952. 956. 943. 969. 993. 949.
#> 9 0009 X NA 1021. 969. 926. 993. NA
#> 10 0018 A NA 912. 936. 943. 934. 973.
#> 11 0018 B 951. 964. 965. 965. 949. 984.
#> 12 0018 X NA 912. 936. 943. 949. 984.
#> 13 0039 A 947. 988. 996. 996. 961. 944.
#> 14 0039 B 903. 902. 921. 967. 955. 961.
#> 15 0039 X 947. 988. 996. 996. 955. 961.
#> 16 0043 A 985. 959. 967. NA 961. 995.
#> 17 0043 B NA 1029. 914. 893. 998. 995.
#> 18 0043 X NA 1029. 967. NA 998. 995.
#> 19 0046 A 930. 987. 893. 922. 1009. 979.
#> 20 0046 B 1008. 1066. 922. 976. 995. 953.
#> 21 0046 X 1008. 1066. 893. 922. 995. 953.
#> 22 0048 A 1027. 958. 930. 930. 1000. NA
#> 23 0048 B 999. 918. 950. 950. 983. 957.
#> 24 0048 X 999. 958. 930. 930. 983. 957.
由 reprex package (v0.3.0)
于 2021-07-29 创建
这是使用 group_modify
和 add_row
的替代方法:
library(tidyverse)
df %>%
group_by(PatientID) %>%
group_modify(.f =
~ {
df_b <- .x[.x$Timepoint == "B",]
df_a <- .x[.x$Timepoint == "A",]
.x %>% add_row(Timepoint = "X",
A = ifelse(df_b$A > 999, df_b$A, df_a$A),
B = ifelse(df_b$B > 986, df_b$B, df_a$B),
C = ifelse(df_b$C > 1000, df_b$C, df_a$C),
D = ifelse(df_b$D > 1030, df_b$D, df_a$D),
E = ifelse(df_b$E > 800, df_b$E, df_a$E),
G = ifelse(df_b$G > 950, df_b$G, df_a$G)
)
})
#> # A tibble: 24 x 8
#> # Groups: PatientID [8]
#> PatientID Timepoint A B C D E G
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0002 A NA 999. 987. 962. 1006. NA
#> 2 0002 B 977. NA 962. 929. 1028. 959.
#> 3 0002 X NA NA 987. 962. 1028. 959.
#> 4 0005 A NA 999. 929. 978. 954. 925.
#> 5 0005 B 964. NA 978. 1006. 1006. 956.
#> 6 0005 X NA NA 929. 978. 1006. 956.
#> 7 0009 A NA 1021. 969. 926. 950. NA
#> 8 0009 B 952. 956. 943. 969. 993. 949.
#> 9 0009 X NA 1021. 969. 926. 993. NA
#> 10 0018 A NA 912. 936. 943. 934. 973.
#> # ... with 14 more rows
由 reprex package (v0.3.0)
于 2021-07-29 创建
使用dplyr
:
rbind(df, df %>%
mutate(X1 = ifelse(A > 999, A, lag(A))) %>%
mutate(X2 = ifelse(B > 986, B, lag(B))) %>%
mutate(X3 = ifelse(C > 1000, C, lag(C))) %>%
mutate(X4 = ifelse(D > 1030, D, lag(D))) %>%
mutate(X5 = ifelse(E > 800, E, lag(E))) %>%
mutate(X6 = ifelse(G > 950, G, lag(G))) %>%
filter(Timepoint == 'B') %>%
select(PatientID, Timepoint, X1:X6) %>%
rename(A = X1,
B = X2,
C = X3,
D = X4,
E = X5,
G = X6) %>%
mutate(Timepoint = 'X')) %>%
arrange(PatientID, Timepoint)
输出:
PatientID Timepoint A B C D E G
1 0002 A NA 998.988 987.140 961.810 1006.330 NA
2 0002 B 977.146 NA 961.810 929.466 1028.070 958.990
3 0002 X NA NA 987.140 961.810 1028.070 958.990
4 0005 A NA 998.680 929.466 978.166 954.274 924.680
5 0005 B 964.315 NA 978.166 1005.820 1005.910 955.927
6 0005 X NA NA 929.466 978.166 1005.910 955.927
7 0009 A NA 1020.560 969.469 925.752 949.969 NA
8 0009 B 952.311 955.540 943.398 969.469 992.820 949.384
9 0009 X NA 1020.560 969.469 925.752 992.820 NA
10 0018 A NA 911.606 936.034 943.398 934.407 973.348
11 0018 B 950.797 964.039 965.292 965.292 948.913 984.392
12 0018 X NA 911.606 936.034 943.398 948.913 984.392
13 0039 A 947.465 988.087 996.404 996.404 961.375 943.894
14 0039 B 902.852 902.367 920.610 967.047 955.296 961.468
15 0039 X 947.465 988.087 996.404 996.404 955.296 961.468
16 0043 A 985.124 959.338 967.047 NA 961.128 995.368
17 0043 B NA 1029.050 913.517 893.428 998.119 994.997
18 0043 X NA 1029.050 967.047 NA 998.119 994.997
19 0046 A 930.141 987.374 893.428 921.606 1009.110 979.454
20 0046 B 1007.790 1066.400 921.606 976.192 994.891 952.605
21 0046 X 1007.790 1066.400 893.428 921.606 994.891 952.605
22 0048 A 1027.110 957.512 929.590 929.590 1000.170 NA
23 0048 B 999.414 917.597 950.493 950.493 982.763 956.507
24 0048 X 999.414 957.512 929.590 929.590 982.763 956.507
df <- data.frame(PatientID = c("0002" ,"0002", "0005", "0005" ,"0009" ,"0009" ,"0018", "0018" ,"0039" ,"0039" , "0043" ,"0043", "0046", "0046" ,"0048" ,"0048"),
Timepoint= c("A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B"),
A = c(NA , 977.146 , NA , 964.315 ,NA , 952.311 , NA , 950.797 , 947.465 , 902.852 , 985.124 ,NA , 930.141 ,1007.790 , 1027.110 , 999.414),
B = c(998.988 , NA , 998.680 , NA , 1020.560 , 955.540 , 911.606 , 964.039 , 988.087 , 902.367 , 959.338 ,1029.050 , 987.374 ,1066.400 ,957.512 , 917.597),
C = c( 987.140 , 961.810 , 929.466 , 978.166, 969.469 , 943.398 ,936.034, 965.292 , 996.404 , 920.610 , 967.047, 913.517 , 893.428 , 921.606 , 929.590 ,950.493),
D = c( 961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469 ,943.398 , 965.292 , 996.404 , 967.047 , NA , 893.428 , 921.606 , 976.192 , 929.590 , 950.493),
E = c(1006.330, 1028.070 , 954.274 ,1005.910 ,949.969 , 992.820 ,934.407 , 948.913 , 961.375 ,955.296 , 961.128 ,998.119 ,1009.110 , 994.891 ,1000.170 ,982.763),
G= c(NA , 958.990 , 924.680 , 955.927 , NA , 949.384 ,973.348 , 984.392 , 943.894 , 961.468 , 995.368 , 994.997 , 979.454 , 952.605 ,NA , 956.507), stringsAsFactors = F)
基于此数据框,我需要为变量 (df$TimePoint
) 创建一个额外的因子级别,该级别将根据以下条件进行填充 - 我们已经有因子 A
和 B
在那个变量中,所以假设我们要创建因子级别 X
:
对于
df$A
。如果df$Timepoint
B >999 则因子 X 将填充为 df$Timepoint level B 值相同的值,否则(如果≤999)则将填充为 df$timepoint A 处的值.对于
df$B
。如果df$Timepoint
B >986,因子X将==作为df$Timepoint水平B值,否则X == df$timepoint A.对于
df$C
。如果df$Timepoint
B >1000,因子 X 将 == 作为 df$Timepoint level B 值,否则 X == df$timepoint A.对于
df$D
。如果df$Timepoint
B >1030,因子X将==作为df$Timepoint水平B值,否则X == df$timepoint A.对于
df$E
。如果df$Timepoint
B >800,因子 X 将 == 作为 df$Timepoint level B 值,否则 X == df$timepoint A.为
df$G
。如果df$Timepoint
B >950,因子X将==作为df$Timepoint水平B值,否则X == df$timepoint A.
新数据框如下所示:
提前致谢! 最佳
这是一种 tidyverse 方法。
library(tidyverse)
df %>%
pivot_wider(names_from = Timepoint,
values_from = A:G
) %>%
mutate(A_X = ifelse(A_B > 999, A_B, A_A),
B_X = ifelse(B_B > 986, B_B, B_A),
C_X = ifelse(C_B > 1000, C_B, C_A),
D_X = ifelse(D_B > 1030, D_B, D_A),
E_X = ifelse(E_B > 800, E_B, E_A),
G_X = ifelse(G_B > 950, G_B, G_A)) %>%
pivot_longer(cols = A_A:G_X,
names_sep = "_",
names_to = c("cat","Timepoint")) %>%
pivot_wider(names_from = cat,
values_from = value)
#> # A tibble: 24 x 8
#> PatientID Timepoint A B C D E G
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0002 A NA 999. 987. 962. 1006. NA
#> 2 0002 B 977. NA 962. 929. 1028. 959.
#> 3 0002 X NA NA 987. 962. 1028. 959.
#> 4 0005 A NA 999. 929. 978. 954. 925.
#> 5 0005 B 964. NA 978. 1006. 1006. 956.
#> 6 0005 X NA NA 929. 978. 1006. 956.
#> 7 0009 A NA 1021. 969. 926. 950. NA
#> 8 0009 B 952. 956. 943. 969. 993. 949.
#> 9 0009 X NA 1021. 969. 926. 993. NA
#> 10 0018 A NA 912. 936. 943. 934. 973.
#> 11 0018 B 951. 964. 965. 965. 949. 984.
#> 12 0018 X NA 912. 936. 943. 949. 984.
#> 13 0039 A 947. 988. 996. 996. 961. 944.
#> 14 0039 B 903. 902. 921. 967. 955. 961.
#> 15 0039 X 947. 988. 996. 996. 955. 961.
#> 16 0043 A 985. 959. 967. NA 961. 995.
#> 17 0043 B NA 1029. 914. 893. 998. 995.
#> 18 0043 X NA 1029. 967. NA 998. 995.
#> 19 0046 A 930. 987. 893. 922. 1009. 979.
#> 20 0046 B 1008. 1066. 922. 976. 995. 953.
#> 21 0046 X 1008. 1066. 893. 922. 995. 953.
#> 22 0048 A 1027. 958. 930. 930. 1000. NA
#> 23 0048 B 999. 918. 950. 950. 983. 957.
#> 24 0048 X 999. 958. 930. 930. 983. 957.
由 reprex package (v0.3.0)
于 2021-07-29 创建这是使用 group_modify
和 add_row
的替代方法:
library(tidyverse)
df %>%
group_by(PatientID) %>%
group_modify(.f =
~ {
df_b <- .x[.x$Timepoint == "B",]
df_a <- .x[.x$Timepoint == "A",]
.x %>% add_row(Timepoint = "X",
A = ifelse(df_b$A > 999, df_b$A, df_a$A),
B = ifelse(df_b$B > 986, df_b$B, df_a$B),
C = ifelse(df_b$C > 1000, df_b$C, df_a$C),
D = ifelse(df_b$D > 1030, df_b$D, df_a$D),
E = ifelse(df_b$E > 800, df_b$E, df_a$E),
G = ifelse(df_b$G > 950, df_b$G, df_a$G)
)
})
#> # A tibble: 24 x 8
#> # Groups: PatientID [8]
#> PatientID Timepoint A B C D E G
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0002 A NA 999. 987. 962. 1006. NA
#> 2 0002 B 977. NA 962. 929. 1028. 959.
#> 3 0002 X NA NA 987. 962. 1028. 959.
#> 4 0005 A NA 999. 929. 978. 954. 925.
#> 5 0005 B 964. NA 978. 1006. 1006. 956.
#> 6 0005 X NA NA 929. 978. 1006. 956.
#> 7 0009 A NA 1021. 969. 926. 950. NA
#> 8 0009 B 952. 956. 943. 969. 993. 949.
#> 9 0009 X NA 1021. 969. 926. 993. NA
#> 10 0018 A NA 912. 936. 943. 934. 973.
#> # ... with 14 more rows
由 reprex package (v0.3.0)
于 2021-07-29 创建使用dplyr
:
rbind(df, df %>%
mutate(X1 = ifelse(A > 999, A, lag(A))) %>%
mutate(X2 = ifelse(B > 986, B, lag(B))) %>%
mutate(X3 = ifelse(C > 1000, C, lag(C))) %>%
mutate(X4 = ifelse(D > 1030, D, lag(D))) %>%
mutate(X5 = ifelse(E > 800, E, lag(E))) %>%
mutate(X6 = ifelse(G > 950, G, lag(G))) %>%
filter(Timepoint == 'B') %>%
select(PatientID, Timepoint, X1:X6) %>%
rename(A = X1,
B = X2,
C = X3,
D = X4,
E = X5,
G = X6) %>%
mutate(Timepoint = 'X')) %>%
arrange(PatientID, Timepoint)
输出:
PatientID Timepoint A B C D E G
1 0002 A NA 998.988 987.140 961.810 1006.330 NA
2 0002 B 977.146 NA 961.810 929.466 1028.070 958.990
3 0002 X NA NA 987.140 961.810 1028.070 958.990
4 0005 A NA 998.680 929.466 978.166 954.274 924.680
5 0005 B 964.315 NA 978.166 1005.820 1005.910 955.927
6 0005 X NA NA 929.466 978.166 1005.910 955.927
7 0009 A NA 1020.560 969.469 925.752 949.969 NA
8 0009 B 952.311 955.540 943.398 969.469 992.820 949.384
9 0009 X NA 1020.560 969.469 925.752 992.820 NA
10 0018 A NA 911.606 936.034 943.398 934.407 973.348
11 0018 B 950.797 964.039 965.292 965.292 948.913 984.392
12 0018 X NA 911.606 936.034 943.398 948.913 984.392
13 0039 A 947.465 988.087 996.404 996.404 961.375 943.894
14 0039 B 902.852 902.367 920.610 967.047 955.296 961.468
15 0039 X 947.465 988.087 996.404 996.404 955.296 961.468
16 0043 A 985.124 959.338 967.047 NA 961.128 995.368
17 0043 B NA 1029.050 913.517 893.428 998.119 994.997
18 0043 X NA 1029.050 967.047 NA 998.119 994.997
19 0046 A 930.141 987.374 893.428 921.606 1009.110 979.454
20 0046 B 1007.790 1066.400 921.606 976.192 994.891 952.605
21 0046 X 1007.790 1066.400 893.428 921.606 994.891 952.605
22 0048 A 1027.110 957.512 929.590 929.590 1000.170 NA
23 0048 B 999.414 917.597 950.493 950.493 982.763 956.507
24 0048 X 999.414 957.512 929.590 929.590 982.763 956.507