在 R 中使用 grep (?) 删除文本文件中的行
Delete rows in text file with grep (?) in R
我有一个非常重复的文本文件,前几行看起来像这样:
Filename:
ROI: red_1 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.013282 0.133982 0.061581 0.034069
Band 2 0.009866 0.112935 0.042688 0.026618
Band 3 0.008304 0.037059 0.018434 0.007515
Band 4 0.004726 0.040089 0.018490 0.009605
Histogram DN Npts Total Percent Acc Pct
Band 1 0.013282 1 1 5.0000 5.0000
Bin=0.00047 0.013755 0 1 0.0000 5.0000
0.014228 0 1 0.0000 5.0000
.. 并持续一段时间,直到达到另一个 ROI 值,如下所示:
Stats for ROI: red_5 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.030513 0.180980 0.090056 0.044456
Band 2 0.022289 0.157861 0.046419 0.030555
Band 3 0.012533 0.046693 0.027343 0.008947
Band 4 0.003332 0.041555 0.016888 0.007770
Histogram DN Npts Total Percent Acc Pct
Band 1 0.030513 1 1 5.0000 5.0000
Bin=0.00059 0.031103 0 1 0.0000 5.0000
0.031693 0 1 0.0000 5.0000
0.032283 0 1 0.0000 5.0000
我只想要一个数据框,每个 ROI 仅包含 ROI: ...
、Basic Stats
、Band 1
、Band 2
.. 到 Band 4
行。所有 100 个 ROI 的最终输出看起来像这样。
ROI: red_1 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.013282 0.133982 0.061581 0.034069
Band 2 0.009866 0.112935 0.042688 0.026618
Band 3 0.008304 0.037059 0.018434 0.007515
Band 4 0.004726 0.040089 0.018490 0.009605
Stats for ROI: red_5 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.030513 0.180980 0.090056 0.044456
Band 2 0.022289 0.157861 0.046419 0.030555
Band 3 0.012533 0.046693 0.027343 0.008947
Band 4 0.003332 0.041555 0.016888 0.007770
有人可以指点我有关 R 中清理数据的教程吗?使用 rep(grepl())
是要走的路吗?数据的全文文件可在此处获得:LINK.
如评论中所述,为每个 ROI 值设置换行符不符合 R 数据帧的形式。我认为我们可以通过添加一个列来跟踪 ROI 来实现您正在寻找的东西。最终产品如下所示:
ROI Band min max mean stdev
red_1 [Red] 20 points Band 1 0.013282 0.133982 0.061581 0.034069
red_1 [Red] 20 points Band 2 0.009866 0.112935 0.042688 0.026618
red_1 [Red] 20 points Band 3 0.008304 0.037059 0.018434 0.007515
red_1 [Red] 20 points Band 4 0.004726 0.040089 0.018490 0.009605
red_2 [Red] 12 points Band 1 0.032262 0.124425 0.078073 0.028031
red_2 [Red] 12 points Band 2 0.021072 0.064156 0.037923 0.012178
red_2 [Red] 12 points Band 3 0.013404 0.066043 0.036316 0.014787
red_2 [Red] 12 points Band 4 0.005162 0.055781 0.015526 0.013255
red_3 [Red] 12 points Band 1 0.037488 0.107830 0.057892 0.018964
red_3 [Red] 12 points Band 2 0.028140 0.072370 0.045340 0.014507
由于您的数据不大,我们可以通过将数据作为字符向量读取并遍历各行来实现:
file <- file("test2.txt",)
dat <- readLines(file)
out <- NULL
roi <- NULL
for(i in 1:length(dat)){
line <- dat[i]
if(length(grep("ROI: ",line))>0){
roi <- substr(line,regexpr("ROI",line)[1]+5,nchar(line))
}
if(substr(line,0,9)==" Band"){
splitLine <- strsplit(trimws(line),"\t")[[1]]
outLine <- data.frame("ROI" = roi,
"Band" = splitLine[1],
"min" = splitLine[2],
"max" = splitLine[3],
"mean" = splitLine[4],
"stdev" = splitLine[5]
)
out <- rbind(out,outLine)
}
}
数据框 out
应该非常接近您要查找的内容。
这是一种将文件作为文本拉入的方法,识别 ROI
和 Basic Stats
行的相关行,然后生成包含所有数据的长数据框。
input <- readLines("https://dl.dropboxusercontent.com/u/45095175/test2.txt")
roi_lines <- grep("ROI", input)
basic_stat_lines <- grep("Basic Stats", input)
roi_names <- sub("^.*ROI: ", "", input[roi_lines])
roi_data <- lapply(1:length(basic_stat_lines), function(i) {
data.frame(roi = roi_names[i], read.delim(text = input[basic_stat_lines[i] + 0:4],
stringsAsFactors = FALSE, check.names = FALSE),
stringsAsFactors = FALSE)
})
roi_data_all <- do.call("rbind", roi_data)
myfun
将文本文件作为输入,returns 数据框列表。 what
参数将设置用户是否要从文本文件中提取数据或基本统计信息。
myfun <- function( file, what )
{
x <- readLines( file )
g1 <- which( grepl("ROI:", x))
if( what == 'Basic Stats'){
g2 <- which( grepl('Basic Stats', x))
} else if ( what == "Histogram" ) {
g2 <- which( grepl("Histogram", x))
} else {
stop( 'what value is not supported')
}
df_list <- list()
counter <- 0
while( counter < length( g1 ))
{
counter <- counter + 1
if( counter != length( g1 ) ){
low <- g1[ counter ]
high <- g1[ counter + 1 ]
} else {
low <- g1[ counter ]
high <- length( x )
}
min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
title <- ifelse( counter == 1,
list( gsub( '\[|\]', '', unlist( strsplit( x[ low ], "\ ") )[ 2:4 ] ) ),
list( gsub( '\[|\]', '', unlist( strsplit( x[ low ], "\ ") )[ 4:6 ] ) ) )
if( what == 'Basic Stats'){
min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: ( min_ind + 5 ) ], "\t")), stringsAsFactors = FALSE )
colnames( x1 ) <- x1[1, ]
x1 <- x1[2:5, ]
x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
colnames(x1) <- gsub("\ ", '', colnames(x1)) # remove spaces
# convert from character to numeric data type
x1[, 2:5 ] <- lapply( x1[, 2:5 ], function(x) as.numeric( as.character( x ) ) )
df_list[[ as.character(counter) ]] <- x1
} else if ( what == "Histogram" ) {
x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: (high-1) ], "\t")), stringsAsFactors = FALSE )
# column names and band and bin columns
colnames( x1 ) <- x1[1, ]
colnames(x1)[1] <- 'Histogram'
x1$Band <- rep( gsub("[Band\ ]", '', grep( "Band", x1$Histogram, value = TRUE )),
diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
x1$Bin <- rep( gsub("[Bin=\ ]", '', grep( "Bin", x1$Histogram, value = TRUE )),
diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
x1 <- x1[! grepl( 'Histogram', x1$Histogram ), ]
x1$Histogram <- NULL
x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
colnames(x1) <- gsub("\ ", '', colnames(x1)) # remove spaces
# convert from character to numeric data type
x1[, c(1:7, 10) ] <- lapply( x1[, c(1:7, 10) ], function(x) as.numeric( as.character( x ) ) )
df_list[[ as.character(counter) ]] <- x1
}
}
return( df_list )
}
1.提取基本统计数据:
df_list <- myfun(file = "test2.txt", what = 'Basic Stats')
df_list[[1]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.013282 0.133982 0.061581 0.034069 red_1 Red 20
# 3 Band 2 0.009866 0.112935 0.042688 0.026618 red_1 Red 20
# 4 Band 3 0.008304 0.037059 0.018434 0.007515 red_1 Red 20
# 5 Band 4 0.004726 0.040089 0.018490 0.009605 red_1 Red 20
df_list[[2]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.032262 0.124425 0.078073 0.028031 red_2 Red 12
# 3 Band 2 0.021072 0.064156 0.037923 0.012178 red_2 Red 12
# 4 Band 3 0.013404 0.066043 0.036316 0.014787 red_2 Red 12
# 5 Band 4 0.005162 0.055781 0.015526 0.013255 red_2 Red 12
df_list[[3]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.037488 0.107830 0.057892 0.018964 red_3 Red 12
# 3 Band 2 0.028140 0.072370 0.045340 0.014507 red_3 Red 12
# 4 Band 3 0.014960 0.112973 0.032751 0.026575 red_3 Red 12
# 5 Band 4 0.006566 0.029133 0.018201 0.006897 red_3 Red 12
2。提取数据:
df_list <- myfun(file = "test2.txt", what = 'Histogram')
头
head(df_list[[1]])
# DN Npts Total Percent AccPct Band Bin ROI color points
# 2 0.013282 1 1 5 5 1 0.00047 red_1 Red 20
# 3 0.013755 0 1 0 5 1 0.00047 red_1 Red 20
# 4 0.014228 0 1 0 5 1 0.00047 red_1 Red 20
# 5 0.014702 0 1 0 5 1 0.00047 red_1 Red 20
# 6 0.015175 0 1 0 5 1 0.00047 red_1 Red 20
# 7 0.015648 0 1 0 5 1 0.00047 red_1 Red 20
汇总统计:
library('data.table')
df1 <- df_list[[1]]
setDT(df1)[, .( Min = min( DN ),
Max = max( DN ),
Mean = mean( DN ),
Stdev = sd( DN ) ), by = 'Band']
# Band Min Max Mean Stdev
# 1: 1 0.013282 0.133982 0.07363182 0.035048124
# 2: 2 0.009866 0.112935 0.06140034 0.029928470
# 3: 3 0.008304 0.037059 0.02268180 0.008349628
# 4: 4 0.004726 0.040089 0.02240761 0.010268456
df2 <- df_list[[2]]
setDT(df2)[, .( Min = min( DN ),
Max = max( DN ),
Mean = mean( DN ),
Stdev = sd( DN ) ), by = 'Band']
# Band Min Max Mean Stdev
# 1: 1 0.032262 0.124425 0.07834352 0.02676153
# 2: 2 0.021072 0.064156 0.04261389 0.01251049
# 3: 3 0.013404 0.066043 0.03972310 0.01528497
# 4: 4 0.005162 0.055781 0.03047151 0.01469855
我有一个非常重复的文本文件,前几行看起来像这样:
Filename:
ROI: red_1 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.013282 0.133982 0.061581 0.034069
Band 2 0.009866 0.112935 0.042688 0.026618
Band 3 0.008304 0.037059 0.018434 0.007515
Band 4 0.004726 0.040089 0.018490 0.009605
Histogram DN Npts Total Percent Acc Pct
Band 1 0.013282 1 1 5.0000 5.0000
Bin=0.00047 0.013755 0 1 0.0000 5.0000
0.014228 0 1 0.0000 5.0000
.. 并持续一段时间,直到达到另一个 ROI 值,如下所示:
Stats for ROI: red_5 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.030513 0.180980 0.090056 0.044456
Band 2 0.022289 0.157861 0.046419 0.030555
Band 3 0.012533 0.046693 0.027343 0.008947
Band 4 0.003332 0.041555 0.016888 0.007770
Histogram DN Npts Total Percent Acc Pct
Band 1 0.030513 1 1 5.0000 5.0000
Bin=0.00059 0.031103 0 1 0.0000 5.0000
0.031693 0 1 0.0000 5.0000
0.032283 0 1 0.0000 5.0000
我只想要一个数据框,每个 ROI 仅包含 ROI: ...
、Basic Stats
、Band 1
、Band 2
.. 到 Band 4
行。所有 100 个 ROI 的最终输出看起来像这样。
ROI: red_1 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.013282 0.133982 0.061581 0.034069
Band 2 0.009866 0.112935 0.042688 0.026618
Band 3 0.008304 0.037059 0.018434 0.007515
Band 4 0.004726 0.040089 0.018490 0.009605
Stats for ROI: red_5 [Red] 20 points
Basic Stats Min Max Mean Stdev
Band 1 0.030513 0.180980 0.090056 0.044456
Band 2 0.022289 0.157861 0.046419 0.030555
Band 3 0.012533 0.046693 0.027343 0.008947
Band 4 0.003332 0.041555 0.016888 0.007770
有人可以指点我有关 R 中清理数据的教程吗?使用 rep(grepl())
是要走的路吗?数据的全文文件可在此处获得:LINK.
如评论中所述,为每个 ROI 值设置换行符不符合 R 数据帧的形式。我认为我们可以通过添加一个列来跟踪 ROI 来实现您正在寻找的东西。最终产品如下所示:
ROI Band min max mean stdev
red_1 [Red] 20 points Band 1 0.013282 0.133982 0.061581 0.034069
red_1 [Red] 20 points Band 2 0.009866 0.112935 0.042688 0.026618
red_1 [Red] 20 points Band 3 0.008304 0.037059 0.018434 0.007515
red_1 [Red] 20 points Band 4 0.004726 0.040089 0.018490 0.009605
red_2 [Red] 12 points Band 1 0.032262 0.124425 0.078073 0.028031
red_2 [Red] 12 points Band 2 0.021072 0.064156 0.037923 0.012178
red_2 [Red] 12 points Band 3 0.013404 0.066043 0.036316 0.014787
red_2 [Red] 12 points Band 4 0.005162 0.055781 0.015526 0.013255
red_3 [Red] 12 points Band 1 0.037488 0.107830 0.057892 0.018964
red_3 [Red] 12 points Band 2 0.028140 0.072370 0.045340 0.014507
由于您的数据不大,我们可以通过将数据作为字符向量读取并遍历各行来实现:
file <- file("test2.txt",)
dat <- readLines(file)
out <- NULL
roi <- NULL
for(i in 1:length(dat)){
line <- dat[i]
if(length(grep("ROI: ",line))>0){
roi <- substr(line,regexpr("ROI",line)[1]+5,nchar(line))
}
if(substr(line,0,9)==" Band"){
splitLine <- strsplit(trimws(line),"\t")[[1]]
outLine <- data.frame("ROI" = roi,
"Band" = splitLine[1],
"min" = splitLine[2],
"max" = splitLine[3],
"mean" = splitLine[4],
"stdev" = splitLine[5]
)
out <- rbind(out,outLine)
}
}
数据框 out
应该非常接近您要查找的内容。
这是一种将文件作为文本拉入的方法,识别 ROI
和 Basic Stats
行的相关行,然后生成包含所有数据的长数据框。
input <- readLines("https://dl.dropboxusercontent.com/u/45095175/test2.txt")
roi_lines <- grep("ROI", input)
basic_stat_lines <- grep("Basic Stats", input)
roi_names <- sub("^.*ROI: ", "", input[roi_lines])
roi_data <- lapply(1:length(basic_stat_lines), function(i) {
data.frame(roi = roi_names[i], read.delim(text = input[basic_stat_lines[i] + 0:4],
stringsAsFactors = FALSE, check.names = FALSE),
stringsAsFactors = FALSE)
})
roi_data_all <- do.call("rbind", roi_data)
myfun
将文本文件作为输入,returns 数据框列表。 what
参数将设置用户是否要从文本文件中提取数据或基本统计信息。
myfun <- function( file, what )
{
x <- readLines( file )
g1 <- which( grepl("ROI:", x))
if( what == 'Basic Stats'){
g2 <- which( grepl('Basic Stats', x))
} else if ( what == "Histogram" ) {
g2 <- which( grepl("Histogram", x))
} else {
stop( 'what value is not supported')
}
df_list <- list()
counter <- 0
while( counter < length( g1 ))
{
counter <- counter + 1
if( counter != length( g1 ) ){
low <- g1[ counter ]
high <- g1[ counter + 1 ]
} else {
low <- g1[ counter ]
high <- length( x )
}
min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
title <- ifelse( counter == 1,
list( gsub( '\[|\]', '', unlist( strsplit( x[ low ], "\ ") )[ 2:4 ] ) ),
list( gsub( '\[|\]', '', unlist( strsplit( x[ low ], "\ ") )[ 4:6 ] ) ) )
if( what == 'Basic Stats'){
min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: ( min_ind + 5 ) ], "\t")), stringsAsFactors = FALSE )
colnames( x1 ) <- x1[1, ]
x1 <- x1[2:5, ]
x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
colnames(x1) <- gsub("\ ", '', colnames(x1)) # remove spaces
# convert from character to numeric data type
x1[, 2:5 ] <- lapply( x1[, 2:5 ], function(x) as.numeric( as.character( x ) ) )
df_list[[ as.character(counter) ]] <- x1
} else if ( what == "Histogram" ) {
x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: (high-1) ], "\t")), stringsAsFactors = FALSE )
# column names and band and bin columns
colnames( x1 ) <- x1[1, ]
colnames(x1)[1] <- 'Histogram'
x1$Band <- rep( gsub("[Band\ ]", '', grep( "Band", x1$Histogram, value = TRUE )),
diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
x1$Bin <- rep( gsub("[Bin=\ ]", '', grep( "Bin", x1$Histogram, value = TRUE )),
diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
x1 <- x1[! grepl( 'Histogram', x1$Histogram ), ]
x1$Histogram <- NULL
x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
colnames(x1) <- gsub("\ ", '', colnames(x1)) # remove spaces
# convert from character to numeric data type
x1[, c(1:7, 10) ] <- lapply( x1[, c(1:7, 10) ], function(x) as.numeric( as.character( x ) ) )
df_list[[ as.character(counter) ]] <- x1
}
}
return( df_list )
}
1.提取基本统计数据:
df_list <- myfun(file = "test2.txt", what = 'Basic Stats')
df_list[[1]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.013282 0.133982 0.061581 0.034069 red_1 Red 20
# 3 Band 2 0.009866 0.112935 0.042688 0.026618 red_1 Red 20
# 4 Band 3 0.008304 0.037059 0.018434 0.007515 red_1 Red 20
# 5 Band 4 0.004726 0.040089 0.018490 0.009605 red_1 Red 20
df_list[[2]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.032262 0.124425 0.078073 0.028031 red_2 Red 12
# 3 Band 2 0.021072 0.064156 0.037923 0.012178 red_2 Red 12
# 4 Band 3 0.013404 0.066043 0.036316 0.014787 red_2 Red 12
# 5 Band 4 0.005162 0.055781 0.015526 0.013255 red_2 Red 12
df_list[[3]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.037488 0.107830 0.057892 0.018964 red_3 Red 12
# 3 Band 2 0.028140 0.072370 0.045340 0.014507 red_3 Red 12
# 4 Band 3 0.014960 0.112973 0.032751 0.026575 red_3 Red 12
# 5 Band 4 0.006566 0.029133 0.018201 0.006897 red_3 Red 12
2。提取数据:
df_list <- myfun(file = "test2.txt", what = 'Histogram')
头
head(df_list[[1]])
# DN Npts Total Percent AccPct Band Bin ROI color points
# 2 0.013282 1 1 5 5 1 0.00047 red_1 Red 20
# 3 0.013755 0 1 0 5 1 0.00047 red_1 Red 20
# 4 0.014228 0 1 0 5 1 0.00047 red_1 Red 20
# 5 0.014702 0 1 0 5 1 0.00047 red_1 Red 20
# 6 0.015175 0 1 0 5 1 0.00047 red_1 Red 20
# 7 0.015648 0 1 0 5 1 0.00047 red_1 Red 20
汇总统计:
library('data.table')
df1 <- df_list[[1]]
setDT(df1)[, .( Min = min( DN ),
Max = max( DN ),
Mean = mean( DN ),
Stdev = sd( DN ) ), by = 'Band']
# Band Min Max Mean Stdev
# 1: 1 0.013282 0.133982 0.07363182 0.035048124
# 2: 2 0.009866 0.112935 0.06140034 0.029928470
# 3: 3 0.008304 0.037059 0.02268180 0.008349628
# 4: 4 0.004726 0.040089 0.02240761 0.010268456
df2 <- df_list[[2]]
setDT(df2)[, .( Min = min( DN ),
Max = max( DN ),
Mean = mean( DN ),
Stdev = sd( DN ) ), by = 'Band']
# Band Min Max Mean Stdev
# 1: 1 0.032262 0.124425 0.07834352 0.02676153
# 2: 2 0.021072 0.064156 0.04261389 0.01251049
# 3: 3 0.013404 0.066043 0.03972310 0.01528497
# 4: 4 0.005162 0.055781 0.03047151 0.01469855