在 R 中使用 grep (?) 删除文本文件中的行

Question

我有一个非常重复的文本文件，前几行看起来像这样：

Filename: 
ROI: red_1 [Red] 20 points

Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.013282    0.133982    0.061581    0.034069
     Band 2 0.009866    0.112935    0.042688    0.026618
     Band 3 0.008304    0.037059    0.018434    0.007515
     Band 4 0.004726    0.040089    0.018490    0.009605

Histogram         DN       Npts   Total  Percent     Acc Pct
Band 1      0.013282          1       1   5.0000      5.0000
Bin=0.00047 0.013755          0       1   0.0000      5.0000
            0.014228          0       1   0.0000      5.0000

.. 并持续一段时间，直到达到另一个 ROI 值，如下所示：

Stats for ROI: red_5 [Red] 20 points
Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.030513    0.180980    0.090056    0.044456
     Band 2 0.022289    0.157861    0.046419    0.030555
     Band 3 0.012533    0.046693    0.027343    0.008947
     Band 4 0.003332    0.041555    0.016888    0.007770

Histogram         DN       Npts   Total  Percent     Acc Pct
Band 1      0.030513          1       1   5.0000      5.0000
Bin=0.00059 0.031103          0       1   0.0000      5.0000
            0.031693          0       1   0.0000      5.0000
            0.032283          0       1   0.0000      5.0000

我只想要一个数据框，每个 ROI 仅包含 ROI: ...、Basic Stats、Band 1、Band 2.. 到 Band 4 行。所有 100 个 ROI 的最终输出看起来像这样。

ROI: red_1 [Red] 20 points

Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.013282    0.133982    0.061581    0.034069
     Band 2 0.009866    0.112935    0.042688    0.026618
     Band 3 0.008304    0.037059    0.018434    0.007515
     Band 4 0.004726    0.040089    0.018490    0.009605

Stats for ROI: red_5 [Red] 20 points
Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.030513    0.180980    0.090056    0.044456
     Band 2 0.022289    0.157861    0.046419    0.030555
     Band 3 0.012533    0.046693    0.027343    0.008947
     Band 4 0.003332    0.041555    0.016888    0.007770

有人可以指点我有关 R 中清理数据的教程吗？使用 rep(grepl()) 是要走的路吗？数据的全文文件可在此处获得：LINK.

Answer 1

如评论中所述，为每个 ROI 值设置换行符不符合 R 数据帧的形式。我认为我们可以通过添加一个列来跟踪 ROI 来实现您正在寻找的东西。最终产品如下所示：

ROI                     Band    min         max         mean        stdev
red_1 [Red] 20 points   Band 1  0.013282    0.133982    0.061581    0.034069
red_1 [Red] 20 points   Band 2  0.009866    0.112935    0.042688    0.026618
red_1 [Red] 20 points   Band 3  0.008304    0.037059    0.018434    0.007515
red_1 [Red] 20 points   Band 4  0.004726    0.040089    0.018490    0.009605
red_2 [Red] 12 points   Band 1  0.032262    0.124425    0.078073    0.028031
red_2 [Red] 12 points   Band 2  0.021072    0.064156    0.037923    0.012178
red_2 [Red] 12 points   Band 3  0.013404    0.066043    0.036316    0.014787
red_2 [Red] 12 points   Band 4  0.005162    0.055781    0.015526    0.013255
red_3 [Red] 12 points   Band 1  0.037488    0.107830    0.057892    0.018964
red_3 [Red] 12 points   Band 2  0.028140    0.072370    0.045340    0.014507

由于您的数据不大，我们可以通过将数据作为字符向量读取并遍历各行来实现：

file <- file("test2.txt",)
dat <- readLines(file)
out <- NULL
roi <- NULL
for(i in 1:length(dat)){
  line <- dat[i]
  if(length(grep("ROI: ",line))>0){
    roi <- substr(line,regexpr("ROI",line)[1]+5,nchar(line))
  }
  if(substr(line,0,9)=="     Band"){
    splitLine <- strsplit(trimws(line),"\t")[[1]]
    outLine <- data.frame("ROI" = roi, 
                       "Band" = splitLine[1],
                       "min" = splitLine[2],
                       "max" = splitLine[3],
                       "mean" = splitLine[4],
                       "stdev" = splitLine[5]
                       )
    out <- rbind(out,outLine)
  }
}

数据框 out 应该非常接近您要查找的内容。

Answer 2

这是一种将文件作为文本拉入的方法，识别 ROI 和 Basic Stats 行的相关行，然后生成包含所有数据的长数据框。

input <- readLines("https://dl.dropboxusercontent.com/u/45095175/test2.txt")
roi_lines <- grep("ROI", input)
basic_stat_lines <- grep("Basic Stats", input)
roi_names <- sub("^.*ROI: ", "", input[roi_lines])
roi_data <- lapply(1:length(basic_stat_lines), function(i) {
  data.frame(roi = roi_names[i], read.delim(text = input[basic_stat_lines[i] + 0:4],
                                           stringsAsFactors = FALSE, check.names = FALSE),
             stringsAsFactors = FALSE)
})
roi_data_all <- do.call("rbind", roi_data)

Answer 3

myfun 将文本文件作为输入，returns 数据框列表。 what 参数将设置用户是否要从文本文件中提取数据或基本统计信息。

myfun <- function( file, what )
{
  x <- readLines( file )
  g1 <- which( grepl("ROI:", x))
  if( what == 'Basic Stats'){
    g2 <- which( grepl('Basic Stats', x))
  } else if ( what == "Histogram" ) {
    g2 <- which( grepl("Histogram", x))
  } else {
    stop( 'what value is not supported')
  }

  df_list <- list()
  counter <- 0
  while( counter < length( g1 ))
  {
    counter <- counter + 1

    if( counter != length( g1 ) ){
      low  <- g1[ counter ]
      high <- g1[ counter + 1 ]
    } else {
      low  <- g1[ counter ]
      high <- length( x )
    }

    min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )

    title <- ifelse( counter == 1,
                     list( gsub( '\[|\]', '', unlist( strsplit( x[ low ], "\ ") )[ 2:4 ] ) ), 
                     list( gsub( '\[|\]', '', unlist( strsplit( x[ low ], "\ ") )[ 4:6 ] ) ) )

    if( what == 'Basic Stats'){
      min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: ( min_ind + 5 ) ], "\t")), stringsAsFactors = FALSE )
      colnames( x1 ) <- x1[1, ]
      x1 <- x1[2:5, ]
      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, 2:5 ] <- lapply( x1[, 2:5 ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1

    } else if ( what == "Histogram" ) {
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: (high-1) ], "\t")), stringsAsFactors = FALSE )
      # column names and band and bin columns
      colnames( x1 ) <- x1[1, ]
      colnames(x1)[1] <- 'Histogram'
      x1$Band <- rep( gsub("[Band\ ]", '', grep( "Band", x1$Histogram, value = TRUE )),
                      diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1$Bin <- rep( gsub("[Bin=\ ]", '', grep( "Bin", x1$Histogram, value = TRUE )),
                     diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1 <- x1[! grepl( 'Histogram', x1$Histogram ), ]
      x1$Histogram <- NULL

      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, c(1:7, 10) ] <- lapply( x1[, c(1:7, 10) ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1
    }
  }

  return( df_list )
}

1.提取基本统计数据：

df_list <- myfun(file = "test2.txt", what = 'Basic Stats')
df_list[[1]]
#    BasicStats      Min      Max     Mean    Stdev   ROI color points
# 2      Band 1 0.013282 0.133982 0.061581 0.034069 red_1   Red     20
# 3      Band 2 0.009866 0.112935 0.042688 0.026618 red_1   Red     20
# 4      Band 3 0.008304 0.037059 0.018434 0.007515 red_1   Red     20
# 5      Band 4 0.004726 0.040089 0.018490 0.009605 red_1   Red     20
df_list[[2]]
#    BasicStats      Min      Max     Mean    Stdev   ROI color points
# 2      Band 1 0.032262 0.124425 0.078073 0.028031 red_2   Red     12
# 3      Band 2 0.021072 0.064156 0.037923 0.012178 red_2   Red     12
# 4      Band 3 0.013404 0.066043 0.036316 0.014787 red_2   Red     12
# 5      Band 4 0.005162 0.055781 0.015526 0.013255 red_2   Red     12
df_list[[3]]
#    BasicStats      Min      Max     Mean    Stdev   ROI color points
# 2      Band 1 0.037488 0.107830 0.057892 0.018964 red_3   Red     12
# 3      Band 2 0.028140 0.072370 0.045340 0.014507 red_3   Red     12
# 4      Band 3 0.014960 0.112973 0.032751 0.026575 red_3   Red     12
# 5      Band 4 0.006566 0.029133 0.018201 0.006897 red_3   Red     12

2。提取数据：

df_list <- myfun(file = "test2.txt", what = 'Histogram')

头

head(df_list[[1]])
#         DN Npts Total Percent AccPct Band     Bin   ROI color points
# 2 0.013282    1     1       5      5    1 0.00047 red_1   Red     20
# 3 0.013755    0     1       0      5    1 0.00047 red_1   Red     20
# 4 0.014228    0     1       0      5    1 0.00047 red_1   Red     20
# 5 0.014702    0     1       0      5    1 0.00047 red_1   Red     20
# 6 0.015175    0     1       0      5    1 0.00047 red_1   Red     20
# 7 0.015648    0     1       0      5    1 0.00047 red_1   Red     20

汇总统计：

library('data.table')
df1 <- df_list[[1]]
setDT(df1)[, .( Min = min( DN ),
                Max = max( DN ),
                Mean = mean( DN ),
                Stdev = sd( DN ) ), by = 'Band']
#    Band      Min      Max       Mean       Stdev
# 1:    1 0.013282 0.133982 0.07363182 0.035048124
# 2:    2 0.009866 0.112935 0.06140034 0.029928470
# 3:    3 0.008304 0.037059 0.02268180 0.008349628
# 4:    4 0.004726 0.040089 0.02240761 0.010268456

df2 <- df_list[[2]]
setDT(df2)[, .( Min = min( DN ),
                Max = max( DN ),
                Mean = mean( DN ),
                Stdev = sd( DN ) ), by = 'Band']
#    Band      Min      Max       Mean      Stdev
# 1:    1 0.032262 0.124425 0.07834352 0.02676153
# 2:    2 0.021072 0.064156 0.04261389 0.01251049
# 3:    3 0.013404 0.066043 0.03972310 0.01528497
# 4:    4 0.005162 0.055781 0.03047151 0.01469855

在 R 中使用 grep (?) 删除文本文件中的行

Delete rows in text file with grep (?) in R

r

dataframe

data-cleaning