在 R 中写入数据帧时出错

Error in writing data frame in R

我正在尝试从我从 OCR 格式的 pdf 文件中提取的文本中搜索单词。这个 pdf 文件有多个页面,所以对于每一页,我都在搜索那个词,如果找到那个词然后写 filename, status (存在或不存在),Page 在其上找到它以及它在数据帧中找到的 words 。但是数据框给出了所有文件的状态"Present",我只想这样

file_name       Status        Page              words
test1.pdf    "Present"       test1_2,test1_4    gym,school
test2.pdf    "Not Present"     -                 -
test3.pdf    "Present"       test3_1            gym

我在这段代码中遗漏了什么。

这是代码

    All_files=Sys.glob("*.pdf")
v1 <- numeric(length(All_files))
chk_words=c("Swimming pool","Gym","west","para")
word <- "Gym"
tc=c()
ps=c()
x=list()
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

  cnt <- pdf_info(All_files[i])$pages
  print(cnt)

  for(j in seq_len(cnt)){
    img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
    text <- ocr(img_file)
    ocr_text <- capture.output(cat(text))
    check <- sapply(ocr_text, paste, collapse="")
    junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff")
    file.remove(junk)
    br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
    else "Present" 
    print(br)       
    if(br=="Present") {
      v1[i] <- j
      break}

    for(k in chk_words){ 
      br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")}
      if(br == "Present")
        ps=k
      x[[k]]=ps
      tc=unlist(unique(x))
    }




  }

  print(tc)
  Status <- if(v1[i] == 0) "Not Present" else "Present"
  pages <- if(v1[i] == 0) "-" else 
    paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
  words <- if(v1[i] == 0) "-" else word
  df <- rbind(df, cbind(file_name = basename(file_name),
                        Status, pages = pages, words = words,tc))


}

任何建议都是可取的。

谢谢

这里是一个单词的选项

v1 <- numeric(length(All_files))
word <- "school"
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

    cnt <- pdf_info(All_files[i])$pages
    print(cnt)

    for(j in seq_len(cnt)){
      img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
      text <- ocr(img_file)
      ocr_text <- capture.output(cat(text))
      check <- sapply(ocr_text, paste, collapse="")
      junk <- dir(path= paste0(path, "/tiff"), pattern="tiff")
      file.remove(junk)
      br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
              else "Present" 
      print(br)       
      if(br=="Present") {
         v1[i] <- j
         break}

    }

    Status <- if(v1[i] == 0) "Not Present" else "Present"
    pages <- if(v1[i] == 0) "-" else 
     paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
    words <- if(v1[i] == 0) "-" else word
    df <- rbind(df, cbind(file_name = basename(file_name),
              Status, pages = pages, words = words))


}

-输出

df
#     file_name      Status  pages  words
#1 Amenities.pdf Not Present      -      -
#2      test.pdf     Present test_2 school