问题运行导入 glob 包以扫描 PDF 目录的 python 脚本

Question

我正在尝试运行一个程序，该程序应扫描“新文件夹”目录中的所有 pdf 文件，提取相关字符串值并在新计算机中生成 table。

代码如下

    def check_rate(rating):
    Rating=rating.upper()
    U="Unsafe"
    NR="Needs Rectification"
    II="Improvements identified"
    A="Adequate"
    if Rating[:2]=="1H":
        return U
    elif Rating[:2]=="2H":
        return NR
    elif Rating[:2]=="2M":
        return II
    elif Rating[:2]=="2L":
        return A
    elif Rating[:2]=="3L":
        return A
    elif "UNSAFE" in Rating:
        return U
    elif "NEEDS RECTIFICATION" in Rating:
        return NR
    elif "IMPROVEMENTS IDENTIFIED" in Rating:
        return II
    elif "ADEQUATE" in Rating:
        return A


import glob
import pandas as pd
files=glob.glob("./New folder/*.pdf")
df_name=pd.DataFrame(files,columns=['FileName'])
x=0
y=len(df_name)
df_name["SV_ID"]=""
while x < y:
    i1=df_name["FileName"][x][22]
    i2=df_name["FileName"][x][23]
    i3=df_name["FileName"][x][24]
    if i1==" " or i1=="-" or i1==".":
        df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:22]
    elif i2==" " or i2=="-" or i2==".":
        df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:23]
    elif i3==" " or i3=="-" or i3==".":
        df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:24]
    else:
        df_name.at[x,"SV_ID"]="N/A"
    x+=1
df_name.to_csv('name2.csv')
df_ref=pd.read_csv('CheckListItems.csv')
df_Rate=pd.read_csv('TechSafe.csv')

from tika import parser

line=0
n1=0
while n1<y:
    rawText = parser.from_file(df_name['FileName'][n1])

    rawList = rawText['content'].splitlines()

    n=0
    long=len(rawList)
    version=''
    while n<long:
        word=rawList[n].strip().upper()
        if word[:27]=='SOLAR VICTORIA AUDIT REPORT':
            version=word[43:]
        if word[-1:]==".":
            word=word[:-1]
        if word in list(df_ref["Reference"]):
            add=1
            while add<45 and n+add<long:
                wordcheck=rawList[n+add].strip()
                
                if wordcheck[-1:]==".":
                    wordcheck=wordcheck[:-1]
                if wordcheck in list(df_ref["Reference"]):
                    add=45

                if wordcheck[:8] == "Rating -":
                    df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
                    df_Rate.at[line,'Ref']=word
                    df_Rate.at[line,'Rate']=check_rate(rawList[n+add+1].strip())
                    df_Rate.at[line,'Version']=version
                    add=45
                    line+=1
#                    print(df_name['SV_ID'][n1],wordcheck)

                    
                elif wordcheck[:7] == "Rating ":
                    df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
                    df_Rate.at[line,'Ref']=word
                    wordcheck=wordcheck[7:]
                    df_Rate.at[line,'Rate']=check_rate(wordcheck)
                    df_Rate.at[line,'Version']=version
                    add=45
                    line+=1
#                    print(df_name['SV_ID'][n1],wordcheck)
                    
                
# Old process




                        
                    line+=1

                add+=1

        n+=1
    n1+=1
    print(n1*100//y,"%")
df_Rate.to_csv('1.1 Finding Draft.csv')
df_Rate.tail()

这应该会产生如下 table

SV_ID	参考	评分	版本
INS00102811	个人消费支出 23	确定的改进	V2.4C

我认为问题不在于 pdf 或脚本主体。我认为问题主要出在包裹上。这段代码在我朋友的电脑上运行良好，运行它在我面前是实时的，但在我的电脑上不起作用。我们可能有不同版本的 python，我们正在运行ning on jupyter notebook。

我收到以下错误：

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
     46     x+=1
     47 df_name.to_csv('name2.csv')
---> 48 df_ref=pd.read_csv('CheckListItems.csv')
     49 df_Rate=pd.read_csv('TechSafe.csv')
     50 

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    684     )
    685 
--> 686     return _read(filepath_or_buffer, kwds)
    687 
    688 

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    450 
    451     # Create the parser.
--> 452     parser = TextFileReader(fp_or_buf, **kwds)
    453 
    454     if chunksize or iterator:

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
    944             self.options["has_index_names"] = kwds["has_index_names"]
    945 
--> 946         self._make_engine(self.engine)
    947 
    948     def close(self):

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
   1176     def _make_engine(self, engine="c"):
   1177         if engine == "c":
-> 1178             self._engine = CParserWrapper(self.f, **self.options)
   1179         else:
   1180             if engine == "python":

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
   2006         kwds["usecols"] = self.usecols
   2007 
-> 2008         self._reader = parsers.TextReader(src, **kwds)
   2009         self.unnamed_cols = self._reader.unnamed_cols
   2010 

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] No such file or directory: 'CheckListItems.csv'

我创建了一个名为“CheckListItels.csv”的空文件。

现在我显示以下错误：

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
     49 df_Rate=pd.read_csv('TechSafe.csv')
     50 
---> 51 from tika import parser
     52 
     53 line=0

ModuleNotFoundError: No module named 'tika'

假设是软件包安装问题，我尝试安装 glob

(base) C:\>pip install glob2
Requirement already satisfied: glob2 in c:\programdata\anaconda3\lib\site-packages (0.7)

我正在使用 Python 版本 3.8.5。

我不确定如何编写此代码运行。请帮忙。谢谢。

感谢您的宝贵建议。根据建议，我从 anacondas gui 安装了 tika。我发现的错误如下：

KeyError                                  Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2894             try:
-> 2895                 return self._engine.get_loc(casted_key)
   2896             except KeyError as err:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Reference'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-1-8f6d401ba180> in <module>
     67         if word[-1:]==".":
     68             word=word[:-1]
---> 69         if word in list(df_ref["Reference"]):
     70             add=1
     71             while add<45 and n+add<long:

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2895                 return self._engine.get_loc(casted_key)
   2896             except KeyError as err:
-> 2897                 raise KeyError(key) from err
   2898 
   2899         if tolerance is not None:

KeyError: 'Reference'

Answer 1

无法找到错误状态 CheckListItems.csv：

FileNotFoundError                   Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
     46     x+=1
     47 df_name.to_csv('name2.csv')
---> 48 df_ref=pd.read_csv('CheckListItems.csv')

文件CheckListItems.csv是否存在？尝试使用该名称创建一个空文本文件，然后再次运行。

Answer 2

您收到此错误的可能性为 99%，因为该文件不在您的代码说明的位置 - 抱歉！

行：

df_ref=pd.read_csv('CheckListItems.csv')

是 CheckListItems.csv 文件的相对路径。所以这是我解决问题的方法：

确保 CheckListItems.csv 与您的 jupyter notebook 文件位于同一目录中。如果不是，请将其移至那里。
万一无法解决问题，请提供 CheckListItems.csv 的绝对路径，即

df_ref=pd.read_csv('/home/drislam/documents/python/CheckListItems.csv')

问题运行导入 glob 包以扫描 PDF 目录的 python 脚本

Problem running a python script that imports glob package to scan through a directory of PDFs

python

parsing

glob

问题 运行 导入 glob 包以扫描 PDF 目录的 python 脚本

Problem running a python script that imports glob package to scan through a directory of PDFs

python

parsing

glob

问题运行导入 glob 包以扫描 PDF 目录的 python 脚本