问题 运行 导入 glob 包以扫描 PDF 目录的 python 脚本
Problem running a python script that imports glob package to scan through a directory of PDFs
我正在尝试 运行 一个程序,该程序应扫描“新文件夹”目录中的所有 pdf 文件,提取相关字符串值并在新计算机中生成 table。
代码如下
def check_rate(rating):
Rating=rating.upper()
U="Unsafe"
NR="Needs Rectification"
II="Improvements identified"
A="Adequate"
if Rating[:2]=="1H":
return U
elif Rating[:2]=="2H":
return NR
elif Rating[:2]=="2M":
return II
elif Rating[:2]=="2L":
return A
elif Rating[:2]=="3L":
return A
elif "UNSAFE" in Rating:
return U
elif "NEEDS RECTIFICATION" in Rating:
return NR
elif "IMPROVEMENTS IDENTIFIED" in Rating:
return II
elif "ADEQUATE" in Rating:
return A
import glob
import pandas as pd
files=glob.glob("./New folder/*.pdf")
df_name=pd.DataFrame(files,columns=['FileName'])
x=0
y=len(df_name)
df_name["SV_ID"]=""
while x < y:
i1=df_name["FileName"][x][22]
i2=df_name["FileName"][x][23]
i3=df_name["FileName"][x][24]
if i1==" " or i1=="-" or i1==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:22]
elif i2==" " or i2=="-" or i2==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:23]
elif i3==" " or i3=="-" or i3==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:24]
else:
df_name.at[x,"SV_ID"]="N/A"
x+=1
df_name.to_csv('name2.csv')
df_ref=pd.read_csv('CheckListItems.csv')
df_Rate=pd.read_csv('TechSafe.csv')
from tika import parser
line=0
n1=0
while n1<y:
rawText = parser.from_file(df_name['FileName'][n1])
rawList = rawText['content'].splitlines()
n=0
long=len(rawList)
version=''
while n<long:
word=rawList[n].strip().upper()
if word[:27]=='SOLAR VICTORIA AUDIT REPORT':
version=word[43:]
if word[-1:]==".":
word=word[:-1]
if word in list(df_ref["Reference"]):
add=1
while add<45 and n+add<long:
wordcheck=rawList[n+add].strip()
if wordcheck[-1:]==".":
wordcheck=wordcheck[:-1]
if wordcheck in list(df_ref["Reference"]):
add=45
if wordcheck[:8] == "Rating -":
df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
df_Rate.at[line,'Ref']=word
df_Rate.at[line,'Rate']=check_rate(rawList[n+add+1].strip())
df_Rate.at[line,'Version']=version
add=45
line+=1
# print(df_name['SV_ID'][n1],wordcheck)
elif wordcheck[:7] == "Rating ":
df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
df_Rate.at[line,'Ref']=word
wordcheck=wordcheck[7:]
df_Rate.at[line,'Rate']=check_rate(wordcheck)
df_Rate.at[line,'Version']=version
add=45
line+=1
# print(df_name['SV_ID'][n1],wordcheck)
# Old process
line+=1
add+=1
n+=1
n1+=1
print(n1*100//y,"%")
df_Rate.to_csv('1.1 Finding Draft.csv')
df_Rate.tail()
这应该会产生如下 table
SV_ID
参考
评分
版本
INS00102811
个人消费支出 23
确定的改进
V2.4C
我认为问题不在于 pdf 或脚本主体。我认为问题主要出在包裹上。这段代码在我朋友的电脑上运行良好,运行 它在我面前是实时的,但在我的电脑上不起作用。我们可能有不同版本的 python,我们正在 运行ning on jupyter notebook。
我收到以下错误:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
46 x+=1
47 df_name.to_csv('name2.csv')
---> 48 df_ref=pd.read_csv('CheckListItems.csv')
49 df_Rate=pd.read_csv('TechSafe.csv')
50
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
684 )
685
--> 686 return _read(filepath_or_buffer, kwds)
687
688
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
450
451 # Create the parser.
--> 452 parser = TextFileReader(fp_or_buf, **kwds)
453
454 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
944 self.options["has_index_names"] = kwds["has_index_names"]
945
--> 946 self._make_engine(self.engine)
947
948 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1176 def _make_engine(self, engine="c"):
1177 if engine == "c":
-> 1178 self._engine = CParserWrapper(self.f, **self.options)
1179 else:
1180 if engine == "python":
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
2006 kwds["usecols"] = self.usecols
2007
-> 2008 self._reader = parsers.TextReader(src, **kwds)
2009 self.unnamed_cols = self._reader.unnamed_cols
2010
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: [Errno 2] No such file or directory: 'CheckListItems.csv'
我创建了一个名为“CheckListItels.csv”的空文件。
现在我显示以下错误:
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
49 df_Rate=pd.read_csv('TechSafe.csv')
50
---> 51 from tika import parser
52
53 line=0
ModuleNotFoundError: No module named 'tika'
假设是软件包安装问题,我尝试安装 glob
(base) C:\>pip install glob2
Requirement already satisfied: glob2 in c:\programdata\anaconda3\lib\site-packages (0.7)
我正在使用 Python 版本 3.8.5。
我不确定如何编写此代码 运行。请帮忙。谢谢。
感谢您的宝贵建议。
根据建议,我从 anacondas gui 安装了 tika。
我发现的错误如下:
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2894 try:
-> 2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Reference'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-1-8f6d401ba180> in <module>
67 if word[-1:]==".":
68 word=word[:-1]
---> 69 if word in list(df_ref["Reference"]):
70 add=1
71 while add<45 and n+add<long:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2900 if self.columns.nlevels > 1:
2901 return self._getitem_multilevel(key)
-> 2902 indexer = self.columns.get_loc(key)
2903 if is_integer(indexer):
2904 indexer = [indexer]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
-> 2897 raise KeyError(key) from err
2898
2899 if tolerance is not None:
KeyError: 'Reference'
无法找到错误状态 CheckListItems.csv
:
FileNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
46 x+=1
47 df_name.to_csv('name2.csv')
---> 48 df_ref=pd.read_csv('CheckListItems.csv')
文件CheckListItems.csv
是否存在?尝试使用该名称创建一个空文本文件,然后再次 运行。
您收到此错误的可能性为 99%,因为该文件不在您的代码说明的位置 - 抱歉!
行:
df_ref=pd.read_csv('CheckListItems.csv')
是 CheckListItems.csv 文件的相对路径。所以这是我解决问题的方法:
- 确保 CheckListItems.csv 与您的 jupyter notebook 文件位于同一目录中。如果不是,请将其移至那里。
- 万一无法解决问题,请提供 CheckListItems.csv 的绝对路径,即
df_ref=pd.read_csv('/home/drislam/documents/python/CheckListItems.csv')
我正在尝试 运行 一个程序,该程序应扫描“新文件夹”目录中的所有 pdf 文件,提取相关字符串值并在新计算机中生成 table。
代码如下
def check_rate(rating):
Rating=rating.upper()
U="Unsafe"
NR="Needs Rectification"
II="Improvements identified"
A="Adequate"
if Rating[:2]=="1H":
return U
elif Rating[:2]=="2H":
return NR
elif Rating[:2]=="2M":
return II
elif Rating[:2]=="2L":
return A
elif Rating[:2]=="3L":
return A
elif "UNSAFE" in Rating:
return U
elif "NEEDS RECTIFICATION" in Rating:
return NR
elif "IMPROVEMENTS IDENTIFIED" in Rating:
return II
elif "ADEQUATE" in Rating:
return A
import glob
import pandas as pd
files=glob.glob("./New folder/*.pdf")
df_name=pd.DataFrame(files,columns=['FileName'])
x=0
y=len(df_name)
df_name["SV_ID"]=""
while x < y:
i1=df_name["FileName"][x][22]
i2=df_name["FileName"][x][23]
i3=df_name["FileName"][x][24]
if i1==" " or i1=="-" or i1==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:22]
elif i2==" " or i2=="-" or i2==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:23]
elif i3==" " or i3=="-" or i3==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:24]
else:
df_name.at[x,"SV_ID"]="N/A"
x+=1
df_name.to_csv('name2.csv')
df_ref=pd.read_csv('CheckListItems.csv')
df_Rate=pd.read_csv('TechSafe.csv')
from tika import parser
line=0
n1=0
while n1<y:
rawText = parser.from_file(df_name['FileName'][n1])
rawList = rawText['content'].splitlines()
n=0
long=len(rawList)
version=''
while n<long:
word=rawList[n].strip().upper()
if word[:27]=='SOLAR VICTORIA AUDIT REPORT':
version=word[43:]
if word[-1:]==".":
word=word[:-1]
if word in list(df_ref["Reference"]):
add=1
while add<45 and n+add<long:
wordcheck=rawList[n+add].strip()
if wordcheck[-1:]==".":
wordcheck=wordcheck[:-1]
if wordcheck in list(df_ref["Reference"]):
add=45
if wordcheck[:8] == "Rating -":
df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
df_Rate.at[line,'Ref']=word
df_Rate.at[line,'Rate']=check_rate(rawList[n+add+1].strip())
df_Rate.at[line,'Version']=version
add=45
line+=1
# print(df_name['SV_ID'][n1],wordcheck)
elif wordcheck[:7] == "Rating ":
df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
df_Rate.at[line,'Ref']=word
wordcheck=wordcheck[7:]
df_Rate.at[line,'Rate']=check_rate(wordcheck)
df_Rate.at[line,'Version']=version
add=45
line+=1
# print(df_name['SV_ID'][n1],wordcheck)
# Old process
line+=1
add+=1
n+=1
n1+=1
print(n1*100//y,"%")
df_Rate.to_csv('1.1 Finding Draft.csv')
df_Rate.tail()
这应该会产生如下 table
SV_ID | 参考 | 评分 | 版本 |
---|---|---|---|
INS00102811 | 个人消费支出 23 | 确定的改进 | V2.4C |
我认为问题不在于 pdf 或脚本主体。我认为问题主要出在包裹上。这段代码在我朋友的电脑上运行良好,运行 它在我面前是实时的,但在我的电脑上不起作用。我们可能有不同版本的 python,我们正在 运行ning on jupyter notebook。
我收到以下错误:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
46 x+=1
47 df_name.to_csv('name2.csv')
---> 48 df_ref=pd.read_csv('CheckListItems.csv')
49 df_Rate=pd.read_csv('TechSafe.csv')
50
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
684 )
685
--> 686 return _read(filepath_or_buffer, kwds)
687
688
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
450
451 # Create the parser.
--> 452 parser = TextFileReader(fp_or_buf, **kwds)
453
454 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
944 self.options["has_index_names"] = kwds["has_index_names"]
945
--> 946 self._make_engine(self.engine)
947
948 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1176 def _make_engine(self, engine="c"):
1177 if engine == "c":
-> 1178 self._engine = CParserWrapper(self.f, **self.options)
1179 else:
1180 if engine == "python":
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
2006 kwds["usecols"] = self.usecols
2007
-> 2008 self._reader = parsers.TextReader(src, **kwds)
2009 self.unnamed_cols = self._reader.unnamed_cols
2010
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: [Errno 2] No such file or directory: 'CheckListItems.csv'
我创建了一个名为“CheckListItels.csv”的空文件。
现在我显示以下错误:
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
49 df_Rate=pd.read_csv('TechSafe.csv')
50
---> 51 from tika import parser
52
53 line=0
ModuleNotFoundError: No module named 'tika'
假设是软件包安装问题,我尝试安装 glob
(base) C:\>pip install glob2
Requirement already satisfied: glob2 in c:\programdata\anaconda3\lib\site-packages (0.7)
我正在使用 Python 版本 3.8.5。
我不确定如何编写此代码 运行。请帮忙。谢谢。
感谢您的宝贵建议。 根据建议,我从 anacondas gui 安装了 tika。 我发现的错误如下:
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2894 try:
-> 2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Reference'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-1-8f6d401ba180> in <module>
67 if word[-1:]==".":
68 word=word[:-1]
---> 69 if word in list(df_ref["Reference"]):
70 add=1
71 while add<45 and n+add<long:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2900 if self.columns.nlevels > 1:
2901 return self._getitem_multilevel(key)
-> 2902 indexer = self.columns.get_loc(key)
2903 if is_integer(indexer):
2904 indexer = [indexer]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
-> 2897 raise KeyError(key) from err
2898
2899 if tolerance is not None:
KeyError: 'Reference'
无法找到错误状态 CheckListItems.csv
:
FileNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
46 x+=1
47 df_name.to_csv('name2.csv')
---> 48 df_ref=pd.read_csv('CheckListItems.csv')
文件CheckListItems.csv
是否存在?尝试使用该名称创建一个空文本文件,然后再次 运行。
您收到此错误的可能性为 99%,因为该文件不在您的代码说明的位置 - 抱歉!
行:
df_ref=pd.read_csv('CheckListItems.csv')
是 CheckListItems.csv 文件的相对路径。所以这是我解决问题的方法:
- 确保 CheckListItems.csv 与您的 jupyter notebook 文件位于同一目录中。如果不是,请将其移至那里。
- 万一无法解决问题,请提供 CheckListItems.csv 的绝对路径,即
df_ref=pd.read_csv('/home/drislam/documents/python/CheckListItems.csv')