pandas "pandas.errors.ParserError: Error tokenizing data. C error: Unknown error in IO callback"
pandas "pandas.errors.ParserError: Error tokenizing data. C error: Unknown error in IO callback"
我正在使用 pandas 到 read_csv
一个 3.8 G 的文本文件,以竖线分隔,但在将文件读入内存时出错。
这是我的 read_in_files()
函数抛出的完整错误:
错误:
Reading in file C:\Users\cdabel\Desktop\_Temp\Master_Extract_Data_Mart_201909240935.txt
Traceback (most recent call last):
File "<stdin>", line 10, in <module>
File "<stdin>", line 7, in read_in_files
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 463, in _read
data = parser.read(nrows)
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 1154, in read
ret = self._engine.read(nrows)
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 2048, in read
data = self._reader.read(nrows)
File "pandas\_libs\parsers.pyx", line 879, in pandas._libs.parsers.TextReader.read
File "pandas\_libs\parsers.pyx", line 894, in pandas._libs.parsers.TextReader._read_low_memory
File "pandas\_libs\parsers.pyx", line 948, in pandas._libs.parsers.TextReader._read_rows
File "pandas\_libs\parsers.pyx", line 935, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas\_libs\parsers.pyx", line 2130, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Unknown error in IO callback
导致此错误的可能原因是什么?会不会跟记忆有关?我该如何解决这个问题?我应该将这些数据分块吗?
我不怀疑是 RAM 问题,因为在调用函数时我有超过 7 GB 的未使用 RAM,如我的 Windows 10 任务管理器性能监视器所示。此外,我无法提供任何基础数据示例,因为它是健康和 PII 数据。
这是我的代码的摘录:
import os
import pandas as pd
# File
filepath = "C:\Temp\datafile.txt"
filename_w_ext = "datafile.txt"
# Read in TXT file
def read_in_files(filepath, filename_w_ext):
filename, file_ext = os.path.splitext(filename_w_ext)
print('Reading in file {}'.format(filepath))
with open(filepath, "r", newline='') as file:
global df_data
# Here's where it errors:
df_data = pd.read_csv(file, dtype=str, sep='|')
return df_data.columns.values.tolist(), df_data.values.tolist()
谷歌搜索此特定错误仅提供 pandas Tokenizer code
中错误处理的源代码
第 583-612 行:
static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
int status;
size_t bytes_read;
status = 0;
self->datapos = 0;
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
TRACE((
"parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
nbytes, bytes_read, status));
self->datalen = bytes_read;
if (status != REACHED_EOF && self->data == NULL) {
int64_t bufsize = 200;
self->error_msg = (char *)malloc(bufsize);
if (status == CALLING_READ_FAILED) {
snprintf(self->error_msg, bufsize,
"Calling read(nbytes) on source failed. "
"Try engine='python'.");
} else {
snprintf(self->error_msg, bufsize, "Unknown error in IO callback");
}
return -1;
}
TRACE(("datalen: %d\n", self->datalen));
return status;
}
在更强大的服务器上测试后,我现在意识到这个错误显然是由于我的 4 GB 文件需要 25 到 35 GB 的可用 RAM,其中有 114 列。这实际上应该抛出一个内存不足的错误,但我想 RAM 中的数量增加超过了 Tokenizer 代码检查它有多接近 运行 内存不足的能力。
我正在使用 pandas 到 read_csv
一个 3.8 G 的文本文件,以竖线分隔,但在将文件读入内存时出错。
这是我的 read_in_files()
函数抛出的完整错误:
错误:
Reading in file C:\Users\cdabel\Desktop\_Temp\Master_Extract_Data_Mart_201909240935.txt
Traceback (most recent call last):
File "<stdin>", line 10, in <module>
File "<stdin>", line 7, in read_in_files
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 463, in _read
data = parser.read(nrows)
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 1154, in read
ret = self._engine.read(nrows)
File "c:\python36\lib\site-packages\pandas\io\parsers.py", line 2048, in read
data = self._reader.read(nrows)
File "pandas\_libs\parsers.pyx", line 879, in pandas._libs.parsers.TextReader.read
File "pandas\_libs\parsers.pyx", line 894, in pandas._libs.parsers.TextReader._read_low_memory
File "pandas\_libs\parsers.pyx", line 948, in pandas._libs.parsers.TextReader._read_rows
File "pandas\_libs\parsers.pyx", line 935, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas\_libs\parsers.pyx", line 2130, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Unknown error in IO callback
导致此错误的可能原因是什么?会不会跟记忆有关?我该如何解决这个问题?我应该将这些数据分块吗?
我不怀疑是 RAM 问题,因为在调用函数时我有超过 7 GB 的未使用 RAM,如我的 Windows 10 任务管理器性能监视器所示。此外,我无法提供任何基础数据示例,因为它是健康和 PII 数据。
这是我的代码的摘录:
import os
import pandas as pd
# File
filepath = "C:\Temp\datafile.txt"
filename_w_ext = "datafile.txt"
# Read in TXT file
def read_in_files(filepath, filename_w_ext):
filename, file_ext = os.path.splitext(filename_w_ext)
print('Reading in file {}'.format(filepath))
with open(filepath, "r", newline='') as file:
global df_data
# Here's where it errors:
df_data = pd.read_csv(file, dtype=str, sep='|')
return df_data.columns.values.tolist(), df_data.values.tolist()
谷歌搜索此特定错误仅提供 pandas Tokenizer code
中错误处理的源代码第 583-612 行:
static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
int status;
size_t bytes_read;
status = 0;
self->datapos = 0;
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
TRACE((
"parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
nbytes, bytes_read, status));
self->datalen = bytes_read;
if (status != REACHED_EOF && self->data == NULL) {
int64_t bufsize = 200;
self->error_msg = (char *)malloc(bufsize);
if (status == CALLING_READ_FAILED) {
snprintf(self->error_msg, bufsize,
"Calling read(nbytes) on source failed. "
"Try engine='python'.");
} else {
snprintf(self->error_msg, bufsize, "Unknown error in IO callback");
}
return -1;
}
TRACE(("datalen: %d\n", self->datalen));
return status;
}
在更强大的服务器上测试后,我现在意识到这个错误显然是由于我的 4 GB 文件需要 25 到 35 GB 的可用 RAM,其中有 114 列。这实际上应该抛出一个内存不足的错误,但我想 RAM 中的数量增加超过了 Tokenizer 代码检查它有多接近 运行 内存不足的能力。