手动复制 dict underlying code:the value can't be object 的问题
A problem in manually reproducing dict underlying code:the value can't be object
我尝试手动复现pythondict底层代码(只能实现setitem
和getitem
),但是我遇到了一个问题:我写的hashmap只有当值类型是基本数据类型(如int,str等)时才能正常工作。如果值的类型是object,hashmap只能设置item,python获取object类型的值会崩溃
错误信息是“
致命错误:已跟踪 GC 对象
我猜可能有几个问题:
- something wrong in the (PyTypeObject)PyHashMap_Type definition
- The getitem method return's PyObject*, but the python cannot resolve the object's pointer
以下是我测试过的一些案例
case1:the key and value types are all basic data types, it works well
case2:the key's type is object,but the value type is basic data type ,
it works well
case3:the key'type is basic data type, but the value type is object,
the python crash when getting item, even though the value is the an
empty object
case4:the key and value types are all object, the result is the same
as case3
// the PyTypeObject
static PyTypeObject PyHashMap_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
"hashmap",
sizeof(PyMapObject),
0,
(destructor)PyHashMap_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
(reprfunc)repr_func, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
Py_TPFLAGS_BASETYPE | Py_TPFLAGS_DICT_SUBCLASS, /* tp_flags */
0, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
0, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
_HashMap_New, /* tp_new */
PyObject_GC_Del, /* tp_free */
};
// the hashmap struct
typedef struct _mapkeysobject PyMapKeysObject;
typedef struct {
Py_hash_t me_hash;
PyObject *me_key;
PyObject *me_value;
} PyMapKeyEntry;
struct _mapkeysobject {
Py_ssize_t dk_size;
Py_ssize_t dk_usable;
PyMapKeyEntry dk_entries[1];
};
typedef struct {
PyObject_HEAD
Py_ssize_t ma_used;
PyMapKeysObject *ma_keys;
PyObject **ma_values;
} PyMapObject;
// get item methods, the python call GET_ITEM_WRAPPER
static PyObject* GET_ITEM_WRAPPER(PyObject* self, PyObject* args){
PyObject * o = NULL;
PyObject * key = NULL;
if (!PyArg_ParseTuple(args, "OO",&o,&key)) {
printf("error: arg list error");
Py_RETURN_NONE;
}
PyObject* value = PyMap_GetItem(o, key);
if (value == NULL) Py_RETURN_NONE;
return value;
}
static PyObject* PyMap_GetItem(PyObject* o, PyObject* key){
PyMapObject *mp;
Py_hash_t hash;
mp = (PyMapObject *)o;
hash = PyObject_Hash(key);
if (hash == -1)
return NULL;
return searchmap(mp, key, hash);
}
static PyObject* searchmap(PyMapObject* mp, PyObject* key, Py_hash_t hash){
PyObject **value_addr;
PyMapKeyEntry *ep;
ep = lookup_function(mp, key, hash, &value_addr);
if (ep == NULL) return NULL;
return ep->me_value;
}
PyMapKeyEntry* lookup_function(PyMapObject* mp, PyObject* key, Py_hash_t hash, PyObject ***value_addr){
size_t i;
size_t perturb;
size_t mask = DK_MASK(mp->ma_keys);
PyMapKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
PyMapKeyEntry *ep;
i = (size_t)hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key) {
*value_addr = &ep->me_value;
return ep;
}
for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
if (ep->me_key == NULL || ep->me_key == key) {
*value_addr = &ep->me_value;
return ep;
}
}
assert(0); /* NOT REACHED */
return 0;
}
我希望值类型可以是对象
我相信你搞砸了引用计数(虽然很难绝对确定代码不完整)。
您对存储值的引用计数应至少为 1(以表示您将对它的引用作为字典的一部分)。但是,在您 return 时,它应该是两个(一个用于您持有的参考,一个用于参考 return 值)。
我怀疑 PyMap_GetItem
你应该这样做:
PyObject *ret_val = searchmap(mp, key, hash);
Py_XINCREF(ret_val);
return ret_val;
(虽然我似乎并没有在 PyHashMap_Type
中实际设置 sequence/mapping 方法所以谁知道 PyMap_GetItem
是否实际使用...)
这似乎适用于整数和字符串的原因是 Python "interns" 小整数和短字符串,本质上保持对它们的恒定引用。如果你一直查找它们,那么它最终会崩溃(每次你这样做都会丢失一个参考)。
我尝试手动复现pythondict底层代码(只能实现setitem
和getitem
),但是我遇到了一个问题:我写的hashmap只有当值类型是基本数据类型(如int,str等)时才能正常工作。如果值的类型是object,hashmap只能设置item,python获取object类型的值会崩溃
错误信息是“ 致命错误:已跟踪 GC 对象
我猜可能有几个问题:
- something wrong in the (PyTypeObject)PyHashMap_Type definition
- The getitem method return's PyObject*, but the python cannot resolve the object's pointer
以下是我测试过的一些案例
case1:the key and value types are all basic data types, it works well
case2:the key's type is object,but the value type is basic data type , it works well
case3:the key'type is basic data type, but the value type is object, the python crash when getting item, even though the value is the an empty object
case4:the key and value types are all object, the result is the same as case3
// the PyTypeObject
static PyTypeObject PyHashMap_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
"hashmap",
sizeof(PyMapObject),
0,
(destructor)PyHashMap_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
(reprfunc)repr_func, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
Py_TPFLAGS_BASETYPE | Py_TPFLAGS_DICT_SUBCLASS, /* tp_flags */
0, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
0, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
_HashMap_New, /* tp_new */
PyObject_GC_Del, /* tp_free */
};
// the hashmap struct
typedef struct _mapkeysobject PyMapKeysObject;
typedef struct {
Py_hash_t me_hash;
PyObject *me_key;
PyObject *me_value;
} PyMapKeyEntry;
struct _mapkeysobject {
Py_ssize_t dk_size;
Py_ssize_t dk_usable;
PyMapKeyEntry dk_entries[1];
};
typedef struct {
PyObject_HEAD
Py_ssize_t ma_used;
PyMapKeysObject *ma_keys;
PyObject **ma_values;
} PyMapObject;
// get item methods, the python call GET_ITEM_WRAPPER
static PyObject* GET_ITEM_WRAPPER(PyObject* self, PyObject* args){
PyObject * o = NULL;
PyObject * key = NULL;
if (!PyArg_ParseTuple(args, "OO",&o,&key)) {
printf("error: arg list error");
Py_RETURN_NONE;
}
PyObject* value = PyMap_GetItem(o, key);
if (value == NULL) Py_RETURN_NONE;
return value;
}
static PyObject* PyMap_GetItem(PyObject* o, PyObject* key){
PyMapObject *mp;
Py_hash_t hash;
mp = (PyMapObject *)o;
hash = PyObject_Hash(key);
if (hash == -1)
return NULL;
return searchmap(mp, key, hash);
}
static PyObject* searchmap(PyMapObject* mp, PyObject* key, Py_hash_t hash){
PyObject **value_addr;
PyMapKeyEntry *ep;
ep = lookup_function(mp, key, hash, &value_addr);
if (ep == NULL) return NULL;
return ep->me_value;
}
PyMapKeyEntry* lookup_function(PyMapObject* mp, PyObject* key, Py_hash_t hash, PyObject ***value_addr){
size_t i;
size_t perturb;
size_t mask = DK_MASK(mp->ma_keys);
PyMapKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
PyMapKeyEntry *ep;
i = (size_t)hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key) {
*value_addr = &ep->me_value;
return ep;
}
for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
if (ep->me_key == NULL || ep->me_key == key) {
*value_addr = &ep->me_value;
return ep;
}
}
assert(0); /* NOT REACHED */
return 0;
}
我希望值类型可以是对象
我相信你搞砸了引用计数(虽然很难绝对确定代码不完整)。
您对存储值的引用计数应至少为 1(以表示您将对它的引用作为字典的一部分)。但是,在您 return 时,它应该是两个(一个用于您持有的参考,一个用于参考 return 值)。
我怀疑 PyMap_GetItem
你应该这样做:
PyObject *ret_val = searchmap(mp, key, hash);
Py_XINCREF(ret_val);
return ret_val;
(虽然我似乎并没有在 PyHashMap_Type
中实际设置 sequence/mapping 方法所以谁知道 PyMap_GetItem
是否实际使用...)
这似乎适用于整数和字符串的原因是 Python "interns" 小整数和短字符串,本质上保持对它们的恒定引用。如果你一直查找它们,那么它最终会崩溃(每次你这样做都会丢失一个参考)。