如何从数据集中删除无用的元素
How to remove not useful elements from a dataset
我有一个数据集,如下所示:
{0: {"address": 0,
"ctag": "TOP",
"deps": defaultdict(<class "list">, {"ROOT": [6, 51]}),
"feats": "",
"head": "",
"lemma": "",
"rel": "",
"tag": "TOP",
"word": ""},
1: {"address": 1,
"ctag": "Ne",
"deps": defaultdict(<class "list">, {"NPOSTMOD": [2]}),
"feats": "_",
"head": 6,
"lemma": "اشرف",
"rel": "SBJ",
"tag": "Ne",
"word": "اشرف"},
我想从这个数据集中删除 "deps":...?
。我试过这段代码但没有用,因为 "depts":
的值在字典的每个元素中都不同。
import re
import simplejson as simplejson
with open("../data/cleaned.txt", 'r') as fp:
lines = fp.readlines()
k = str(lines)
a = re.sub(r'\d:', '', k) # this is for removing numbers like `1:{..`
json_data = simplejson.dumps(a)
#print(json_data)
n = eval(k.replace('defaultdict(<class "list">', 'list'))
print(n)
尝试
import json
with open("../data/cleaned.txt", 'r') as fp:
data = json.load(fp)
for key, value in data.items():
value.pop("deps", None)
现在您将拥有不含 deps
的数据。如果您想将记录转储到新文件
json.dump(data, "output.json")
怎么样
#!/usr/bin/env python
# -*- coding: utf-8 -*-
data = {0: {"address": 0,
"ctag": "TOP",
"deps": 'something',
"feats": "",
"head": "",
"lemma": "",
"rel": "",
"tag": "TOP",
"word": ""},
1: {"address": 1,
"ctag": "Ne",
"deps": 'something',
"feats": "_",
"head": 6,
"lemma": "اشرف",
"rel": "SBJ",
"tag": "Ne",
"word": "اشرف"}}
for value in data.values():
if 'deps' in value:
del value['deps']
正确的方法是修复生成文本文件的代码。这个 defaultdict(<class "list">, {"ROOT": [6, 51]})
暗示它在需要更智能的格式时使用了简单的 repr
。
如果无法真正修复,以下只是一个穷人的解决方法。
摆脱 "deps": ...
很容易:一次读取文件一行并丢弃任何以 ""deps"
开头的行(忽略开头的空格)就足够了。但这还不够,因为当 json 坚持键只能是文本时,文件包含数字键。因此必须识别并引用数字键。
这可能允许加载文件:
进口重新
将简单json导入为简单json
with open("../data/cleaned.txt", 'r') as fp:
k = ''.join(re.sub(r'(?<!\w)(\d+)', r'""',line)
for line in fp if not line.strip().startswith('"deps"'))
# remove an eventual last comma
k = re.sub(r',[\s\n]*$', '', k, re.DOTALL)
# uncomment if the file does not contain the last }
# k += '}'
js = json.loads(k)
我有一个数据集,如下所示:
{0: {"address": 0,
"ctag": "TOP",
"deps": defaultdict(<class "list">, {"ROOT": [6, 51]}),
"feats": "",
"head": "",
"lemma": "",
"rel": "",
"tag": "TOP",
"word": ""},
1: {"address": 1,
"ctag": "Ne",
"deps": defaultdict(<class "list">, {"NPOSTMOD": [2]}),
"feats": "_",
"head": 6,
"lemma": "اشرف",
"rel": "SBJ",
"tag": "Ne",
"word": "اشرف"},
我想从这个数据集中删除 "deps":...?
。我试过这段代码但没有用,因为 "depts":
的值在字典的每个元素中都不同。
import re
import simplejson as simplejson
with open("../data/cleaned.txt", 'r') as fp:
lines = fp.readlines()
k = str(lines)
a = re.sub(r'\d:', '', k) # this is for removing numbers like `1:{..`
json_data = simplejson.dumps(a)
#print(json_data)
n = eval(k.replace('defaultdict(<class "list">', 'list'))
print(n)
尝试
import json
with open("../data/cleaned.txt", 'r') as fp:
data = json.load(fp)
for key, value in data.items():
value.pop("deps", None)
现在您将拥有不含 deps
的数据。如果您想将记录转储到新文件
json.dump(data, "output.json")
怎么样
#!/usr/bin/env python
# -*- coding: utf-8 -*-
data = {0: {"address": 0,
"ctag": "TOP",
"deps": 'something',
"feats": "",
"head": "",
"lemma": "",
"rel": "",
"tag": "TOP",
"word": ""},
1: {"address": 1,
"ctag": "Ne",
"deps": 'something',
"feats": "_",
"head": 6,
"lemma": "اشرف",
"rel": "SBJ",
"tag": "Ne",
"word": "اشرف"}}
for value in data.values():
if 'deps' in value:
del value['deps']
正确的方法是修复生成文本文件的代码。这个 defaultdict(<class "list">, {"ROOT": [6, 51]})
暗示它在需要更智能的格式时使用了简单的 repr
。
如果无法真正修复,以下只是一个穷人的解决方法。
摆脱 "deps": ...
很容易:一次读取文件一行并丢弃任何以 ""deps"
开头的行(忽略开头的空格)就足够了。但这还不够,因为当 json 坚持键只能是文本时,文件包含数字键。因此必须识别并引用数字键。
这可能允许加载文件:
进口重新 将简单json导入为简单json
with open("../data/cleaned.txt", 'r') as fp:
k = ''.join(re.sub(r'(?<!\w)(\d+)', r'""',line)
for line in fp if not line.strip().startswith('"deps"'))
# remove an eventual last comma
k = re.sub(r',[\s\n]*$', '', k, re.DOTALL)
# uncomment if the file does not contain the last }
# k += '}'
js = json.loads(k)