python python2 和 python3 中的 unicode 问题
python unicode issues in both python2 and python3
我有一组 python 脚本 (https://github.com/hvdwolf/wikiscripts) 可以解析 wikidump,使其成为 gpx/osm/csv/sql/sqlite 转储,用作导航应用程序中的 POI 文件。我只解析有坐标的文章。为此,我使用包含 sql 插入语句的外部链接转储。包含 "geohack.php" 子字符串的 sql 语句确实包含坐标。我将这些导入到 sqlite 数据库中,用作文章转储的参考。
它们都是 utf-8 转储并且解析所有 "western type" 文件工作正常,但阿拉伯语、波斯语、俄语、日语、希腊语、中文和其他语言不起作用。显然我做错了什么。
我得到的标题字符串是:
%D9%85%D8%A7%D9%81%D8%B8%D8%A9_%D8%A7%D9%84%D8%A8%D8%AF%D8%A7%D8%A6%D8%B9
%D8%A3%D9%88%D8%B1%D9%8A%D9%88%D9%8A%D9%84%D8%A7
Battle_of_Nicopolis
Qingdao
所以一些正常的字符是可以的。其余的都是胡言乱语(对我来说)。
我已经做了一些测试,我只是简单地读取转储并写入 utf-8 编码的文本文件(line in => line out)然后它工作正常,但是在字符串处理函数和 "re." 函数的某处它改变了我的 unicode 文本。
编辑:我的 python 脚本以:# -- 编码:utf-8 --
我的代码(相关部分,包括 python2 和 python3 语句,以及一些说明我已经尝试过的内容):
with gzip.open(externallinks_file, 'r') as single_externallinksfile:
#reader = codecs.getreader("utf-8")
#single_externallinksfile = reader(single_externallinksfile)
#with codecs.getreader('utf-8')gzip.open(externallinks_file, 'r') as single_externallinksfile:
linecounter = 0
totlinecounter = 0
filelinecounter = 0
# We need to read line by line as we have massive files, sometimes multiple GBs
for line in single_externallinksfile:
if sys.version_info<(3,0,0):
line = unicode(line, 'utf-8')
else:
line = line.decode("utf-8")
if "INSERT INTO" in line:
insert_statements = line.split("),(")
for statement in insert_statements:
#statement = statement.decode("utf-8")
filelinecounter += 1
#if ("geohack.php?" in statement) and (("pagename" in statement) or ("src=" in statement)):
# src can also be in the line, but is different and we leave it out for now
if ("geohack.php?" in statement) and ("pagename" in statement) and ("params" in statement):
language = ""
region = ""
poitype = ""
content = re.findall(r'.*?pagename=(.*?)\'\,\'',statement,flags=re.IGNORECASE)
if len(content) > 0: # We even need this check due to corrupted lines
splitcontent = content[0].split("&")
title = splitcontent[0]
#title = title.decode('utf8')
for subcontent in splitcontent:
if "language=" in subcontent:
language = subcontent.replace("language=","")
#print('taal is: ' + language)
if "params=" in subcontent:
params_string = subcontent.replace("params=","").split("_")
latitude,longitude,poitype,region = get_coordinates_type_region(params_string)
if ( str(latitude) != "" and str(longitude) != "" and (str(latitude) != "0") or (str(longitude) != "0")):
if GENERATE_SQL == "YES":
sql_file.write('insert into ' + file_prefix + '_externallinks values ("' + title + '","' + str(latitude) + '","' + str(longitude) + '","' + language + '","' + poitype + '","' + region + '");\n')
if CREATE_SQLITE == "YES":
sqlcommand = 'insert into ' + file_prefix + '_externallinks values ("' + title + '","' + str(latitude) + '","' + str(longitude) + '","' + language + '","' + poitype + '","' + region +'");'
#print(sqlcommand)
cursor.execute(sqlcommand)
linecounter += 1
if linecounter == 10000:
if CREATE_SQLITE == "YES":
# Do a databse commit every 10000 rows
wikidb.commit()
totlinecounter += linecounter
linecounter = 0
print('\nProcessed ' + str(totlinecounter) + ' lines out of ' + str(filelinecounter) + ' sql line statements. Elapsed time: ' + str(datetime.datetime.now().replace(microsecond=0) - start_time))
标题好像是percent-encoded。
try:
# Python 3
from urllib.parse import unquote
except ImportError:
# Python 2
from urllib import unquote
percent_encoded = '''
%D9%85%D8%A7%D9%81%D8%B8%D8%A9_%D8%A7%D9%84%D8%A8%D8%AF%D8%A7%D8%A6%D8%B9
%D8%A3%D9%88%D8%B1%D9%8A%D9%88%D9%8A%D9%84%D8%A7
Battle_of_Nicopolis
Qingdao
'''
print(unquote(percent_encoded))
产量
مافظة_البدائع
أوريويلا
Battle_of_Nicopolis
Qingdao
我有一组 python 脚本 (https://github.com/hvdwolf/wikiscripts) 可以解析 wikidump,使其成为 gpx/osm/csv/sql/sqlite 转储,用作导航应用程序中的 POI 文件。我只解析有坐标的文章。为此,我使用包含 sql 插入语句的外部链接转储。包含 "geohack.php" 子字符串的 sql 语句确实包含坐标。我将这些导入到 sqlite 数据库中,用作文章转储的参考。 它们都是 utf-8 转储并且解析所有 "western type" 文件工作正常,但阿拉伯语、波斯语、俄语、日语、希腊语、中文和其他语言不起作用。显然我做错了什么。
我得到的标题字符串是:
%D9%85%D8%A7%D9%81%D8%B8%D8%A9_%D8%A7%D9%84%D8%A8%D8%AF%D8%A7%D8%A6%D8%B9 %D8%A3%D9%88%D8%B1%D9%8A%D9%88%D9%8A%D9%84%D8%A7 Battle_of_Nicopolis
Qingdao
所以一些正常的字符是可以的。其余的都是胡言乱语(对我来说)。 我已经做了一些测试,我只是简单地读取转储并写入 utf-8 编码的文本文件(line in => line out)然后它工作正常,但是在字符串处理函数和 "re." 函数的某处它改变了我的 unicode 文本。
编辑:我的 python 脚本以:# -- 编码:utf-8 --
我的代码(相关部分,包括 python2 和 python3 语句,以及一些说明我已经尝试过的内容):
with gzip.open(externallinks_file, 'r') as single_externallinksfile:
#reader = codecs.getreader("utf-8")
#single_externallinksfile = reader(single_externallinksfile)
#with codecs.getreader('utf-8')gzip.open(externallinks_file, 'r') as single_externallinksfile:
linecounter = 0
totlinecounter = 0
filelinecounter = 0
# We need to read line by line as we have massive files, sometimes multiple GBs
for line in single_externallinksfile:
if sys.version_info<(3,0,0):
line = unicode(line, 'utf-8')
else:
line = line.decode("utf-8")
if "INSERT INTO" in line:
insert_statements = line.split("),(")
for statement in insert_statements:
#statement = statement.decode("utf-8")
filelinecounter += 1
#if ("geohack.php?" in statement) and (("pagename" in statement) or ("src=" in statement)):
# src can also be in the line, but is different and we leave it out for now
if ("geohack.php?" in statement) and ("pagename" in statement) and ("params" in statement):
language = ""
region = ""
poitype = ""
content = re.findall(r'.*?pagename=(.*?)\'\,\'',statement,flags=re.IGNORECASE)
if len(content) > 0: # We even need this check due to corrupted lines
splitcontent = content[0].split("&")
title = splitcontent[0]
#title = title.decode('utf8')
for subcontent in splitcontent:
if "language=" in subcontent:
language = subcontent.replace("language=","")
#print('taal is: ' + language)
if "params=" in subcontent:
params_string = subcontent.replace("params=","").split("_")
latitude,longitude,poitype,region = get_coordinates_type_region(params_string)
if ( str(latitude) != "" and str(longitude) != "" and (str(latitude) != "0") or (str(longitude) != "0")):
if GENERATE_SQL == "YES":
sql_file.write('insert into ' + file_prefix + '_externallinks values ("' + title + '","' + str(latitude) + '","' + str(longitude) + '","' + language + '","' + poitype + '","' + region + '");\n')
if CREATE_SQLITE == "YES":
sqlcommand = 'insert into ' + file_prefix + '_externallinks values ("' + title + '","' + str(latitude) + '","' + str(longitude) + '","' + language + '","' + poitype + '","' + region +'");'
#print(sqlcommand)
cursor.execute(sqlcommand)
linecounter += 1
if linecounter == 10000:
if CREATE_SQLITE == "YES":
# Do a databse commit every 10000 rows
wikidb.commit()
totlinecounter += linecounter
linecounter = 0
print('\nProcessed ' + str(totlinecounter) + ' lines out of ' + str(filelinecounter) + ' sql line statements. Elapsed time: ' + str(datetime.datetime.now().replace(microsecond=0) - start_time))
标题好像是percent-encoded。
try:
# Python 3
from urllib.parse import unquote
except ImportError:
# Python 2
from urllib import unquote
percent_encoded = '''
%D9%85%D8%A7%D9%81%D8%B8%D8%A9_%D8%A7%D9%84%D8%A8%D8%AF%D8%A7%D8%A6%D8%B9
%D8%A3%D9%88%D8%B1%D9%8A%D9%88%D9%8A%D9%84%D8%A7
Battle_of_Nicopolis
Qingdao
'''
print(unquote(percent_encoded))
产量
مافظة_البدائع
أوريويلا
Battle_of_Nicopolis
Qingdao