Selenium Python 网页抓取 UTF-8
Selenium Python web scraping UTF-8
也许这个问题以前有人问过,但由于找不到合适的答案,所以我敢问一个类似的问题。我的问题是,我一直在尝试抓取一个名为 'Sahibinden' 的土耳其汽车销售网站。我使用 jupyter notebook 和 sublime editors.Once 我尝试获取写入 csv 文件的数据,土耳其字母更改为不同的字符。我试过。 'UTF-8 Encoding', '# -- coding: utf-8 --', ISO 8859-9 等,但我无法解决问题。另一个问题是 Sublime 编辑器不会创建 csv 文件,尽管我在 jupyter notebook 上没有遇到任何问题。您将在图像 link 中找到 csv 文件输出。如果有人能回复我,我将不胜感激。
注意:一旦我 运行 在编辑器上打印命令,程序就可以正常工作了。
非常感谢。
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
import unicodedata
with open ('result1.csv','w') as f:
f.write('brand, model, year, oil_type, gear, odometer, body, hp,
eng_dim, color, warranty, condition, price, safe,
in_fea, outs_fea, mul_fea,pai_fea, rep_fea, acklm \n')
chrome_path = r"C:\Users\Mike\Desktop\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
def final_page(fn_20):
for lur in fn_20:
driver.get(lur)
brand = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[3]/span''')
brand = brand.text
brand = brand.encode("utf-8")
print (brand)
model = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[5]/span''')
model = model.text
model = model.encode("utf-8")
print (model)
year = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[6]/span''')
year = year.text
year = year.encode("utf-8")
print (year)
oil_type = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[7]/span''')
oil_type = oil_type.text
oil_type = oil_type.encode("utf-8")
print (oil_type)
gear = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[8]/span''')
gear = gear.text
gear = gear.encode("utf-8")
print (gear)
odometer = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[9]/span''')
odometer = odometer.text
odometer = odometer.encode("utf-8")
print (odometer)
body = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[10]/span''')
body = body.text
body = body.encode("utf-8")
print (body)
hp = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[11]/span''')
hp = hp.text
hp = hp.encode("utf-8")
print (hp)
eng_dim = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[12]/span''')
eng_dim = eng_dim.text
eng_dim = eng_dim.encode("utf-8")
print (eng_dim)
color = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[14]/span''')
color = color.text
color = color.encode("utf-8")
print (color)
warranty = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[15]/span''')
warranty = warranty.text
warranty = warranty.encode("utf-8")
print (warranty)
condition = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[19]/span''')
condition = condition.text
condition = condition.encode("utf-8")
print (condition)
price = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/h3''')
price = price.text
price = price.encode("utf-8")
print (price)
safe = ''
safety1 = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[1]/li[@class='selected']''')
for ur in safety1:
ur1 = ur.text
ur1 = ur1.encode("utf-8")
safe +=ur1 + ', '
print (safe)
in_fea = ''
in_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[2]/li[@class='selected']''')
for ins in in_features:
ins1 = ins.text
ins1 = ins1.encode("utf-8")
in_fea += ins1 + ', '
print (in_fea)
outs_fea = ''
out_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[3]/li[@class='selected']''')
for outs in out_features:
out1 = outs.text
out1 = out1.encode("utf-8")
outs_fea += out1 + ', '
print (outs_fea)
mul_fea = ''
mult_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[4]/li[@class='selected']''')
for mults in mult_features:
mul = mults.text
mul = mul.encode("utf-8")
mul_fea += mul + ', '
print (mul_fea)
pai_fea = ''
paint = driver.find_elements_by_xpath('''//div[@class='classified-pair custom-area ']/ul[1]/li[@class='selected']''')
for pai in paint:
pain = pai.text
pain = pain.encode("utf-8")
pai_fea += pain + ', '
print (pai_fea)
rep_fea = ''
replcd = driver.find_elements_by_xpath('''//div[@class='classified-pair custom-area']/ul[2]/li[@class='selected']''')
for rep in replcd:
repa = rep.text
repa = repa.encode("utf-8")
rep_fea += rep + ', '
print (rep_fea)
acklm = driver.find_element_by_xpath('''//div[@id='classified-detail']/div[@class='uiBox'][1]/div[@id='classifiedDescription']''')
acklm = acklm.text
acklm = acklm.encode("utf-8")
print (acklm)
try:
with open ('result1.csv', 'a') as f:
f.write (brand + ',' [enter image description here][1]+ model + ',' + year + ',' + oil_type + ',' + gear + ',' + odometer + ',' + body + ',' + hp + ',' + eng_dim + ',' + color + ',' + warranty + ',' + condition + ',' + price + ',' + safe + ',' + in_fea + ',' + outs_fea + ',' + mul_fea + ',' + pai_fea + ',' + rep_fea + ',' + acklm + '\n')
except Exception as e:
print (e)
driver.close
import codecs
file = codecs.open("utf_test", "w", "utf-8")
file.write(u'\ufeff')
file.write("test with utf-8")
file.write("字符")
file.close()
或者这也适用于我
with codecs.open("utf_test", "w", "utf-8-sig") as temp:
temp.write("this is a utf-test\n")
temp.write(u"test")
也许这个问题以前有人问过,但由于找不到合适的答案,所以我敢问一个类似的问题。我的问题是,我一直在尝试抓取一个名为 'Sahibinden' 的土耳其汽车销售网站。我使用 jupyter notebook 和 sublime editors.Once 我尝试获取写入 csv 文件的数据,土耳其字母更改为不同的字符。我试过。 'UTF-8 Encoding', '# -- coding: utf-8 --', ISO 8859-9 等,但我无法解决问题。另一个问题是 Sublime 编辑器不会创建 csv 文件,尽管我在 jupyter notebook 上没有遇到任何问题。您将在图像 link 中找到 csv 文件输出。如果有人能回复我,我将不胜感激。
注意:一旦我 运行 在编辑器上打印命令,程序就可以正常工作了。
非常感谢。
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
import unicodedata
with open ('result1.csv','w') as f:
f.write('brand, model, year, oil_type, gear, odometer, body, hp,
eng_dim, color, warranty, condition, price, safe,
in_fea, outs_fea, mul_fea,pai_fea, rep_fea, acklm \n')
chrome_path = r"C:\Users\Mike\Desktop\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
def final_page(fn_20):
for lur in fn_20:
driver.get(lur)
brand = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[3]/span''')
brand = brand.text
brand = brand.encode("utf-8")
print (brand)
model = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[5]/span''')
model = model.text
model = model.encode("utf-8")
print (model)
year = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[6]/span''')
year = year.text
year = year.encode("utf-8")
print (year)
oil_type = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[7]/span''')
oil_type = oil_type.text
oil_type = oil_type.encode("utf-8")
print (oil_type)
gear = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[8]/span''')
gear = gear.text
gear = gear.encode("utf-8")
print (gear)
odometer = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[9]/span''')
odometer = odometer.text
odometer = odometer.encode("utf-8")
print (odometer)
body = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[10]/span''')
body = body.text
body = body.encode("utf-8")
print (body)
hp = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[11]/span''')
hp = hp.text
hp = hp.encode("utf-8")
print (hp)
eng_dim = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[12]/span''')
eng_dim = eng_dim.text
eng_dim = eng_dim.encode("utf-8")
print (eng_dim)
color = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[14]/span''')
color = color.text
color = color.encode("utf-8")
print (color)
warranty = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[15]/span''')
warranty = warranty.text
warranty = warranty.encode("utf-8")
print (warranty)
condition = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[19]/span''')
condition = condition.text
condition = condition.encode("utf-8")
print (condition)
price = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/h3''')
price = price.text
price = price.encode("utf-8")
print (price)
safe = ''
safety1 = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[1]/li[@class='selected']''')
for ur in safety1:
ur1 = ur.text
ur1 = ur1.encode("utf-8")
safe +=ur1 + ', '
print (safe)
in_fea = ''
in_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[2]/li[@class='selected']''')
for ins in in_features:
ins1 = ins.text
ins1 = ins1.encode("utf-8")
in_fea += ins1 + ', '
print (in_fea)
outs_fea = ''
out_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[3]/li[@class='selected']''')
for outs in out_features:
out1 = outs.text
out1 = out1.encode("utf-8")
outs_fea += out1 + ', '
print (outs_fea)
mul_fea = ''
mult_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[4]/li[@class='selected']''')
for mults in mult_features:
mul = mults.text
mul = mul.encode("utf-8")
mul_fea += mul + ', '
print (mul_fea)
pai_fea = ''
paint = driver.find_elements_by_xpath('''//div[@class='classified-pair custom-area ']/ul[1]/li[@class='selected']''')
for pai in paint:
pain = pai.text
pain = pain.encode("utf-8")
pai_fea += pain + ', '
print (pai_fea)
rep_fea = ''
replcd = driver.find_elements_by_xpath('''//div[@class='classified-pair custom-area']/ul[2]/li[@class='selected']''')
for rep in replcd:
repa = rep.text
repa = repa.encode("utf-8")
rep_fea += rep + ', '
print (rep_fea)
acklm = driver.find_element_by_xpath('''//div[@id='classified-detail']/div[@class='uiBox'][1]/div[@id='classifiedDescription']''')
acklm = acklm.text
acklm = acklm.encode("utf-8")
print (acklm)
try:
with open ('result1.csv', 'a') as f:
f.write (brand + ',' [enter image description here][1]+ model + ',' + year + ',' + oil_type + ',' + gear + ',' + odometer + ',' + body + ',' + hp + ',' + eng_dim + ',' + color + ',' + warranty + ',' + condition + ',' + price + ',' + safe + ',' + in_fea + ',' + outs_fea + ',' + mul_fea + ',' + pai_fea + ',' + rep_fea + ',' + acklm + '\n')
except Exception as e:
print (e)
driver.close
import codecs
file = codecs.open("utf_test", "w", "utf-8")
file.write(u'\ufeff')
file.write("test with utf-8")
file.write("字符")
file.close()
或者这也适用于我
with codecs.open("utf_test", "w", "utf-8-sig") as temp:
temp.write("this is a utf-test\n")
temp.write(u"test")