我无法在 Aliexpress 中获得每个产品页面的评论
I can't get reviews of each product page in Aliexpress
我正在尝试从此网站抓取数据:https://www.aliexpress.com/wholesale?catId=0&initiative_id=AS_20220313071939&SearchText=bluetooth+earphones 特别是我想从每个产品页面获取所有评论。主要问题是我正在努力获得这个被包围的底部,以便抓取每个评论和客户国家:
这是一张照片显示:
enter image description here
这是我的代码:
from selenium import webdriver
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders", "Shipping Cost", "Product links", "Country","Comments"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
review = product.xpath('.//span[@class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = ''
###########################################
links = product.xpath('//div[@class="JIIxO"]//a/@href')
if links:
links = links[0]
else:
links = ''
# scraping data from each inner page
for link in links :
driver.get(link)
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
for cmt in tree.xpath('//*[@id="transction-feedback"]/div[5]/div[1]'):
country = cmt.xpath('.//div[@class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
comment = cmt.xpath('.//span[@id="0.0.0.i4.5dc4sSFDsSFD5B"]/text()')
if comment:
comment = comment[0]
else:
comment = ''
row = [title, price, currency, review, nb_sold, ship_cost, links,country, comment]
results.append(row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
有两个问题:
第一个:
您必须使用 html.fromstring(driver.page_source)
AFTER 向下滚动。
第二个:
只有在 window 中(在 viewport
中)显示的项目才会添加,因此您不能直接跳到页面末尾。您必须使用 window.innerHeight
.
部分滚动(循环)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
#print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
在 xpath
.
中进行其他更改的完整工作代码
它在每一页上给我 60 个项目。
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
# jump to the end of page
#driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# scroll partially
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
#print('[DEBUG] title:', title)
if title:
title = title[0]
#print('[DEBUG] title:', title)
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
# for `$ 35.00`
currency = price[0]
price = ''.join(price[1:])
# for `35.00 zł`
#currency = price[-1]
#price = ''.join(price[:-1])
#print('[DEBUG] price:', price)
#print('[DEBUG] currency:', currency)
review = product.xpath('.//span[@class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
#print('[DEBUG] review:', review)
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
#print('[DEBUG] nb_sold:', nb_sold)
row = [title, price, currency, review, nb_sold]
results.append(row)
#print('[DEBUG] row:', row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
我正在尝试从此网站抓取数据:https://www.aliexpress.com/wholesale?catId=0&initiative_id=AS_20220313071939&SearchText=bluetooth+earphones 特别是我想从每个产品页面获取所有评论。主要问题是我正在努力获得这个被包围的底部,以便抓取每个评论和客户国家: 这是一张照片显示: enter image description here
这是我的代码:
from selenium import webdriver
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders", "Shipping Cost", "Product links", "Country","Comments"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
review = product.xpath('.//span[@class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = ''
###########################################
links = product.xpath('//div[@class="JIIxO"]//a/@href')
if links:
links = links[0]
else:
links = ''
# scraping data from each inner page
for link in links :
driver.get(link)
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
for cmt in tree.xpath('//*[@id="transction-feedback"]/div[5]/div[1]'):
country = cmt.xpath('.//div[@class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
comment = cmt.xpath('.//span[@id="0.0.0.i4.5dc4sSFDsSFD5B"]/text()')
if comment:
comment = comment[0]
else:
comment = ''
row = [title, price, currency, review, nb_sold, ship_cost, links,country, comment]
results.append(row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
有两个问题:
第一个:
您必须使用 html.fromstring(driver.page_source)
AFTER 向下滚动。
第二个:
只有在 window 中(在 viewport
中)显示的项目才会添加,因此您不能直接跳到页面末尾。您必须使用 window.innerHeight
.
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
#print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
在 xpath
.
它在每一页上给我 60 个项目。
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
# jump to the end of page
#driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# scroll partially
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
#print('[DEBUG] title:', title)
if title:
title = title[0]
#print('[DEBUG] title:', title)
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
# for `$ 35.00`
currency = price[0]
price = ''.join(price[1:])
# for `35.00 zł`
#currency = price[-1]
#price = ''.join(price[:-1])
#print('[DEBUG] price:', price)
#print('[DEBUG] currency:', currency)
review = product.xpath('.//span[@class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
#print('[DEBUG] review:', review)
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
#print('[DEBUG] nb_sold:', nb_sold)
row = [title, price, currency, review, nb_sold]
results.append(row)
#print('[DEBUG] row:', row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()