无法使用 Scrapy 获取 table 数据
Cannot get table data with Scrapy
这是网站,我想抓取对应于 Category
和 SubCategory
的 table 数据。
https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US
我使用 Scrapy 和 Selenium 进行抓取。这是我的代码:
导入一些库:
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.support.ui import Select
主要代码:
class AdbcSpiderSelenium(scrapy.Spider):
name = 'adbc_selenium'
allowed_domains = ['https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US']
start_urls = [
'https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US'
]
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_path = 'D:\work\crawl data\selenium_project\chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
driver.get("https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US")
tab = Select(driver.find_element_by_id("ddlNatureId"))
tab.select_by_value('ADVOCATE OFFICES')
tab = Select(driver.find_element_by_id("ddlSubCategId"))
tab.select_by_value('Advertising Agent')
self.html = driver.page_source
driver.close()
def parse(self, response):
resp = Selector(text=self.html)
contents = resp.xpath('//*[@id="gvActivities"]/table//tr')
for content in contents:
yield {
'code': content.xpath('td[1]//text()').get(),
'name': content.xpath('td[2]//text()').get()
}
我的代码试图抓取 table 数据,其中 Business Category = ADVOCATE OFFICES
和 Business SubCategory = Advertising Agent
就是上图。
代码可以 运行 但它不会 return table 的任何数据。我的代码有什么问题?
这是要使用的正确 xpath:
//table[@id="gvActivities"]//tr
这是网站,我想抓取对应于 Category
和 SubCategory
的 table 数据。
https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US
我使用 Scrapy 和 Selenium 进行抓取。这是我的代码:
导入一些库:
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.support.ui import Select
主要代码:
class AdbcSpiderSelenium(scrapy.Spider):
name = 'adbc_selenium'
allowed_domains = ['https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US']
start_urls = [
'https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US'
]
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_path = 'D:\work\crawl data\selenium_project\chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
driver.get("https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US")
tab = Select(driver.find_element_by_id("ddlNatureId"))
tab.select_by_value('ADVOCATE OFFICES')
tab = Select(driver.find_element_by_id("ddlSubCategId"))
tab.select_by_value('Advertising Agent')
self.html = driver.page_source
driver.close()
def parse(self, response):
resp = Selector(text=self.html)
contents = resp.xpath('//*[@id="gvActivities"]/table//tr')
for content in contents:
yield {
'code': content.xpath('td[1]//text()').get(),
'name': content.xpath('td[2]//text()').get()
}
我的代码试图抓取 table 数据,其中 Business Category = ADVOCATE OFFICES
和 Business SubCategory = Advertising Agent
就是上图。
代码可以 运行 但它不会 return table 的任何数据。我的代码有什么问题?
这是要使用的正确 xpath:
//table[@id="gvActivities"]//tr