想要使用 selenium-webdriver 从站点中提取数据
want to extract data from a site using selenium-wedriver
我想提取网站上每家公司的名称、网站、phone 和电子邮件,但代码一直在页面上一遍又一遍地打印第一个公司名称,如果我试图找到它就会崩溃网站、phone 和电子邮件。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
url='https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b'
driver = webdriver.Firefox()
driver.get(url)
wait=WebDriverWait(driver,50)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,'#pym-0 > iframe')))
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
button = wait.until(EC.element_to_be_clickable((By.XPATH,'./html/body/div[5]/div/ul/li[13]/a')))
numOfPages=1161
counter=4
for i in range(numOfPages):
driver.execute_script("arguments[0].scrollIntoView();", button)
for e in list:
name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
print(name)
website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
print(website)
phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
print(phone)
email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
print(email)
time.sleep(counter)
button.click()
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
if i%40==0:
counter+=1
我的问题出在这些代码行上
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
for e in list:
name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
print(name)
website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
print(website)
phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
print(phone)
email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
print(email)
我建议您使用其他查找元素的函数来使您的代码更具可读性。我对您的代码做了一些更改,希望能帮助您获取数据:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
url = "https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b"
driver = webdriver.Firefox()
driver.get(url)
wait = WebDriverWait(driver, 50)
wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))
).click()
wait.until(
EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#pym-0 > iframe"))
)
list = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title ")))
button = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "a[ng-click='setPage(pager.currentPage + 1)']")
)
)
counter = 4
def getText(element):
text = element.text
if not text:
text = "---"
return text
def getContactInfo(parent):
element = None
try:
element = parent.find_element_by_class_name("contact-info")
except:
pass
return element
while (
# Last Page has disabled the li element
not "disabled"
in driver.find_element_by_css_selector(
"li[ng-class='{disabled:pager.currentPage === pager.totalPages}']"
)
.get_attribute("class")
.split()
):
driver.execute_script("arguments[0].scrollIntoView();", button)
for e in list:
name = e.find_element_by_tag_name("h4")
print(getText(name))
account_info = e.find_element_by_css_selector(
"div.account-Info.large-12.columns.ng-scope"
)
contact_info = getContactInfo(account_info)
if contact_info:
website = contact_info.find_element_by_css_selector(
"a.website.ng-binding.ng-scope"
)
print(getText(website))
phone = contact_info.find_element_by_css_selector("a.telephone.ng-binding")
print(getText(phone))
email = contact_info.find_element_by_css_selector("a.emailid.ng-binding")
print(getText(email))
print("*******\n")
button.click()
time.sleep(counter)
list = wait.until(
EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title "))
)
driver.quit()
我想提取网站上每家公司的名称、网站、phone 和电子邮件,但代码一直在页面上一遍又一遍地打印第一个公司名称,如果我试图找到它就会崩溃网站、phone 和电子邮件。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
url='https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b'
driver = webdriver.Firefox()
driver.get(url)
wait=WebDriverWait(driver,50)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,'#pym-0 > iframe')))
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
button = wait.until(EC.element_to_be_clickable((By.XPATH,'./html/body/div[5]/div/ul/li[13]/a')))
numOfPages=1161
counter=4
for i in range(numOfPages):
driver.execute_script("arguments[0].scrollIntoView();", button)
for e in list:
name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
print(name)
website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
print(website)
phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
print(phone)
email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
print(email)
time.sleep(counter)
button.click()
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
if i%40==0:
counter+=1
我的问题出在这些代码行上
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
for e in list:
name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
print(name)
website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
print(website)
phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
print(phone)
email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
print(email)
我建议您使用其他查找元素的函数来使您的代码更具可读性。我对您的代码做了一些更改,希望能帮助您获取数据:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
url = "https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b"
driver = webdriver.Firefox()
driver.get(url)
wait = WebDriverWait(driver, 50)
wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))
).click()
wait.until(
EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#pym-0 > iframe"))
)
list = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title ")))
button = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "a[ng-click='setPage(pager.currentPage + 1)']")
)
)
counter = 4
def getText(element):
text = element.text
if not text:
text = "---"
return text
def getContactInfo(parent):
element = None
try:
element = parent.find_element_by_class_name("contact-info")
except:
pass
return element
while (
# Last Page has disabled the li element
not "disabled"
in driver.find_element_by_css_selector(
"li[ng-class='{disabled:pager.currentPage === pager.totalPages}']"
)
.get_attribute("class")
.split()
):
driver.execute_script("arguments[0].scrollIntoView();", button)
for e in list:
name = e.find_element_by_tag_name("h4")
print(getText(name))
account_info = e.find_element_by_css_selector(
"div.account-Info.large-12.columns.ng-scope"
)
contact_info = getContactInfo(account_info)
if contact_info:
website = contact_info.find_element_by_css_selector(
"a.website.ng-binding.ng-scope"
)
print(getText(website))
phone = contact_info.find_element_by_css_selector("a.telephone.ng-binding")
print(getText(phone))
email = contact_info.find_element_by_css_selector("a.emailid.ng-binding")
print(getText(email))
print("*******\n")
button.click()
time.sleep(counter)
list = wait.until(
EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title "))
)
driver.quit()