如何在 windows 10 上使用 Selenium 和 Python 3.6 抓取 Table
How to scrape a Table with Selenium and Python 3.6 on windows 10
我正在努力从网站上抓取数据并将数据写入 CSV 文件。
然后在写入文件更改页面后继续该过程直到最后一页。
我只从第一行获取数据。我做错了什么。
请纠正我。
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
for i in range(1,101,1) :
sNo = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[1]")
district = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]/a")
districtName = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]").text
state= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[3]").text
population= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[4]").text
growth= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[5]").text
sexRatio= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[6]").text
districtLink = district.get_attribute("href")
print(districtName,state,population,growth,sexRatio,districtLink)
with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio","DistrictLink"]
writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population, 'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})
print ("OK")
driver.quit()
还有一个问题:
我尝试通过抓取 CSS 选择器但无法在 col.
之后放置分隔符
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
districtTable = driver.find_elements_by_css_selector("#DataTables_Table_0")
for row in districtTable:
print(row.text)
with open(r"D:\python36_files\censusDistrictData1.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = ["Row"],restval='none',delimiter = ';')
writer.writerow({"Row":row.text}) #pl.let me know how to insert ; after a data to a csvfile
print ("OK")
driver.quit()
您似乎刚刚遗漏了 for
循环
下的缩进
for i in range(1,101):
trXpath = "//table[@id='DataTables_Table_0']/tbody/tr[" + str(i) + "]"
sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
state= driver.find_element_by_xpath(trXpath + "/td[3]").text
population= driver.find_element_by_xpath(trXpath + "/td[4]").text
growth= driver.find_element_by_xpath(trXpath + "/td[5]").text
sexRatio= driver.find_element_by_xpath(trXpath + "/td[6]").text
print(districtName, state, population, growth, sexRatio, districtLink)
我稍微简化了代码,使其更易于阅读和维护。
这个问题是我解决的,但在Python中不是正确的方法,请详细指导我。
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
i = 0
for i in range(0, 100):
i += 1
baseXpath = "//table[@id='DataTables_Table_0']/tbody/tr["
row = str(i)
trXpath = baseXpath + row + ']'
sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
state = driver.find_element_by_xpath(trXpath + "/td[3]").text
population = driver.find_element_by_xpath(trXpath + "/td[4]").text
growth = driver.find_element_by_xpath(trXpath + "/td[5]").text
sexRatio = driver.find_element_by_xpath(trXpath + "/td[6]").text
print(districtName,';', state,';', population,';', growth,';', sexRatio)
with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio","DistrictLink"]
writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population, 'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})
print ("OK")
driver.quit()
我正在努力从网站上抓取数据并将数据写入 CSV 文件。 然后在写入文件更改页面后继续该过程直到最后一页。 我只从第一行获取数据。我做错了什么。 请纠正我。
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
for i in range(1,101,1) :
sNo = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[1]")
district = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]/a")
districtName = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]").text
state= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[3]").text
population= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[4]").text
growth= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[5]").text
sexRatio= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[6]").text
districtLink = district.get_attribute("href")
print(districtName,state,population,growth,sexRatio,districtLink)
with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio","DistrictLink"]
writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population, 'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})
print ("OK")
driver.quit()
还有一个问题: 我尝试通过抓取 CSS 选择器但无法在 col.
之后放置分隔符import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
districtTable = driver.find_elements_by_css_selector("#DataTables_Table_0")
for row in districtTable:
print(row.text)
with open(r"D:\python36_files\censusDistrictData1.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = ["Row"],restval='none',delimiter = ';')
writer.writerow({"Row":row.text}) #pl.let me know how to insert ; after a data to a csvfile
print ("OK")
driver.quit()
您似乎刚刚遗漏了 for
循环
for i in range(1,101):
trXpath = "//table[@id='DataTables_Table_0']/tbody/tr[" + str(i) + "]"
sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
state= driver.find_element_by_xpath(trXpath + "/td[3]").text
population= driver.find_element_by_xpath(trXpath + "/td[4]").text
growth= driver.find_element_by_xpath(trXpath + "/td[5]").text
sexRatio= driver.find_element_by_xpath(trXpath + "/td[6]").text
print(districtName, state, population, growth, sexRatio, districtLink)
我稍微简化了代码,使其更易于阅读和维护。
这个问题是我解决的,但在Python中不是正确的方法,请详细指导我。
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
i = 0
for i in range(0, 100):
i += 1
baseXpath = "//table[@id='DataTables_Table_0']/tbody/tr["
row = str(i)
trXpath = baseXpath + row + ']'
sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
state = driver.find_element_by_xpath(trXpath + "/td[3]").text
population = driver.find_element_by_xpath(trXpath + "/td[4]").text
growth = driver.find_element_by_xpath(trXpath + "/td[5]").text
sexRatio = driver.find_element_by_xpath(trXpath + "/td[6]").text
print(districtName,';', state,';', population,';', growth,';', sexRatio)
with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio","DistrictLink"]
writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population, 'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})
print ("OK")
driver.quit()