用 Beautiful Soup 和 Python 抓取到 CSV
Scraping with Beautiful Soup and Python to CSV
尝试使用 Beautiful Soup 和 Selenium 从房地产网站的列表中提取楼层面积(以平方英尺为单位)和地块面积(以公顷为单位)。
地板尺寸在控制台中打印正常
但是写入 csv 文件时,地板尺寸列下的 'sq ft' 信息未提取
似乎如果 'sq ft' 在规定的 ID 元素之后被 BS4 发现,则返回,所有其他 'sq ft' 文本每隔一个 url 写入 csv 时。正如您在 (image) 上看到的那样,尽管这两个链接也有公顷,但其中两个列表有这个:
http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG
http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ
有人能解释一下为什么控制台上打印的是平方英尺而不是写入 csv 吗?任何帮助将不胜感激。
相关 HTML 其中 CP2_CPContent_conDetails1_divDetails 是楼层面积和地块面积的相关定位器:
<div id="CP2_CPContent_conDetails1_divDetails">
0.3 Acres <br>(0.12 Hectares)
<div class="clear"></div>
<div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
Potential building size of 6,458 sq ft (600 sq m)<br>
</div>
代码如下:
driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)
#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
return house_links
#get html data from url and return as object
def get_html_data(url, driver):
driver.get(url)
time.sleep(np.random.lognormal(0,1))
soup = BeautifulSoup(driver.page_source, 'html.parser')
return soup
def get_lot_size(soup):
try:
for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
lot_size = element.find_next(text=re.compile('Hectares'))
lot_size = lot_size.replace("(", "").replace(")", "")
print(lot_size)
return lot_size
except:
return 'NA'
def get_floor_size(soup):
try:
for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
floor_size = element.find_next(text=re.compile('sq ft'))
print(floor_size)
return floor_size
except:
return 'NA'
def flatten_list(house_links):
house_links_flat = []
for sublist in house_links:
for item in sublist:
house_links_flat.append(item)
return house_links_flat
def get_house_data(driver, house_links_flat):
house_data = []
for link in house_links_flat:
soup = get_html_data(link, driver)
floor_size = get_floor_size(soup)
lot_size = get_lot_size(soup)
house_data.append([floor_size, lot_size])
return house_data
house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)
#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
file_name, index = False, encoding = "UTF-8"
)
我可以使用你的代码获取 Hectares
。
我在使用 sq ft
时遇到了问题 - 它甚至没有显示。都是因为你在
中使用了 find()
而不是 find_all()
for element in soup.find()
但是 find()
没有 return 列出元素而是单个元素然后 for
没有从列表中获取这个元素但是它可能获取它的子元素并且它搜索 sq ft
放错地方了。
from selenium import webdriver
import numpy as np
import time
import re
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)
#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
return house_links
#get html data from url and return as object
def get_html_data(url, driver):
driver.get(url)
time.sleep(np.random.lognormal(0,1))
soup = BeautifulSoup(driver.page_source, 'html.parser')
return soup
def get_lot_size(soup):
try:
for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
lot_size = element.find_next(text=re.compile('Hectares'))
if lot_size:
lot_size = lot_size.replace("(", "").replace(")", "")
lot_size = lot_size.strip()
print('lot_size:', lot_size)
return lot_size
except Exception as ex:
print("EX:", ex)
return 'NA'
def get_floor_size(soup):
try:
for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
floor_size = element.find_next(text=re.compile('sq ft'))
if floor_size:
floor_size = floor_size.strip()
print('floor_size:', floor_size)
return floor_size
except Exception as ex:
print("EX:", ex)
return 'NA'
def flatten_list(house_links):
house_links_flat = []
for sublist in house_links:
for item in sublist:
house_links_flat.append(item)
return house_links_flat
def get_house_data(driver, house_links_flat):
house_data = []
for link in house_links_flat:
soup = get_html_data(link, driver)
floor_size = get_floor_size(soup)
lot_size = get_lot_size(soup)
house_data.append([floor_size, lot_size])
print('-------------------')
return house_data
house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)
#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
file_name, index = False, encoding = "UTF-8"
)
CSV:
Floor_Size,Lot_Size
,0.21 Hectares
7342 sq ft,
1665 sq ft,
"The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
4672 to 20302 sq ft,
,0.36 Hectares
,0.08 Hectares
,0.18 Hectares
2325 sq ft,
,0.02 Hectares
5288 sq ft,
0 sq ft,
,0.36 Hectares
,0.18 Hectares
"* Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
1258 to 5385 sq ft,
,0.13 Hectares
3600 sq ft,
,0.24 Hectares
6781 to 6871 sq ft,
尝试使用 Beautiful Soup 和 Selenium 从房地产网站的列表中提取楼层面积(以平方英尺为单位)和地块面积(以公顷为单位)。
地板尺寸在控制台中打印正常
但是写入 csv 文件时,地板尺寸列下的 'sq ft' 信息未提取
似乎如果 'sq ft' 在规定的 ID 元素之后被 BS4 发现,则返回,所有其他 'sq ft' 文本每隔一个 url 写入 csv 时。正如您在 (image) 上看到的那样,尽管这两个链接也有公顷,但其中两个列表有这个:
http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ
有人能解释一下为什么控制台上打印的是平方英尺而不是写入 csv 吗?任何帮助将不胜感激。
相关 HTML 其中 CP2_CPContent_conDetails1_divDetails 是楼层面积和地块面积的相关定位器:
<div id="CP2_CPContent_conDetails1_divDetails">
0.3 Acres <br>(0.12 Hectares)
<div class="clear"></div>
<div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
Potential building size of 6,458 sq ft (600 sq m)<br>
</div>
代码如下:
driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)
#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
return house_links
#get html data from url and return as object
def get_html_data(url, driver):
driver.get(url)
time.sleep(np.random.lognormal(0,1))
soup = BeautifulSoup(driver.page_source, 'html.parser')
return soup
def get_lot_size(soup):
try:
for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
lot_size = element.find_next(text=re.compile('Hectares'))
lot_size = lot_size.replace("(", "").replace(")", "")
print(lot_size)
return lot_size
except:
return 'NA'
def get_floor_size(soup):
try:
for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
floor_size = element.find_next(text=re.compile('sq ft'))
print(floor_size)
return floor_size
except:
return 'NA'
def flatten_list(house_links):
house_links_flat = []
for sublist in house_links:
for item in sublist:
house_links_flat.append(item)
return house_links_flat
def get_house_data(driver, house_links_flat):
house_data = []
for link in house_links_flat:
soup = get_html_data(link, driver)
floor_size = get_floor_size(soup)
lot_size = get_lot_size(soup)
house_data.append([floor_size, lot_size])
return house_data
house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)
#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
file_name, index = False, encoding = "UTF-8"
)
我可以使用你的代码获取 Hectares
。
我在使用 sq ft
时遇到了问题 - 它甚至没有显示。都是因为你在
find()
而不是 find_all()
for element in soup.find()
但是 find()
没有 return 列出元素而是单个元素然后 for
没有从列表中获取这个元素但是它可能获取它的子元素并且它搜索 sq ft
放错地方了。
from selenium import webdriver
import numpy as np
import time
import re
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)
#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
return house_links
#get html data from url and return as object
def get_html_data(url, driver):
driver.get(url)
time.sleep(np.random.lognormal(0,1))
soup = BeautifulSoup(driver.page_source, 'html.parser')
return soup
def get_lot_size(soup):
try:
for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
lot_size = element.find_next(text=re.compile('Hectares'))
if lot_size:
lot_size = lot_size.replace("(", "").replace(")", "")
lot_size = lot_size.strip()
print('lot_size:', lot_size)
return lot_size
except Exception as ex:
print("EX:", ex)
return 'NA'
def get_floor_size(soup):
try:
for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
floor_size = element.find_next(text=re.compile('sq ft'))
if floor_size:
floor_size = floor_size.strip()
print('floor_size:', floor_size)
return floor_size
except Exception as ex:
print("EX:", ex)
return 'NA'
def flatten_list(house_links):
house_links_flat = []
for sublist in house_links:
for item in sublist:
house_links_flat.append(item)
return house_links_flat
def get_house_data(driver, house_links_flat):
house_data = []
for link in house_links_flat:
soup = get_html_data(link, driver)
floor_size = get_floor_size(soup)
lot_size = get_lot_size(soup)
house_data.append([floor_size, lot_size])
print('-------------------')
return house_data
house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)
#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
file_name, index = False, encoding = "UTF-8"
)
CSV:
Floor_Size,Lot_Size
,0.21 Hectares
7342 sq ft,
1665 sq ft,
"The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
4672 to 20302 sq ft,
,0.36 Hectares
,0.08 Hectares
,0.18 Hectares
2325 sq ft,
,0.02 Hectares
5288 sq ft,
0 sq ft,
,0.36 Hectares
,0.18 Hectares
"* Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
1258 to 5385 sq ft,
,0.13 Hectares
3600 sq ft,
,0.24 Hectares
6781 to 6871 sq ft,