如何抓取两个 URL 并将每个 url 的元素放在一个 table 中?
How to scrape two URLs and and put the elements of each url in one single table?
我在同一页面上有两个 URL,我想抓取它们以获取纽约市的房价。我使用 BeautifulSoup 获取每个房间的地址、价格和可用性。之后,我制作了一本字典,以便我可以创建一个 DataFrame。
我为每个 URL 获得两个不同的 DataFrame,但我希望每个 URL 的信息都在一个 DataFrame 中。
在我得到我需要的信息后,我将它附加到一个列表中,我稍后将其用于字典
def getRoomInfo (startingPage):
html = requests.get (startingPage)
bs1 = BeautifulSoup (html.text, "html.parser")
url = "{}://{}".format (urlparse (startingPage).scheme, urlparse(startingPage).netloc)
href_links = []
for link in bs1.find_all("a", href = re.compile ("/new-york-apartment/roommate-share/"+"\d+")):
href_links.append (link["href"])
room_link = []
for links in href_links:
room_link.append(url+links)
addressList =[]
priceList = []
availabilityList = []
for page in room_link:
html_page = requests.get (page)
bs_page = BeautifulSoup (html_page.text, "html.parser")
address = bs_page.find ("div",{"class": "ap-info-address"} )
addressList.append (address.get_text())
price = bs_page.find ("div",{"class": "apt-price price-cur-1"} )
priceList.append (price.get_text())
availability = bs_page.find ("td")
availabilityList.append (availability.get_text())
infoDataFrame = pd.DataFrame (
{"Address": addressList,
"Price": priceList,
"Availability": availabilityList,
})
print (infoDataFrame)
links_rooms = ("https://www.nyhabitat.com/new-york-apartment/roommate-share",
"https://www.nyhabitat.com/new-york-apartment/list.php?page=2&dep=SH&lev=3&price=400;2400&guest=1&sort=new&cll=1&searchMapLeft=40.60484725779594&searchMapTop=-73.81336257537379&searchMapRight=40.90185344223534&searchMapBottom=-74.14810226043238&searchMapZoom=11&div_code=ny&lang=en")
strip() in-built Python 函数用于删除字符串中的所有前导和尾随空格。
rooms = []
for page in room_link:
html_page = requests.get(page)
soup = BeautifulSoup (html_page.text, "html.parser")
for row in soup.select('div[class*="grid-col"]'):
room = {}
priceDiv = row.find("div",{'class':'slider-descr-wrap'})
room['price'] = priceDiv.find("div",{"class": "apt-price price-cur-1"}).text.strip()
addrDiv = row.find("div",{'class':'slider-descr-bottom'})
room['address'] = addrDiv.find("span",{"class": "slider-descr-2-row"}).text.strip()
room['availability'] = addrDiv.find("span",{'class':'search-aval'}).text.strip()
rooms.append(room)
print(rooms)
df = pd.DataFrame(rooms, columns=['price', 'address','availability'])
print(df)
O/P:
[{'price': ',395', 'address': 'Bushwick, Brooklyn', 'availability': 'Available Aug 01 2019'}, {'price': ',350', 'address': 'Fort Greene, Brooklyn', 'availability': 'Available Jun 15 2019'}, {'price': ',055', 'address': 'Kips Bay, Manhattan', 'availability': 'Available Jun 30 2019'}, {'price': ',350', 'address': 'Duplex, Brooklyn', 'availability': 'Available Jun 08 2019'}, {'price': '0', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Aug 10 2019'}, {'price': ',100', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Aug 10 2019'}, {'price': '5', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Aug 31 2019'}, {'price': '0', 'address': 'Duplex, Ridgewood, Queens', 'availability': 'Available Jun 08 2019'}, {'price': '3', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Jun 12 2020'}, {'price': ',150', 'address': 'Triplex, Ridgewood, Queens', 'availability': 'Available Jun 08 2019'}, {'price': ',317', 'address': 'Stuyvesant Town, Manhattan', 'availability': 'Available Dec 31 2019'}, {'price': '0', 'address': 'Jamaica, Queens', 'availability': 'Available Jun 08 2019'}, {'price': ',700', 'address': 'Chelsea, Manhattan', 'availability': 'Available Sep 01 2019'}, {'price': '0', 'address': 'Astoria, Queens', 'availability': 'Available Jul 22 2019'}, {'price': ',750', 'address': 'Chelsea, Manhattan', 'availability': 'Available Jun 08 2019'}, {'price': ',375', 'address': 'Harlem, Manhattan', 'availability': 'Available Oct 01 2019'}, {'price': '1', 'address': 'Forest Hills, Queens', 'availability': 'Available Aug 01 2019'}, {'price': '0', 'address': 'Brooklyn', 'availability': 'Available Jun 08 2019'}, {'price': '8', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Jun 08 2019'}, {'price': ',200', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Dec 01 2019'}]
price address availability
0 ,395 Bushwick, Brooklyn Available Aug 01 2019
1 ,350 Fort Greene, Brooklyn Available Jun 15 2019
2 ,055 Kips Bay, Manhattan Available Jun 30 2019
3 ,350 Duplex, Brooklyn Available Jun 08 2019
4 0 Flatbush, Brooklyn Available Aug 10 2019
5 ,100 Flatbush, Brooklyn Available Aug 10 2019
6 5 Washington Heights, Manhattan Available Aug 31 2019
7 0 Duplex, Ridgewood, Queens Available Jun 08 2019
8 3 Washington Heights, Manhattan Available Jun 12 2020
9 ,150 Triplex, Ridgewood, Queens Available Jun 08 2019
10 ,317 Stuyvesant Town, Manhattan Available Dec 31 2019
11 0 Jamaica, Queens Available Jun 08 2019
12 ,700 Chelsea, Manhattan Available Sep 01 2019
13 0 Astoria, Queens Available Jul 22 2019
14 ,750 Chelsea, Manhattan Available Jun 08 2019
15 ,375 Harlem, Manhattan Available Oct 01 2019
16 1 Forest Hills, Queens Available Aug 01 2019
17 0 Brooklyn Available Jun 08 2019
18 8 Washington Heights, Manhattan Available Jun 08 2019
19 ,200 Flatbush, Brooklyn Available Dec 01 2019
我在同一页面上有两个 URL,我想抓取它们以获取纽约市的房价。我使用 BeautifulSoup 获取每个房间的地址、价格和可用性。之后,我制作了一本字典,以便我可以创建一个 DataFrame。
我为每个 URL 获得两个不同的 DataFrame,但我希望每个 URL 的信息都在一个 DataFrame 中。
在我得到我需要的信息后,我将它附加到一个列表中,我稍后将其用于字典
def getRoomInfo (startingPage):
html = requests.get (startingPage)
bs1 = BeautifulSoup (html.text, "html.parser")
url = "{}://{}".format (urlparse (startingPage).scheme, urlparse(startingPage).netloc)
href_links = []
for link in bs1.find_all("a", href = re.compile ("/new-york-apartment/roommate-share/"+"\d+")):
href_links.append (link["href"])
room_link = []
for links in href_links:
room_link.append(url+links)
addressList =[]
priceList = []
availabilityList = []
for page in room_link:
html_page = requests.get (page)
bs_page = BeautifulSoup (html_page.text, "html.parser")
address = bs_page.find ("div",{"class": "ap-info-address"} )
addressList.append (address.get_text())
price = bs_page.find ("div",{"class": "apt-price price-cur-1"} )
priceList.append (price.get_text())
availability = bs_page.find ("td")
availabilityList.append (availability.get_text())
infoDataFrame = pd.DataFrame (
{"Address": addressList,
"Price": priceList,
"Availability": availabilityList,
})
print (infoDataFrame)
links_rooms = ("https://www.nyhabitat.com/new-york-apartment/roommate-share", "https://www.nyhabitat.com/new-york-apartment/list.php?page=2&dep=SH&lev=3&price=400;2400&guest=1&sort=new&cll=1&searchMapLeft=40.60484725779594&searchMapTop=-73.81336257537379&searchMapRight=40.90185344223534&searchMapBottom=-74.14810226043238&searchMapZoom=11&div_code=ny&lang=en")
strip() in-built Python 函数用于删除字符串中的所有前导和尾随空格。
rooms = []
for page in room_link:
html_page = requests.get(page)
soup = BeautifulSoup (html_page.text, "html.parser")
for row in soup.select('div[class*="grid-col"]'):
room = {}
priceDiv = row.find("div",{'class':'slider-descr-wrap'})
room['price'] = priceDiv.find("div",{"class": "apt-price price-cur-1"}).text.strip()
addrDiv = row.find("div",{'class':'slider-descr-bottom'})
room['address'] = addrDiv.find("span",{"class": "slider-descr-2-row"}).text.strip()
room['availability'] = addrDiv.find("span",{'class':'search-aval'}).text.strip()
rooms.append(room)
print(rooms)
df = pd.DataFrame(rooms, columns=['price', 'address','availability'])
print(df)
O/P:
[{'price': ',395', 'address': 'Bushwick, Brooklyn', 'availability': 'Available Aug 01 2019'}, {'price': ',350', 'address': 'Fort Greene, Brooklyn', 'availability': 'Available Jun 15 2019'}, {'price': ',055', 'address': 'Kips Bay, Manhattan', 'availability': 'Available Jun 30 2019'}, {'price': ',350', 'address': 'Duplex, Brooklyn', 'availability': 'Available Jun 08 2019'}, {'price': '0', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Aug 10 2019'}, {'price': ',100', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Aug 10 2019'}, {'price': '5', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Aug 31 2019'}, {'price': '0', 'address': 'Duplex, Ridgewood, Queens', 'availability': 'Available Jun 08 2019'}, {'price': '3', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Jun 12 2020'}, {'price': ',150', 'address': 'Triplex, Ridgewood, Queens', 'availability': 'Available Jun 08 2019'}, {'price': ',317', 'address': 'Stuyvesant Town, Manhattan', 'availability': 'Available Dec 31 2019'}, {'price': '0', 'address': 'Jamaica, Queens', 'availability': 'Available Jun 08 2019'}, {'price': ',700', 'address': 'Chelsea, Manhattan', 'availability': 'Available Sep 01 2019'}, {'price': '0', 'address': 'Astoria, Queens', 'availability': 'Available Jul 22 2019'}, {'price': ',750', 'address': 'Chelsea, Manhattan', 'availability': 'Available Jun 08 2019'}, {'price': ',375', 'address': 'Harlem, Manhattan', 'availability': 'Available Oct 01 2019'}, {'price': '1', 'address': 'Forest Hills, Queens', 'availability': 'Available Aug 01 2019'}, {'price': '0', 'address': 'Brooklyn', 'availability': 'Available Jun 08 2019'}, {'price': '8', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Jun 08 2019'}, {'price': ',200', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Dec 01 2019'}]
price address availability
0 ,395 Bushwick, Brooklyn Available Aug 01 2019
1 ,350 Fort Greene, Brooklyn Available Jun 15 2019
2 ,055 Kips Bay, Manhattan Available Jun 30 2019
3 ,350 Duplex, Brooklyn Available Jun 08 2019
4 0 Flatbush, Brooklyn Available Aug 10 2019
5 ,100 Flatbush, Brooklyn Available Aug 10 2019
6 5 Washington Heights, Manhattan Available Aug 31 2019
7 0 Duplex, Ridgewood, Queens Available Jun 08 2019
8 3 Washington Heights, Manhattan Available Jun 12 2020
9 ,150 Triplex, Ridgewood, Queens Available Jun 08 2019
10 ,317 Stuyvesant Town, Manhattan Available Dec 31 2019
11 0 Jamaica, Queens Available Jun 08 2019
12 ,700 Chelsea, Manhattan Available Sep 01 2019
13 0 Astoria, Queens Available Jul 22 2019
14 ,750 Chelsea, Manhattan Available Jun 08 2019
15 ,375 Harlem, Manhattan Available Oct 01 2019
16 1 Forest Hills, Queens Available Aug 01 2019
17 0 Brooklyn Available Jun 08 2019
18 8 Washington Heights, Manhattan Available Jun 08 2019
19 ,200 Flatbush, Brooklyn Available Dec 01 2019