如何抓取两个 URL 并将每个 url 的元素放在一个 table 中?

How to scrape two URLs and and put the elements of each url in one single table?

我在同一页面上有两个 URL,我想抓取它们以获取纽约市的房价。我使用 BeautifulSoup 获取每个房间的地址、价格和可用性。之后,我制作了一本字典,以便我可以创建一个 DataFrame。

我为每个 URL 获得两个不同的 DataFrame,但我希望每个 URL 的信息都在一个 DataFrame 中。

在我得到我需要的信息后,我将它附加到一个列表中,我稍后将其用于字典

def getRoomInfo (startingPage):
    html = requests.get (startingPage)
    bs1 = BeautifulSoup (html.text, "html.parser")
    url = "{}://{}".format (urlparse (startingPage).scheme, urlparse(startingPage).netloc)



href_links = []
for link in bs1.find_all("a", href = re.compile ("/new-york-apartment/roommate-share/"+"\d+")):
    href_links.append (link["href"])

room_link = []
for links in href_links:
    room_link.append(url+links)

addressList =[]
priceList = []
availabilityList = []

for page in room_link:
    html_page = requests.get (page)
    bs_page = BeautifulSoup (html_page.text, "html.parser")


    address = bs_page.find ("div",{"class": "ap-info-address"} )
    addressList.append (address.get_text())


    price = bs_page.find ("div",{"class": "apt-price price-cur-1"} )
    priceList.append (price.get_text())


    availability = bs_page.find ("td")
    availabilityList.append (availability.get_text())      

infoDataFrame = pd.DataFrame (
    {"Address": addressList, 
        "Price": priceList, 
         "Availability": availabilityList,  
    })

print (infoDataFrame)

links_rooms = ("https://www.nyhabitat.com/new-york-apartment/roommate-share", "https://www.nyhabitat.com/new-york-apartment/list.php?page=2&dep=SH&lev=3&price=400;2400&guest=1&sort=new&cll=1&searchMapLeft=40.60484725779594&searchMapTop=-73.81336257537379&searchMapRight=40.90185344223534&searchMapBottom=-74.14810226043238&searchMapZoom=11&div_code=ny&lang=en")

strip() in-built Python 函数用于删除字符串中的所有前导和尾随空格。

rooms = []
for page in room_link:
    html_page = requests.get(page)
    soup = BeautifulSoup (html_page.text, "html.parser")

    for row in soup.select('div[class*="grid-col"]'):
        room = {}
        priceDiv = row.find("div",{'class':'slider-descr-wrap'})
        room['price'] = priceDiv.find("div",{"class": "apt-price price-cur-1"}).text.strip()
        addrDiv = row.find("div",{'class':'slider-descr-bottom'})
        room['address'] = addrDiv.find("span",{"class": "slider-descr-2-row"}).text.strip()
        room['availability'] = addrDiv.find("span",{'class':'search-aval'}).text.strip()
        rooms.append(room)

print(rooms) 
df = pd.DataFrame(rooms, columns=['price', 'address','availability'])  
print(df) 

O/P:

[{'price': ',395', 'address': 'Bushwick, Brooklyn', 'availability': 'Available Aug 01 2019'}, {'price': ',350', 'address': 'Fort Greene, Brooklyn', 'availability': 'Available Jun 15 2019'}, {'price': ',055', 'address': 'Kips Bay, Manhattan', 'availability': 'Available Jun 30 2019'}, {'price': ',350', 'address': 'Duplex, Brooklyn', 'availability': 'Available Jun 08 2019'}, {'price': '0', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Aug 10 2019'}, {'price': ',100', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Aug 10 2019'}, {'price': '5', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Aug 31 2019'}, {'price': '0', 'address': 'Duplex, Ridgewood, Queens', 'availability': 'Available Jun 08 2019'}, {'price': '3', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Jun 12 2020'}, {'price': ',150', 'address': 'Triplex, Ridgewood, Queens', 'availability': 'Available Jun 08 2019'}, {'price': ',317', 'address': 'Stuyvesant Town, Manhattan', 'availability': 'Available Dec 31 2019'}, {'price': '0', 'address': 'Jamaica, Queens', 'availability': 'Available Jun 08 2019'}, {'price': ',700', 'address': 'Chelsea, Manhattan', 'availability': 'Available Sep 01 2019'}, {'price': '0', 'address': 'Astoria, Queens', 'availability': 'Available Jul 22 2019'}, {'price': ',750', 'address': 'Chelsea, Manhattan', 'availability': 'Available Jun 08 2019'}, {'price': ',375', 'address': 'Harlem, Manhattan', 'availability': 'Available Oct 01 2019'}, {'price': '1', 'address': 'Forest Hills, Queens', 'availability': 'Available Aug 01 2019'}, {'price': '0', 'address': 'Brooklyn', 'availability': 'Available Jun 08 2019'}, {'price': '8', 'address': 'Washington Heights, Manhattan', 'availability': 'Available Jun 08 2019'}, {'price': ',200', 'address': 'Flatbush, Brooklyn', 'availability': 'Available Dec 01 2019'}]
     price                        address           availability
0   ,395             Bushwick, Brooklyn  Available Aug 01 2019
1   ,350          Fort Greene, Brooklyn  Available Jun 15 2019
2   ,055            Kips Bay, Manhattan  Available Jun 30 2019
3   ,350               Duplex, Brooklyn  Available Jun 08 2019
4     0             Flatbush, Brooklyn  Available Aug 10 2019
5   ,100             Flatbush, Brooklyn  Available Aug 10 2019
6     5  Washington Heights, Manhattan  Available Aug 31 2019
7     0      Duplex, Ridgewood, Queens  Available Jun 08 2019
8     3  Washington Heights, Manhattan  Available Jun 12 2020
9   ,150     Triplex, Ridgewood, Queens  Available Jun 08 2019
10  ,317     Stuyvesant Town, Manhattan  Available Dec 31 2019
11    0                Jamaica, Queens  Available Jun 08 2019
12  ,700             Chelsea, Manhattan  Available Sep 01 2019
13    0                Astoria, Queens  Available Jul 22 2019
14  ,750             Chelsea, Manhattan  Available Jun 08 2019
15  ,375              Harlem, Manhattan  Available Oct 01 2019
16    1           Forest Hills, Queens  Available Aug 01 2019
17    0                       Brooklyn  Available Jun 08 2019
18    8  Washington Heights, Manhattan  Available Jun 08 2019
19  ,200             Flatbush, Brooklyn  Available Dec 01 2019