如何将 web-page 抓取到与 Pandas Dataframe 兼容的字典中？

Question

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})
g_price = soup.find_all("div",{"class" : "item-price-container"})
g_star = soup.find_all("div",{"class" : "stars stars-small tile-row"})

data=defaultdict(list)
for product_title in g_data:
    a_product_title = product_title.find_all("a","js-product-title")
    for text_product_title in a_product_title : 
       data['Product Title'].append(textroduct_title.text)  

for row in g_price:
    price = row.find('span', 'price price-display').text.strip()
    data['Price'].append(price)

for allstar in g_star:
    star=allstar.find('span','visuallyhidden').text.stp()
    data['Stars'].append(star)

dd_starring = soup.find_all('dd', {"class" : "media-details-artist-dd module"})
for dd in dd_starring :
     actors = dd.text
 #data['Actors'].append(actors)

df = pd.DataFrame(data)
df

如果我尝试使用 data['Stars'].append(star) 行添加它 - 我收到以下错误 -

ValueError: arrays must all be the same length

应该如何附加它，没有星号的行应该有 NA。

有什么建议吗？请帮忙

Answer 1

您不需要构建单独的内容列表来循环。您可以只遍历 g_data，这意味着您不会有不同长度的结果集。

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})

data=defaultdict(list)
for content in g_data:
    title = content.find("a","js-product-title")
    data['Product Title'].append(title.text)

    try:
        stars =content.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
        data['Stars'].append(stars)

    except:
        data['Stars'].append(None)

    price = content.find('span', 'price price-display').text.strip()
    data['Price'].append(price)



 #data['Actors'].append(actors)

df = pd.DataFrame(data)
df

据我所知，内部循环也没有必要，因为每件商品只有一个名称、价格和评级。

Answer 2

您最初的问题是因为您的各个循环所包含的每个元素的数量与您正在循环的元素数量不同（即 - 15 星，v. 20 价格）。避免此类问题的最佳方法是首先有一个循环，然后将 try & except 值应用于您正在转义的每个项目。这样，如果您想要的物品的持续存在有任何问题，您仍然可以收集存在的物品。

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})

data=defaultdict(list)

#One loop to rule them all
for tile in g_data:
    #the "tile" value in g_data contains what you are looking for...
    #find the product titles
    try:
        title = tile.find("a","js-product-title")
        data['Product Title'].append(title.text)
    except:
        data['Product Title'].append("")

    #find the prices
    try:
        price = tile.find('span', 'price price-display').text.strip()
        data['Price'].append(price)  
    except:
        data['Price'].append("")

    #find the stars
    try:
        g_star = tile.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
        data['Stars'].append(g_star)
    except:
        data['Stars'].append("")

df = pd.DataFrame(data)

如何将 web-page 抓取到与 Pandas Dataframe 兼容的字典中？

How to Scrape a web-page into a dictionary compatible with Pandas Dataframe?

html

dataframe

pandas