在 jupyter 中通过 python 进行网络爬行时无法获取完整的 html table 内容
unable to get entire html table content while web crawling through python in jupyter
从 bs4 导入 BeautifulSoup
导入请求,时间
class CrawledArticle():
def __init__(self, heading, message):
self.heading = heading
self.message = message
class ArticleFetcher():
def fetch(self):
url = "https://www.agrarwetter.net/Agrarwetter-Aktuell/Stuttgart.Ist.html"
articles = []
time.sleep(1)
r = requests.get(url)
doc = BeautifulSoup(r.text, "lxml")
#tr_elements = doc.xpath('//tr')
# print(doc.select_one('.MITTEFORMULARSPALTE_WETTER').text.strip())
for item in doc.select('.td_INHALT_MITTE'):
#heading = item.select_one('.MITTEFORMULARSPALTE_WETTER').text.strip()
heading = item.select_one('.SCHRIFT_FORMULARFELDBESCHRIFTUNG_MITTE_WETTER').text.strip()
message = item.select_one('.td_INHALT_MITTE .SCHRIFT_FORMULAR_WERTE_MITTE:not(.pull-right)').text.strip()
crawled = CrawledArticle(heading, message)
articles.append(crawled)
return articles
a = ArticleFetcher()
b = a.fetch()
for item in b:
print(item.heading,'\n' ,item.message)
我想这就是你想要的:
r = requests.get('https://www.agrarwetter.net/Agrarwetter-Aktuell/Stuttgart.Ist.html', headers=user_agent)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.select_one('.FARBE_FORMULAR_MITTE') #Select the table containing the weather data
headers = [i.text for i in table.select('.SCHRIFT_FORMULARFELDBESCHRIFTUNG_MITTE_WETTER')] #pull a list of the headers in the table
infos = [i.text for i in table.select('.SCHRIFT_FORMULAR_WERTE_MITTE')] #pull a list of the data associated to the headers
data = dict(zip(headers,infos)) #Create a dict with the headers and information
pprint(data)
输出是:
{'Datum': '14.11.2019Donnerstag',
'Luftdruck': '1001,7 hPa',
'Sichtweite': 'über 70km',
'Taupunkt': '-2,1 °C',
'Temperatur': '8,6 °C',
'Uhrzeit': '15.00 Uhr',
'Wetterzustand': '',
'Windgeschwindigkeit': '10,8 km/h',
'Windrichtung': '',
'relative Feuchte': '47 %'}
从 bs4 导入 BeautifulSoup 导入请求,时间
class CrawledArticle():
def __init__(self, heading, message):
self.heading = heading
self.message = message
class ArticleFetcher():
def fetch(self):
url = "https://www.agrarwetter.net/Agrarwetter-Aktuell/Stuttgart.Ist.html"
articles = []
time.sleep(1)
r = requests.get(url)
doc = BeautifulSoup(r.text, "lxml")
#tr_elements = doc.xpath('//tr')
# print(doc.select_one('.MITTEFORMULARSPALTE_WETTER').text.strip())
for item in doc.select('.td_INHALT_MITTE'):
#heading = item.select_one('.MITTEFORMULARSPALTE_WETTER').text.strip()
heading = item.select_one('.SCHRIFT_FORMULARFELDBESCHRIFTUNG_MITTE_WETTER').text.strip()
message = item.select_one('.td_INHALT_MITTE .SCHRIFT_FORMULAR_WERTE_MITTE:not(.pull-right)').text.strip()
crawled = CrawledArticle(heading, message)
articles.append(crawled)
return articles
a = ArticleFetcher()
b = a.fetch()
for item in b:
print(item.heading,'\n' ,item.message)
我想这就是你想要的:
r = requests.get('https://www.agrarwetter.net/Agrarwetter-Aktuell/Stuttgart.Ist.html', headers=user_agent)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.select_one('.FARBE_FORMULAR_MITTE') #Select the table containing the weather data
headers = [i.text for i in table.select('.SCHRIFT_FORMULARFELDBESCHRIFTUNG_MITTE_WETTER')] #pull a list of the headers in the table
infos = [i.text for i in table.select('.SCHRIFT_FORMULAR_WERTE_MITTE')] #pull a list of the data associated to the headers
data = dict(zip(headers,infos)) #Create a dict with the headers and information
pprint(data)
输出是:
{'Datum': '14.11.2019Donnerstag',
'Luftdruck': '1001,7 hPa',
'Sichtweite': 'über 70km',
'Taupunkt': '-2,1 °C',
'Temperatur': '8,6 °C',
'Uhrzeit': '15.00 Uhr',
'Wetterzustand': '',
'Windgeschwindigkeit': '10,8 km/h',
'Windrichtung': '',
'relative Feuchte': '47 %'}