从背后访问数据 Java

Question

我试图从这里的每个赛程的下拉列表中提取进球时间 http://www.bbc.co.uk/sport/football/league-one/results

我在搜索时似乎找不到数据 - 知道为什么吗？

import requests
from bs4 import BeautifulSoup

# Load Page Data
r = requests.get("http://www.bbc.co.uk/sport/football/league-one/results")
soup = BeautifulSoup(r.content)
print soup.prettify()

# Save Teams
for link in soup.find_all("a"):
    print link.text

# Save Results
for link in soup.find_all("abbr"):
    print link.text

ff

Answer 1

所以这是非常大的数据量（更不用说因为加载所有单独的页面而变慢了）并且它们最终可能会因为过多的请求而阻止您，但这是我看到的唯一方法。我所做的是通过并获取与 Results 按钮关联的 href，加载该页面并解析它以从中获取分数信息

import requests
from bs4 import BeautifulSoup

def parse_page(data):
        subsoup = BeautifulSoup(data)
        matchoverview = subsoup.find('div', attrs={'id':'match-overview'})
        print '--------------'
        homeTeam = matchoverview.find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('a').text
        homeScore = matchoverview.find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('span').text
        homeGoalScorers = ["Home Goal Scorers:"]
        for goals in matchoverview.find('div', attrs={'class':'team-match-details'}).findNext('p').find_all('span'):
            homeGoalScorers.append(goals.text.replace(u'\u2032', "'"))
        homeGoals = "\n".join(homeGoalScorers)
        awayTeam = matchoverview.find('div', attrs={'id': 'away-team'}).find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('a').text
        awayScore = matchoverview.find('div', attrs={'id': 'away-team'}).find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('span').text
        awayGoalScorers = ["Away Goal Scorers:"]
        for goals in matchoverview.find('div', attrs={'id': 'away-team'}).find('div', attrs={'class':'team-match-details'}).findNext('p').find_all('span'):
            awayGoalScorers.append(goals.text.replace(u'\u2032', "'"))
        awayGoals = "\n".join(awayGoalScorers)
        print '{0} {1} - {2} {3}'.format(homeTeam, homeScore, awayTeam, awayScore)
        print homeGoals
        print awayGoals

def all_league_results():
    r = requests.get("http://www.bbc.co.uk/sport/football/league-one/results")
    soup = BeautifulSoup(r.content)

    # Save Teams
    for link in soup.find_all("a", attrs={'class': 'report'}):
        fullLink = 'http://www.bbc.com' + link['href']
        subr = requests.get(fullLink)
        parse_page(subr.text)

def specific_game_results(url):
    subr = requests.get(url)
    parse_page(subr.text)

#get specific games results
specific_game_results('http://www.bbc.co.uk/sport/0/football/32460049')
#get all current league results
all_league_results()

从背后访问数据 Java

Access Data from Behind Java

python

beautifulsoup