使用简单 HTML DOM 解析器按 ID 查找表
Find Tables by ID using Simple HTML DOM Parser
我去年写了一个数据库播种器,用于抓取统计网站。重新访问我的代码后,它似乎不再有效,我对原因有点困惑。 $html->find()
应该 return 找到一个元素数组,但是它似乎只在使用时找到第一个 table。
根据文档,我改为尝试使用 find() 并指定每个 table 的 ID,但这似乎也失败了。
$table_passing = $html->find('table[id=passing]');
谁能帮我弄清楚这里出了什么问题?我不知道为什么这两种方法都不起作用,页面源清楚地显示了多个 tables 和 ID,这两种方法都应该起作用。
private function getTeamStats()
{
$url = 'http://www.pro-football-reference.com/years/2016/opp.htm';
$html = file_get_html($url);
$tables = $html->find('table');
$table_defense = $tables[0];
$table_passing = $tables[1];
$table_rushing = $tables[2];
//$table_passing = $html->find('table[id=passing]');
$teams = array();
# OVERALL DEFENSIVE STATISTICS #
foreach ($table_defense->find('tr') as $row)
{
$stats = $row->find('td');
// Ignore the lines that don't have ranks, these aren't teams
if (isset($stats[0]) && !empty($stats[0]->plaintext))
{
$name = $stats[1]->plaintext;
$rank = $stats[0]->plaintext;
$games = $stats[2]->plaintext;
$yards = $stats[4]->plaintext;
// Calculate the Yards Allowed per Game by dividing Total / Games
$tydag = $yards / $games;
$teams[$name]['rank'] = $rank;
$teams[$name]['games'] = $games;
$teams[$name]['tydag'] = $tydag;
}
}
# PASSING DEFENSIVE STATISTICS #
foreach ($table_passing->find('tr') as $row)
{
$stats = $row->find('td');
// Ignore the lines that don't have ranks, these aren't teams
if (isset($stats[0]) && !empty($stats[0]->plaintext))
{
$name = $stats[1]->plaintext;
$pass_rank = $stats[0]->plaintext;
$pass_yards = $stats[14]->plaintext;
$teams[$name]['pass_rank'] = $pass_rank;
$teams[$name]['paydag'] = $pass_yards;
}
}
# RUSHING DEFENSIVE STATISTICS #
foreach ($table_rushing->find('tr') as $row)
{
$stats = $row->find('td');
// Ignore the lines that don't have ranks, these aren't teams
if (isset($stats[0]) && !empty($stats[0]->plaintext))
{
$name = $stats[1]->plaintext;
$rush_rank = $stats[0]->plaintext;
$rush_yards = $stats[7]->plaintext;
$teams[$name]['rush_rank'] = $rush_rank;
$teams[$name]['ruydag'] = $rush_yards;
}
}
我从不使用 simplexml
或其他派生词,但是当使用 XPath
查询来查找诸如 ID 之类的属性时,通常会使用 @
作为前缀并且应该引用该值 -所以对于你的情况可能是
$table_passing = $html->find('table[@id="passing"]');
使用标准的 DOMDocument 和 DOMXPath 方法,问题是实际的 table 在源代码中是 "commented out"
- 因此 html 注释的简单字符串替换使以下内容能够工作 - 这可以很容易地适应原始代码。
$url='http://www.pro-football-reference.com/years/2016/opp.htm';
$html=file_get_contents( $url );
/* remove the html comments */
$html=str_replace( array('<!--','-->'), '', $html );
libxml_use_internal_errors( true );
$dom=new DOMDocument;
$dom->validateOnParse=false;
$dom->standalone=true;
$dom->strictErrorChecking=false;
$dom->recover=true;
$dom->formatOutput=false;
$dom->loadHTML( $html );
libxml_clear_errors();
$xp=new DOMXPath( $dom );
$tbl=$xp->query( '//table[@id="passing"]' );
foreach( $tbl as $n )echo $n->tagName.' > '.$n->getAttribute('id');
/* outputs */
table > passing
我去年写了一个数据库播种器,用于抓取统计网站。重新访问我的代码后,它似乎不再有效,我对原因有点困惑。 $html->find()
应该 return 找到一个元素数组,但是它似乎只在使用时找到第一个 table。
根据文档,我改为尝试使用 find() 并指定每个 table 的 ID,但这似乎也失败了。
$table_passing = $html->find('table[id=passing]');
谁能帮我弄清楚这里出了什么问题?我不知道为什么这两种方法都不起作用,页面源清楚地显示了多个 tables 和 ID,这两种方法都应该起作用。
private function getTeamStats()
{
$url = 'http://www.pro-football-reference.com/years/2016/opp.htm';
$html = file_get_html($url);
$tables = $html->find('table');
$table_defense = $tables[0];
$table_passing = $tables[1];
$table_rushing = $tables[2];
//$table_passing = $html->find('table[id=passing]');
$teams = array();
# OVERALL DEFENSIVE STATISTICS #
foreach ($table_defense->find('tr') as $row)
{
$stats = $row->find('td');
// Ignore the lines that don't have ranks, these aren't teams
if (isset($stats[0]) && !empty($stats[0]->plaintext))
{
$name = $stats[1]->plaintext;
$rank = $stats[0]->plaintext;
$games = $stats[2]->plaintext;
$yards = $stats[4]->plaintext;
// Calculate the Yards Allowed per Game by dividing Total / Games
$tydag = $yards / $games;
$teams[$name]['rank'] = $rank;
$teams[$name]['games'] = $games;
$teams[$name]['tydag'] = $tydag;
}
}
# PASSING DEFENSIVE STATISTICS #
foreach ($table_passing->find('tr') as $row)
{
$stats = $row->find('td');
// Ignore the lines that don't have ranks, these aren't teams
if (isset($stats[0]) && !empty($stats[0]->plaintext))
{
$name = $stats[1]->plaintext;
$pass_rank = $stats[0]->plaintext;
$pass_yards = $stats[14]->plaintext;
$teams[$name]['pass_rank'] = $pass_rank;
$teams[$name]['paydag'] = $pass_yards;
}
}
# RUSHING DEFENSIVE STATISTICS #
foreach ($table_rushing->find('tr') as $row)
{
$stats = $row->find('td');
// Ignore the lines that don't have ranks, these aren't teams
if (isset($stats[0]) && !empty($stats[0]->plaintext))
{
$name = $stats[1]->plaintext;
$rush_rank = $stats[0]->plaintext;
$rush_yards = $stats[7]->plaintext;
$teams[$name]['rush_rank'] = $rush_rank;
$teams[$name]['ruydag'] = $rush_yards;
}
}
我从不使用 simplexml
或其他派生词,但是当使用 XPath
查询来查找诸如 ID 之类的属性时,通常会使用 @
作为前缀并且应该引用该值 -所以对于你的情况可能是
$table_passing = $html->find('table[@id="passing"]');
使用标准的 DOMDocument 和 DOMXPath 方法,问题是实际的 table 在源代码中是 "commented out"
- 因此 html 注释的简单字符串替换使以下内容能够工作 - 这可以很容易地适应原始代码。
$url='http://www.pro-football-reference.com/years/2016/opp.htm';
$html=file_get_contents( $url );
/* remove the html comments */
$html=str_replace( array('<!--','-->'), '', $html );
libxml_use_internal_errors( true );
$dom=new DOMDocument;
$dom->validateOnParse=false;
$dom->standalone=true;
$dom->strictErrorChecking=false;
$dom->recover=true;
$dom->formatOutput=false;
$dom->loadHTML( $html );
libxml_clear_errors();
$xp=new DOMXPath( $dom );
$tbl=$xp->query( '//table[@id="passing"]' );
foreach( $tbl as $n )echo $n->tagName.' > '.$n->getAttribute('id');
/* outputs */
table > passing