如何在使用 rvest::html_table() 解析 table 时根据单元格属性操作单个单元格值
How to manipulate single cell value based on cell attribute while parsing table with rvest::html_table()
请考虑这个table
html_source <-
'
<table>
<thead>
<tr>
<th width="271"><abbr title="Nome stazione">Stazione</abbr></th>
<th width="60"><abbr title="Provincia">Prov.</abbr></th>
<th width="60"><abbr title="Temp. Min ">Temp. Min</abbr></th>
<th width="60"><abbr title="Temp. Max">Temp. Max</abbr></th>
<th width="60"><abbr title="Umid. Min">Umid. Min</abbr></th>
<th width="70"><abbr title="Umid. Max">Umid. Max</abbr></th>
<th width="60"><abbr title="Raffica Max">Raffica Max</abbr></th>
<th width="90"><abbr title="Pioggia">Pioggia</abbr></th>
<th width="90"><abbr title="Pressione">Pressione</abbr></th>
</tr>
</thead>
<tbody>
<tr>
<td><span class="nome">Dorno</span></td>
<td><span class="provincia">PV</span></td>
<td><span class="active">6.4°C</span></td>
<td><span class="active">9.4°C</span></td>
<td><span class="red">86%</span></td>
<td><span class="active">94%</span></td>
<td><span class="red">11.3 km/h</span></td>
<td><span class="active">2.4 mm/h</span></td>
<td><span class="active">1019.7 hPa</span></td>
</tr>
<tr>
<td><span class="nome">Pomezia</span></td>
<td><span class="provincia">RM</span></td>
<td><span class="active">11.7°C</span></td>
<td><span class="red">14.8°C</span></td>
<td><span class="active">82%</span></td>
<td><span class="active">92%</span></td>
<td><span class="active">14.5 km/h</span></td>
<td><span class="red">0 mm/h</span></td>
<td><span class="active">1022.3 hPa</span></td>
</tr>
</tbody>
</table>
'
我想解析 table 中的值,但也用 NA
替换属性 class="red"
.
的单元格中的值
我看到了 但它指的是整行,您如何独立操作每个单元格?
require(rvest)
require(dplyr)
res <-
xml2::read_html(html_source) %>%
html_table()
View(res[[1]])
res_expected <- res[[1]]
res_expected$`Umid. Min`[1] <- NA
res_expected$`Raffica Max`[1] <- NA
res_expected$`Temp. Max`[2] <- NA
res_expected$Pioggia[2] <- NA
View(res_expected)
您可以像这样删除具有 class“红色”的跨度节点:
require(rvest)
require(dplyr)
parsed <- xml2::read_html(html_source)
xml2::xml_find_all(parsed, './/td/span[@class="red"]') %>% xml2::xml_remove()
res <- parsed %>% xml2::xml_find_first('.//table') %>% html_table()
这不会将它们设置为 <NA>
,但至少会删除这些值。
如果需要,您仍然可以 res %>% mutate_all(~ifelse(.=='', NA, .))
清除那些空字符串。
请考虑这个table
html_source <-
'
<table>
<thead>
<tr>
<th width="271"><abbr title="Nome stazione">Stazione</abbr></th>
<th width="60"><abbr title="Provincia">Prov.</abbr></th>
<th width="60"><abbr title="Temp. Min ">Temp. Min</abbr></th>
<th width="60"><abbr title="Temp. Max">Temp. Max</abbr></th>
<th width="60"><abbr title="Umid. Min">Umid. Min</abbr></th>
<th width="70"><abbr title="Umid. Max">Umid. Max</abbr></th>
<th width="60"><abbr title="Raffica Max">Raffica Max</abbr></th>
<th width="90"><abbr title="Pioggia">Pioggia</abbr></th>
<th width="90"><abbr title="Pressione">Pressione</abbr></th>
</tr>
</thead>
<tbody>
<tr>
<td><span class="nome">Dorno</span></td>
<td><span class="provincia">PV</span></td>
<td><span class="active">6.4°C</span></td>
<td><span class="active">9.4°C</span></td>
<td><span class="red">86%</span></td>
<td><span class="active">94%</span></td>
<td><span class="red">11.3 km/h</span></td>
<td><span class="active">2.4 mm/h</span></td>
<td><span class="active">1019.7 hPa</span></td>
</tr>
<tr>
<td><span class="nome">Pomezia</span></td>
<td><span class="provincia">RM</span></td>
<td><span class="active">11.7°C</span></td>
<td><span class="red">14.8°C</span></td>
<td><span class="active">82%</span></td>
<td><span class="active">92%</span></td>
<td><span class="active">14.5 km/h</span></td>
<td><span class="red">0 mm/h</span></td>
<td><span class="active">1022.3 hPa</span></td>
</tr>
</tbody>
</table>
'
我想解析 table 中的值,但也用 NA
替换属性 class="red"
.
我看到了
require(rvest)
require(dplyr)
res <-
xml2::read_html(html_source) %>%
html_table()
View(res[[1]])
res_expected <- res[[1]]
res_expected$`Umid. Min`[1] <- NA
res_expected$`Raffica Max`[1] <- NA
res_expected$`Temp. Max`[2] <- NA
res_expected$Pioggia[2] <- NA
View(res_expected)
您可以像这样删除具有 class“红色”的跨度节点:
require(rvest)
require(dplyr)
parsed <- xml2::read_html(html_source)
xml2::xml_find_all(parsed, './/td/span[@class="red"]') %>% xml2::xml_remove()
res <- parsed %>% xml2::xml_find_first('.//table') %>% html_table()
这不会将它们设置为 <NA>
,但至少会删除这些值。
如果需要,您仍然可以 res %>% mutate_all(~ifelse(.=='', NA, .))
清除那些空字符串。