如何使用jsoup按顺序提取数据
How to extract data in sequence using jsoup
我正在尝试使用 jsoup 从这个 link https://orderup.com/some/phoenix/delivery/featured 获取数据,但我遇到了一些问题,我的结果数据格式不正确,并且没有显示具有描述的类别。这是我的代码:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class grabber {
public static void main(String[] args) throws Exception {
String url = "https://orderup.com/restaurants/bella-pizza-r3834/delivery";
Document document = Jsoup.connect(url).get();
Elements restname = document.select("h1.urbana");
System.out.println("restname: " + restname.text());
Elements restaddressdiv = document.select("address.desktop-address");
Elements restauranthours = document.select("div.restaurant-hours-region");
Elements restauranthoursa = restauranthours.select("div.restaurant-hours-region");
Elements restauranthoursregion = restauranthoursa.select("dt");
System.out.println("restauranthosssurs: " + restauranthoursregion.size());
for (Element resthours : restauranthoursregion) {
System.out.println("restauranthours: " + resthours.text());
}
Elements h3 = document.select("div.menu-category");
Elements h3tag = h3.select("h3");
for(Element e : h3tag)
{
System.out.println("Category: " + e.text());
if (e.nextElementSibling().select("p").size() == 1) {
Elements itemtitlep =e.nextElementSibling().select("p");
Elements itemtitle = e.nextElementSibling().select("span.item-title");
System.out.println(itemtitle.size());
int itemtitleCount = itemtitle.size();
System.out.println("ifffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff statement");
for(Element itema : itemtitle)
{
System.out.println("Items: " + itema.text());
Elements itemtitleprice = itema.nextElementSibling().select(".item-price");
Elements itemtitledes = itema.getElementsByTag("p");
for(Element itempricea : itemtitleprice)
{
System.out.println("price: " + itempricea.text());
}
for(Element itemdesc : itemtitledes)
{
System.out.println("itemdesc: " + itemdesc.text());
}
}
} else {
Elements itemtitle = e.nextElementSibling().select("span.item-title");
int itemtitleCount = itemtitle.size();
System.out.println(itemtitleCount);
System.out.println("elssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss statement");
for(Element itema : itemtitle)
{
System.out.println("Items: " + itema.text());
Elements itemtitleprice = itema.nextElementSibling().select(".item-price");
Elements itemtitledes = itema.getElementsByTag("p");
for(Element itempricea : itemtitleprice)
{
System.out.println("price: " + itempricea.text());
}
for(Element itemdesc : itemtitledes)
{
System.out.println("itemdesc: " + itemdesc.text());
}
}
}
}
}
}
问题是,您访问的页面 html 比您预期的要灵活一些。例如,在某些类别中,您有主类别的子文本。这被组织为 h3 标签的下一个兄弟。更健壮且更易于阅读的方法可能是这样的:
Elements elh3s = document.select("div.menu-category h3");
for (Element elh3 : elh3s){
System.out.println("Category: " + elh3.text());
//get the list by stepping up and then css select the ul
Elements ellis = elh3.parent().select("ul>li");
for (Element elli : ellis){
System.out.println("title: "
+ elli.select("span.item-title").first().text());
System.out.println("price: "
+ elli.select("span.item-price").first().text());
System.out.println("--");
}
}
建议:
调查Jsoup CSS selectors。它们非常强大,并且由于您已经使用 JSoup 解析了分页,因此您可以完全使用它们而几乎没有性能问题。
我正在尝试使用 jsoup 从这个 link https://orderup.com/some/phoenix/delivery/featured 获取数据,但我遇到了一些问题,我的结果数据格式不正确,并且没有显示具有描述的类别。这是我的代码:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class grabber {
public static void main(String[] args) throws Exception {
String url = "https://orderup.com/restaurants/bella-pizza-r3834/delivery";
Document document = Jsoup.connect(url).get();
Elements restname = document.select("h1.urbana");
System.out.println("restname: " + restname.text());
Elements restaddressdiv = document.select("address.desktop-address");
Elements restauranthours = document.select("div.restaurant-hours-region");
Elements restauranthoursa = restauranthours.select("div.restaurant-hours-region");
Elements restauranthoursregion = restauranthoursa.select("dt");
System.out.println("restauranthosssurs: " + restauranthoursregion.size());
for (Element resthours : restauranthoursregion) {
System.out.println("restauranthours: " + resthours.text());
}
Elements h3 = document.select("div.menu-category");
Elements h3tag = h3.select("h3");
for(Element e : h3tag)
{
System.out.println("Category: " + e.text());
if (e.nextElementSibling().select("p").size() == 1) {
Elements itemtitlep =e.nextElementSibling().select("p");
Elements itemtitle = e.nextElementSibling().select("span.item-title");
System.out.println(itemtitle.size());
int itemtitleCount = itemtitle.size();
System.out.println("ifffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff statement");
for(Element itema : itemtitle)
{
System.out.println("Items: " + itema.text());
Elements itemtitleprice = itema.nextElementSibling().select(".item-price");
Elements itemtitledes = itema.getElementsByTag("p");
for(Element itempricea : itemtitleprice)
{
System.out.println("price: " + itempricea.text());
}
for(Element itemdesc : itemtitledes)
{
System.out.println("itemdesc: " + itemdesc.text());
}
}
} else {
Elements itemtitle = e.nextElementSibling().select("span.item-title");
int itemtitleCount = itemtitle.size();
System.out.println(itemtitleCount);
System.out.println("elssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss statement");
for(Element itema : itemtitle)
{
System.out.println("Items: " + itema.text());
Elements itemtitleprice = itema.nextElementSibling().select(".item-price");
Elements itemtitledes = itema.getElementsByTag("p");
for(Element itempricea : itemtitleprice)
{
System.out.println("price: " + itempricea.text());
}
for(Element itemdesc : itemtitledes)
{
System.out.println("itemdesc: " + itemdesc.text());
}
}
}
}
}
}
问题是,您访问的页面 html 比您预期的要灵活一些。例如,在某些类别中,您有主类别的子文本。这被组织为 h3 标签的下一个兄弟。更健壮且更易于阅读的方法可能是这样的:
Elements elh3s = document.select("div.menu-category h3");
for (Element elh3 : elh3s){
System.out.println("Category: " + elh3.text());
//get the list by stepping up and then css select the ul
Elements ellis = elh3.parent().select("ul>li");
for (Element elli : ellis){
System.out.println("title: "
+ elli.select("span.item-title").first().text());
System.out.println("price: "
+ elli.select("span.item-price").first().text());
System.out.println("--");
}
}
建议:
调查Jsoup CSS selectors。它们非常强大,并且由于您已经使用 JSoup 解析了分页,因此您可以完全使用它们而几乎没有性能问题。