如何抓取多个选择器并将它们分组
How to scrape multiple selectors and group them
我想抓取此页面:https://www.g2crowd.com/products/google-analytics/reviews(为了我自己的教育)
// @nuget: HtmlAgilityPack
using System;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
HtmlWeb web = new HtmlWeb();
HtmlDocument html = web.Load("https://www.g2crowd.com/products/google-analytics/reviews");
var textNodes = html.DocumentNode.SelectNodes("//h3[contains(@class,'review-list-heading')]");
if (textNodes != null)
foreach (var t in textNodes)
Console.WriteLine(t.InnerText);
}
}
这就是我目前所拥有的,它完美地拉动了每条评论的标题。但是,我到底要如何避开标题和评论 body - 明确每条评论都是独立的?
评论"body"(意思是文字)是:
//*[@id="pjax-container"]/div[2]/div[2]/div[6]/div[3]/div/div/div[2]/div[2]/div/div
在 xpath 中。
或 <div itemprop="reviewBody">
纯 html。
这是我目前拥有的 dotnetfiddle:https://dotnetfiddle.net/30Y0M6
请问我说得不够清楚
select 父容器 <div class="mb-2 border-bottom">
然后 select 子容器
// @nuget: HtmlAgilityPack
using System;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
HtmlWeb web = new HtmlWeb();
HtmlDocument html = web.Load("https://www.g2crowd.com/products/google-analytics/reviews");
var divNodes = html.DocumentNode.SelectNodes("//div[@class='mb-2 border-bottom']");
if (divNodes != null)
{
foreach (var child in divNodes)
{
var allowedTags = child.SelectNodes(".//h3 | .//h5 | .//p");
foreach (var tag in allowedTags)
Console.WriteLine(tag.InnerText);
Console.WriteLine("======================================");
}
}
}
}
我想抓取此页面:https://www.g2crowd.com/products/google-analytics/reviews(为了我自己的教育)
// @nuget: HtmlAgilityPack
using System;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
HtmlWeb web = new HtmlWeb();
HtmlDocument html = web.Load("https://www.g2crowd.com/products/google-analytics/reviews");
var textNodes = html.DocumentNode.SelectNodes("//h3[contains(@class,'review-list-heading')]");
if (textNodes != null)
foreach (var t in textNodes)
Console.WriteLine(t.InnerText);
}
}
这就是我目前所拥有的,它完美地拉动了每条评论的标题。但是,我到底要如何避开标题和评论 body - 明确每条评论都是独立的?
评论"body"(意思是文字)是:
//*[@id="pjax-container"]/div[2]/div[2]/div[6]/div[3]/div/div/div[2]/div[2]/div/div
在 xpath 中。
或 <div itemprop="reviewBody">
纯 html。
这是我目前拥有的 dotnetfiddle:https://dotnetfiddle.net/30Y0M6
请问我说得不够清楚
select 父容器 <div class="mb-2 border-bottom">
然后 select 子容器
// @nuget: HtmlAgilityPack
using System;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
HtmlWeb web = new HtmlWeb();
HtmlDocument html = web.Load("https://www.g2crowd.com/products/google-analytics/reviews");
var divNodes = html.DocumentNode.SelectNodes("//div[@class='mb-2 border-bottom']");
if (divNodes != null)
{
foreach (var child in divNodes)
{
var allowedTags = child.SelectNodes(".//h3 | .//h5 | .//p");
foreach (var tag in allowedTags)
Console.WriteLine(tag.InnerText);
Console.WriteLine("======================================");
}
}
}
}