我如何通过 REGEX 从字符串中删除 HTML 标签?
How can i remove HTML Tags from String by REGEX?
我正在从 Mysql 获取数据,但问题是“HTML 标签,即
<p>LARGE</p><p>Lamb;<br>;li;ul;
也正在获取我的数据,我只需要来自上面一行的 "LARGE" 和 "Lamb"。我怎样才能 separate/remove HTML 来自字符串的标签?
我假设 HTML 完好无损,可能如下所示:
<ul><li><p>LARGE</p><p>Lamb<br></li></ul>
在这种情况下,我会使用 HtmlAgilityPack 来获取内容,而不必求助于正则表达式。
var html = "<ul><li><p>LARGE</p><p>Lamb</p><br></li></ul> ";
var hap = new HtmlDocument();
hap.LoadHtml(html);
string text = HtmlEntity.DeEntitize(hap.DocumentNode.InnerText);
// text is now "LARGELamb "
string[] lines = hap.DocumentNode.SelectNodes("//text()")
.Select(h => HtmlEntity.DeEntitize(h.InnerText)).ToArray();
// lines is { "LARGE", "Lamb", " " }
如果我们假设您要修复 html elements
。
static void Main(string[] args)
{
string html = WebUtility.HtmlDecode("<p>LARGE</p><p>Lamb</p>");
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
List<HtmlNode> spanNodes = doc.DocumentNode.Descendants().Where(x => x.Name == "p").ToList();
foreach (HtmlNode node in spanNodes)
{
Console.WriteLine(node.InnerHtml);
}
}
你需要使用HTML Agility Pack.You 可以这样添加引用:
Install-Package HtmlAgilityPack
假设:
- 原始字符串始终采用该特定格式,
还有
- 您无法添加 HTMLAgilityPack,
这是获得所需内容的一种快速而肮脏的方法:
static void Main(string[] args)
{
// Split original string on the 'separator' string.
string originalString = "<p>LARGE</p><p>Lamb;<br>;li;ul; ";
string[] sSeparator = new string[] { "</p><p>" };
string[] splitString = originalString.Split(sSeparator, StringSplitOptions.None);
// Prepare to filter the 'prefix' and 'postscript' strings
string prefix = "<p>";
string postfix = ";<br>;li;ul; ";
int prefixLength = prefix.Length;
int postfixLength = postfix.Length;
// Iterate over the split string and clean up
string s = string.Empty;
for (int i = 0; i < splitString.Length; i++)
{
s = splitString[i];
if (s.Contains(prefix))
{
s = s.Remove(s.IndexOf(prefix), prefixLength);
}
if (s.Contains(postfix))
{
s = s.Remove(s.IndexOf(postfix), postfixLength);
}
splitString[i] = s;
Console.WriteLine(splitString[i]);
}
Console.ReadLine();
}
try this
// erase html tags from a string
public static string StripHtml(string target)
{
//Regular expression for html tags
Regex StripHTMLExpression = new Regex("<\S[^><]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.CultureInvariant | RegexOptions.Compiled);
return StripHTMLExpression.Replace(target, string.Empty);
}
call
string htmlString="<div><span>hello world!</span></div>";
string strippedString=StripHtml(htmlString);
// Convert < > etc. to HTML
String sResult = HttpUtility.HtmlDecode(sData);
// Remove HTML tags delimited by <>
String result = Regex.Replace(sResult, @"enter code here<[^>]*>", String.Empty);
我正在从 Mysql 获取数据,但问题是“HTML 标签,即
<p>LARGE</p><p>Lamb;<br>;li;ul;
也正在获取我的数据,我只需要来自上面一行的 "LARGE" 和 "Lamb"。我怎样才能 separate/remove HTML 来自字符串的标签?
我假设 HTML 完好无损,可能如下所示:
<ul><li><p>LARGE</p><p>Lamb<br></li></ul>
在这种情况下,我会使用 HtmlAgilityPack 来获取内容,而不必求助于正则表达式。
var html = "<ul><li><p>LARGE</p><p>Lamb</p><br></li></ul> ";
var hap = new HtmlDocument();
hap.LoadHtml(html);
string text = HtmlEntity.DeEntitize(hap.DocumentNode.InnerText);
// text is now "LARGELamb "
string[] lines = hap.DocumentNode.SelectNodes("//text()")
.Select(h => HtmlEntity.DeEntitize(h.InnerText)).ToArray();
// lines is { "LARGE", "Lamb", " " }
如果我们假设您要修复 html elements
。
static void Main(string[] args)
{
string html = WebUtility.HtmlDecode("<p>LARGE</p><p>Lamb</p>");
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
List<HtmlNode> spanNodes = doc.DocumentNode.Descendants().Where(x => x.Name == "p").ToList();
foreach (HtmlNode node in spanNodes)
{
Console.WriteLine(node.InnerHtml);
}
}
你需要使用HTML Agility Pack.You 可以这样添加引用:
Install-Package HtmlAgilityPack
假设:
- 原始字符串始终采用该特定格式, 还有
- 您无法添加 HTMLAgilityPack,
这是获得所需内容的一种快速而肮脏的方法:
static void Main(string[] args)
{
// Split original string on the 'separator' string.
string originalString = "<p>LARGE</p><p>Lamb;<br>;li;ul; ";
string[] sSeparator = new string[] { "</p><p>" };
string[] splitString = originalString.Split(sSeparator, StringSplitOptions.None);
// Prepare to filter the 'prefix' and 'postscript' strings
string prefix = "<p>";
string postfix = ";<br>;li;ul; ";
int prefixLength = prefix.Length;
int postfixLength = postfix.Length;
// Iterate over the split string and clean up
string s = string.Empty;
for (int i = 0; i < splitString.Length; i++)
{
s = splitString[i];
if (s.Contains(prefix))
{
s = s.Remove(s.IndexOf(prefix), prefixLength);
}
if (s.Contains(postfix))
{
s = s.Remove(s.IndexOf(postfix), postfixLength);
}
splitString[i] = s;
Console.WriteLine(splitString[i]);
}
Console.ReadLine();
}
try this
// erase html tags from a string
public static string StripHtml(string target)
{
//Regular expression for html tags
Regex StripHTMLExpression = new Regex("<\S[^><]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.CultureInvariant | RegexOptions.Compiled);
return StripHTMLExpression.Replace(target, string.Empty);
}
call
string htmlString="<div><span>hello world!</span></div>";
string strippedString=StripHtml(htmlString);
// Convert < > etc. to HTML
String sResult = HttpUtility.HtmlDecode(sData);
// Remove HTML tags delimited by <>
String result = Regex.Replace(sResult, @"enter code here<[^>]*>", String.Empty);