有没有比我尝试编写代码更简单的方法来擦除为 html 节点设置的所有固定大小?
Is there any easer way to wipe all fixed sizes set for html nodes than I trying to code?
我有一个这样的输入字符串:
<span id = 'RTF_Text_101' style="font-family:'Arial';font-size:12pt;text-align:left;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">aeqw</span></p>
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">qwe</span></p>
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">qwe</span></p>
<table cellspacing="0" cellpadding="0pt" style="width:498.2pt;border-collapse:collapse;">
<colgroup>
<col width="332" />
<col width="332" />
</colgroup>
<tr align="left" valign="top">
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd1</span></p>
</td>
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd3</span></p>
</td>
</tr>
<tr align="left" valign="top">
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd2</span></p>
</td>
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd4</span></p>
</td>
</tr>
</table>
</span>
这些是由 rtf 到 html 转换工具生成的。问题是它使 tables 具有固定大小,而我需要它是动态的。
因此,我需要替换此字符串中属于 table 相关标签(table、tr、td)的所有尺寸(宽度、高度)。
我开始编写代码:
//the input parameters for this funct is huge html piece and tag that should be "non sized"
public void RemoveSizesFromStringForTag(ref string str, string tag)
{
int tag_start_index = -1; //current found tag start index
int curr_search_pos = 0; //current position to start search for next tag
while ((tag_start_index = str.IndexOf("<" + tag, curr_search_pos)) >= 0) //while we found some tag
{
int tag_end_index = str.IndexOf(">", tag_start_index); //get it's end index
string temp_part = str.Substring(tag_start_index, tag_end_index - tag_start_index); //substring tag liek that "<table ... >"
bool replace_needed = false; //used as flag
if (temp_part.ToLower().Contains("width")) //if substring contains width
{
//NOT IMPLEMENTED
RemoveAttributeFromString(ref temp_part, "width"); // then remove this attribute from that string
replace_needed = true; //and mark that we need to replace later
}
if (temp_part.ToLower().Contains("height"))
{
RemoveAttributeFromString(ref temp_part, "height");
replace_needed = true;
}
if (replace_needed) // if replace needed
{
str.Remove(tag_start_index, tag_end_index - tag_start_index); //we remove string with sizes
str.Insert(tag_start_index, temp_part); //and insert string without sizes
}
curr_search_pos = tag_start_index + temp_part.Length; //correcting current search position
}
}
public void RemoveAttributeFromString(ref string str, string attr)
{
int attr_start_index = -1;
int curr_search_pos = 0;
while ((attr_start_index = str.IndexOf(attr, curr_search_pos)) >= 0)
{
//honestly I stopped here to understand what to do next,
//since stoo much of cases possible that I cannot handle.
//it could be: id = 'asd'width='10%'height=5px, it could be with spaces, with ", set via style width:10%
}
}
在某些时候,我发现我的方法非常困难,因为可以通过多种方式设置尺寸 (width=10px, width = 10px, width = '10px', id='asd'width="10px"style='...')
。案例太多,很难找到要删除的属性边框。
那么,有没有更简单的方法呢?
这可以理解为格式正确 xml(至少您发布的示例):
编辑
更容易阅读:
private static string RemoveHeightsAndWidths(string original)
{
XElement element = XElement.Parse(original);
var tableRelatedElements =
element.Descendants("table")
.Union(element.Descendants("tr"))
.Union(element.Descendants("td"))
.Union(element.Descendants("th")); //add more items you want to strip the height and width from in the same manner
Regex reg = new Regex("(?:width:.*?;)|(?:height:.*?;)");
foreach (var item in tableRelatedElements)
{
if (item.Attributes("style").Any())
{
item.Attribute("style").Value = reg.Replace(item.Attribute("style").Value, string.Empty);
}
if (item.Attributes("height").Any())
{
item.Attribute("height").Remove();
}
if (item.Attributes("width").Any())
{
item.Attribute("width").Remove();
}
}
return element.ToString();
}
我有一个这样的输入字符串:
<span id = 'RTF_Text_101' style="font-family:'Arial';font-size:12pt;text-align:left;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">aeqw</span></p>
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">qwe</span></p>
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">qwe</span></p>
<table cellspacing="0" cellpadding="0pt" style="width:498.2pt;border-collapse:collapse;">
<colgroup>
<col width="332" />
<col width="332" />
</colgroup>
<tr align="left" valign="top">
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd1</span></p>
</td>
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd3</span></p>
</td>
</tr>
<tr align="left" valign="top">
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd2</span></p>
</td>
<td style="width:244pt; padding-right:1.8pt; padding-left:1.8pt; border-top: 1pt solid #000000; border-right: 1pt solid #000000; border-bottom: 1pt solid #000000; border-left: 1pt solid #000000;">
<p lang="en-US" style="margin-top:0pt;margin-bottom:0pt;"><span style="font-size:10pt;">asd4</span></p>
</td>
</tr>
</table>
</span>
这些是由 rtf 到 html 转换工具生成的。问题是它使 tables 具有固定大小,而我需要它是动态的。
因此,我需要替换此字符串中属于 table 相关标签(table、tr、td)的所有尺寸(宽度、高度)。
我开始编写代码:
//the input parameters for this funct is huge html piece and tag that should be "non sized"
public void RemoveSizesFromStringForTag(ref string str, string tag)
{
int tag_start_index = -1; //current found tag start index
int curr_search_pos = 0; //current position to start search for next tag
while ((tag_start_index = str.IndexOf("<" + tag, curr_search_pos)) >= 0) //while we found some tag
{
int tag_end_index = str.IndexOf(">", tag_start_index); //get it's end index
string temp_part = str.Substring(tag_start_index, tag_end_index - tag_start_index); //substring tag liek that "<table ... >"
bool replace_needed = false; //used as flag
if (temp_part.ToLower().Contains("width")) //if substring contains width
{
//NOT IMPLEMENTED
RemoveAttributeFromString(ref temp_part, "width"); // then remove this attribute from that string
replace_needed = true; //and mark that we need to replace later
}
if (temp_part.ToLower().Contains("height"))
{
RemoveAttributeFromString(ref temp_part, "height");
replace_needed = true;
}
if (replace_needed) // if replace needed
{
str.Remove(tag_start_index, tag_end_index - tag_start_index); //we remove string with sizes
str.Insert(tag_start_index, temp_part); //and insert string without sizes
}
curr_search_pos = tag_start_index + temp_part.Length; //correcting current search position
}
}
public void RemoveAttributeFromString(ref string str, string attr)
{
int attr_start_index = -1;
int curr_search_pos = 0;
while ((attr_start_index = str.IndexOf(attr, curr_search_pos)) >= 0)
{
//honestly I stopped here to understand what to do next,
//since stoo much of cases possible that I cannot handle.
//it could be: id = 'asd'width='10%'height=5px, it could be with spaces, with ", set via style width:10%
}
}
在某些时候,我发现我的方法非常困难,因为可以通过多种方式设置尺寸 (width=10px, width = 10px, width = '10px', id='asd'width="10px"style='...')
。案例太多,很难找到要删除的属性边框。
那么,有没有更简单的方法呢?
这可以理解为格式正确 xml(至少您发布的示例):
编辑
更容易阅读:
private static string RemoveHeightsAndWidths(string original)
{
XElement element = XElement.Parse(original);
var tableRelatedElements =
element.Descendants("table")
.Union(element.Descendants("tr"))
.Union(element.Descendants("td"))
.Union(element.Descendants("th")); //add more items you want to strip the height and width from in the same manner
Regex reg = new Regex("(?:width:.*?;)|(?:height:.*?;)");
foreach (var item in tableRelatedElements)
{
if (item.Attributes("style").Any())
{
item.Attribute("style").Value = reg.Replace(item.Attribute("style").Value, string.Empty);
}
if (item.Attributes("height").Any())
{
item.Attribute("height").Remove();
}
if (item.Attributes("width").Any())
{
item.Attribute("width").Remove();
}
}
return element.ToString();
}