DOMDocument:loadHTML() 正在转换 htmlentities
DOMDocument:loadHTML() is converting htmlentities
一个相关的问题是 Preventing DOMDocument::loadHTML() from converting entities 但它没有给出解决方案。
此代码:
$html = "<span>🆃🅴🆂🆃</span>";
$doc = new DOMDocument;
$doc->resolveExternals = false;
$doc->substituteEntities = false;
$doc->loadhtml($html);
foreach ($doc->getElementsByTagName('span') as $node)
{
var_dump($node->nodeValue);
var_dump(htmlentities($node->nodeValue));
var_dump(htmlentities(iconv('UTF-8', 'ISO-8859-1', $node->nodeValue)));
}
产生这个 HTML:
string(16) ""
string(16) ""
string(0) ""
但我想要的是🆃🅴🆂🆃
我是运行PHP版本5.6.29和ini_get("default_charset")
returnsUTF-8
在 http://php.net/manual/en/function.htmlentities.php 上阅读了更多内容后,我注意到它并没有对所有 unicode 进行编码。有人在评论中写了 superentities
但那个功能似乎对我不起作用。 UTF8entities
函数做到了。
这是我从评论部分和代码中修改的两个函数,虽然不是我想要的,但它确实给了我 html 个编码值。
$html = "<span>🆃🅴🆂🆃</span>";
$doc = new DOMDocument;
$doc->resolveExternals = false;
$doc->substituteEntities = false;
$doc->loadhtml($html);
foreach ($doc->getElementsByTagName('span') as $node)
{
var_dump(UTF8entities($node->nodeValue));
}
function UTF8entities($content="") {
$characterArray = preg_split('/(?<!^)(?!$)/u', $content ); // return array of every multi-byte character
foreach ($characterArray as $character) {
$rv .= unicode_entity_replace($character);
}
return $rv;
}
function unicode_entity_replace($c) { //m. perez
$h = ord($c{0});
if ($h <= 0x7F) {
return $c;
} else if ($h < 0xC2) {
return $c;
}
if ($h <= 0xDF) {
$h = ($h & 0x1F) << 6 | (ord($c{1}) & 0x3F);
$h = "&#" . $h . ";";
return $h;
} else if ($h <= 0xEF) {
$h = ($h & 0x0F) << 12 | (ord($c{1}) & 0x3F) << 6 | (ord($c{2}) & 0x3F);
$h = "&#" . $h . ";";
return $h;
} else if ($h <= 0xF4) {
$h = ($h & 0x0F) << 18 | (ord($c{1}) & 0x3F) << 12 | (ord($c{2}) & 0x3F) << 6 | (ord($c{3}) & 0x3F);
$h = "&#" . $h . ";";
return $h;
}
}
Returns 这个:
string(36) "🆃🅴🆂🆃"
一个相关的问题是 Preventing DOMDocument::loadHTML() from converting entities 但它没有给出解决方案。
此代码:
$html = "<span>🆃🅴🆂🆃</span>";
$doc = new DOMDocument;
$doc->resolveExternals = false;
$doc->substituteEntities = false;
$doc->loadhtml($html);
foreach ($doc->getElementsByTagName('span') as $node)
{
var_dump($node->nodeValue);
var_dump(htmlentities($node->nodeValue));
var_dump(htmlentities(iconv('UTF-8', 'ISO-8859-1', $node->nodeValue)));
}
产生这个 HTML:
string(16) ""
string(16) ""
string(0) ""
但我想要的是🆃🅴🆂🆃
我是运行PHP版本5.6.29和ini_get("default_charset")
returnsUTF-8
在 http://php.net/manual/en/function.htmlentities.php 上阅读了更多内容后,我注意到它并没有对所有 unicode 进行编码。有人在评论中写了 superentities
但那个功能似乎对我不起作用。 UTF8entities
函数做到了。
这是我从评论部分和代码中修改的两个函数,虽然不是我想要的,但它确实给了我 html 个编码值。
$html = "<span>🆃🅴🆂🆃</span>";
$doc = new DOMDocument;
$doc->resolveExternals = false;
$doc->substituteEntities = false;
$doc->loadhtml($html);
foreach ($doc->getElementsByTagName('span') as $node)
{
var_dump(UTF8entities($node->nodeValue));
}
function UTF8entities($content="") {
$characterArray = preg_split('/(?<!^)(?!$)/u', $content ); // return array of every multi-byte character
foreach ($characterArray as $character) {
$rv .= unicode_entity_replace($character);
}
return $rv;
}
function unicode_entity_replace($c) { //m. perez
$h = ord($c{0});
if ($h <= 0x7F) {
return $c;
} else if ($h < 0xC2) {
return $c;
}
if ($h <= 0xDF) {
$h = ($h & 0x1F) << 6 | (ord($c{1}) & 0x3F);
$h = "&#" . $h . ";";
return $h;
} else if ($h <= 0xEF) {
$h = ($h & 0x0F) << 12 | (ord($c{1}) & 0x3F) << 6 | (ord($c{2}) & 0x3F);
$h = "&#" . $h . ";";
return $h;
} else if ($h <= 0xF4) {
$h = ($h & 0x0F) << 18 | (ord($c{1}) & 0x3F) << 12 | (ord($c{2}) & 0x3F) << 6 | (ord($c{3}) & 0x3F);
$h = "&#" . $h . ";";
return $h;
}
}
Returns 这个:
string(36) "🆃🅴🆂🆃"