PHP: 将 HTML href 解析器函数更改为仅在 url 中找到静态字符串时才匹配
PHP: Change HTML href Parser function to only match if it finds a static string in url
我正在尝试修改一些代码来解析 html 超链接的文本并将它们放入数据库中。
我想做的更改是仅在 html 超链接包含特定文本时才匹配,例如:
<a href="http://example.com/images/test1.jpg">my image</a>
不会匹配但是
<a href="http://example.com/thisismyunique/string/test2.jpg">my image2</a>
将被匹配 基于它在 url.
中有“/thisismyunique/string”
有什么想法吗?
class blcHTMLLink extends blcParser {
var $supported_formats = array('html');
/**
* Parse a string for HTML links - <a href="URL">anchor text</a>
*
* @param string $content The text to parse.
* @param string $base_url The base URL to use for normalizing relative URLs. If ommitted, the blog's root URL will be used.
* @param string $default_link_text
* @return array An array of new blcLinkInstance objects. The objects will include info about the links found, but not about the corresponding container entity.
*/
function parse($content, $base_url = '', $default_link_text = ''){
//remove all <code></code> blocks first
$content = preg_replace('/<code[^>]*>.+?<\/code>/si', ' ', $content);
//Find links
$params = array(
'base_url' => $base_url,
'default_link_text' => $default_link_text,
);
$instances = $this->map($content, array($this, 'parser_callback'), $params);
//The parser callback returns NULL when it finds an invalid link. Filter out those nulls
//from the list of instances.
$instances = array_filter($instances);
return $instances;
}
/**
* blcHTMLLink::parser_callback()
*
* @access private
*
* @param array $link
* @param array $params
* @return blcLinkInstance|null
*/
function parser_callback($link, $params){
global $blclog;
$base_url = $params['base_url'];
$url = $raw_url = $link['href'];
$url = trim($url);
//$blclog->debug(__CLASS__ .':' . __FUNCTION__ . ' Found a link, raw URL = "' . $raw_url . '"');
//Sometimes links may contain shortcodes. Execute them.
$url = do_shortcode($url);
//Skip empty URLs
if ( empty($url) ){
$blclog->warn(__CLASS__ .':' . __FUNCTION__ . ' Skipping the link (empty URL)');
return null;
};
//Attempt to parse the URL
$parts = @parse_url($url);
if(!$parts) {
$blclog->warn(__CLASS__ .':' . __FUNCTION__ . ' Skipping the link (parse_url failed)', $url);
return null; //Skip invalid URLs
};
if ( !isset($parts['scheme']) ){
//No scheme - likely a relative URL. Turn it into an absolute one.
//TODO: Also log the original URL and base URL.
$url = $this->relative2absolute($url, $base_url); //$base_url comes from $params
$blclog->info(__CLASS__ .':' . __FUNCTION__ . ' Convert relative URL to absolute. Absolute URL = "' . $url . '"');
}
//Skip invalid links (again)
if ( !$url || (strlen($url)<6) ) {
$blclog->info(__CLASS__ .':' . __FUNCTION__ . ' Skipping the link (invalid/short URL)', $url);
return null;
}
//Remove left-to-right marks. See: https://en.wikipedia.org/wiki/Left-to-right_mark
$ltrm = json_decode('"\u200E"');
$url = str_replace($ltrm, '', $url);
$text = $link['#link_text'];
//The URL is okay, create and populate a new link instance.
$instance = new blcLinkInstance();
$instance->set_parser($this);
$instance->raw_url = $raw_url;
$instance->link_text = $text;
$link_obj = new blcLink($url); //Creates or loads the link
$instance->set_link($link_obj);
return $instance;
}
如果您的 $link
参数中已经有一个 href
索引,它应该包含 URL,您可以轻松地这样做:
$blockedWord = '/thisismyunique/string';
$blockedWordPosition = strpos($link['href'], $blockedWord);
$hasBlockedWord = $blockedWordPosition !== false;
当心,因为 strpos 可能 return 0,如果针已经在 haystack 字符串的开头找到。
在此处了解更多信息:
我正在尝试修改一些代码来解析 html 超链接的文本并将它们放入数据库中。
我想做的更改是仅在 html 超链接包含特定文本时才匹配,例如:
<a href="http://example.com/images/test1.jpg">my image</a>
不会匹配但是
<a href="http://example.com/thisismyunique/string/test2.jpg">my image2</a>
将被匹配 基于它在 url.
中有“/thisismyunique/string”有什么想法吗?
class blcHTMLLink extends blcParser {
var $supported_formats = array('html');
/**
* Parse a string for HTML links - <a href="URL">anchor text</a>
*
* @param string $content The text to parse.
* @param string $base_url The base URL to use for normalizing relative URLs. If ommitted, the blog's root URL will be used.
* @param string $default_link_text
* @return array An array of new blcLinkInstance objects. The objects will include info about the links found, but not about the corresponding container entity.
*/
function parse($content, $base_url = '', $default_link_text = ''){
//remove all <code></code> blocks first
$content = preg_replace('/<code[^>]*>.+?<\/code>/si', ' ', $content);
//Find links
$params = array(
'base_url' => $base_url,
'default_link_text' => $default_link_text,
);
$instances = $this->map($content, array($this, 'parser_callback'), $params);
//The parser callback returns NULL when it finds an invalid link. Filter out those nulls
//from the list of instances.
$instances = array_filter($instances);
return $instances;
}
/**
* blcHTMLLink::parser_callback()
*
* @access private
*
* @param array $link
* @param array $params
* @return blcLinkInstance|null
*/
function parser_callback($link, $params){
global $blclog;
$base_url = $params['base_url'];
$url = $raw_url = $link['href'];
$url = trim($url);
//$blclog->debug(__CLASS__ .':' . __FUNCTION__ . ' Found a link, raw URL = "' . $raw_url . '"');
//Sometimes links may contain shortcodes. Execute them.
$url = do_shortcode($url);
//Skip empty URLs
if ( empty($url) ){
$blclog->warn(__CLASS__ .':' . __FUNCTION__ . ' Skipping the link (empty URL)');
return null;
};
//Attempt to parse the URL
$parts = @parse_url($url);
if(!$parts) {
$blclog->warn(__CLASS__ .':' . __FUNCTION__ . ' Skipping the link (parse_url failed)', $url);
return null; //Skip invalid URLs
};
if ( !isset($parts['scheme']) ){
//No scheme - likely a relative URL. Turn it into an absolute one.
//TODO: Also log the original URL and base URL.
$url = $this->relative2absolute($url, $base_url); //$base_url comes from $params
$blclog->info(__CLASS__ .':' . __FUNCTION__ . ' Convert relative URL to absolute. Absolute URL = "' . $url . '"');
}
//Skip invalid links (again)
if ( !$url || (strlen($url)<6) ) {
$blclog->info(__CLASS__ .':' . __FUNCTION__ . ' Skipping the link (invalid/short URL)', $url);
return null;
}
//Remove left-to-right marks. See: https://en.wikipedia.org/wiki/Left-to-right_mark
$ltrm = json_decode('"\u200E"');
$url = str_replace($ltrm, '', $url);
$text = $link['#link_text'];
//The URL is okay, create and populate a new link instance.
$instance = new blcLinkInstance();
$instance->set_parser($this);
$instance->raw_url = $raw_url;
$instance->link_text = $text;
$link_obj = new blcLink($url); //Creates or loads the link
$instance->set_link($link_obj);
return $instance;
}
如果您的 $link
参数中已经有一个 href
索引,它应该包含 URL,您可以轻松地这样做:
$blockedWord = '/thisismyunique/string';
$blockedWordPosition = strpos($link['href'], $blockedWord);
$hasBlockedWord = $blockedWordPosition !== false;
当心,因为 strpos 可能 return 0,如果针已经在 haystack 字符串的开头找到。
在此处了解更多信息: