Hashtags, Mentions: Use a tag stack instead of regex for protecting tags (#455)

* Use a tag stack instead of regex for protecting tags

* Use the placeholder in the test

* Add comments

* Update comment

* ignor html comments

thanks @marcS0H

---------

Co-authored-by: Matthias Pfefferle <pfefferle@users.noreply.github.com>
This commit is contained in:
Alex Kirk 2023-09-21 17:03:57 +02:00 committed by GitHub
parent addd7dd8a1
commit 008ae52a53
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 62 deletions

View file

@ -43,38 +43,56 @@ class Hashtag {
* @return string the filtered post-content
*/
public static function the_content( $the_content ) {
$protected_tags = array();
$protect = function( $m ) use ( &$protected_tags ) {
$c = \wp_rand( 100000, 999999 );
$protect = '!#!#PROTECT' . $c . '#!#!';
while ( isset( $protected_tags[ $protect ] ) ) {
$c = \wp_rand( 100000, 999999 );
$protect = '!#!#PROTECT' . $c . '#!#!';
$tag_stack = array();
$protected_tags = array(
'pre',
'code',
'textarea',
'style',
'a',
);
$content_with_links = '';
$in_protected_tag = false;
foreach ( wp_html_split( $the_content ) as $chunk ) {
if ( preg_match( '#^<!--[\s\S]*-->$#i', $chunk, $m ) ) {
$content_with_links .= $chunk;
continue;
}
$protected_tags[ $protect ] = $m[0];
return $protect;
};
$the_content = preg_replace_callback(
'#<!\[CDATA\[.*?\]\]>#is',
$protect,
$the_content
);
$the_content = preg_replace_callback(
'#<(pre|code|textarea|style)\b[^>]*>.*?</\1[^>]*>#is',
$protect,
$the_content
);
$the_content = preg_replace_callback(
'#<[^>]+>#i',
$protect,
$the_content
);
$the_content = \preg_replace_callback( '/' . ACTIVITYPUB_HASHTAGS_REGEXP . '/i', array( '\Activitypub\Hashtag', 'replace_with_links' ), $the_content );
if ( preg_match( '#^<(/)?([a-z-]+)\b[^>]*>$#i', $chunk, $m ) ) {
$tag = strtolower( $m[2] );
if ( '/' === $m[1] ) {
// Closing tag.
$i = array_search( $tag, $tag_stack );
// We can only remove the tag from the stack if it is in the stack.
if ( false !== $i ) {
$tag_stack = array_slice( $tag_stack, 0, $i );
}
} else {
// Opening tag, add it to the stack.
$tag_stack[] = $tag;
}
$the_content = str_replace( array_reverse( array_keys( $protected_tags ) ), array_reverse( array_values( $protected_tags ) ), $the_content );
// If we're in a protected tag, the tag_stack contains at least one protected tag string.
// The protected tag state can only change when we encounter a start or end tag.
$in_protected_tag = array_intersect( $tag_stack, $protected_tags );
return $the_content;
// Never inspect tags.
$content_with_links .= $chunk;
continue;
}
if ( $in_protected_tag ) {
// Don't inspect a chunk inside an inspected tag.
$content_with_links .= $chunk;
continue;
}
// Only reachable when there is no protected tag in the stack.
$content_with_links .= \preg_replace_callback( '/' . ACTIVITYPUB_HASHTAGS_REGEXP . '/i', array( '\Activitypub\Hashtag', 'replace_with_links' ), $chunk );
}
return $content_with_links;
}
/**

View file

@ -25,43 +25,56 @@ class Mention {
* @return string the filtered post-content
*/
public static function the_content( $the_content ) {
$protected_tags = array();
$protect = function( $m ) use ( &$protected_tags ) {
$c = \wp_rand( 100000, 999999 );
$protect = '!#!#PROTECT' . $c . '#!#!';
while ( isset( $protected_tags[ $protect ] ) ) {
$c = \wp_rand( 100000, 999999 );
$protect = '!#!#PROTECT' . $c . '#!#!';
$tag_stack = array();
$protected_tags = array(
'pre',
'code',
'textarea',
'style',
'a',
);
$content_with_links = '';
$in_protected_tag = false;
foreach ( wp_html_split( $the_content ) as $chunk ) {
if ( preg_match( '#^<!--[\s\S]*-->$#i', $chunk, $m ) ) {
$content_with_links .= $chunk;
continue;
}
$protected_tags[ $protect ] = $m[0];
return $protect;
};
$the_content = preg_replace_callback(
'#<!\[CDATA\[.*?\]\]>#is',
$protect,
$the_content
);
$the_content = preg_replace_callback(
'#<(pre|code|textarea|style)\b[^>]*>.*?</\1[^>]*>#is',
$protect,
$the_content
);
$the_content = preg_replace_callback(
'#<a.*?href=[^>]+>.*?</a>#i',
$protect,
$the_content
);
$the_content = preg_replace_callback(
'#<img.*?[^>]+>#i',
$protect,
$the_content
);
if ( preg_match( '#^<(/)?([a-z-]+)\b[^>]*>$#i', $chunk, $m ) ) {
$tag = strtolower( $m[2] );
if ( '/' === $m[1] ) {
// Closing tag.
$i = array_search( $tag, $tag_stack );
// We can only remove the tag from the stack if it is in the stack.
if ( false !== $i ) {
$tag_stack = array_slice( $tag_stack, 0, $i );
}
} else {
// Opening tag, add it to the stack.
$tag_stack[] = $tag;
}
$the_content = \preg_replace_callback( '/@' . ACTIVITYPUB_USERNAME_REGEXP . '/', array( self::class, 'replace_with_links' ), $the_content );
$the_content = \str_replace( array_reverse( array_keys( $protected_tags ) ), array_reverse( array_values( $protected_tags ) ), $the_content );
// If we're in a protected tag, the tag_stack contains at least one protected tag string.
// The protected tag state can only change when we encounter a start or end tag.
$in_protected_tag = array_intersect( $tag_stack, $protected_tags );
return $the_content;
// Never inspect tags.
$content_with_links .= $chunk;
continue;
}
if ( $in_protected_tag ) {
// Don't inspect a chunk inside an inspected tag.
$content_with_links .= $chunk;
continue;
}
// Only reachable when there is no protected tag in the stack.
$content_with_links .= \preg_replace_callback( '/@' . ACTIVITYPUB_USERNAME_REGEXP . '/', array( self::class, 'replace_with_links' ), $chunk );
}
return $content_with_links;
}
/**

View file

@ -41,8 +41,9 @@ ENDPRE;
array( 'hallo <a href="http://test.test/#object">#test</a> test', 'hallo <a href="http://test.test/#object">#test</a> test' ),
array( '<div>hallo #object test</div>', '<div>hallo <a rel="tag" class="hashtag u-tag u-category" href="%s">#object</a> test</div>' ),
array( '<div>hallo #object</div>', '<div>hallo <a rel="tag" class="hashtag u-tag u-category" href="%s">#object</a></div>' ),
array( '<div>#object</div>', '<div>#object</div>' ),
array( '<div>#object</div>', '<div><a rel="tag" class="hashtag u-tag u-category" href="%s">#object</a></div>' ),
array( '<a>#object</a>', '<a>#object</a>' ),
array( '<!-- #object -->', '<!-- #object -->' ),
array( '<div style="color: #ccc;">object</a>', '<div style="color: #ccc;">object</a>' ),
array( $code, $code ),
array( $style, $style ),

View file

@ -33,6 +33,7 @@ ENDPRE;
array( 'hallo <a rel="mention" class="u-url mention" href="https://notiz.blog/author/matthias-pfefferle/">@pfefferle@notiz.blog</a> test', 'hallo <a rel="mention" class="u-url mention" href="https://notiz.blog/author/matthias-pfefferle/">@pfefferle@notiz.blog</a> test' ),
array( 'hallo <a rel="mention" class="u-url mention" href="https://notiz.blog/@pfefferle/">@pfefferle@notiz.blog</a> test', 'hallo <a rel="mention" class="u-url mention" href="https://notiz.blog/@pfefferle/">@pfefferle@notiz.blog</a> test' ),
array( 'hallo <img src="abc" alt="https://notiz.blog/@pfefferle/" title="@pfefferle@notiz.blog"/> test', 'hallo <img src="abc" alt="https://notiz.blog/@pfefferle/" title="@pfefferle@notiz.blog"/> test' ),
array( '<!-- @pfefferle@notiz.blog -->', '<!-- @pfefferle@notiz.blog -->' ),
array( $code, $code ),
array( $pre, $pre ),
);