Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
cee0661
HTML API: Add tests for attribute value input preprocessing.
sirreal Jun 11, 2026
82a26aa
HTML API: Apply input preprocessing in get_attribute().
sirreal Jun 11, 2026
48d8fb4
HTML API: Add tests for class updates over preprocessed values.
sirreal Jun 11, 2026
d1f852c
HTML API: Apply input preprocessing when flushing class updates.
sirreal Jun 11, 2026
020155a
HTML API: Add tests for NULL bytes in attribute names.
sirreal Jun 11, 2026
442e820
HTML API: Replace NULL bytes in comparable attribute names.
sirreal Jun 11, 2026
135157f
HTML API: Add tests for NULL bytes in tag names.
sirreal Jun 11, 2026
5b8ad27
HTML API: Replace NULL bytes in tag names at the read boundary.
sirreal Jun 11, 2026
f6f58fd
HTML API: Add test for NULL bytes in API-supplied class values.
sirreal Jun 11, 2026
ba93ef4
HTML API: Stop replacing NULL bytes in API-supplied class values.
sirreal Jun 11, 2026
9baceb6
HTML API: Avoid re-scanning attribute values without CR or NULL bytes.
sirreal Jun 11, 2026
3b415d1
HTML API: Add tests for character references preceding replaced bytes.
sirreal Jun 11, 2026
8f5e8b2
HTML API: Replace NULL bytes after decoding attribute values.
sirreal Jun 11, 2026
e18f389
HTML API: Detect ambiguous character reference followers by ASCII only.
sirreal Jun 11, 2026
449bf72
HTML API: Add tests for tag-name queries over replaced names.
sirreal Jun 11, 2026
5c52634
HTML API: Match tag-name queries against replaced names.
sirreal Jun 11, 2026
5292c7d
HTML API: Add test for case-insensitive class update flushing.
sirreal Jun 11, 2026
8c26adf
HTML API: Flush class updates for any case spelling of "class".
sirreal Jun 11, 2026
e41d168
HTML API: Pin edge cases of replaced names and document boundaries.
sirreal Jun 11, 2026
88a4d52
Merge remote-tracking branch 'upstream/trunk' into HEAD
sirreal Jun 15, 2026
7f64468
Merge branch 'trunk' into spec-compliant-getters
sirreal Jul 1, 2026
62d682f
Revert irrelevant doc changes
sirreal Jul 1, 2026
b7d4b39
Remove redundant tests
sirreal Jul 1, 2026
30b17da
Remove excessive docs
sirreal Jul 1, 2026
7e451fe
Simplify tag name matching logic
sirreal Jul 1, 2026
b3d15f5
Remove excessive documentation
sirreal Jul 1, 2026
624fe63
simplify comment
sirreal Jul 1, 2026
5293caa
Remove excessive documentation
sirreal Jul 1, 2026
7f77e93
Simplify attribute value getter
sirreal Jul 1, 2026
908f4b3
Ignore new private method
sirreal Jul 1, 2026
fed8e18
Rework new function docs
sirreal Jul 1, 2026
3c61f03
Remove excessive docs
sirreal Jul 1, 2026
480bce8
Reformat comment
sirreal Jul 1, 2026
d701662
Test function types
sirreal Jul 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 39 additions & 20 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1217,7 +1217,7 @@ public function class_list() {
return;
}

$name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) );
$name = substr( $class, $at, $length );
if ( $is_quirks ) {
$name = strtolower( $name );
}
Expand Down Expand Up @@ -2261,8 +2261,13 @@ private function parse_next_attribute(): bool {
* - HTML 5 spec
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*
* The tokenizer replaces U+0000 NULL bytes in
* attribute names with U+FFFD.
*
* @see https://html.spec.whatwg.org/#attribute-name-state
*/
$comparable_name = strtolower( $attribute_name );
$comparable_name = strtolower( str_replace( "\x00", "\u{FFFD}", $attribute_name ) );

// If an attribute is listed many times, only use the first declaration and ignore the rest.
if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
Expand Down Expand Up @@ -2389,13 +2394,7 @@ private function class_name_updates_to_attributes_updates(): void {
}

if ( false === $existing_class && isset( $this->attributes['class'] ) ) {
$existing_class = WP_HTML_Decoder::decode_attribute(
substr(
$this->html,
$this->attributes['class']->value_starts_at,
$this->attributes['class']->value_length
)
);
$existing_class = $this->get_decoded_attribute_value( $this->attributes['class'] );
}

if ( false === $existing_class ) {
Expand Down Expand Up @@ -2823,7 +2822,7 @@ public function get_attribute( $name ) {
* attribute values. If any exist, those enqueued class changes must first be flushed out
* into an attribute value update.
*/
if ( 'class' === $name ) {
if ( 'class' === $comparable ) {
$this->class_name_updates_to_attributes_updates();
}

Expand Down Expand Up @@ -2854,8 +2853,28 @@ public function get_attribute( $name ) {
return true;
}

$raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length );
return $this->get_decoded_attribute_value( $attribute );
}

/**
* Decode an attribute value from source.
*
* This method applies the following transformations that the processor defers:
* - Normalize newlines (input stream preprocessing)
* - Replace NULL bytes (tokenization)
* - Decode character references (tokenization)
*
* @since 7.1.0
* @ignore
*
* @param WP_HTML_Attribute_Token $attribute Attribute token from the input document.
* @return string Decoded attribute value.
*/
private function get_decoded_attribute_value( WP_HTML_Attribute_Token $attribute ): string {
$raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length );
$raw_value = str_replace( "\r\n", "\n", $raw_value );
$raw_value = str_replace( "\r", "\n", $raw_value );
$raw_value = str_replace( "\x00", "\u{FFFD}", $raw_value );
return WP_HTML_Decoder::decode_attribute( $raw_value );
}

Expand Down Expand Up @@ -2936,7 +2955,7 @@ public function get_tag(): ?string {
return null;
}

$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
$tag_name = str_replace( "\x00", "\u{FFFD}", substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ) );

if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
return strtoupper( $tag_name );
Expand Down Expand Up @@ -4798,14 +4817,14 @@ private function matches(): bool {
}

// Does the tag name match the requested tag name in a case-insensitive manner?
if (
isset( $this->sought_tag_name ) &&
(
strlen( $this->sought_tag_name ) !== $this->tag_name_length ||
0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true )
)
) {
return false;
if ( isset( $this->sought_tag_name ) ) {
$tag_name = $this->get_tag();
if (
strlen( $this->sought_tag_name ) !== strlen( $tag_name ) ||
0 !== substr_compare( $tag_name, $this->sought_tag_name, 0, null, true )
) {
return false;
}
}

if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
Expand Down
37 changes: 37 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlDecoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,43 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
}

/**
* Ensures that numeric character references for U+0000 decode to U+FFFD
* while raw NULL bytes pass through the decoder untransformed.
*
* The tokenizer, not the decoder, is responsible for replacing raw NULL
* bytes; in the Tag Processor that responsibility falls on the methods
* which read values out of the input document.
*
* @ticket 65372
*
* @dataProvider data_null_code_points
*
* @param string $raw_value Raw attribute value.
* @param string $decoded_value The expected decoded attribute value.
*/
public function test_null_code_points_in_attribute_values( string $raw_value, string $decoded_value ) {
$this->assertSame(
$decoded_value,
WP_HTML_Decoder::decode_attribute( $raw_value ),
'Improperly decoded raw attribute value.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_null_code_points() {
return array(
'Decimal zero' => array( 'a�b', "a\u{FFFD}b" ),
'Hexadecimal zero' => array( 'a�b', "a\u{FFFD}b" ),
'Multiple zeros' => array( 'a�b', "a\u{FFFD}b" ),
'Raw NULL byte passes through' => array( "a\x00b", "a\x00b" ),
);
}

/**
* Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes.
*
Expand Down
Loading
Loading