' ); * * @since 6.7.0 * * @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. ``. * * @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the * provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null. */ public static function from_doctype_token( string $doctype_html ): ?self { $doctype_name = null; $doctype_public_id = null; $doctype_system_id = null; $end = strlen( $doctype_html ) - 1; /* * This parser combines the rules for parsing DOCTYPE tokens found in the HTML * specification for the DOCTYPE related tokenizer states. * * @see https://html.spec.whatwg.org/#doctype-state */ /* * - Valid DOCTYPE HTML token must be at least `` assuming a complete token not * ending in end-of-file. * - It must start with an ASCII case-insensitive match for `` must be the final byte in the HTML string. */ if ( $end < 9 || 0 !== substr_compare( $doctype_html, '`? if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) { return null; } /* * Perform newline normalization and ensure the $end value is correct after normalization. * * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream * @see https://infra.spec.whatwg.org/#normalize-newlines */ $doctype_html = str_replace( "\r\n", "\n", $doctype_html ); $doctype_html = str_replace( "\r", "\n", $doctype_html ); $end = strlen( $doctype_html ) - 1; /* * In this state, the doctype token has been found and its "content" optionally including the * name, public identifier, and system identifier is between the current position and the end. * * "" * ╰─ $at ╰─ $end * * It's also possible that the declaration part is empty. * * ╭─ $at * "" * ╰─ $end * * Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they * have been handled above in the condition that the provided DOCTYPE HTML must contain * exactly one ">" character in the final position. */ /* * * Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and * proceed to the next state. * * @see https://html.spec.whatwg.org/#before-doctype-name-state */ $at += strspn( $doctype_html, " \t\n\f\r", $at ); if ( $at >= $end ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } $name_length = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); $doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) ); $at += $name_length; $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); if ( $at >= $end ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); } /* * "After DOCTYPE name state" * * Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point. * Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype). * * @see https://html.spec.whatwg.org/#after-doctype-name-state */ if ( $at + 6 >= $end ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } /* * > If the six characters starting from the current input character are an ASCII * > case-insensitive match for the word "PUBLIC", then consume those characters * > and switch to the after DOCTYPE public keyword state. */ if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) { $at += 6; $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); if ( $at >= $end ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } goto parse_doctype_public_identifier; } /* * > Otherwise, if the six characters starting from the current input character are an ASCII * > case-insensitive match for the word "SYSTEM", then consume those characters and switch * > to the after DOCTYPE system keyword state. */ if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) { $at += 6; $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); if ( $at >= $end ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } goto parse_doctype_system_identifier; } /* * > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error. * > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus * > DOCTYPE state. */ return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); parse_doctype_public_identifier: /* * The parser should enter "DOCTYPE public identifier (double-quoted) state" or * "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes. * Anything else forces quirks mode and ignores the rest of the contents. * * @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state * @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state */ $closer_quote = $doctype_html[ $at ]; /* * > This is a missing-quote-before-doctype-public-identifier parse error. Set the * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state. */ if ( '"' !== $closer_quote && "'" !== $closer_quote ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } ++$at; $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at ); $doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) ); $at += $identifier_length; if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } ++$at; /* * "Between DOCTYPE public and system identifiers state" * * Advance through whitespace between public and system identifiers. * * @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state */ $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); if ( $at >= $end ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); } parse_doctype_system_identifier: /* * The parser should enter "DOCTYPE system identifier (double-quoted) state" or * "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes. * Anything else forces quirks mode and ignores the rest of the contents. * * @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state * @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state */ $closer_quote = $doctype_html[ $at ]; /* * > This is a missing-quote-before-doctype-system-identifier parse error. Set the * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state. */ if ( '"' !== $closer_quote && "'" !== $closer_quote ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } ++$at; $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at ); $doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) ); $at += $identifier_length; if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) { return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); } return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); } }