From b6688433adc78b90328794b21e91e5ce0eeaf457 Mon Sep 17 00:00:00 2001 From: Laurent Lyaudet Date: Fri, 13 Feb 2026 14:19:15 +0100 Subject: [PATCH 1/2] LL: add class InvalidEncodingException and flag VALIDATE_UTF8_ENCODING to call validateUTF8Encoding() function. --- .../JsonLint/InvalidEncodingException.php | 47 ++++++ src/Seld/JsonLint/JsonParser.php | 134 ++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 src/Seld/JsonLint/InvalidEncodingException.php diff --git a/src/Seld/JsonLint/InvalidEncodingException.php b/src/Seld/JsonLint/InvalidEncodingException.php new file mode 100644 index 0000000..e07e4ea --- /dev/null +++ b/src/Seld/JsonLint/InvalidEncodingException.php @@ -0,0 +1,47 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Seld\JsonLint; + +class InvalidEncodingException extends ParsingException +{ + /** + * @var array{key: string, line: int} + */ + protected $details; + + /** + * @param string $message + * @param string $key + * @phpstan-param array{line: int} $details + */ + public function __construct($message, $key, array $details) + { + $details['key'] = $key; + parent::__construct($message, $details); + } + + /** + * @return string + */ + public function getKey() + { + return $this->details['key']; + } + + /** + * @phpstan-return array{key: string, line: int} + */ + public function getDetails() + { + return $this->details; + } +} diff --git a/src/Seld/JsonLint/JsonParser.php b/src/Seld/JsonLint/JsonParser.php index cec4a87..f097217 100644 --- a/src/Seld/JsonLint/JsonParser.php +++ b/src/Seld/JsonLint/JsonParser.php @@ -32,6 +32,7 @@ class JsonParser const PARSE_TO_ASSOC = 4; const ALLOW_COMMENTS = 8; const ALLOW_DUPLICATE_KEYS_TO_ARRAY = 16; + const VALIDATE_UTF8_ENCODING = 32; /** @var Lexer */ private $lexer; @@ -184,6 +185,13 @@ class JsonParser */ public function lint($input, $flags = 0) { + if ($flags & self::VALIDATE_UTF8_ENCODING) { + try { + $this->validateUTF8Encoding($input); + } catch (InvalidEncodingException $e) { + return $e; + } + } try { $this->parse($input, $flags); } catch (ParsingException $e) { @@ -605,4 +613,130 @@ private function failOnBOM($input) $this->parseError("BOM detected, make sure your input does not include a Unicode Byte-Order-Mark"); } } + + /** + * @param string $input + * @return void + */ + private function validateUTF8Encoding($input) + { + $iContinuationOctetNeeded = 0; + $iCharacterStartPosition = 0; + $iCurrentLineNumber = 1; + $iOffsetInOctetsFromLineStart = -1; + $iOffsetInCharactersFromLineStart = -1; + for ($i = 0, $iMax = strlen($input); $i < $iMax; ++$i) { + $iCurrentOctet = ord($input[$i]); + $iOffsetInOctetsFromLineStart += 1; + if ($iContinuationOctetNeeded > 0) { + if ($iCurrentOctet < 128 || $iCurrentOctet >= 192) { + throw new InvalidEncodingException( + "Non-UTF8 character found on line " + .$iCurrentLineNumber + ."; the octet " + .($iOffsetInOctetssFromLineStart + 1) + .", part of the character " + .($iOffsetInCharactersFromLineStart + 1) + .", has value " + .$iCurrentOctet + ." which is not a continuation octet." + ." (Sequential positions in octets without line splitting:" + ." character start position " + .$iCharacterStartPosition + .", octet position " + .$i + .")", + $iCurrentOctet, + array( + 'line' => $iCurrentLineNumber, + 'offset_in_octets_from_line_start' => $iOffsetInOctetsFromLineStart, + 'offset_in_characters_from_line_start' => $iOffsetInCharactersFromLineStart, + 'octet_position' => $i, + 'character_start_position' => $iCharacterStartPosition, + ) + ); + } + --$iContinuationOctetNeeded; + continue; + } + + $iCharacterStartPosition = $i; + $iOffsetInCharactersFromLineStart += 1; + if ($iCurrentOctet < 128) { // 0xxxxxxx ASCII + if ($input[$i] === "\n") { + $iCurrentLineNumber += 1; + $iOffsetInOctetsFromLineStart = -1; + $iOffsetInCharactersFromLineStart = -1; + } + continue; + } + if ($iCurrentOctet >= 128 && $iCurrentOctet < 192) { + throw new InvalidEncodingException( + "Non-UTF8 character found on line " + .$iCurrentLineNumber + ."; the octet " + .($iOffsetInOctetssFromLineStart + 1) + .", part of the character " + .($iOffsetInCharactersFromLineStart + 1) + .", has value " + .$iCurrentOctet + ." which is a continuation octet." + ." (Sequential positions in octets without line splitting:" + ." character start position " + .$iCharacterStartPosition + .", octet position " + .$i + .")", + $iCurrentOctet, + array( + 'line' => $iCurrentLineNumber, + 'offset_in_octets_from_line_start' => $iOffsetInOctetsFromLineStart, + 'offset_in_characters_from_line_start' => $iOffsetInCharactersFromLineStart, + 'octet_position' => $i, + 'character_start_position' => $iCharacterStartPosition, + ) + ); + } + if ($iCurrentOctet >= 192 && $iCurrentOctet < 224) { + // 110xxxxx 10xxxxxx + $iContinuationOctetNeeded = 1; + continue; + } + if ($iCurrentOctet >= 224 && $iCurrentOctet < 240) { + // 1110xxxx 10xxxxxx 10xxxxxx + $iContinuationOctetNeeded = 2; + continue; + } + if ($iCurrentOctet >= 240 && $iCurrentOctet < 248) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + $iContinuationOctetNeeded = 3; + continue; + } + throw new InvalidEncodingException( + "Non-UTF8 character found on line " + .$iCurrentLineNumber + ."; the octet " + .($iOffsetInOctetssFromLineStart + 1) + .", part of the character " + .($iOffsetInCharactersFromLineStart + 1) + .", has value " + .$iCurrentOctet + ." which is invalid." + ." (Sequential positions in octets without line splitting:" + ." character start position " + .$iCharacterStartPosition + .", octet position " + .$i + .")", + $iCurrentOctet, + array( + 'line' => $iCurrentLineNumber, + 'offset_in_octets_from_line_start' => $iOffsetInOctetsFromLineStart, + 'offset_in_characters_from_line_start' => $iOffsetInCharactersFromLineStart, + 'octet_position' => $i, + 'character_start_position' => $iCharacterStartPosition, + ) + ); + } + } } From c359e2531e1e615dd823969952bd986c8322060d Mon Sep 17 00:00:00 2001 From: Laurent Lyaudet Date: Wed, 18 Mar 2026 13:30:29 +0100 Subject: [PATCH 2/2] JsonParser.php correction Octetss -> Octets --- src/Seld/JsonLint/JsonParser.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Seld/JsonLint/JsonParser.php b/src/Seld/JsonLint/JsonParser.php index f097217..7efd066 100644 --- a/src/Seld/JsonLint/JsonParser.php +++ b/src/Seld/JsonLint/JsonParser.php @@ -29,7 +29,7 @@ class JsonParser { const DETECT_KEY_CONFLICTS = 1; const ALLOW_DUPLICATE_KEYS = 2; - const PARSE_TO_ASSOC = 4; + const PARSE_TO_AOC = 4; const ALLOW_COMMENTS = 8; const ALLOW_DUPLICATE_KEYS_TO_ARRAY = 16; const VALIDATE_UTF8_ENCODING = 32; @@ -44,7 +44,7 @@ class JsonParser private $flags; /** @var list */ private $stack; - /** @var list|int|bool|float|string|null> */ + /** @var list|int|bool|float|string|null> */ private $vstack; // semantic value stack /** @var list */ private $lstack; // location stack @@ -634,7 +634,7 @@ private function validateUTF8Encoding($input) "Non-UTF8 character found on line " .$iCurrentLineNumber ."; the octet " - .($iOffsetInOctetssFromLineStart + 1) + .($iOffsetInOctetsFromLineStart + 1) .", part of the character " .($iOffsetInCharactersFromLineStart + 1) .", has value " @@ -675,7 +675,7 @@ private function validateUTF8Encoding($input) "Non-UTF8 character found on line " .$iCurrentLineNumber ."; the octet " - .($iOffsetInOctetssFromLineStart + 1) + .($iOffsetInOctetsFromLineStart + 1) .", part of the character " .($iOffsetInCharactersFromLineStart + 1) .", has value " @@ -716,7 +716,7 @@ private function validateUTF8Encoding($input) "Non-UTF8 character found on line " .$iCurrentLineNumber ."; the octet " - .($iOffsetInOctetssFromLineStart + 1) + .($iOffsetInOctetsFromLineStart + 1) .", part of the character " .($iOffsetInCharactersFromLineStart + 1) .", has value "