diff --git a/NEWS.md b/NEWS.md index 3ba44ddeb87e75..c666338ffafbe0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -92,6 +92,7 @@ releases. * prism 1.9.0 * 1.7.0 to [v1.8.0][prism-v1.8.0], [v1.8.1][prism-v1.8.1], [v1.9.0][prism-v1.9.0] * psych 5.4.0 + * 5.3.1 to [v5.4.0][psych-v5.4.0] * resolv 0.7.1 * 0.7.0 to [v0.7.1][resolv-v0.7.1] * stringio 3.2.1.dev @@ -250,6 +251,7 @@ A lot of work has gone into making Ractors more stable, performant, and usable. [prism-v1.8.0]: https://github.com/ruby/prism/releases/tag/v1.8.0 [prism-v1.8.1]: https://github.com/ruby/prism/releases/tag/v1.8.1 [prism-v1.9.0]: https://github.com/ruby/prism/releases/tag/v1.9.0 +[psych-v5.4.0]: https://github.com/ruby/psych/releases/tag/v5.4.0 [resolv-v0.7.1]: https://github.com/ruby/resolv/releases/tag/v0.7.1 [strscan-v3.1.7]: https://github.com/ruby/strscan/releases/tag/v3.1.7 [strscan-v3.1.8]: https://github.com/ruby/strscan/releases/tag/v3.1.8 diff --git a/ext/json/lib/json/ext.rb b/ext/json/lib/json/ext.rb index 5bacc5e371212f..28500f1492396d 100644 --- a/ext/json/lib/json/ext.rb +++ b/ext/json/lib/json/ext.rb @@ -41,5 +41,31 @@ def parse end end + if defined?(ResumableParser) # Not yet available on JRuby + class ResumableParser + # Returns whether the parser is entirely done: no unconsumed bytes in + # the buffer, no document under construction and no parsed value + # awaiting retrieval. + # + # The main use case is detecting a truncated stream once the input is + # exhausted: + # + # loop do + # begin + # parser << socket.readpartial(4096) + # rescue EOFError + # break + # end + # while parser.parse + # process(parser.value) + # end + # end + # warn "stream was truncated" unless parser.empty? + def empty? + eos? && !partial_value? && !value? + end + end + end + JSON_LOADED = true unless defined?(JSON::JSON_LOADED) end diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c index a60f6a3c95511b..58dd281884f711 100644 --- a/ext/json/parser/parser.c +++ b/ext/json/parser/parser.c @@ -779,11 +779,18 @@ json_eat_comments(JSON_ParserState *state, JSON_ParserConfig *config) switch (peek(state)) { case '/': { - state->cursor = memchr(state->cursor, '\n', state->end - state->cursor); - if (!state->cursor) { + const char *newline = memchr(state->cursor, '\n', state->end - state->cursor); + if (!newline) { + // state->parser marks resumable mode, where the buffer end is only a + // chunk boundary: the terminating newline may still arrive, so leave + // the comment unterminated instead of consuming to end as a one-shot + // parse would. + if (state->parser) { + raise_eos_error_at("unterminated comment, expected end of line", state, start); + } state->cursor = state->end; } else { - state->cursor++; + state->cursor = newline + 1; } break; } @@ -2737,6 +2744,41 @@ static VALUE cResumableParser_eos_p(VALUE self) return eos(&parser->state) ? Qtrue : Qfalse; } +/* + * call-seq: partial_value? -> true or false + * + * Returns whether a document is currently under construction: an unclosed + * container, a key awaiting its value, etc. + * + * It answers the same question as !partial_value.nil?, but as a + * cheap predicate on the parser's internal state, without materializing the + * partially parsed Ruby objects: + * parser << '{"a":1,' + * parser.parse # => false + * parser.partial_value? # => true + * + * A fully parsed document whose value hasn't been retrieved yet is not under + * construction: #value? returns true and #partial_value? returns false. + */ +static VALUE cResumableParser_partial_value_p(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + + // Mirror of #value?: values on the stack while the document isn't DONE + // belong to a partially built document. A container whose first key or + // element hasn't been parsed yet has no frame nor value registered (the + // tokenizer rewinds to the container start on EOS), so that state is + // observable through the buffer (#eos?/#rest) instead, keeping this + // predicate consistent with #partial_value returning nil. + if (parser->value_stack.head > 0) { + json_frame *frame = json_frame_stack_peek(&parser->frames); + if (frame->phase != JSON_PHASE_DONE) { + return Qtrue; + } + } + return Qfalse; +} + /* * call-seq: parsed_bytes -> integer * @@ -2793,6 +2835,7 @@ void Init_parser(void) rb_define_method(cResumableParser, "value", cResumableParser_value, 0); rb_define_method(cResumableParser, "value?", cResumableParser_value_p, 0); rb_define_method(cResumableParser, "partial_value", cResumableParser_partial_value, 0); + rb_define_method(cResumableParser, "partial_value?", cResumableParser_partial_value_p, 0); rb_define_method(cResumableParser, "clear", cResumableParser_clear, 0); rb_define_method(cResumableParser, "rest", cResumableParser_rest, 0); rb_define_method(cResumableParser, "eos?", cResumableParser_eos_p, 0); diff --git a/set.c b/set.c index 2827df6c4f298e..0ea48a17e19ce7 100644 --- a/set.c +++ b/set.c @@ -2411,11 +2411,10 @@ rb_set_size(VALUE set) * === Methods for Creating a \Set * * - ::[]: - * Returns a new set containing the given objects. + * Returns a new set populated with the given objects. * - ::new: - * Returns a new set containing either the given objects - * (if no block given) or the return values from the called block - * (if a block given). + * Returns a new set based on the given object (if no block given), + * or on the return values from the called block (if a block given). * * === Methods for \Set Operations * diff --git a/test/json/json_parser_test.rb b/test/json/json_parser_test.rb index 9000d1152f7ea8..2f79b87cc028e6 100644 --- a/test/json/json_parser_test.rb +++ b/test/json/json_parser_test.rb @@ -526,6 +526,7 @@ def test_parse_comments JSON assert_equal({ "key1" => "value1" }, parse(json, allow_comments: true)) assert_equal({}, parse('{} /**/', allow_comments: true)) + assert_equal({}, parse('{} // eol comment ending at eof', allow_comments: true)) assert_raise(ParserError) { parse('{} /* comment not closed', allow_comments: true) } assert_raise(ParserError) { parse('{} /*/', allow_comments: true) } assert_raise(ParserError) { parse('{} /x wrong comment', allow_comments: true) } diff --git a/test/json/resumable_parser_test.rb b/test/json/resumable_parser_test.rb index 734d6e220b769d..357052c9a4035a 100644 --- a/test/json/resumable_parser_test.rb +++ b/test/json/resumable_parser_test.rb @@ -199,6 +199,48 @@ def test_incomplete_input_at_structural_positions_resumes assert_incomplete "{\"a\":1," end + def test_line_comment_spanning_feed_boundary_is_not_terminated_early + # A `//` line comment is only terminated by a newline. When the newline + # has not arrived yet, the comment must stay incomplete rather than being + # treated as consumed -- otherwise its body, delivered in a later chunk, + # leaks out as parsed values. + values = [] + parser = new_parser(allow_comments: true) + parser << '[1] //' + values << parser.value while parser.parse + + parser << "[2]\n[3]" # [2] belongs to the comment, [3] is a real document + values << parser.value while parser.parse + + assert_equal [[1], [3]], values + end + + def test_line_comment_terminated_by_newline_across_feeds + values = [] + parser = new_parser(allow_comments: true) + parser << '[1] //co' + values << parser.value while parser.parse + + parser << "mment\n[2]" + values << parser.value while parser.parse + + assert_equal [[1], [2]], values + end + + def test_block_comment_spanning_feed_boundary_is_not_terminated_early + # A `/* */` block comment whose closing `*/` has not arrived yet must stay + # incomplete, mirroring the line-comment behaviour above. + values = [] + parser = new_parser(allow_comments: true) + parser << '[1] /*' + values << parser.value while parser.parse + + parser << '[2]*/[3]' # [2] belongs to the comment, [3] is a real document + values << parser.value while parser.parse + + assert_equal [[1], [3]], values + end + def test_rest @parser << '[1, 2, 3, "unterminated string' refute @parser.parse @@ -239,6 +281,89 @@ def test_eos assert_predicate @parser, :eos? end + def test_empty_predicate + # empty? is defined on the state left after parsing everything that + # could be parsed from the fed bytes, so drain with parse/value first. + { + '' => true, # nothing fed: vacuously empty + '{"a":1}' => true, + '{"a":1}{"b":2}' => true, + '{"a":1} ' => true, # trailing whitespace + '{"a":1}{"b":2' => false, # inside a number token + '{"a":1}{"b":' => false, # right after a colon (token boundary) + '{"a":1}{' => false, # right after an object open + '{"a":1,' => false, # right after a comma (token boundary) + '"abc' => false, # inside a string token + '[1,2' => false, # unclosed array + }.each do |json, expected| + parser = new_parser + parser << json + parser.value while parser.parse + assert_equal expected, parser.empty?, "expected #{json.inspect} to be empty? == #{expected}" + end + end + + def test_empty_predicate_with_undrained_buffer + @parser << '{"a":1}{"b":2}' + assert @parser.parse + refute_predicate @parser, :empty? # second document still in the buffer + assert_equal({ "a" => 1 }, @parser.value) + assert @parser.parse + assert_equal({ "b" => 2 }, @parser.value) + assert_predicate @parser, :empty? + end + + def test_empty_predicate_with_pending_value + # A fully parsed document awaiting retrieval with #value is not empty. + @parser << '{"a":1}' + assert @parser.parse + refute_predicate @parser, :empty? + assert_equal({ "a" => 1 }, @parser.value) + assert_predicate @parser, :empty? + end + + def test_empty_predicate_across_feeds + @parser << '{"a' # chunk boundary inside a string literal + refute @parser.parse + refute_predicate @parser, :empty? + + @parser << '":1' + refute @parser.parse + refute_predicate @parser, :empty? + + @parser << '}' + assert @parser.parse + refute_predicate @parser, :empty? # value not retrieved yet + assert_equal({ "a" => 1 }, @parser.value) + assert_predicate @parser, :empty? + end + + def test_partial_value_predicate + { + '' => false, + '{"a":1}' => false, + '{"a":1}{"b":2}' => false, + '{"a":1} ' => false, + '{"a":1}{"b":2' => true, # inside a number token + '{"a":1}{"b":' => true, # right after a colon (token boundary) + # The tokenizer rewinds to the token start on EOS, so nothing is + # registered yet for a lone '{' or an unterminated top-level string: + # partial_value returns nil and partial_value? agrees. The truncation + # is still observable through the buffer: eos? is false, rest isn't + # empty. + '{"a":1}{' => false, # right after an object open + '"abc' => false, # inside a string token + '{"a":1,' => true, # right after a comma (token boundary) + '[1,2' => true, # unclosed array + }.each do |json, expected| + parser = new_parser + parser << json + parser.value while parser.parse + assert_equal expected, parser.partial_value?, "expected #{json.inspect} to be partial_value? == #{expected}" + assert_equal !parser.partial_value.nil?, parser.partial_value?, "partial_value?/partial_value mismatch for #{json.inspect}" + end + end + def test_partial_value assert_nil @parser.partial_value assert_partial_value [1, 2, 3], '[1, 2, 3, "unterminated string'