Skip to content

Commit fcd2100

Browse files
committed
Allow sharing middlesubstring if the terminator is present
Sharing middle substrings have been behind a compilation flags for a very long time, and it's unclear if we'll ever make it the default. However, we can still share middle substrings without breaking the zero-terminated contract if the source string happens to have the necessary NUL characters at that specific offset. This is the case for some file formats such as FlatBuffers, BSON keys and a few others.
1 parent 1392600 commit fcd2100

2 files changed

Lines changed: 41 additions & 14 deletions

File tree

string.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -198,15 +198,30 @@ VALUE rb_cSymbol;
198198

199199
#define STR_ENC_GET(str) get_encoding(str)
200200

201+
static inline bool
202+
zero_filled(const char *s, int n)
203+
{
204+
for (; n > 0; --n) {
205+
if (*s++) return false;
206+
}
207+
return true;
208+
}
209+
201210
#if !defined SHARABLE_MIDDLE_SUBSTRING
202211
# define SHARABLE_MIDDLE_SUBSTRING 0
203212
#endif
204-
#if !SHARABLE_MIDDLE_SUBSTRING
205-
#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
213+
214+
static inline bool
215+
SHARABLE_SUBSTRING_P(VALUE str, long beg, long len)
216+
{
217+
#if SHARABLE_MIDDLE_SUBSTRING
218+
return true;
206219
#else
207-
#define SHARABLE_SUBSTRING_P(beg, len, end) 1
220+
long end = beg + len;
221+
long source_len = RSTRING_LEN(str);
222+
return end == source_len || zero_filled(RSTRING_PTR(str) + end, TERM_LEN(str));
208223
#endif
209-
224+
}
210225

211226
static inline long
212227
str_embed_capa(VALUE str)
@@ -2810,15 +2825,6 @@ rb_string_value_ptr(volatile VALUE *ptr)
28102825
return RSTRING_PTR(str);
28112826
}
28122827

2813-
static int
2814-
zero_filled(const char *s, int n)
2815-
{
2816-
for (; n > 0; --n) {
2817-
if (*s++) return 0;
2818-
}
2819-
return 1;
2820-
}
2821-
28222828
static const char *
28232829
str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
28242830
{
@@ -3138,7 +3144,7 @@ str_subseq(VALUE str, long beg, long len)
31383144
RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
31393145

31403146
const int termlen = TERM_LEN(str);
3141-
if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3147+
if (!SHARABLE_SUBSTRING_P(str, beg, len)) {
31423148
str2 = rb_enc_str_new(RSTRING_PTR(str) + beg, len, rb_str_enc_get(str));
31433149
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
31443150
ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);

test/ruby/test_string.rb

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3461,6 +3461,27 @@ def test_byteslice
34613461
assert_equal(false, ("\u3042"*10).byteslice(0, 20).valid_encoding?, bug7954)
34623462
end
34633463

3464+
def test_shared_middle_string_terminator
3465+
ten = "0123456789"
3466+
hundred = ten * 10
3467+
str = "#{hundred}\0#{hundred}".freeze
3468+
3469+
require 'objspace'
3470+
3471+
substr = str.byteslice(0, hundred.bytesize)
3472+
assert_equal hundred, substr
3473+
assert_includes ObjectSpace.dump(substr), ' "shared":true,'
3474+
3475+
# Larger terminator
3476+
substr.force_encoding(Encoding::UTF_16BE)
3477+
assert_equal hundred.dup.force_encoding(Encoding::UTF_16BE), substr
3478+
refute_includes ObjectSpace.dump(substr), ' "shared":true,'
3479+
3480+
substr = str.byteslice(0, hundred.bytesize + 1)
3481+
assert_equal hundred + "\0", substr
3482+
refute_includes ObjectSpace.dump(substr), ' "shared":true,'
3483+
end
3484+
34643485
def test_unknown_string_option
34653486
str = nil
34663487
assert_nothing_raised(SyntaxError) do

0 commit comments

Comments
 (0)