Skip to content

Commit 120c9ed

Browse files
committed
[ruby/prism] Include string in constant pool entry to avoid chasing pointer
ruby/prism@dcb2e8c924
1 parent c746def commit 120c9ed

3 files changed

Lines changed: 30 additions & 30 deletions

File tree

prism/prism.c

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22233,34 +22233,25 @@ pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, si
2223322233
.warn_mismatched_indentation = true
2223422234
};
2223522235

22236-
// Pre-size the arenas based on input size to reduce the number of block
22237-
// allocations (and the kernel page zeroing they trigger). The ratios were
22238-
// measured empirically: AST arena ~3.3x input, metadata arena ~1.1x input.
22239-
// The reserve call is a no-op when the capacity is at or below the default
22240-
// arena block size, so small inputs don't waste an extra allocation.
22236+
/* Pre-size the arenas based on input size to reduce the number of block
22237+
* allocations (and the kernel page zeroing they trigger). The ratios were
22238+
* measured empirically: AST arena ~3.3x input, metadata arena ~1.1x input.
22239+
* The reserve call is a no-op when the capacity is at or below the default
22240+
* arena block size, so small inputs don't waste an extra allocation. */
2224122241
if (size <= SIZE_MAX / 4) pm_arena_reserve(arena, size * 4);
2224222242
if (size <= SIZE_MAX / 5 * 4) pm_arena_reserve(&parser->metadata_arena, size + size / 4);
2224322243

22244-
// Initialize the constant pool. We're going to completely guess as to the
22245-
// number of constants that we'll need based on the size of the input. The
22246-
// ratio we chose here is actually less arbitrary than you might think.
22247-
//
22248-
// We took ~50K Ruby files and measured the size of the file versus the
22249-
// number of constants that were found in those files. Then we found the
22250-
// average and standard deviation of the ratios of constants/bytesize. Then
22251-
// we added 1.34 standard deviations to the average to get a ratio that
22252-
// would fit 75% of the files (for a two-tailed distribution). This works
22253-
// because there was about a 0.77 correlation and the distribution was
22254-
// roughly normal.
22255-
//
22256-
// This ratio will need to change if we add more constants to the constant
22257-
// pool for another node type.
22258-
uint32_t constant_size = ((uint32_t) size) / 95;
22244+
/* Initialize the constant pool. Measured across 1532 Ruby stdlib files, the
22245+
* bytes/constant ratio has a median of ~56 and a 90th percentile of ~135.
22246+
* We use 120 as a balance between over-allocation waste and resize
22247+
* frequency. Resizes are cheap with arena allocation, so we lean toward
22248+
* under-estimating. */
22249+
uint32_t constant_size = ((uint32_t) size) / 120;
2225922250
pm_constant_pool_init(&parser->metadata_arena, &parser->constant_pool, constant_size < 4 ? 4 : constant_size);
2226022251

22261-
// Initialize the newline list. Similar to the constant pool, we're going to
22262-
// guess at the number of newlines that we'll need based on the size of the
22263-
// input.
22252+
/* Initialize the line offset list. Similar to the constant pool, we are
22253+
* going to estimate the number of newlines that we will need based on the
22254+
* size of the input. */
2226422255
size_t newline_size = size / 22;
2226522256
pm_line_offset_list_init(&parser->metadata_arena, &parser->line_offsets, newline_size < 4 ? 4 : newline_size);
2226622257

prism/util/pm_constant_pool.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,7 @@ pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size
239239
pm_constant_pool_bucket_t *bucket;
240240

241241
while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) {
242-
pm_constant_t *constant = &pool->constants[bucket->id - 1];
243-
if ((constant->length == length) && memcmp(constant->start, start, length) == 0) {
242+
if ((bucket->length == length) && memcmp(bucket->start, start, length) == 0) {
244243
return bucket->id;
245244
}
246245

@@ -270,18 +269,17 @@ pm_constant_pool_insert(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8
270269
// If there is a collision, then we need to check if the content is the
271270
// same as the content we are trying to insert. If it is, then we can
272271
// return the id of the existing constant.
273-
pm_constant_t *constant = &pool->constants[bucket->id - 1];
274-
275-
if ((constant->length == length) && memcmp(constant->start, start, length) == 0) {
272+
if ((bucket->length == length) && memcmp(bucket->start, start, length) == 0) {
276273
// Since we have found a match, we need to check if this is
277274
// attempting to insert a shared or an owned constant. We want to
278275
// prefer shared constants since they don't require allocations.
279276
if (type != PM_CONSTANT_POOL_BUCKET_OWNED && bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) {
280277
// If we're attempting to insert a shared constant and the
281278
// existing constant is owned, then we can replace it with the
282279
// shared constant to prefer non-owned references.
283-
constant->start = start;
280+
bucket->start = start;
284281
bucket->type = (unsigned int) (type & 0x3);
282+
pool->constants[bucket->id - 1].start = start;
285283
}
286284

287285
return bucket->id;
@@ -298,7 +296,9 @@ pm_constant_pool_insert(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8
298296
*bucket = (pm_constant_pool_bucket_t) {
299297
.id = (unsigned int) (id & 0x3fffffff),
300298
.type = (unsigned int) (type & 0x3),
301-
.hash = hash
299+
.hash = hash,
300+
.start = start,
301+
.length = length
302302
};
303303

304304
pool->constants[id - 1] = (pm_constant_t) {

prism/util/pm_constant_pool.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,15 @@ typedef struct {
113113

114114
/** The hash of the bucket. */
115115
uint32_t hash;
116+
117+
/**
118+
* A pointer to the start of the string, stored directly in the bucket to
119+
* avoid a pointer chase to the constants array during probing.
120+
*/
121+
const uint8_t *start;
122+
123+
/** The length of the string. */
124+
size_t length;
116125
} pm_constant_pool_bucket_t;
117126

118127
/** A constant in the pool which effectively stores a string. */

0 commit comments

Comments
 (0)