Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions src/commands/manifest/bazel/bazel-workspace-walk.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/**
* Walk the directory tree rooted at `cwd` and return every directory that
* looks like a Bazel workspace root — i.e. contains `MODULE.bazel`,
* `WORKSPACE`, or `WORKSPACE.bazel`. Real monorepos host multiple roots
* (e.g. `envoy/mobile/MODULE.bazel`, rules_kotlin's per-example
* `examples/<name>/MODULE.bazel`); the per-workspace extraction runs once
* per discovered root.
*
* The walker is dependency-injected with the directory-prune policy:
* callers pass the set of basenames and basename prefixes the walk must
* refuse to descend into. This module intentionally hardcodes none of
* the "common" prunes (`.git`, `node_modules`, …) — Bazel callers compose
* the codebase-wide `IGNORED_DIRS` list (`src/utils/glob.mts`) with the
* Bazel-specific bits (`bazel-*` output_base symlinks,
* `.socket-auto-manifest`).
*
* Discovery is bounded-but-complete: the walk visits directories in
* deterministic (sorted) order under a single visited-directory budget
* (`MAX_WALK_DIRS`) as the only pathological-input / symlink-loop guard —
* there is no depth cap, because the deepest workspace marker observed across
* the OSS corpus (9) sat below the old depth-8 ceiling, so that ceiling
* silently dropped real first-party modules. All roots found within the
* budget are collected, sorted, then capped to `MAX_WORKSPACE_ROOTS`. Both
* the cap and a budget exhaustion `logger.warn` unconditionally (a missed
* module silently drops its Maven hub, so truncation must never be silent).
*/

import { readdirSync } from 'node:fs'
import path from 'node:path'

import { logger } from '@socketsecurity/registry/lib/logger'

// Hard ceiling on workspace roots; 16 sits well above realistic monorepo
// counts while tightening the guard against pathological inputs.
const MAX_WORKSPACE_ROOTS = 16
// Hard ceiling on directories visited. The sole guard against pathological
// inputs and symlink loops (a loop consumes the budget and stops). A few
// thousand is far above any realistic first-party tree once the prune policy
// has removed vendored/output dirs.
const DEFAULT_MAX_WALK_DIRS = 5_000
// Files whose presence promotes a directory to a workspace root.
const WORKSPACE_MARKER_FILES = new Set([
'MODULE.bazel',
'WORKSPACE',
'WORKSPACE.bazel',
])

export type FindWorkspaceRootsOptions = {
cwd: string
// Directory basenames to skip outright (exact match). Pass the union of
// the codebase-wide ignore set (`IGNORED_DIRS` in `src/utils/glob.mts`)
// and any caller-specific additions (e.g. `.socket-auto-manifest`).
ignoreDirNames?: ReadonlySet<string>
// Directory basename prefixes to skip. Bazel callers pass `['bazel-']` so
// the walk never descends into Bazel's output_base symlinks.
ignoreDirPrefixes?: readonly string[]
// Visited-directory budget override (testing); defaults to MAX_WALK_DIRS.
maxWalkDirs?: number
verbose?: boolean
}

const EMPTY_SET: ReadonlySet<string> = new Set()
const EMPTY_ARRAY: readonly string[] = []

// Walks the tree rooted at `opts.cwd` and returns absolute paths to every
// directory that contains at least one workspace marker file. Output is
// sorted for determinism and capped at MAX_WORKSPACE_ROOTS.
export function findWorkspaceRoots(opts: FindWorkspaceRootsOptions): string[] {
const { cwd, verbose } = opts
const ignoreDirNames = opts.ignoreDirNames ?? EMPTY_SET
const ignoreDirPrefixes = opts.ignoreDirPrefixes ?? EMPTY_ARRAY
const maxWalkDirs = opts.maxWalkDirs ?? DEFAULT_MAX_WALK_DIRS
const roots: string[] = []
// LIFO stack; children are pushed in reverse-sorted order so they pop in
// ascending order, giving a deterministic traversal.
const stack: string[] = [cwd]
let dirsVisited = 0
let budgetHit = false
while (stack.length) {
if (dirsVisited >= maxWalkDirs) {
budgetHit = true
break
}
const dir = stack.pop()
if (dir === undefined) {
break
}
dirsVisited += 1
let entries
try {
entries = readdirSync(dir, { withFileTypes: true })
} catch {
continue
}
let isWorkspaceRoot = false
const childNames: string[] = []
for (const entry of entries) {
if (entry.isFile()) {
if (WORKSPACE_MARKER_FILES.has(entry.name)) {
isWorkspaceRoot = true
}
continue
}
if (!entry.isDirectory()) {
continue
}
const name = entry.name
if (ignoreDirNames.has(name)) {
continue
}
// Note: `Dirent.isDirectory()` does not follow symlinks, so Bazel's
// `bazel-*` output symlinks are already excluded by the gate above. This
// prefix prune is what catches a REAL directory named `bazel-*` (and is
// cheap defense-in-depth for the symlink case).
let pruned = false
for (const prefix of ignoreDirPrefixes) {
if (name.startsWith(prefix)) {
pruned = true
break
}
}
if (!pruned) {
childNames.push(name)
}
}
if (isWorkspaceRoot) {
roots.push(dir)
}
// Descend regardless of whether this dir is itself a root — nested
// workspaces are common (root MODULE.bazel + examples/*/MODULE.bazel).
childNames.sort()
for (let i = childNames.length - 1; i >= 0; i -= 1) {
stack.push(path.join(dir, childNames[i]!))
}
}
roots.sort()
const kept = roots.slice(0, MAX_WORKSPACE_ROOTS)
const droppedCount = roots.length - kept.length
if (budgetHit) {
// The dir budget was exhausted, so an unknown number of roots may be
// undiscovered — surface it unconditionally.
logger.warn(
`Bazel workspace walk hit the ${maxWalkDirs}-directory budget; some workspaces beneath ${cwd} may be undiscovered (found ${roots.length}, kept ${kept.length}).`,
)
}
if (droppedCount > 0) {
// The cap dropped roots. Exact count when the full tree was walked; "≥"
// when the budget cut the walk short (more roots may exist).
const qualifier = budgetHit ? '≥' : ''
logger.warn(
`Bazel workspace walk found ${roots.length} workspace root(s); capping at ${MAX_WORKSPACE_ROOTS} and dropping ${qualifier}${droppedCount}.`,
)
if (verbose) {
logger.log(
'[VERBOSE] workspace walker: dropped roots:',
roots.slice(MAX_WORKSPACE_ROOTS),
)
}
}
return kept
}
224 changes: 224 additions & 0 deletions src/commands/manifest/bazel/bazel-workspace-walk.test.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import {
mkdirSync,
mkdtempSync,
rmSync,
symlinkSync,
writeFileSync,
} from 'node:fs'
import os from 'node:os'
import path from 'node:path'

import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'

import { logger } from '@socketsecurity/registry/lib/logger'

import { findWorkspaceRoots } from './bazel-workspace-walk.mts'

function touch(file: string): void {
mkdirSync(path.dirname(file), { recursive: true })
writeFileSync(file, '')
}

// A representative injected prune set for exercising the walker's generic
// name/prefix pruning. The walker hardcodes none of these; the production
// default (DEFAULT_BAZEL_WALKER_IGNORE_DIR_* in extract_bazel_to_maven.mts)
// is IGNORED_DIRS + VCS/IDE dirs for names and just `['bazel-']` for
// prefixes. `dist` is included here only as an extra arbitrary prefix to
// prove multi-prefix pruning works, not because callers pass it.
const BAZEL_IGNORE_NAMES: ReadonlySet<string> = new Set([
'.git',
'.hg',
'.idea',
'.pnpm-store',
'.socket-auto-manifest',
'.svn',
'.vscode',
'node_modules',
])
const BAZEL_IGNORE_PREFIXES: readonly string[] = ['bazel-', 'dist']

describe('bazel-workspace-walk', () => {
let tmp: string

beforeEach(() => {
tmp = mkdtempSync(path.join(os.tmpdir(), 'sock-bazel-walk-'))
})

afterEach(() => {
rmSync(tmp, { recursive: true, force: true })
})

describe('findWorkspaceRoots', () => {
it('returns the root when only the root has MODULE.bazel', () => {
touch(path.join(tmp, 'MODULE.bazel'))
expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp])
})

it('detects WORKSPACE and WORKSPACE.bazel as root markers', () => {
touch(path.join(tmp, 'WORKSPACE'))
expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp])
rmSync(path.join(tmp, 'WORKSPACE'))
touch(path.join(tmp, 'WORKSPACE.bazel'))
expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp])
})

it('finds nested workspaces at arbitrary depth', () => {
touch(path.join(tmp, 'MODULE.bazel'))
touch(path.join(tmp, 'examples', 'dagger', 'MODULE.bazel'))
touch(path.join(tmp, 'examples', 'android', 'nested', 'WORKSPACE.bazel'))
const found = findWorkspaceRoots({ cwd: tmp }).map(p =>
path.relative(tmp, p),
)
expect(found).toEqual(['', 'examples/android/nested', 'examples/dagger'])
})

it('returns [] when there is no workspace root', () => {
writeFileSync(path.join(tmp, 'README.md'), '')
expect(findWorkspaceRoots({ cwd: tmp })).toEqual([])
})

it('does NOT prune by default — pruning policy is caller-supplied', () => {
touch(path.join(tmp, 'MODULE.bazel'))
touch(path.join(tmp, 'node_modules', 'MODULE.bazel'))
const found = findWorkspaceRoots({ cwd: tmp }).map(p =>
path.relative(tmp, p),
)
expect(found).toEqual(['', 'node_modules'])
})

it('prunes injected ignoreDirNames', () => {
touch(path.join(tmp, 'MODULE.bazel'))
for (const dir of ['node_modules', '.git', '.socket-auto-manifest']) {
touch(path.join(tmp, dir, 'sub', 'MODULE.bazel'))
}
const found = findWorkspaceRoots({
cwd: tmp,
ignoreDirNames: BAZEL_IGNORE_NAMES,
}).map(p => path.relative(tmp, p))
expect(found).toEqual([''])
})

it('prunes injected ignoreDirPrefixes (bazel-* symlinks)', () => {
const fakeOutputBase = mkdtempSync(
path.join(os.tmpdir(), 'sock-fake-outbase-'),
)
try {
mkdirSync(path.join(fakeOutputBase, 'external', 'maven'), {
recursive: true,
})
touch(path.join(fakeOutputBase, 'external', 'maven', 'MODULE.bazel'))
symlinkSync(fakeOutputBase, path.join(tmp, 'bazel-out'))
touch(path.join(tmp, 'MODULE.bazel'))
const found = findWorkspaceRoots({
cwd: tmp,
ignoreDirPrefixes: BAZEL_IGNORE_PREFIXES,
}).map(p => path.relative(tmp, p))
expect(found).toEqual([''])
} finally {
rmSync(fakeOutputBase, { recursive: true, force: true })
}
})

it('prunes injected dist* prefix', () => {
touch(path.join(tmp, 'MODULE.bazel'))
touch(path.join(tmp, 'dist', 'MODULE.bazel'))
touch(path.join(tmp, 'distribution', 'MODULE.bazel'))
const found = findWorkspaceRoots({
cwd: tmp,
ignoreDirPrefixes: BAZEL_IGNORE_PREFIXES,
}).map(p => path.relative(tmp, p))
expect(found).toEqual([''])
})

it('returns absolute, sorted paths', () => {
touch(path.join(tmp, 'z', 'MODULE.bazel'))
touch(path.join(tmp, 'a', 'MODULE.bazel'))
touch(path.join(tmp, 'm', 'MODULE.bazel'))
const found = findWorkspaceRoots({ cwd: tmp })
expect(found).toEqual([
path.join(tmp, 'a'),
path.join(tmp, 'm'),
path.join(tmp, 'z'),
])
for (const p of found) {
expect(path.isAbsolute(p)).toBe(true)
}
})

it('handles an unreadable directory by skipping it (no throw)', () => {
touch(path.join(tmp, 'MODULE.bazel'))
expect(findWorkspaceRoots({ cwd: path.join(tmp, 'nope') })).toEqual([])
})

it('finds a workspace marker deeper than the old depth-8 cap (depth 9)', () => {
const deep = path.join(
tmp,
'l1',
'l2',
'l3',
'l4',
'l5',
'l6',
'l7',
'l8',
'l9',
)
touch(path.join(deep, 'MODULE.bazel'))
const found = findWorkspaceRoots({ cwd: tmp })
expect(found).toEqual([deep])
})
})

describe('findWorkspaceRoots truncation', () => {
let warnSpy: ReturnType<typeof vi.spyOn>

beforeEach(() => {
warnSpy = vi.spyOn(logger, 'warn').mockImplementation(() => logger)
})

afterEach(() => {
warnSpy.mockRestore()
})

it('caps at 16 roots, warns unconditionally, and keeps the sorted survivors', () => {
// 18 sibling roots; only the 16 lexicographically smallest survive.
const names = Array.from(
{ length: 18 },
(_, i) => `r${String(i).padStart(2, '0')}`,
)
for (const name of names) {
touch(path.join(tmp, name, 'MODULE.bazel'))
}
const found = findWorkspaceRoots({ cwd: tmp }).map(p =>
path.relative(tmp, p),
)
expect(found).toHaveLength(16)
expect(found).toEqual(names.slice(0, 16))
expect(warnSpy).toHaveBeenCalled()
expect(warnSpy.mock.calls.map(c => String(c[0])).join('\n')).toMatch(
/capping at 16 and dropping 2/,
)
})

it('warns unconditionally when the visited-directory budget is exhausted', () => {
for (const name of ['a', 'b', 'c']) {
touch(path.join(tmp, name, 'MODULE.bazel'))
}
// Budget of 3 visits tmp + a + b, then stops before c.
const found = findWorkspaceRoots({ cwd: tmp, maxWalkDirs: 3 }).map(p =>
path.relative(tmp, p),
)
expect(found).toEqual(['a', 'b'])
expect(warnSpy.mock.calls.map(c => String(c[0])).join('\n')).toMatch(
/directory budget/,
)
})

it('does not warn on a normal small tree', () => {
touch(path.join(tmp, 'MODULE.bazel'))
touch(path.join(tmp, 'examples', 'dagger', 'MODULE.bazel'))
findWorkspaceRoots({ cwd: tmp })
expect(warnSpy).not.toHaveBeenCalled()
})
})
})
Loading