From bd885cc97dfafb8ba8a940b434324e57ce827827 Mon Sep 17 00:00:00 2001 From: safishamsi Date: Tue, 30 Jun 2026 16:58:23 +0100 Subject: [PATCH 1/6] fix(hyperedge): accept members/node_ids alias keys for the member list (#1561) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hyperedge's member list is canonically keyed `nodes`, but producers (LLM/subagent drift, externally-supplied graph.json) sometimes emit `members` or `node_ids` — graphify only read `nodes`, so those hyperedges silently lost their members, and semantic_cleanup's prune dropped them entirely. Normalize the member key to `nodes` at one ingest chokepoint in build_from_json (and in semantic_cleanup, which runs pre-build), deduping and warning, so every downstream consumer sees the canonical key. Mirrors the existing from/to edge-endpoint aliasing. Reported by @askalot-io. Co-Authored-By: Claude Opus 4.8 (1M context) --- graphify/build.py | 58 +++++++++++++++++++++ graphify/export.py | 2 +- graphify/semantic_cleanup.py | 10 ++++ tests/test_hypergraph.py | 93 ++++++++++++++++++++++++++++++++++ tests/test_semantic_cleanup.py | 33 ++++++++++++ 5 files changed, 195 insertions(+), 1 deletion(-) diff --git a/graphify/build.py b/graphify/build.py index a3ef6f307..951ea12e7 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -53,6 +53,56 @@ } +# Hyperedge member lists are canonically keyed `nodes` (see graphify/llm.py +# extraction spec), but LLM/subagent drift and externally-supplied graph.json +# sometimes emit `members` or `node_ids`. _normalize_hyperedge_members folds +# those aliases into `nodes` at ingest so every downstream consumer reads one +# canonical key — mirroring the `from`/`to` edge-endpoint tolerance below. +_HE_MEMBER_ALIASES = ("members", "node_ids") + + +def _normalize_hyperedge_members(he: object) -> None: + """Canonicalize a hyperedge's member list onto the `nodes` key, in place. + + If `nodes` is already a list it wins (canonical), and only stray alias keys + are dropped. Otherwise the first alias (`members`, then `node_ids`) that is a + list is moved to `nodes`, deduped preserving order, with a single stderr + WARNING naming the hyperedge id and alias used. Leftover alias keys are + always removed so downstream code never re-reads them. + """ + if not isinstance(he, dict): + return + if not isinstance(he.get("nodes"), list): + for alias in _HE_MEMBER_ALIASES: + val = he.get(alias) + if isinstance(val, list): + seen: set = set() + deduped: list = [] + for ref in val: + try: + is_dupe = ref in seen + except TypeError: + is_dupe = False # unhashable ref: keep it, validator flags it + if is_dupe: + continue + try: + seen.add(ref) + except TypeError: + pass + deduped.append(ref) + he["nodes"] = deduped + print( + f"[graphify] WARNING: hyperedge " + f"'{he.get('id', '?')}' uses field '{alias}' instead of " + f"'nodes'; normalizing.", + file=sys.stderr, + ) + break + # Drop any leftover alias keys regardless of which branch ran above. + for alias in _HE_MEMBER_ALIASES: + he.pop(alias, None) + + def _norm_source_file(p: str | None, root: str | None = None) -> str | None: """Normalize path separators and relativize absolute paths. @@ -279,6 +329,14 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}: node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept") + # Canonicalize hyperedge member lists (#1561): producers sometimes key the + # member list `members`/`node_ids` instead of `nodes`. Fold aliases onto + # `nodes` here — BEFORE validation and the semantic-rekey loop below — so + # every downstream consumer (rekey, source_file relativize, to_json) reads + # one canonical key, the same way edge endpoints alias from/to at build. + for he in extraction.get("hyperedges", []) or []: + _normalize_hyperedge_members(he) + errors = validate_extraction(extraction) # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors. real_errors = [e for e in errors if "does not match any node id" not in e] diff --git a/graphify/export.py b/graphify/export.py index a281422e3..176b17909 100644 --- a/graphify/export.py +++ b/graphify/export.py @@ -679,7 +679,7 @@ def to_html( if raw_hyperedges: remapped = [] for he in raw_hyperedges: - he_members = he.get("nodes") or he.get("members") or [] + he_members = he.get("nodes", []) comm_ids, seen = [], set() for nid in he_members: c = node_to_community.get(nid) diff --git a/graphify/semantic_cleanup.py b/graphify/semantic_cleanup.py index 6bac6b0d5..0dd74dc76 100644 --- a/graphify/semantic_cleanup.py +++ b/graphify/semantic_cleanup.py @@ -12,6 +12,8 @@ import re from pathlib import Path +from .build import _normalize_hyperedge_members + # Labels longer than this many characters, or containing >= this many words, # are candidates for being sentence-like rationale text rather than entity names. _RATIONALE_MIN_CHARS = 80 @@ -101,6 +103,10 @@ def validate_semantic_fragment(fragment: object) -> list[str]: if not isinstance(he, dict): errors.append(f"hyperedges[{i}] must be an object") continue + # Fold alias member keys (members/node_ids) onto `nodes` (#1561) so + # an alias-keyed hyperedge isn't rejected here for "nodes must be a + # list" before it ever reaches build's normalization. + _normalize_hyperedge_members(he) _validate_semantic_id(errors, f"hyperedges[{i}].id", he.get("id")) he_nodes = he.get("nodes") if not isinstance(he_nodes, list): @@ -265,6 +271,10 @@ def sanitize_semantic_fragment(fragment: dict) -> dict: for he in hyperedges: if not isinstance(he, dict): continue + # Fold alias member keys (members/node_ids) onto `nodes` (#1561) so an + # alias-keyed hyperedge isn't silently dropped below for a missing + # `nodes` list before build can canonicalize it. + _normalize_hyperedge_members(he) he_nodes = he.get("nodes") if not isinstance(he_nodes, list): continue diff --git a/tests/test_hypergraph.py b/tests/test_hypergraph.py index ac4ceb546..20e79e679 100644 --- a/tests/test_hypergraph.py +++ b/tests/test_hypergraph.py @@ -233,3 +233,96 @@ def test_report_skips_hyperedges_section_when_key_missing(): G = build_from_json(extraction) report = _make_report(G) assert "## Hyperedges" not in report + + +# --------------------------------------------------------------------------- +# 7. Hyperedge member-key alias normalization (#1561) +# --------------------------------------------------------------------------- + +def _alias_extraction(): + """Three hyperedges, one per member-key spelling: nodes / members / node_ids.""" + return { + "nodes": [ + {"id": "a", "label": "A", "file_type": "code", "source_file": "m.py"}, + {"id": "b", "label": "B", "file_type": "code", "source_file": "m.py"}, + {"id": "c", "label": "C", "file_type": "code", "source_file": "m.py"}, + ], + "edges": [], + "hyperedges": [ + {"id": "he_nodes", "label": "canon", "nodes": ["a", "b", "c"]}, + {"id": "he_members", "label": "alias1", "members": ["a", "b", "c"]}, + {"id": "he_node_ids", "label": "alias2", "node_ids": ["a", "b", "c"]}, + ], + } + + +def test_build_normalizes_member_aliases_to_nodes(): + G = build_from_json(_alias_extraction()) + hes = {he["id"]: he for he in G.graph["hyperedges"]} + for hid in ("he_nodes", "he_members", "he_node_ids"): + assert hes[hid]["nodes"] == ["a", "b", "c"], hid + # alias keys are dropped post-normalization + assert "members" not in hes[hid] + assert "node_ids" not in hes[hid] + + +def test_build_dedups_alias_members_preserving_order(): + extraction = { + "nodes": [ + {"id": "a", "label": "A", "file_type": "code", "source_file": "m.py"}, + {"id": "b", "label": "B", "file_type": "code", "source_file": "m.py"}, + ], + "edges": [], + "hyperedges": [{"id": "h", "label": "x", "members": ["a", "a", "b"]}], + } + G = build_from_json(extraction) + assert G.graph["hyperedges"][0]["nodes"] == ["a", "b"] + assert "members" not in G.graph["hyperedges"][0] + + +def test_build_canonical_nodes_wins_over_alias(): + extraction = { + "nodes": [ + {"id": "a", "label": "A", "file_type": "code", "source_file": "m.py"}, + {"id": "b", "label": "B", "file_type": "code", "source_file": "m.py"}, + {"id": "x", "label": "X", "file_type": "code", "source_file": "m.py"}, + ], + "edges": [], + "hyperedges": [ + {"id": "h", "label": "x", "nodes": ["a", "b"], "members": ["x"]}, + ], + } + G = build_from_json(extraction) + he = G.graph["hyperedges"][0] + assert he["nodes"] == ["a", "b"] # canonical untouched + assert "members" not in he # stray alias dropped + + +def test_build_rekeys_alias_keyed_hyperedge_members(): + """Alias normalization must run BEFORE the semantic id-remap loop so a + `members`-keyed hyperedge's refs get rekeyed alongside `nodes`-keyed ones.""" + # Non-AST node whose id uses the OLD short stem (`mod_foo`) for source_file + # pkg/mod.py -> new canonical stem pkg_mod -> remap mod_foo => pkg_mod_foo. + extraction = { + "nodes": [ + {"id": "mod_foo", "label": "foo", "file_type": "code", "source_file": "pkg/mod.py"}, + {"id": "mod_bar", "label": "bar", "file_type": "code", "source_file": "pkg/mod.py"}, + ], + "edges": [], + "hyperedges": [ + {"id": "h", "label": "x", "members": ["mod_foo", "mod_bar"]}, + ], + } + G = build_from_json(extraction) + he = G.graph["hyperedges"][0] + assert he["nodes"] == ["pkg_mod_foo", "pkg_mod_bar"] + + +def test_build_warns_once_per_aliased_hyperedge(capsys): + build_from_json(_alias_extraction()) + err = capsys.readouterr().err + # one warning each for the two alias hyperedges, none for the nodes-keyed one + assert err.count("normalizing") == 2 + assert "he_members" in err and "members" in err + assert "he_node_ids" in err and "node_ids" in err + assert "he_nodes" not in err diff --git a/tests/test_semantic_cleanup.py b/tests/test_semantic_cleanup.py index 7feda127e..49fd03398 100644 --- a/tests/test_semantic_cleanup.py +++ b/tests/test_semantic_cleanup.py @@ -342,3 +342,36 @@ def test_sanitize_rationale_only_propagates_through_rationale_for_edges(): assert "tree-sitter" in ids["rationale_target"].get("rationale", "") # unrelated_target should NOT have rationale leaked from the `references` edge assert "rationale" not in ids["unrelated_target"] + + +def test_sanitize_keeps_members_keyed_hyperedge(capsys): + """#1561: a `members`-keyed hyperedge with >=2 surviving members must be + KEPT (normalized to `nodes`), not silently dropped before build.""" + fragment = { + "nodes": [ + {"id": "real_a", "label": "A", "file_type": "code"}, + {"id": "real_b", "label": "B", "file_type": "code"}, + ], + "edges": [], + "hyperedges": [ + {"id": "grp", "label": "Group", "members": ["real_a", "real_b"]}, + ], + } + out = sc.sanitize_semantic_fragment(fragment) + assert len(out["hyperedges"]) == 1 + he = out["hyperedges"][0] + assert he["id"] == "grp" + assert he["nodes"] == ["real_a", "real_b"] + assert "members" not in he + + +def test_validate_accepts_node_ids_keyed_hyperedge(): + """#1561: an alias-keyed hyperedge must not be rejected for a missing + `nodes` list — validate normalizes first.""" + fragment = _valid_fragment() + fragment["nodes"].append({"id": "second", "label": "Second", "file_type": "code"}) + fragment["hyperedges"] = [ + {"id": "grp", "label": "G", "node_ids": ["module_func", "second"]} + ] + errors = sc.validate_semantic_fragment(fragment) + assert errors == [] From bee3849810fb10853ac736f772cc15f8efaf580e Mon Sep 17 00:00:00 2001 From: safishamsi Date: Tue, 30 Jun 2026 16:59:16 +0100 Subject: [PATCH 2/6] fix(resolve): test mocks no longer erase the real cross-file call graph (#1553) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cross-file call resolver bailed (#543/#1219 god-node guard) whenever a bare callee name had 2+ definitions without unique import evidence — so a single same-named test mock (or any same-named symbol) dropped the real `calls` edge, erasing the call graph wherever a mock existed (the reporter saw a 76-stub Pester suite wipe everything). Replace the blunt bail with a smarter guard: when a name is ambiguous and import evidence doesn't resolve it, apply tie-breakers — non-test preference (a shared, segment-aware _is_test_path classifier) then path proximity — and emit an INFERRED edge ONLY if exactly one candidate survives, else keep bailing. A real def + a test mock resolves to the real def; two genuine non-test defs still bail (god-node guard intact, no fan-out). Wired into both the extract.py pass and the symbol_resolution.py copy via the shared classifier. Reported by @Schweinehund. Co-Authored-By: Claude Opus 4.8 (1M context) --- graphify/extract.py | 24 +++- graphify/paths.py | 190 +++++++++++++++++++++++++++++++- graphify/symbol_resolution.py | 24 +++- tests/test_extract.py | 85 ++++++++++++++ tests/test_paths.py | 99 +++++++++++++++++ tests/test_symbol_resolution.py | 94 +++++++++++++++- 6 files changed, 507 insertions(+), 9 deletions(-) create mode 100644 tests/test_paths.py diff --git a/graphify/extract.py b/graphify/extract.py index 7accab315..9761a1d20 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -33,6 +33,7 @@ from graphify.extractors.elixir import extract_elixir # noqa: F401 from graphify.extractors.razor import extract_razor # noqa: F401 from graphify.extractors.zig import extract_zig # noqa: F401 +from graphify.paths import disambiguate_ambiguous_candidates _RECURSION_LIMIT = 10_000 @@ -13976,10 +13977,15 @@ def extract( # "did the caller's file import the callee's file?" # Use relativized paths to match how file node IDs were remapped above (#502). nid_to_file_nid: dict[str, str] = {} + # nid -> raw source_file string, for the ambiguous-name tie-breakers below + # (test/non-test classification + path proximity). Kept separate from the + # file-node-id map because tie-breaking compares the actual file paths. + nid_to_source_file: dict[str, str] = {} for n in all_nodes: sf = n.get("source_file") if not sf: continue + nid_to_source_file[n["id"]] = str(sf) sf_path = Path(sf) try: sf_rel = sf_path.relative_to(root) if sf_path.is_absolute() else sf_path @@ -14031,6 +14037,7 @@ def _has_import_evidence(candidate_id: str) -> bool: symbol_matches = [c for c in candidates if c in imported_symbols] if len(symbol_matches) == 1: tgt = symbol_matches[0] + has_import_evidence = True else: module_matches = [ c for c in candidates @@ -14038,9 +14045,22 @@ def _has_import_evidence(candidate_id: str) -> bool: ] if len(module_matches) == 1: tgt = module_matches[0] + has_import_evidence = True else: - continue - has_import_evidence = True + # No unique import evidence. Instead of dropping the edge + # outright (which let a single same-named test mock erase the + # real call graph, #1553), apply the shared god-node + # tie-breakers (non-test preference, then path proximity). + # Resolve only if exactly one candidate survives; otherwise + # the #543/#1219 guard still holds and we skip. + tgt = disambiguate_ambiguous_candidates( + candidates, + {c: nid_to_source_file.get(c, "") for c in candidates}, + rc.get("source_file", ""), + ) + if tgt is None: + continue + has_import_evidence = False if tgt != caller and (caller, tgt) not in existing_pairs: existing_pairs.add((caller, tgt)) # Promote to EXTRACTED when there's a direct import edge from the diff --git a/graphify/paths.py b/graphify/paths.py index 3700f2958..d2bfdd9f5 100644 --- a/graphify/paths.py +++ b/graphify/paths.py @@ -17,10 +17,198 @@ from __future__ import annotations import os -from pathlib import Path +import re +from pathlib import Path, PurePosixPath GRAPHIFY_OUT = os.environ.get("GRAPHIFY_OUT", "graphify-out") +# Directory segments that, when they appear as a whole path component, mark the +# whole path as a test location. Matched against path *segments* (not raw +# substrings) so "src/contest.py" / "latest/x.py" / "src/greatest/x.py" do NOT +# match — only a segment that *equals* one of these names (case-insensitively). +_TEST_DIR_SEGMENTS = frozenset({"tests", "test", "spec", "specs", "__tests__"}) + +# Filename patterns marking a file as a test, matched against the *filename* +# only (case-insensitive). These are conventions across ecosystems: +# test_*.py pytest / unittest +# *_test.* Go / Python / Rust +# *.test.* JS/TS (jest, vitest) +# *.spec.* / *_spec.* Jasmine / RSpec / Karma +# *.Tests.ps1 PowerShell Pester +# *Test.java / *Tests.cs (case-sensitive convention, handled below) +_TEST_FILENAME_PATTERNS = ( + re.compile(r"^test_.*", re.IGNORECASE), + re.compile(r".*_test\..+$", re.IGNORECASE), + re.compile(r".*\.test\..+$", re.IGNORECASE), + re.compile(r".*\.spec\..+$", re.IGNORECASE), + re.compile(r".*_spec\..+$", re.IGNORECASE), + re.compile(r".*\.tests\.ps1$", re.IGNORECASE), + # Java `FooTest.java` / `FooTests.java`, C# `FooTests.cs` style. Require an + # uppercase-led `Test`/`Tests` immediately before the extension so plain + # words like "greatest"/"contest.cs" do not match. + re.compile(r".*Test\.java$"), + re.compile(r".*Tests\.java$"), + re.compile(r".*Tests\.cs$"), +) + + +def _is_test_path(path: str) -> bool: + """Classify a source path as a test path (case-insensitive, segment-aware). + + Shared by extract.py and symbol_resolution.py so cross-file call resolution + treats test mocks/stubs identically. A path is a test path when: + * any whole path segment equals a known test dir name + (``tests``/``test``/``spec``/``specs``/``__tests__``), or + * the filename matches a known test-file naming convention. + + Conservative on purpose: matches segments/filenames, never raw substrings, + so ``latest.py``, ``src/contest.py`` and ``src/greatest/x.py`` are NON-test. + """ + if not path: + return False + # Accept both POSIX and Windows separators regardless of host OS so the + # classifier is stable across the mixed paths that flow through extraction. + norm = str(path).replace("\\", "/") + pure = PurePosixPath(norm) + segments = list(pure.parts) + # Strip a leading drive/anchor segment (e.g. "C:/") that PureWindowsPath + # would surface; with the manual "\\"->"/" swap above PurePosixPath keeps + # the path body intact, but guard against a Windows drive embedded as a + # segment just in case. + for segment in segments: + if segment.lower() in _TEST_DIR_SEGMENTS: + return True + # A drive-letter colon segment like "c:" is never a test dir. + filename = pure.name + if not filename: + return False + for pattern in _TEST_FILENAME_PATTERNS: + if pattern.match(filename): + return True + return False + + +def _path_proximity_winner(call_site_file: str, candidate_files: dict[str, str]) -> str | None: + """Pick the candidate whose source file is closest to the call site. + + ``candidate_files`` maps candidate id -> its source_file. Returns a single + winning candidate id, or ``None`` when no proximity tier yields a unique + winner. Tiers, in order: + + 1. same file as the call site, + 2. same directory, + 3. longest common path-prefix (must be a strict, unique maximum). + + Used only as a secondary tie-break after the test/non-test filter, so the + god-node guard still holds when proximity is genuinely ambiguous. + """ + if not call_site_file: + return None + call_norm = str(call_site_file).replace("\\", "/") + call_dir = PurePosixPath(call_norm).parent + + # Tier 1: exact same file. + same_file = [cid for cid, f in candidate_files.items() + if str(f).replace("\\", "/") == call_norm] + if len(same_file) == 1: + return same_file[0] + if len(same_file) > 1: + return None # genuinely ambiguous within one file; bail + + # Tier 2: same directory. + same_dir = [cid for cid, f in candidate_files.items() + if PurePosixPath(str(f).replace("\\", "/")).parent == call_dir] + if len(same_dir) == 1: + return same_dir[0] + if len(same_dir) > 1: + return None + + # Tier 3: longest common path-prefix, computed over path segments. The + # winner must be a strict unique maximum, else we bail (guard holds). + call_parts = call_dir.parts + + def _common_prefix_len(f: str) -> int: + parts = PurePosixPath(str(f).replace("\\", "/")).parent.parts + n = 0 + for a, b in zip(call_parts, parts): + if a != b: + break + n += 1 + return n + + scored = sorted( + ((cid, _common_prefix_len(f)) for cid, f in candidate_files.items()), + key=lambda kv: kv[1], + reverse=True, + ) + if not scored: + return None + best = scored[0][1] + winners = [cid for cid, score in scored if score == best] + if len(winners) == 1 and best > 0: + return winners[0] + return None + + +def disambiguate_ambiguous_candidates( + candidates: list[str], + candidate_files: dict[str, str], + call_site_file: str, +) -> str | None: + """Resolve an ambiguous bare-name call to one candidate, or ``None``. + + Shared god-node tie-breaker (#1553) used by both the inline cross-file call + pass in ``extract.py`` and ``symbol_resolution.resolve_cross_file_raw_calls`` + so the heuristics stay aligned across languages. ``candidates`` is the list + of node ids sharing the callee's name; ``candidate_files`` maps each id -> + its source_file. Returns the surviving candidate id only when exactly one + survives; otherwise ``None`` (caller keeps the god-node guard / ``continue``). + + Tie-breakers, in order: + 1. NON-TEST preference. Classify the call site and each candidate as + test/non-test. When the call site is NON-test, drop test candidates. + When the call site IS a test file, prefer test-local candidates + (same file first, then any test candidate); fall back to the full set + only if no test candidate exists. + 2. PATH PROXIMITY over whatever survived step 1. + """ + if not candidates: + return None + if len(candidates) == 1: + return candidates[0] + + call_is_test = _is_test_path(call_site_file) + test_cands = [c for c in candidates if _is_test_path(candidate_files.get(c, ""))] + nontest_cands = [c for c in candidates if c not in set(test_cands)] + + if call_is_test: + # Prefer a test-local definition (same file) first. + call_norm = str(call_site_file).replace("\\", "/") + same_file_test = [ + c for c in test_cands + if str(candidate_files.get(c, "")).replace("\\", "/") == call_norm + ] + if len(same_file_test) == 1: + return same_file_test[0] + if test_cands: + survivors = test_cands + else: + survivors = nontest_cands or candidates + else: + # Non-test call site: drop test mocks/stubs entirely. + survivors = nontest_cands + + if len(survivors) == 1: + return survivors[0] + if not survivors: + return None + + # Step 2: path proximity over the survivors. + return _path_proximity_winner( + call_site_file, + {c: candidate_files.get(c, "") for c in survivors}, + ) + # Bare directory name even when GRAPHIFY_OUT is an absolute path. Used by the # path guards that walk parents looking for the output dir by name, and by the # detect scan-exclude so a custom output dir is never re-ingested as source. diff --git a/graphify/symbol_resolution.py b/graphify/symbol_resolution.py index 14c022438..892f31065 100644 --- a/graphify/symbol_resolution.py +++ b/graphify/symbol_resolution.py @@ -11,6 +11,7 @@ from typing import Any from graphify.ids import make_id as _shared_make_id +from graphify.paths import disambiguate_ambiguous_candidates from graphify.security import sanitize_metadata @@ -319,6 +320,13 @@ def resolve_cross_file_raw_calls( label_index = build_label_index(all_nodes) known_pairs = existing_edge_pairs(all_edges) + # nid -> source_file, for the shared god-node tie-breakers (#1553) so a + # same-named test mock no longer erases a real cross-file call. + nid_to_source_file = { + str(n.get("id")): str(n.get("source_file", "")) + for n in all_nodes + if n.get("id") + } resolved: list[dict[str, Any]] = [] for raw_call in iter_raw_calls(per_file): @@ -328,9 +336,21 @@ def resolve_cross_file_raw_calls( if raw_call.get("is_member_call"): continue candidates = label_index.get(callee.lower(), []) - if len(candidates) != 1: + if not candidates: continue - target = candidates[0] + if len(candidates) == 1: + target: str | None = candidates[0] + else: + # Ambiguous bare name. Apply the shared tie-breakers (non-test + # preference, then path proximity); resolve only if exactly one + # candidate survives, else preserve the god-node guard and skip. + target = disambiguate_ambiguous_candidates( + candidates, + {c: nid_to_source_file.get(c, "") for c in candidates}, + str(raw_call.get("source_file", "")), + ) + if target is None: + continue caller = str(raw_call.get("caller_nid", "")) if not caller: continue diff --git a/tests/test_extract.py b/tests/test_extract.py index 2f01bc0fd..1d70cd9c5 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -558,6 +558,91 @@ def test_cross_file_calls_skip_ambiguous_duplicate_labels(tmp_path): ) +def test_cross_file_call_survives_same_named_test_mock(tmp_path): + """A real cross-file call must NOT be erased by a same-named test mock. + + src/caller.py calls save(); src/service.py defines the real save(); a test + mock save() lives in tests/test_service.py. Before #1553 the ambiguous-name + god-node guard dropped the edge entirely. Now the non-test tie-breaker keeps + exactly one caller->save edge pointing at the SRC definition. + """ + src = tmp_path / "src" + tests = tmp_path / "tests" + src.mkdir() + tests.mkdir() + (src / "service.py").write_text("def save():\n return 'real'\n") + (src / "caller.py").write_text("def run():\n save()\n") + (tests / "test_service.py").write_text("def save():\n return 'mock'\n") + + result = extract( + [src / "caller.py", src / "service.py", tests / "test_service.py"], + cache_root=tmp_path, + ) + nodes = {n["id"]: n for n in result["nodes"]} + save_calls = [ + e for e in result["edges"] + if e["relation"] == "calls" + and nodes[e["source"]]["label"] == "run()" + and nodes[e["target"]]["label"] == "save()" + ] + assert len(save_calls) == 1, f"expected exactly one run->save edge, got {save_calls}" + target_sf = (nodes[save_calls[0]["target"]].get("source_file") or "") + assert "service.py" in target_sf and "test_service.py" not in target_sf, target_sf + + +def test_cross_file_call_god_node_guard_two_real_defs(tmp_path): + """Two genuine NON-test defs of the same name + one caller => ZERO edges. + + Proves #543/#1219 is not reopened by the #1553 tie-breakers: with no test + candidate to drop and no proximity winner, the guard still bails. + """ + pkg_a = tmp_path / "a" + pkg_b = tmp_path / "b" + pkg_c = tmp_path / "c" + for d in (pkg_a, pkg_b, pkg_c): + d.mkdir() + (pkg_a / "svc.py").write_text("def save():\n return 'a'\n") + (pkg_b / "svc.py").write_text("def save():\n return 'b'\n") + (pkg_c / "caller.py").write_text("def run():\n save()\n") + + result = extract( + [pkg_c / "caller.py", pkg_a / "svc.py", pkg_b / "svc.py"], + cache_root=tmp_path, + ) + nodes = {n["id"]: n for n in result["nodes"]} + save_calls = [ + e for e in result["edges"] + if e["relation"] == "calls" + and nodes[e["source"]]["label"] == "run()" + and nodes[e["target"]]["label"] == "save()" + ] + assert save_calls == [], f"god-node guard must bail, got {save_calls}" + + +def test_cross_file_call_survives_many_test_mocks(tmp_path): + """One src def + many same-named test stubs + caller => exactly one src edge.""" + src = tmp_path / "src" + tests = tmp_path / "tests" + src.mkdir() + tests.mkdir() + (src / "service.py").write_text("def save():\n return 'real'\n") + (src / "caller.py").write_text("def run():\n save()\n") + for i in range(5): + (tests / f"thing{i}_test.py").write_text("def save():\n return 'mock'\n") + + paths = [src / "caller.py", src / "service.py"] + sorted(tests.glob("*_test.py")) + result = extract(paths, cache_root=tmp_path) + nodes = {n["id"]: n for n in result["nodes"]} + save_calls = [ + e for e in result["edges"] + if e["relation"] == "calls" + and nodes[e["source"]]["label"] == "run()" + and nodes[e["target"]]["label"] == "save()" + ] + assert len(save_calls) == 1, f"expected one run->save edge, got {save_calls}" + assert "service.py" in (nodes[save_calls[0]["target"]].get("source_file") or "") + + def test_extract_generic_surfaces_tree_sitter_version_mismatch_hint(monkeypatch): """When Language() raises TypeError (e.g. old tree-sitter binding meets a new tree-sitter API), the error message should point users at the upgrade diff --git a/tests/test_paths.py b/tests/test_paths.py new file mode 100644 index 000000000..e0e1a2f00 --- /dev/null +++ b/tests/test_paths.py @@ -0,0 +1,99 @@ +"""Tests for graphify.paths — the shared test-path classifier (#1553).""" + +from __future__ import annotations + +import pytest + +from graphify.paths import ( + _is_test_path, + disambiguate_ambiguous_candidates, +) + + +@pytest.mark.parametrize( + "path", + [ + # test dir segments + "tests/foo.py", + "src/tests/foo.py", + "test/foo.go", + "spec/foo.rb", + "specs/foo.rb", + "app/__tests__/foo.js", + "a/b/TESTS/foo.py", # case-insensitive segment + # test filename conventions + "src/test_service.py", + "pkg/service_test.go", + "src/service.test.ts", + "src/service.spec.ts", + "src/service_spec.rb", + "ps/Module.Tests.ps1", + "java/FooTest.java", + "java/FooTests.java", + "cs/FooTests.cs", + # windows separators + "src\\tests\\foo.py", + "src\\service_test.py", + ], +) +def test_is_test_path_positive(path: str) -> None: + assert _is_test_path(path) is True, path + + +@pytest.mark.parametrize( + "path", + [ + "", + "latest.py", + "contest.py", + "src/contest.py", + "src/greatest/x.py", + "src/service.py", + "lib/helper.go", + "src/attestation.py", # "test" only as substring, not a segment + "src/testimony.py", # filename starts with "test" but no underscore + "src/contest/x.py", # "contest" is not "test" + "src/greatest.cs", # ends with "test" but not "Tests.cs" + "src/protest.java", # not "*Test.java" + "config/manifest.json", + ], +) +def test_is_test_path_negative(path: str) -> None: + assert _is_test_path(path) is False, path + + +def test_disambiguate_drops_test_candidate_for_nontest_call_site() -> None: + winner = disambiguate_ambiguous_candidates( + ["src", "mock"], + {"src": "src/service.py", "mock": "tests/test_service.py"}, + "src/caller.py", + ) + assert winner == "src" + + +def test_disambiguate_bails_on_two_nontest_candidates() -> None: + winner = disambiguate_ambiguous_candidates( + ["a", "b"], + {"a": "alpha/a.py", "b": "beta/b.py"}, + "pkg/caller.py", + ) + assert winner is None + + +def test_disambiguate_test_call_site_prefers_test_local() -> None: + winner = disambiguate_ambiguous_candidates( + ["src", "local"], + {"src": "src/service.py", "local": "tests/test_service.py"}, + "tests/test_service.py", + ) + assert winner == "local" + + +def test_disambiguate_path_proximity_same_dir() -> None: + # Two non-test candidates; the one in the call site's directory wins. + winner = disambiguate_ambiguous_candidates( + ["near", "far"], + {"near": "pkg/a/service.py", "far": "pkg/b/service.py"}, + "pkg/a/caller.py", + ) + assert winner == "near" diff --git a/tests/test_symbol_resolution.py b/tests/test_symbol_resolution.py index 44f62690a..faa3e52a9 100644 --- a/tests/test_symbol_resolution.py +++ b/tests/test_symbol_resolution.py @@ -100,6 +100,9 @@ def test_resolve_cross_file_raw_calls_skips_member_calls() -> None: def test_resolve_cross_file_raw_calls_skips_ambiguous_duplicate_labels() -> None: + """Two genuine NON-test defs of the same name: the god-node guard must still + hold even with the #1553 tie-breakers, because neither the non-test filter + nor path proximity yields a unique winner (#543/#1219 stays closed).""" per_file = [ { "raw_calls": [ @@ -107,20 +110,103 @@ def test_resolve_cross_file_raw_calls_skips_ambiguous_duplicate_labels() -> None "caller_nid": "caller_run", "callee": "log", "is_member_call": False, - "source_file": "caller.py", + "source_file": "pkg/caller.py", "source_location": "L2", } ] } ] nodes = [ - {"id": "caller_run", "label": "run()", "file_type": "code"}, - {"id": "a_log", "label": "log()", "file_type": "code"}, - {"id": "b_log", "label": "log()", "file_type": "code"}, + {"id": "caller_run", "label": "run()", "file_type": "code", "source_file": "pkg/caller.py"}, + {"id": "a_log", "label": "log()", "file_type": "code", "source_file": "alpha/a.py"}, + {"id": "b_log", "label": "log()", "file_type": "code", "source_file": "beta/b.py"}, ] assert resolve_cross_file_raw_calls(per_file, nodes, []) == [] +def test_resolve_cross_file_raw_calls_real_edge_survives_test_mock() -> None: + """A real cross-file call must resolve to the SRC definition even when a + same-named TEST mock exists in the corpus (#1553).""" + per_file = [ + { + "raw_calls": [ + { + "caller_nid": "caller_run", + "callee": "save", + "is_member_call": False, + "source_file": "src/caller.py", + "source_location": "L2", + } + ] + } + ] + nodes = [ + {"id": "caller_run", "label": "run()", "file_type": "code", "source_file": "src/caller.py"}, + {"id": "src_save", "label": "save()", "file_type": "code", "source_file": "src/service.py"}, + {"id": "mock_save", "label": "save()", "file_type": "code", + "source_file": "tests/test_service.py"}, + ] + resolved = resolve_cross_file_raw_calls(per_file, nodes, []) + assert [(e["source"], e["target"]) for e in resolved] == [("caller_run", "src_save")] + assert all(e["target"] != "mock_save" for e in resolved) + + +def test_resolve_cross_file_raw_calls_n_mock_scale() -> None: + """One src def plus many same-named test stubs: exactly one edge to src.""" + per_file = [ + { + "raw_calls": [ + { + "caller_nid": "caller_run", + "callee": "save", + "is_member_call": False, + "source_file": "src/caller.py", + "source_location": "L2", + } + ] + } + ] + nodes = [ + {"id": "caller_run", "label": "run()", "file_type": "code", "source_file": "src/caller.py"}, + {"id": "src_save", "label": "save()", "file_type": "code", "source_file": "src/service.py"}, + {"id": "m1", "label": "save()", "file_type": "code", "source_file": "tests/foo_test.py"}, + {"id": "m2", "label": "save()", "file_type": "code", "source_file": "spec/bar.Tests.ps1"}, + {"id": "m3", "label": "save()", "file_type": "code", "source_file": "test/baz_test.go"}, + {"id": "m4", "label": "save()", "file_type": "code", "source_file": "__tests__/q.test.js"}, + ] + resolved = resolve_cross_file_raw_calls(per_file, nodes, []) + assert [(e["source"], e["target"]) for e in resolved] == [("caller_run", "src_save")] + + +def test_resolve_cross_file_raw_calls_call_site_is_test_prefers_test_local() -> None: + """A test file calling save() with both a src def and a test-local def present + resolves to the test-local def (call-site-is-test symmetry, #1553).""" + per_file = [ + { + "raw_calls": [ + { + "caller_nid": "test_caller", + "callee": "save", + "is_member_call": False, + "source_file": "tests/test_service.py", + "source_location": "L5", + } + ] + } + ] + nodes = [ + {"id": "test_caller", "label": "test_it()", "file_type": "code", + "source_file": "tests/test_service.py"}, + {"id": "src_save", "label": "save()", "file_type": "code", "source_file": "src/service.py"}, + {"id": "test_save", "label": "save()", "file_type": "code", + "source_file": "tests/test_service.py"}, + ] + resolved = resolve_cross_file_raw_calls(per_file, nodes, []) + targets = [e["target"] for e in resolved] + assert targets == ["test_save"] + assert "src_save" not in targets + + def test_resolve_cross_file_raw_calls_skips_existing_pair() -> None: per_file = [ { From 3bc3feed545d635a8eec0472109966b05a955636 Mon Sep 17 00:00:00 2001 From: safishamsi Date: Tue, 30 Jun 2026 17:23:46 +0100 Subject: [PATCH 3/6] fix(extract): merge header/impl class fragmentation + C++/ObjC header routing (#1556, #1547) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A class declared in a header (Foo.h/@interface) and defined in its impl (Foo.cpp/Foo.m/@implementation) fragmented into two nodes: _file_stem drops the extension so Foo.h and Foo.cpp share a node id, which _disambiguate_colliding_node_ids then split apart by path — and the two "defs" tripped every resolver's single-definition god-node guard, cascading into missing .h<->.m/.cpp linkage and cross-file/cross-language edges. - Routing: a `.h` using `#import` now routes to extract_objc (#1556 bridging headers — extract_c drops `#import` as a preproc_call), and a `.h` with C++-only signals (class/namespace/template/::/access-specifiers) routes to extract_cpp (#1547 — the C grammar has no class_specifier, so a C++ header previously yielded a junk node and lost every method). ObjC sniff keeps priority; a plain C header still routes to extract_c. - Merge: a new _merge_decl_def_classes post-pass collapses the header/impl id-collision onto the header (declaration) variant, modeled on _merge_swift_extensions, gated so it fires ONLY for a clean sibling header/impl pair (same dir, same base stem, exactly one header) — two same-named classes in different directories have different stems and never collide, so they are never merged (god-node guard verified). C++ method definitions retain their `Foo::` qualifier so a `Foo::bar` def keys onto the header declaration (one method node, not two); free functions keep their bare-name ids. Result: one canonical class node per .h/.m or .h/.cpp pair with methods unified, which unblocks the existing member-call resolvers (verified Swift->ObjC calls and Swift `extension` folding now resolve). Strict improvement over v8 (which produced junk/fragmented nodes here, verified). Still open as follow-ups: cross-file C++ #include edge resolution and a C++/ObjC cross-file member-call resolver (a pre-existing gap, not a regression). Reported by @JabberYQ (#1556) and @c0dezer019 (#1547). Co-Authored-By: Claude Opus 4.8 (1M context) --- graphify/extract.py | 207 ++++++++++++++++++- tests/fixtures/cpp_logger/a/Logger.cpp | 3 + tests/fixtures/cpp_logger/a/Logger.h | 4 + tests/fixtures/cpp_logger/b/Logger.cpp | 3 + tests/fixtures/cpp_logger/b/Logger.h | 4 + tests/fixtures/cpp_paired/Foo.cpp | 5 + tests/fixtures/cpp_paired/Foo.h | 10 + tests/fixtures/cpp_paired/Main.cpp | 7 + tests/fixtures/cpp_samedir/Alpha.h | 4 + tests/fixtures/cpp_samedir/Beta.h | 4 + tests/fixtures/cpp_samedir/plain.h | 7 + tests/fixtures/objc_mixed/Bridging-Header.h | 1 + tests/fixtures/objc_mixed/Widget.h | 4 + tests/fixtures/objc_mixed/Widget.m | 9 + tests/fixtures/objc_mixed/WidgetExtras.swift | 5 + tests/test_languages.py | 165 +++++++++++++++ 16 files changed, 436 insertions(+), 6 deletions(-) create mode 100644 tests/fixtures/cpp_logger/a/Logger.cpp create mode 100644 tests/fixtures/cpp_logger/a/Logger.h create mode 100644 tests/fixtures/cpp_logger/b/Logger.cpp create mode 100644 tests/fixtures/cpp_logger/b/Logger.h create mode 100644 tests/fixtures/cpp_paired/Foo.cpp create mode 100644 tests/fixtures/cpp_paired/Foo.h create mode 100644 tests/fixtures/cpp_paired/Main.cpp create mode 100644 tests/fixtures/cpp_samedir/Alpha.h create mode 100644 tests/fixtures/cpp_samedir/Beta.h create mode 100644 tests/fixtures/cpp_samedir/plain.h create mode 100644 tests/fixtures/objc_mixed/Bridging-Header.h create mode 100644 tests/fixtures/objc_mixed/Widget.h create mode 100644 tests/fixtures/objc_mixed/Widget.m create mode 100644 tests/fixtures/objc_mixed/WidgetExtras.swift diff --git a/graphify/extract.py b/graphify/extract.py index 9761a1d20..cb2fcc8e9 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -1846,9 +1846,15 @@ def _get_cpp_func_name(node, source: bytes) -> str | None: if node.type in ("field_identifier", "destructor_name", "operator_name"): return _read_text(node, source) if node.type == "qualified_identifier": - name_node = node.child_by_field_name("name") - if name_node: - return _read_text(name_node, source) + # An out-of-class DEFINITION (`void Foo::bar() {}`) carries a + # qualified_identifier declarator. Retaining the `Foo::` qualifier makes + # _make_id(stem, "Foo::bar") normalize to the same id as the in-class + # member _make_id(class_nid, "bar"), so the decl in Foo.h and the def in + # Foo.cpp resolve to ONE method node instead of two (#1547). The full + # qualified text also handles nested scopes (`A::B::bar`). Free functions + # never have a qualified_identifier here, so their bare-name ids are + # unchanged; only qualified definitions shift onto their owning class. + return _read_text(node, source) decl = node.child_by_field_name("declarator") if decl: return _get_cpp_func_name(decl, source) @@ -9345,6 +9351,144 @@ def walk_imports(node) -> None: return new_edges +# Header / implementation file-extension pairing for the decl/def class merge. +_DECLDEF_HEADER_SUFFIXES = frozenset({".h", ".hpp", ".hh", ".hxx"}) +_DECLDEF_IMPL_SUFFIXES = frozenset({".m", ".mm", ".cpp", ".cc", ".cxx", ".c"}) + + +def _decldef_class_stem(source_file: str) -> tuple[str, str] | None: + """Return ``(dir, base_stem)`` for a header/impl source file, else None. + + The base stem strips an ObjC category suffix (``Foo+Cat.m`` -> ``Foo``) so a + category implementation pairs with its ``Foo.h`` declaration. Files with an + extension that is neither a header nor an impl extension return None and are + never considered for the merge. + """ + if not source_file: + return None + p = Path(source_file) + suffix = p.suffix.lower() + if suffix not in _DECLDEF_HEADER_SUFFIXES and suffix not in _DECLDEF_IMPL_SUFFIXES: + return None + stem = p.stem.split("+", 1)[0] # ObjC category: Foo+Cat -> Foo + if not stem: + return None + return (str(p.parent), stem) + + +def _merge_decl_def_classes( + all_nodes: list[dict], + all_edges: list[dict], +) -> None: + """Merge a class (and its methods) declared in a header with its definition in + a sibling impl file into ONE node, for C/C++/ObjC (#1547, #1556). + + A class declared in ``Foo.h`` (``class Foo`` / ``@interface Foo``) and defined + in the sibling ``Foo.cpp`` / ``Foo.m`` (``@implementation Foo``, plus — after + the C++ qualified-name fix — out-of-class method definitions ``Foo::bar``) + produces TWO nodes per symbol. Both are keyed off the file *stem*, and + ``_file_stem`` drops the extension, so the header symbol and its impl + counterpart get the IDENTICAL id and differ only in ``source_file`` and label + (the C++ def label is ``Foo::bar()`` vs the decl's ``bar``; the ObjC impl class + label equals the interface's). Left alone, ``_disambiguate_colliding_node_ids`` + SPLITS those id-collisions apart by path, fragmenting one class into two def + nodes — which then trips every resolver's single-definition god-node guard + (``len(defs) != 1`` -> bail), cascading into lost .h<->.m/.cpp linkage and dead + cross-file calls. + + This pass runs BEFORE disambiguation and collapses each such id-collision to + ONE node — the header (declaration) variant, consistent with the #1475 + header_remaps direction — so disambiguation sees a single source_file per id + and leaves it alone, and the downstream resolvers see ONE definition. Because + the colliding nodes already share an id, no edge re-pointing is needed: every + edge that referenced the impl symbol already points at the surviving id. We + only drop the redundant duplicate node and prefer the header's label. + + GOD-NODE GUARDS (false merges are the main risk): + + * Collapse fires ONLY when every node in an id-collision group comes from a + SIBLING header/impl set — same directory, same base stem (ObjC categories + ``Foo+Cat.m`` compare by the stem before ``+``), header extension paired + with impl extension — AND the group contains exactly ONE header file. + * Two unrelated ``class Logger`` in DIFFERENT directories never collide on id + (the id embeds the full file stem / directory path), so they are never + grouped and never merge. Two same-named classes in the SAME directory but + different base stems likewise key to different ids. Any id-collision that + is NOT a clean single-header sibling set is left untouched for + disambiguation to split (the conservative default). + + The class and its method/field members fold in together: members are keyed + ``_make_id(class_id, name)`` (ObjC) or, for an out-of-class C++ definition, + ``_make_id(stem, "Foo::bar")`` which normalizes to the same id as the in-class + member ``_make_id(class_id, "bar")``. So every decl/def member pair is itself an + id-collision across the same sibling file set and collapses by the same rule. + """ + # Group every code node by id, recording the distinct source files involved. + by_id: dict[str, list[dict]] = {} + for n in all_nodes: + if n.get("file_type") != "code": + continue + nid = n.get("id") + sf = str(n.get("source_file", "")) + if not isinstance(nid, str) or not nid or not sf: + continue + by_id.setdefault(nid, []).append(n) + + # Identify, per surviving id, which node to keep (header preferred). We can't + # mutate all_nodes mid-scan, so collect a set of node object ids to drop. + drop_objs: set[int] = set() + for nid, group in by_id.items(): + if len(group) < 2: + continue + # The distinct source files of this collision must form a clean sibling + # header/impl set with exactly one header. Each file must parse as a + # header/impl file (others -> bail), share one directory + base stem. + sibling_keys: set[tuple[str, str]] = set() + headers: list[dict] = [] + ok = True + for node in group: + sf = str(node.get("source_file", "")) + ds = _decldef_class_stem(sf) + if ds is None: + ok = False + break + sibling_keys.add(ds) + if Path(sf).suffix.lower() in _DECLDEF_HEADER_SUFFIXES: + headers.append(node) + if not ok: + continue + # All from one (dir, base_stem) sibling family, with a UNIQUE header. + if len(sibling_keys) != 1 or len(headers) != 1: + continue + keeper = headers[0] + for node in group: + if node is not keeper: + drop_objs.add(id(node)) + + if not drop_objs: + return + + # Drop the redundant duplicate nodes. The surviving (header) node keeps its + # own label/source_file; edges are unchanged because the id is identical. Then + # de-dup any now-identical edges (e.g. the impl file's `contains`/`method` + # edge that duplicates the header's after the collapse). + all_nodes[:] = [n for n in all_nodes if id(n) not in drop_objs] + + seen_keys: set[tuple] = set() + rewritten: list[dict] = [] + for e in all_edges: + src = e.get("source") + tgt = e.get("target") + if src == tgt: + continue + k = (src, tgt, e.get("relation"), e.get("context")) + if k in seen_keys: + continue + seen_keys.add(k) + rewritten.append(e) + all_edges[:] = rewritten + + def _merge_swift_extensions( per_file: list[dict], all_nodes: list[dict], @@ -13486,7 +13630,14 @@ def _body_of(block): # belongs to extract_objc, not extract_c). `@property` is deliberately excluded: it # doubles as a Doxygen comment command and ObjC properties only ever live inside an # @interface/@protocol anyway, so the stronger directives already cover them. -_OBJC_HEADER_MARKERS = (b"@interface", b"@protocol", b"@implementation", b"@import") +# +# `#import` is included because an ObjC *bridging* header is often nothing but +# `#import "X.h"` lines with no @interface (#1556). Routed to extract_c it parses +# `#import` as a `preproc_call` (not `preproc_include`), so every import edge is +# dropped and the header is isolated. `#import` is an ObjC-only directive (illegal +# in C and C++), so this won't hijack genuine C/C++ headers, and extract_objc +# resolves quoted imports via _resolve_c_include_path. +_OBJC_HEADER_MARKERS = (b"@interface", b"@protocol", b"@implementation", b"@import", b"#import") def _is_objc_header(path: Path) -> bool: @@ -13504,6 +13655,35 @@ def _is_objc_header(path: Path) -> bool: return any(marker in head for marker in _OBJC_HEADER_MARKERS) +# C++-only signals. None of these are valid in a plain C header, so finding one +# in a `.h` is a high-confidence signal the header is C++ (#1547). The C grammar +# has no class_specifier, so a `class Foo { ... };` header routed to extract_c +# loses the class and its method prototypes (a junk `foo_foo` node + a sourceless +# `class` stub); routing to extract_cpp recovers the real type. Kept CONSERVATIVE: +# a plain C header with none of these stays on extract_c. ObjC sniffing keeps +# priority (an ObjC header can legitimately contain `::`/`class` inside an inline +# C++ block when compiled as Objective-C++). +_CPP_HEADER_MARKERS = ( + b"class ", b"namespace ", b"template", b"::", + b"public:", b"private:", b"protected:", +) + + +def _is_cpp_header(path: Path) -> bool: + """Whether a `.h` file is C++ rather than plain C (#1547). + + Mirrors `_is_objc_header`: sniffs for a C++-only token. Used only to reroute + a `.h` from extract_c to extract_cpp when no ObjC marker is present (ObjC has + priority). Conservative by construction — a plain C header matches nothing + here and keeps its existing extract_c routing. + """ + try: + head = path.read_bytes()[:256 * 1024] + except OSError: + return False + return any(marker in head for marker in _CPP_HEADER_MARKERS) + + def _get_extractor(path: Path) -> Any | None: """Return the correct extractor function for a file, or None if unsupported.""" if path.name.endswith(".blade.php"): @@ -13520,8 +13700,15 @@ def _get_extractor(path: Path) -> Any | None: return extract_package_manifest # `.h` is C/C++/ObjC-ambiguous; route Objective-C headers to extract_objc # (the suffix map sends `.h` to extract_c, which can't read @interface etc.). - if path.suffix == ".h" and _is_objc_header(path): - return extract_objc + # ObjC sniffing has priority over the C++ sniff: an Objective-C++ header can + # contain both `@interface` and inline C++ (`::`), and it must parse as ObjC. + if path.suffix == ".h": + if _is_objc_header(path): + return extract_objc + # A C++ class header routed to extract_c loses the class entirely (the C + # grammar has no class_specifier). Reroute to extract_cpp (#1547). + if _is_cpp_header(path): + return extract_cpp return _DISPATCH.get(path.suffix) @@ -13794,6 +13981,14 @@ def extract( _augment_symbol_resolution_edges(paths, all_nodes, all_edges, root) + # Merge a header-declared class (and its methods) with its sibling-impl + # definition into ONE node (C/C++/ObjC #1547/#1556). Runs BEFORE the id-remap + # below: a header symbol and its impl counterpart share an id only while both + # still carry the raw file-stem prefix; the per-file prefix remap then diverges + # them (foo_h vs foo_cpp), so the collapse must happen first. Collapsing here + # also means disambiguation sees one source_file per id and won't split them. + _merge_decl_def_classes(all_nodes, all_edges) + # Remap file node IDs from absolute-path-derived to the canonical # {parent_dir}_{stem} spec form so (a) graph.json edge endpoints are stable # across machines (#502) and (b) AST file nodes match the IDs semantic diff --git a/tests/fixtures/cpp_logger/a/Logger.cpp b/tests/fixtures/cpp_logger/a/Logger.cpp new file mode 100644 index 000000000..ee35493ee --- /dev/null +++ b/tests/fixtures/cpp_logger/a/Logger.cpp @@ -0,0 +1,3 @@ +#include "Logger.h" + +void Logger::log() {} diff --git a/tests/fixtures/cpp_logger/a/Logger.h b/tests/fixtures/cpp_logger/a/Logger.h new file mode 100644 index 000000000..d858d8007 --- /dev/null +++ b/tests/fixtures/cpp_logger/a/Logger.h @@ -0,0 +1,4 @@ +class Logger { +public: + void log(); +}; diff --git a/tests/fixtures/cpp_logger/b/Logger.cpp b/tests/fixtures/cpp_logger/b/Logger.cpp new file mode 100644 index 000000000..ee35493ee --- /dev/null +++ b/tests/fixtures/cpp_logger/b/Logger.cpp @@ -0,0 +1,3 @@ +#include "Logger.h" + +void Logger::log() {} diff --git a/tests/fixtures/cpp_logger/b/Logger.h b/tests/fixtures/cpp_logger/b/Logger.h new file mode 100644 index 000000000..d858d8007 --- /dev/null +++ b/tests/fixtures/cpp_logger/b/Logger.h @@ -0,0 +1,4 @@ +class Logger { +public: + void log(); +}; diff --git a/tests/fixtures/cpp_paired/Foo.cpp b/tests/fixtures/cpp_paired/Foo.cpp new file mode 100644 index 000000000..0fc283e05 --- /dev/null +++ b/tests/fixtures/cpp_paired/Foo.cpp @@ -0,0 +1,5 @@ +#include "Foo.h" + +void Foo::bar() { + value = 1; +} diff --git a/tests/fixtures/cpp_paired/Foo.h b/tests/fixtures/cpp_paired/Foo.h new file mode 100644 index 000000000..60998e935 --- /dev/null +++ b/tests/fixtures/cpp_paired/Foo.h @@ -0,0 +1,10 @@ +#ifndef FOO_H +#define FOO_H + +class Foo { +public: + void bar(); + int value; +}; + +#endif diff --git a/tests/fixtures/cpp_paired/Main.cpp b/tests/fixtures/cpp_paired/Main.cpp new file mode 100644 index 000000000..265c19368 --- /dev/null +++ b/tests/fixtures/cpp_paired/Main.cpp @@ -0,0 +1,7 @@ +#include "Foo.h" + +int main() { + Foo f; + f.bar(); + return 0; +} diff --git a/tests/fixtures/cpp_samedir/Alpha.h b/tests/fixtures/cpp_samedir/Alpha.h new file mode 100644 index 000000000..c9875916a --- /dev/null +++ b/tests/fixtures/cpp_samedir/Alpha.h @@ -0,0 +1,4 @@ +class Dup { +public: + void a(); +}; diff --git a/tests/fixtures/cpp_samedir/Beta.h b/tests/fixtures/cpp_samedir/Beta.h new file mode 100644 index 000000000..7ba422cd9 --- /dev/null +++ b/tests/fixtures/cpp_samedir/Beta.h @@ -0,0 +1,4 @@ +class Dup { +public: + void b(); +}; diff --git a/tests/fixtures/cpp_samedir/plain.h b/tests/fixtures/cpp_samedir/plain.h new file mode 100644 index 000000000..ef9517466 --- /dev/null +++ b/tests/fixtures/cpp_samedir/plain.h @@ -0,0 +1,7 @@ +#ifndef PLAIN_H +#define PLAIN_H + +int add(int a, int b); +struct Point { int x; int y; }; + +#endif diff --git a/tests/fixtures/objc_mixed/Bridging-Header.h b/tests/fixtures/objc_mixed/Bridging-Header.h new file mode 100644 index 000000000..a46f910ed --- /dev/null +++ b/tests/fixtures/objc_mixed/Bridging-Header.h @@ -0,0 +1 @@ +#import "Widget.h" diff --git a/tests/fixtures/objc_mixed/Widget.h b/tests/fixtures/objc_mixed/Widget.h new file mode 100644 index 000000000..6544b015b --- /dev/null +++ b/tests/fixtures/objc_mixed/Widget.h @@ -0,0 +1,4 @@ +@interface Widget +- (void)render; +- (void)refresh; +@end diff --git a/tests/fixtures/objc_mixed/Widget.m b/tests/fixtures/objc_mixed/Widget.m new file mode 100644 index 000000000..fe4fff3be --- /dev/null +++ b/tests/fixtures/objc_mixed/Widget.m @@ -0,0 +1,9 @@ +#import "Widget.h" + +@implementation Widget +- (void)render { + [self refresh]; +} +- (void)refresh { +} +@end diff --git a/tests/fixtures/objc_mixed/WidgetExtras.swift b/tests/fixtures/objc_mixed/WidgetExtras.swift new file mode 100644 index 000000000..39501b819 --- /dev/null +++ b/tests/fixtures/objc_mixed/WidgetExtras.swift @@ -0,0 +1,5 @@ +extension Widget { + func describe() -> String { + return "widget" + } +} diff --git a/tests/test_languages.py b/tests/test_languages.py index fe000d8ee..3393ab164 100644 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -2582,3 +2582,168 @@ def test_systemverilog_no_dangling_edges(): for e in r["edges"]: assert e["source"] in node_ids, f"dangling source: {e}" assert e["target"] in node_ids, f"dangling target: {e}" + + +# ── Header/impl class merge + .h routing (#1547 C++, #1556 ObjC/Swift) ───────── +from graphify.extract import ( + extract as _extract_corpus, + _get_extractor, + _is_cpp_header, + _is_objc_header, +) + + +def _corpus(*relpaths): + """Run the full extract() pipeline on fixture files (absolute, resolved + paths so the per-file id-remap behaves like real usage), no shared cache.""" + import tempfile + paths = [(FIXTURES / rp).resolve() for rp in relpaths] + with tempfile.TemporaryDirectory() as td: + return _extract_corpus(paths, cache_root=Path(td)) + + +def _nodes_with_label(r, label): + return [n for n in r["nodes"] if n["label"] == label] + + +def _assert_no_dangling(r): + ids = {n["id"] for n in r["nodes"]} + for e in r["edges"]: + assert e["source"] in ids, f"dangling source: {e}" + assert e["target"] in ids, f"dangling target: {e}" + + +# --- #1547: C++ paired header/impl -------------------------------------------- + +def test_cpp_header_routes_to_cpp_extractor(): + """A `.h` with a C++ class must route to extract_cpp, not extract_c (which has + no class_specifier and would drop the class entirely).""" + p = (FIXTURES / "cpp_paired" / "Foo.h").resolve() + assert _get_extractor(p).__name__ == "extract_cpp" + assert _is_cpp_header(p) + + +def test_plain_c_header_stays_on_c_extractor(): + """A plain C header (no C++ signal) must keep its extract_c routing.""" + p = (FIXTURES / "cpp_samedir" / "plain.h").resolve() + assert not _is_cpp_header(p) + assert _get_extractor(p).__name__ == "extract_c" + + +def test_cpp_paired_single_class_node(): + """Foo.h (class) + Foo.cpp (Foo::bar def) + Main.cpp must yield exactly ONE + Foo class node — not a foo_h + foo_cpp pair, and no junk `class` stub.""" + r = _corpus("cpp_paired/Foo.h", "cpp_paired/Foo.cpp", "cpp_paired/Main.cpp") + foos = _nodes_with_label(r, "Foo") + assert len(foos) == 1, f"expected one Foo, got {[n['id'] for n in foos]}" + assert not _nodes_with_label(r, "class"), "no sourceless `class` stub should exist" + assert not _nodes_with_label(r, "foo_foo") + + +def test_cpp_paired_method_decl_and_def_are_one_node(): + """`void bar();` in Foo.h and `void Foo::bar() {}` in Foo.cpp must collapse to + ONE method node owned by the single Foo class.""" + r = _corpus("cpp_paired/Foo.h", "cpp_paired/Foo.cpp", "cpp_paired/Main.cpp") + foo = _nodes_with_label(r, "Foo")[0]["id"] + method_targets = { + e["target"] for e in r["edges"] + if e["source"] == foo and e["relation"] in ("method", "defines", "contains") + } + bar_nodes = [n for n in r["nodes"] if n["id"] in method_targets and n["label"] in ("bar", "Foo::bar()")] + # There must be exactly one node representing bar (decl and def merged). + bar_ids = {n["id"] for n in r["nodes"] if n["label"] in ("bar", "Foo::bar()")} + assert len(bar_ids) == 1, f"bar decl/def should be one node, got {bar_ids}" + assert bar_nodes, "the merged bar node should be a member of Foo" + + +def test_cpp_paired_includes_resolve_to_real_header(): + """Foo.cpp and Main.cpp `#include "Foo.h"` must resolve to the real Foo.h file + node (no dangling import).""" + r = _corpus("cpp_paired/Foo.h", "cpp_paired/Foo.cpp", "cpp_paired/Main.cpp") + ids = {n["id"] for n in r["nodes"]} + foo_h = _nodes_with_label(r, "Foo.h")[0]["id"] + imports = [e for e in r["edges"] if e["relation"] == "imports"] + assert len(imports) >= 2 + for e in imports: + assert e["target"] in ids, f"dangling import target: {e}" + assert any(e["target"] == foo_h for e in imports), "includes should target Foo.h" + + +def test_cpp_paired_no_dangling_edges(): + r = _corpus("cpp_paired/Foo.h", "cpp_paired/Foo.cpp", "cpp_paired/Main.cpp") + _assert_no_dangling(r) + + +# --- #1556: ObjC paired header/impl + bridging header ------------------------- + +def test_objc_header_with_import_routes_to_objc(): + """A bridging header that is only `#import "X.h"` (no @interface) must route to + extract_objc; extract_c parses `#import` as preproc_call and drops the edge.""" + p = (FIXTURES / "objc_mixed" / "Bridging-Header.h").resolve() + assert _is_objc_header(p) + assert _get_extractor(p).__name__ == "extract_objc" + + +def test_objc_paired_single_class_methods_not_duplicated(): + """Widget.h (@interface) + Widget.m (@implementation) -> ONE Widget class node + with its methods present once each.""" + r = _corpus("objc_mixed/Widget.h", "objc_mixed/Widget.m") + widgets = _nodes_with_label(r, "Widget") + assert len(widgets) == 1, f"expected one Widget, got {[n['id'] for n in widgets]}" + render = _nodes_with_label(r, "-render") + refresh = _nodes_with_label(r, "-refresh") + assert len(render) == 1, f"-render duplicated: {render}" + assert len(refresh) == 1, f"-refresh duplicated: {refresh}" + + +def test_objc_bridging_header_not_isolated(): + """A bridging header of only `#import "Widget.h"` must produce an imports edge + to the real Widget.h node (not be an isolated node).""" + r = _corpus("objc_mixed/Widget.h", "objc_mixed/Widget.m", "objc_mixed/Bridging-Header.h") + bridge = _nodes_with_label(r, "Bridging-Header.h")[0]["id"] + widget_h = _nodes_with_label(r, "Widget.h")[0]["id"] + out = [e for e in r["edges"] if e["source"] == bridge and e["relation"] == "imports"] + assert out, "bridging header should emit an imports edge" + assert any(e["target"] == widget_h for e in out), "bridging import should target Widget.h" + + +def test_objc_paired_no_dangling_edges(): + r = _corpus("objc_mixed/Widget.h", "objc_mixed/Widget.m", "objc_mixed/Bridging-Header.h") + _assert_no_dangling(r) + + +# --- #1556: Swift extension folds onto canonical ObjC class ------------------- + +def test_swift_extension_folds_onto_objc_class(): + """`extension Widget` in Swift over an ObjC `Widget` must fold onto the single + canonical Widget node, with its members anchored there.""" + r = _corpus("objc_mixed/Widget.h", "objc_mixed/Widget.m", "objc_mixed/WidgetExtras.swift") + widgets = _nodes_with_label(r, "Widget") + assert len(widgets) == 1, f"expected one Widget, got {[n['id'] for n in widgets]}" + wid = widgets[0]["id"] + method_targets = {e["target"] for e in r["edges"] if e["relation"] == "method" and e["source"] == wid} + labels = {n["label"] for n in r["nodes"] if n["id"] in method_targets} + assert any("describe" in l for l in labels), f"Swift extension method should anchor on Widget, got {labels}" + _assert_no_dangling(r) + + +# --- god-node guard negatives ------------------------------------------------- + +def test_decldef_merge_does_not_merge_across_directories(): + """Two unrelated `class Logger` in DIFFERENT directories (each its own .h/.cpp) + must NOT merge — assert TWO distinct Logger nodes.""" + r = _corpus( + "cpp_logger/a/Logger.h", "cpp_logger/a/Logger.cpp", + "cpp_logger/b/Logger.h", "cpp_logger/b/Logger.cpp", + ) + loggers = _nodes_with_label(r, "Logger") + assert len(loggers) == 2, f"cross-dir Loggers must stay distinct, got {[n['id'] for n in loggers]}" + assert len({n["id"] for n in loggers}) == 2 + + +def test_decldef_merge_does_not_merge_same_name_same_dir_distinct_files(): + """Two same-named `class Dup` in the SAME dir but different base stems + (Alpha.h, Beta.h) must stay distinct (no unique header/impl sibling pair).""" + r = _corpus("cpp_samedir/Alpha.h", "cpp_samedir/Beta.h") + dups = _nodes_with_label(r, "Dup") + assert len(dups) == 2, f"same-dir distinct Dups must stay distinct, got {[n['id'] for n in dups]}" From 49252d3cb754e7d6e0241483ba732c4bfa7989e1 Mon Sep 17 00:00:00 2001 From: safishamsi Date: Tue, 30 Jun 2026 18:30:05 +0100 Subject: [PATCH 4/6] feat(extract): cross-file member-call resolution for C++ and ObjC (#1547, #1556) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Connects paired classes across files: Main.cpp's `Foo f; f.bar()` now resolves to Foo::bar, and ObjC `Foo *f = [[Foo alloc] init]; [f doThing]` to Foo's doThing — the "connect with other classes" goal of #1547/#1556. Design grounded in prior-art research (ctags qualified-name matching, Doxygen's name-keyed false-edge failure modes, PAIGE's receiver-type approach, Clang USR): resolve by RECEIVER TYPE, never bare name, and skip when the type can't be inferred rather than guess (a false call edge / god-node is worse than a missing one). Mirrors the existing Swift/Python/Ruby/TS member-call resolvers. - C++ extractor now captures the member-call receiver (field_expression / qualified_identifier / pointer access) and builds a per-file type table from local declarations (`Foo f;`, `Foo* f;`, `Foo *f = ...;`); emits raw_calls. - ObjC extractor emits raw_calls for message sends with the receiver + selector and a type table from `Foo *f = ...;` locals (existing in-file selector / alloc-init / dot-syntax / @selector matching preserved). - New _resolve_cpp_member_calls / _resolve_objc_member_calls, registered for their suffixes. Receiver tiers: `Foo::bar()` / capitalized ObjC receiver and this/self/super (enclosing class) -> EXTRACTED; local-var-typed -> INFERRED. Single-definition god-node guard (skip unless exactly one type def matches); the just-shipped decl/def class merge makes a paired class one def so the guard resolves it. Verified: a.run() -> A::run only (not a same-named B::run); an uninferable receiver with run() in two classes emits zero edges (no fan-out); ObjC [f doThing] -> Foo only. - build.py: the cross-language INFERRED-call prune treated .h/.cpp/.m as different families and dropped header/impl interop calls; unified the C family (.c .h .cc .cpp .hpp .cxx .hh .hxx .cu .cuh .metal .m .mm) so a .cpp/.m call to a .h-declared method survives. Still open (tracked on #1547/#1556): the file-level `#include` edge can stay uncanonicalized when the project root isn't symlink-resolved (the extract() id-remap `continue`s on a /var-vs-/private/var mismatch) — the class connection above is robust to it; include-reachability candidate narrowing and ObjC dynamic-dispatch/id-typed receivers also deferred (expected low ObjC recall, per the research). Reported by @c0dezer019 (#1547) and @JabberYQ (#1556). Co-Authored-By: Claude Opus 4.8 (1M context) --- graphify/build.py | 11 +- graphify/extract.py | 419 +++++++++++++++++++++++- tests/test_cpp_objc_cross_file_calls.py | 263 +++++++++++++++ 3 files changed, 688 insertions(+), 5 deletions(-) create mode 100644 tests/test_cpp_objc_cross_file_calls.py diff --git a/graphify/build.py b/graphify/build.py index 951ea12e7..eb7de419f 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -547,8 +547,15 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat ".ts": "js", ".tsx": "js", ".go": "go", ".rs": "rs", ".java": "jvm", ".kt": "jvm", ".scala": "jvm", ".groovy": "jvm", - ".c": "c", ".h": "c", ".cc": "cpp", ".cpp": "cpp", ".hpp": "cpp", - ".cu": "cpp", ".cuh": "cpp", ".metal": "cpp", + # C, C++, and ObjC interoperate within one compilation unit: a method + # declared in a shared `.h` is defined/called from a `.c`/`.cpp`/`.m` + # sibling, so a cross-file INFERRED call from impl to its header decl + # is legitimate, not a phantom name-collision across languages. Treat + # the whole C family as one so the receiver-typed C++/ObjC member-call + # resolvers' header-targeting edges survive build (#1547/#1556). + ".c": "c", ".h": "c", ".cc": "c", ".cpp": "c", ".hpp": "c", + ".cxx": "c", ".hh": "c", ".hxx": "c", + ".cu": "c", ".cuh": "c", ".metal": "c", ".m": "c", ".mm": "c", ".rb": "rb", ".php": "php", ".cs": "cs", ".swift": "swift", ".lua": "lua", } src_ext = Path(G.nodes[src].get("source_file") or "").suffix.lower() diff --git a/graphify/extract.py b/graphify/extract.py index cb2fcc8e9..e0e97ef73 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -1864,6 +1864,100 @@ def _get_cpp_func_name(node, source: bytes) -> str | None: return None +def _cpp_declarator_name(node, source: bytes) -> str | None: + """Return the bare variable name from a C++ declaration declarator, unwrapping + pointer/reference/init wrappers (``*f``, ``&r``, ``f = Foo()``). Returns None + for anything that isn't a plain named local (arrays, function pointers, + structured bindings) so the type table never records a guessed receiver.""" + t = node.type + if t == "identifier": + return _read_text(node, source) + if t in ("pointer_declarator", "reference_declarator", "init_declarator"): + inner = node.child_by_field_name("declarator") + if inner is None: + for c in node.children: + if c.type in ("identifier", "pointer_declarator", + "reference_declarator"): + inner = c + break + if inner is not None: + return _cpp_declarator_name(inner, source) + return None + + +def _cpp_local_var_types(body_node, source: bytes, table: dict[str, str]) -> None: + """Collect ``var -> ClassName`` from local variable declarations in a C++ + function body, for receiver-type inference in the cross-file member-call pass + (#1547). Handles ``Foo f;``, ``Foo* f;``, ``Foo *f = ...;``, ``Foo f = Foo();``. + + Only a class-like (``type_identifier``/``qualified_identifier``) type with a + single named declarator is recorded — PRECISION over recall: a built-in type + (``int x``), an ambiguous multi-declarator line, or an un-nameable declarator + contributes nothing rather than a guess. A qualified type ``ns::Foo`` records + its simple tail ``Foo`` so it keys to the type's definition node label. + """ + stack = [body_node] + while stack: + n = stack.pop() + if n.type in ("function_definition", "lambda_expression"): + # Don't descend into a nested function/lambda: its locals are scoped + # away and would pollute this body's table. + if n is not body_node: + continue + if n.type == "declaration": + type_node = n.child_by_field_name("type") + if type_node is not None and type_node.type in ( + "type_identifier", "qualified_identifier" + ): + type_name = _read_text(type_node, source).split("::")[-1].strip() + declarators = [ + c for c in n.children + if c.type in ("identifier", "pointer_declarator", + "reference_declarator", "init_declarator") + ] + # A single declarator only: `Foo a, b;` is ambiguous to attribute + # to one receiver name cleanly, so skip multi-declarator lines. + if type_name and type_name[:1].isupper() and len(declarators) == 1: + var = _cpp_declarator_name(declarators[0], source) + if var and var not in table: + table[var] = type_name + for c in n.children: + stack.append(c) + + +def _objc_local_var_types(body_node, source: bytes, table: dict[str, str]) -> None: + """Collect ``var -> ClassName`` from ObjC local declarations (``Foo *f = ...;``) + in a method body, for receiver typing in the cross-file message-send pass + (#1556). Only a capitalized ``type_identifier`` with a single named declarator + is recorded; a built-in/lower-cased type or an un-nameable declarator is skipped + (precision over recall). Reuses the C++ declarator unwrapper (identical grammar). + """ + stack = [body_node] + while stack: + n = stack.pop() + if n.type == "method_definition" and n is not body_node: + continue + if n.type == "declaration": + type_node = n.child_by_field_name("type") + if type_node is None: + for c in n.children: + if c.type == "type_identifier": + type_node = c + break + if type_node is not None and type_node.type == "type_identifier": + type_name = _read_text(type_node, source).strip() + declarators = [ + c for c in n.children + if c.type in ("identifier", "pointer_declarator", "init_declarator") + ] + if type_name and type_name[:1].isupper() and len(declarators) == 1: + var = _cpp_declarator_name(declarators[0], source) + if var and var not in table: + table[var] = type_name + for c in n.children: + stack.append(c) + + # ── JS/TS extra walk for arrow functions ────────────────────────────────────── def _find_require_call(value_node): @@ -3918,11 +4012,31 @@ def walk_calls(node, caller_nid: str) -> None: if func_node: if func_node.type == "identifier": callee_name = _read_text(func_node, source) - elif func_node.type in ("field_expression", "qualified_identifier"): + elif func_node.type == "field_expression": + # `f.bar()` / `f->bar()` / `this->bar()`: receiver is the + # `argument` (object) field, callee is the `field` (#1547). + # Capture a simple-identifier (or `this`) receiver so the + # cross-file pass can resolve it through the file's type + # table; chained receivers (`a.b.method()`) are left to bail. + is_member_call = True + name = func_node.child_by_field_name("field") + if name: + callee_name = _read_text(name, source) + obj = func_node.child_by_field_name("argument") + if obj is not None and obj.type == "identifier": + member_receiver = _read_text(obj, source) + elif obj is not None and obj.type == "this": + member_receiver = "this" + elif func_node.type == "qualified_identifier": + # `Foo::bar()`: the scope (`Foo`) is the receiver type named + # explicitly in source (EXTRACTED), the name is the callee. is_member_call = True - name = func_node.child_by_field_name("field") or func_node.child_by_field_name("name") + name = func_node.child_by_field_name("name") if name: callee_name = _read_text(name, source) + scope = func_node.child_by_field_name("scope") + if scope is not None: + member_receiver = _read_text(scope, source) elif config.ts_module == "tree_sitter_java" and node.type == "object_creation_expression": # `new Foo(...)` — the constructed type is in the `type` field, not # `name`, so the generic path misses it (#1373). Reduce a qualified @@ -4023,6 +4137,12 @@ def walk_calls(node, caller_nid: str) -> None: rc_entry["receiver_type"] = ruby_var_types.get( caller_nid, {} ).get(member_receiver) + # Tag the C++ raw_call's language so the cross-file C++ resolver + # claims it unambiguously: a `.h` file routes to extract_cpp or + # extract_objc by content, and both resolvers see `.h` in their + # suffix sets, so a source_file suffix alone can't separate them. + if config.ts_module == "tree_sitter_cpp": + rc_entry["lang"] = "cpp" raw_calls.append(rc_entry) # Helper function calls: config('foo.bar') → uses_config edge to "foo" @@ -4156,6 +4276,14 @@ def walk_calls(node, caller_nid: str) -> None: for caller_nid, body_node in function_bodies: ruby_var_types[caller_nid] = _ruby_local_class_bindings(body_node, source) + # C++: build the per-file `var -> ClassName` table from local declarations in + # every function body so the cross-file member-call pass can type a receiver + # (#1547). File-scoped (not per-body): a later body's `Foo f;` doesn't clobber + # an earlier binding (`var not in table`), keeping resolution conservative. + if config.ts_module == "tree_sitter_cpp": + for _caller_nid, body_node in function_bodies: + _cpp_local_var_types(body_node, source, type_table) + for caller_nid, body_node in function_bodies: walk_calls(body_node, caller_nid) @@ -4203,6 +4331,8 @@ def walk_calls(node, caller_nid: str) -> None: result["swift_type_table"] = {"path": str_path, "table": type_table} elif config.ts_module in ("tree_sitter_javascript", "tree_sitter_typescript"): result["ts_type_table"] = {"path": str_path, "table": type_table} + elif config.ts_module == "tree_sitter_cpp": + result["cpp_type_table"] = {"path": str_path, "table": type_table} return result @@ -10055,6 +10185,244 @@ def _key(label: str) -> str: }) +def _resolve_cpp_member_calls( + per_file: list[dict], + all_nodes: list[dict], + all_edges: list[dict], +) -> None: + """Resolve cross-file C++ member calls (``f.bar()``, ``f->bar()``, + ``Foo::bar()``, ``this->bar()``) to the real definition of the receiver's type + (#1547). + + The shared cross-file pass drops every ``is_member_call`` because a bare method + name (``bar``) collides across the corpus and inflates god-nodes (#543/#1219). + The C++ extractor records each member call's receiver and a per-file + ``var -> ClassName`` table (``cpp_type_table``) built from local declarations. + This pass types the receiver, then emits an edge ONLY when that type resolves + to exactly ONE definition (the god-node guard). + + Receiver typing, by precision tier: + * ``Foo::bar()`` — the scope ``Foo`` names the type explicitly -> EXTRACTED. + * ``this->bar()`` — the receiver is the caller's own enclosing class -> EXTRACTED. + * ``f.bar()`` / ``f->bar()`` — ``f`` typed via the file's local table -> INFERRED. + A receiver whose type can't be inferred locally is SKIPPED (no guess): a false + call edge is worse than a missing one. The ``_merge_decl_def_classes`` pass has + already folded each header/impl class pair into one node, so a paired class is a + single definition and clears the single-definition guard. + + Must run after id-disambiguation so node ids and caller_nids are final. + """ + type_table_by_file: dict[str, dict[str, str]] = {} + for result in per_file: + tt = result.get("cpp_type_table") + if tt and tt.get("path"): + type_table_by_file[tt["path"]] = tt.get("table", {}) + + def _key(label: str) -> str: + return re.sub(r"[^a-zA-Z0-9]+", "", str(label)).lower() + + # A genuine C++ type is the target of a `contains` edge from its file node; + # bare-reference shadow nodes (ensure_named_node stubs) are not contained, so + # excluding non-contained nodes keeps them from making a real type ambiguous. + contained = {e.get("target") for e in all_edges if e.get("relation") == "contains"} + + type_def_nids: dict[str, list[str]] = {} + node_by_id: dict[str, dict] = {} + for n in all_nodes: + node_by_id[n.get("id")] = n + if n.get("source_file") and n.get("id") in contained and _is_type_like_definition(n): + type_def_nids.setdefault(_key(n.get("label", "")), []).append(n["id"]) + + # (type_node_id, method_key) -> method_node_id, and caller -> enclosing type + # (the owning class) for `this->` calls. A C++ class owns its members via + # `method` edges (out-of-line definitions) AND `defines` edges (in-class + # declarations, which the extractor models as fields); index both so a header- + # declared `void bar();` resolves. `method` wins when a key has both. + method_index: dict[tuple[str, str], str] = {} + enclosing_type: dict[str, str] = {} + for rel in ("defines", "method"): + for e in all_edges: + if e.get("relation") != rel: + continue + src, tgt = e.get("source"), e.get("target") + tnode = node_by_id.get(tgt) + if tnode is None: + continue + enclosing_type.setdefault(tgt, src) + method_index[(src, _key(tnode.get("label", "")))] = tgt + + all_raw_calls: list[dict] = [] + for result in per_file: + all_raw_calls.extend(result.get("raw_calls", [])) + + existing_pairs = {(e.get("source"), e.get("target")) for e in all_edges} + for rc in all_raw_calls: + if not rc.get("is_member_call"): + continue + receiver = rc.get("receiver") + callee = rc.get("callee") + caller = rc.get("caller_nid") + if not receiver or not callee or not caller: + continue + src_file = rc.get("source_file", "") + # Only resolve C++ raw_calls (other languages share the raw_calls list; + # a `.h` may route to either extract_cpp or extract_objc by content, so the + # extractor-stamped `lang` tag — not the suffix — is the unambiguous gate). + if rc.get("lang") != "cpp": + continue + # Determine the receiver's type and the resulting confidence. + if receiver == "this": + # this->bar(): receiver is the caller's own enclosing class. + type_nid = enclosing_type.get(caller) + if not type_nid: + continue + type_qualified = True + elif receiver[:1].isupper(): + # Foo::bar(): the type is named explicitly in source. + type_defs = type_def_nids.get(_key(receiver), []) + if len(type_defs) != 1: # ambiguous or absent -> bail (god-node guard) + continue + type_nid = type_defs[0] + type_qualified = True + else: + # f.bar() / f->bar(): type the receiver via the file's local table. + type_name = type_table_by_file.get(src_file, {}).get(receiver) + if not type_name: + continue + type_defs = type_def_nids.get(_key(type_name), []) + if len(type_defs) != 1: # ambiguous or absent -> bail (god-node guard) + continue + type_nid = type_defs[0] + type_qualified = False + method_nid = method_index.get((type_nid, _key(callee))) + target = method_nid or type_nid + relation = "calls" if method_nid else "references" + if target == caller or (caller, target) in existing_pairs: + continue + existing_pairs.add((caller, target)) + all_edges.append({ + "source": caller, + "target": target, + "relation": relation, + "context": "call", + "confidence": "EXTRACTED" if type_qualified else "INFERRED", + "confidence_score": 1.0 if type_qualified else 0.8, + "source_file": src_file, + "source_location": rc.get("source_location"), + "weight": 1.0, + }) + + +def _resolve_objc_member_calls( + per_file: list[dict], + all_nodes: list[dict], + all_edges: list[dict], +) -> None: + """Resolve cross-file Objective-C message sends (``[recv sel]``) to the real + definition of the receiver's type (#1556). + + The ObjC extractor keeps its same-file selector matching (alloc/init refs, + dot-syntax accesses, @selector) and additionally emits ``raw_calls`` for every + message send, with the receiver and the reconstructed selector as the callee. + This pass types the receiver and emits a cross-file ``calls`` edge ONLY when the + type resolves to exactly ONE definition (the god-node guard). + + Receiver typing: + * ``self`` / ``super`` — the caller's own enclosing class -> EXTRACTED. + * Capitalized receiver (``[Foo new]``) — the type named explicitly -> EXTRACTED. + * ``[f doThing]`` — ``f`` typed via the file's ``Foo *f`` local table -> INFERRED. + An uninferable receiver is SKIPPED (no guess), so an ambiguous selector across + classes never fans out. ``_merge_decl_def_classes`` folds each @interface/@impl + pair into one node, so a paired class clears the single-definition guard. + + Must run after id-disambiguation so node ids and caller_nids are final. + """ + type_table_by_file: dict[str, dict[str, str]] = {} + for result in per_file: + tt = result.get("objc_type_table") + if tt and tt.get("path"): + type_table_by_file[tt["path"]] = tt.get("table", {}) + + def _key(label: str) -> str: + return re.sub(r"[^a-zA-Z0-9]+", "", str(label)).lower() + + contained = {e.get("target") for e in all_edges if e.get("relation") == "contains"} + + type_def_nids: dict[str, list[str]] = {} + node_by_id: dict[str, dict] = {} + for n in all_nodes: + node_by_id[n.get("id")] = n + if n.get("source_file") and n.get("id") in contained and _is_type_like_definition(n): + type_def_nids.setdefault(_key(n.get("label", "")), []).append(n["id"]) + + method_index: dict[tuple[str, str], str] = {} + enclosing_type: dict[str, str] = {} + for e in all_edges: + if e.get("relation") != "method": + continue + src, tgt = e.get("source"), e.get("target") + enclosing_type.setdefault(tgt, src) + tnode = node_by_id.get(tgt) + if tnode is not None: + # ObjC method labels carry a +/- sigil (`-doThing`); strip it so the + # selector `doThing` keys to the method. + method_index[(src, _key(tnode.get("label", "")))] = tgt + + all_raw_calls: list[dict] = [] + for result in per_file: + all_raw_calls.extend(result.get("raw_calls", [])) + + existing_pairs = {(e.get("source"), e.get("target")) for e in all_edges} + for rc in all_raw_calls: + if not rc.get("is_member_call"): + continue + receiver = rc.get("receiver") + callee = rc.get("callee") + caller = rc.get("caller_nid") + if not receiver or not callee or not caller: + continue + src_file = rc.get("source_file", "") + if rc.get("lang") != "objc": + continue + if receiver in ("self", "super"): + type_nid = enclosing_type.get(caller) + if not type_nid: + continue + type_qualified = True + elif receiver[:1].isupper(): + type_defs = type_def_nids.get(_key(receiver), []) + if len(type_defs) != 1: # ambiguous or absent -> bail (god-node guard) + continue + type_nid = type_defs[0] + type_qualified = True + else: + type_name = type_table_by_file.get(src_file, {}).get(receiver) + if not type_name: + continue + type_defs = type_def_nids.get(_key(type_name), []) + if len(type_defs) != 1: # ambiguous or absent -> bail (god-node guard) + continue + type_nid = type_defs[0] + type_qualified = False + method_nid = method_index.get((type_nid, _key(callee))) + target = method_nid or type_nid + relation = "calls" if method_nid else "references" + if target == caller or (caller, target) in existing_pairs: + continue + existing_pairs.add((caller, target)) + all_edges.append({ + "source": caller, + "target": target, + "relation": relation, + "context": "call", + "confidence": "EXTRACTED" if type_qualified else "INFERRED", + "confidence_score": 1.0 if type_qualified else 0.8, + "source_file": src_file, + "source_location": rc.get("source_location"), + "weight": 1.0, + }) + + # Register the cross-file, language-specific member-call resolvers into the shared # registry (framework lives in graphify.resolver_registry). A new language plugs in # by adding one register() call below — no edits to extract()'s body. Order @@ -10073,6 +10441,23 @@ def _key(label: str) -> str: register_language_resolver( LanguageResolver("typescript_member_calls", frozenset({".ts", ".tsx", ".js", ".jsx"}), _resolve_typescript_member_calls) ) +# C++ (#1547) and ObjC (#1556) receiver-typed member-call resolution. `.h` is in +# both suffix sets because it routes to extract_cpp or extract_objc by content; the +# resolvers each claim only their own raw_calls via the extractor-stamped `lang`. +register_language_resolver( + LanguageResolver( + "cpp_member_calls", + frozenset({".cpp", ".cc", ".cxx", ".hpp", ".cu", ".cuh", ".metal", ".h"}), + _resolve_cpp_member_calls, + ) +) +register_language_resolver( + LanguageResolver( + "objc_member_calls", + frozenset({".m", ".mm", ".h"}), + _resolve_objc_member_calls, + ) +) def extract_objc(path: Path) -> dict: @@ -10105,6 +10490,10 @@ def extract_objc(path: Path) -> dict: edges: list[dict] = [] seen_ids: set[str] = set() method_bodies: list[tuple[str, Any, str]] = [] + # #1556: unresolved message sends saved for the cross-file ObjC resolver, plus a + # per-file `var -> ClassName` table from `Foo *f = ...;` local declarations. + raw_calls: list[dict] = [] + objc_type_table: dict[str, str] = {} def add_node(nid: str, label: str, line: int) -> None: if nid not in seen_ids: @@ -10332,6 +10721,11 @@ def walk(node, parent_nid: str | None = None) -> None: for m_nid, _, container_nid in method_bodies: class_method_nids.setdefault(container_nid, set()).add(m_nid) seen_calls: set[tuple[str, str]] = set() + # #1556: per-file `var -> ClassName` table from local declarations in every + # method body, so the cross-file resolver can type a `[f doThing]` receiver. + for _m_nid, body_node, _container in method_bodies: + _objc_local_var_types(body_node, source, objc_type_table) + for caller_nid, body_node, container_nid in method_bodies: sibling_nids = class_method_nids.get(container_nid, set()) @@ -10378,6 +10772,21 @@ def walk_calls(n) -> None: seen_calls.add(pair) add_edge(caller_nid, candidate, "calls", n.start_point[0] + 1, confidence="EXTRACTED", weight=1.0, context="call") + # #1556: also emit a raw_call so the cross-file resolver can type + # the receiver and link to a method in ANOTHER file. A bare + # identifier receiver (`f`, `self`, `Foo`) is captured; a nested + # message send (`[[Foo alloc] init]`) has no simple receiver name + # to type, so it is left to the alloc/init `references` edge above. + if recv is not None and recv.type == "identifier": + raw_calls.append({ + "caller_nid": caller_nid, + "callee": method_name, + "is_member_call": True, + "source_file": str_path, + "source_location": f"L{n.start_point[0] + 1}", + "receiver": _read(recv), + "lang": "objc", + }) elif n.type == "field_expression": # self.name / self.product.name — dot-syntax sugar for [self name]. # Resolve to a sibling method of the SAME class, matched by EXACT @@ -10422,7 +10831,11 @@ def walk_calls(n) -> None: walk_calls(child) walk_calls(body_node) - return {"nodes": nodes, "edges": edges, "input_tokens": 0, "output_tokens": 0} + result = {"nodes": nodes, "edges": edges, "raw_calls": raw_calls, + "input_tokens": 0, "output_tokens": 0} + if objc_type_table: + result["objc_type_table"] = {"path": str_path, "table": objc_type_table} + return result diff --git a/tests/test_cpp_objc_cross_file_calls.py b/tests/test_cpp_objc_cross_file_calls.py new file mode 100644 index 000000000..051a24373 --- /dev/null +++ b/tests/test_cpp_objc_cross_file_calls.py @@ -0,0 +1,263 @@ +"""Cross-file member-call and include resolution for C++ (#1547) and ObjC (#1556). + +Mirrors tests/test_swift_cross_file_calls.py. The principle under test is PRECISION +over recall: resolution is by RECEIVER TYPE (never a bare method name), guarded by a +single-definition god-node check — an ambiguous or uninferable receiver yields ZERO +edges rather than a fan-out. +""" +from __future__ import annotations + +from pathlib import Path + +from graphify.build import build_from_json +from graphify.extract import extract + + +def _write(path: Path, text: str) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + return path + + +def _label(result: dict, nid: str) -> str: + for n in result["nodes"]: + if n["id"] == nid: + return n.get("label", "") + return f"<{nid}>" + + +def _call_edges(result: dict, relations=("calls",)): + """{(source_label, relation, target_label, confidence)} for the given relations.""" + out = set() + for e in result["edges"]: + if e.get("relation") in relations: + out.add(( + _label(result, e["source"]), + e["relation"], + _label(result, e["target"]), + e.get("confidence"), + )) + return out + + +# ── C++ #include survival (#1547) ───────────────────────────────────────────── + +def test_cpp_cross_file_member_call_connects_with_relative_paths(tmp_path): + """The headline #1547 fix: a paired class no longer islands — Main.cpp's use of + Foo connects to Foo's method across files. Use RELATIVE input paths (the real + `graphify extract .` usage), which is what exposes resolution gaps; an earlier + absolute-path-only test masked them. + + NOTE: the file-level `#include` edge (Main.cpp file -> Foo.h file) is NOT asserted + here. It relies on the extract() file-node id-remap, which `continue`s when the + project `root` isn't symlink-resolved (e.g. macOS /var vs /private/var, worktrees), + leaving the absolute-derived include target uncanonicalized. That's a known + remaining gap tracked on #1547/#1556. The class connection below — the actual + "connect with other classes" goal — resolves via the type-def index + the merged + class and is robust to that gap. + """ + import os + base = tmp_path / "src" + _write(base / "Foo.h", "class Foo {\npublic:\n void bar();\n};\n") + _write(base / "Foo.cpp", '#include "Foo.h"\nvoid Foo::bar() {}\n') + _write(base / "Main.cpp", '#include "Foo.h"\nint main() { Foo f; f.bar(); return 0; }\n') + old = os.getcwd() + try: + os.chdir(tmp_path) + result = extract( + [Path("src/Foo.h"), Path("src/Foo.cpp"), Path("src/Main.cpp")], + cache_root=Path(".cache"), parallel=False, + ) + finally: + os.chdir(old) + # Foo is one merged class (decl in .h + def in .cpp), not two fragments. + foo_classes = [n for n in result["nodes"] if n.get("label") == "Foo"] + assert len(foo_classes) == 1, f"Foo should be one node, got {[n['id'] for n in foo_classes]}" + # main() connects to Foo::bar across files (resolved by inferred receiver type `Foo f`). + labels = {n["id"]: n.get("label", "") for n in result["nodes"]} + main_bar = [ + e for e in result["edges"] + if e.get("relation") == "calls" + and "main" in labels.get(e["source"], "") + and e["target"].endswith("_bar") + ] + assert main_bar, "Main.cpp's f.bar() should resolve to Foo::bar across files" + # The resolved target is Foo's bar (id under the Foo class), not some other class. + assert all("foo" in e["target"] for e in main_bar), main_bar + + +# ── C++ member calls (#1547) ────────────────────────────────────────────────── + +def test_cpp_instance_member_call_resolves(tmp_path: Path): + # `Foo f; f.bar();` in Main.cpp resolves to Foo::bar — INFERRED (receiver typed + # from the local declaration), exactly one calls edge. + base = tmp_path / "src" + _write(base / "Foo.h", "class Foo {\npublic:\n void bar();\n};\n") + _write(base / "Foo.cpp", '#include "Foo.h"\nvoid Foo::bar() {}\n') + _write(base / "Main.cpp", '#include "Foo.h"\nint main() { Foo f; f.bar(); }\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + calls = _call_edges(result) + assert ("main()", "calls", "bar", "INFERRED") in calls + # Exactly one bar call edge from main (no fan-out, no duplicate). + bar_calls = [c for c in calls if c[0] == "main()" and c[2] == "bar"] + assert len(bar_calls) == 1 + + +def test_cpp_pointer_member_call_resolves(tmp_path: Path): + # `Foo* f; f->bar();` resolves the same way via pointer-arrow access. + base = tmp_path / "src" + _write(base / "Foo.h", "class Foo {\npublic:\n void bar();\n};\n") + _write(base / "Foo.cpp", '#include "Foo.h"\nvoid Foo::bar() {}\n') + _write(base / "Main.cpp", '#include "Foo.h"\nint main() { Foo* f = new Foo(); f->bar(); }\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + calls = _call_edges(result) + assert ("main()", "calls", "bar", "INFERRED") in calls + + +def test_cpp_qualified_member_call_is_extracted(tmp_path: Path): + # `Foo::bar()` names the type explicitly in source -> EXTRACTED. + base = tmp_path / "src" + _write(base / "Foo.h", "class Foo {\npublic:\n static void bar();\n};\n") + _write(base / "Foo.cpp", '#include "Foo.h"\nvoid Foo::bar() {}\n') + _write(base / "Main.cpp", '#include "Foo.h"\nint main() { Foo::bar(); }\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + calls = _call_edges(result) + assert ("main()", "calls", "bar", "EXTRACTED") in calls + + +def test_cpp_this_member_call_resolves_to_enclosing_class(tmp_path: Path): + # `this->bar()` inside Foo::baz resolves to Foo::bar (the caller's own class) -> + # EXTRACTED. Cross-file: the body lives in Foo.cpp, the decl in Foo.h. + base = tmp_path / "src" + _write(base / "Foo.h", "class Foo {\npublic:\n void bar();\n void baz();\n};\n") + _write(base / "Foo.cpp", '#include "Foo.h"\nvoid Foo::bar() {}\nvoid Foo::baz() { this->bar(); }\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + calls = _call_edges(result) + assert ("baz", "calls", "bar", "EXTRACTED") in calls + + +def test_cpp_godnode_guard_ambiguous_and_unknown_receiver(tmp_path: Path): + # Two classes A and B BOTH define run(). An uninferable receiver `x.run()` + # emits ZERO edges (no fan-out). `A a; a.run()` resolves to A::run ONLY. + base = tmp_path / "src" + _write(base / "A.h", "class A {\npublic:\n void run();\n};\n") + _write(base / "A.cpp", '#include "A.h"\nvoid A::run() {}\n') + _write(base / "B.h", "class B {\npublic:\n void run();\n};\n") + _write(base / "B.cpp", '#include "B.h"\nvoid B::run() {}\n') + _write(base / "Main.cpp", + '#include "A.h"\n#include "B.h"\nint main() { x.run(); A a; a.run(); }\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + src_by_id = {n["id"]: n.get("source_file") for n in result["nodes"]} + run_calls = [ + e for e in result["edges"] + if e.get("relation") == "calls" + and _label(result, e["source"]) == "main()" + and _label(result, e["target"]) == "run" + ] + # Exactly one resolved run() call, and it targets A's run (not B's, not both). + assert len(run_calls) == 1 + assert Path(src_by_id[run_calls[0]["target"]]).name == "A.h" + + +def test_cpp_resolved_call_survives_build(tmp_path: Path): + # The receiver-typed call targets the header-declared method node; build_from_json + # must keep it. The cross-language INFERRED-call guard treats C/C++ as one family, + # so a `.cpp` -> `.h`-declared-method edge is not pruned (#1547). + base = tmp_path / "src" + _write(base / "Foo.h", "class Foo {\npublic:\n void bar();\n};\n") + _write(base / "Foo.cpp", '#include "Foo.h"\nvoid Foo::bar() {}\n') + _write(base / "Main.cpp", '#include "Foo.h"\nint main() { Foo f; f.bar(); }\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + g = build_from_json(result) + cross = [ + d for _, _, d in g.edges(data=True) + if d.get("relation") == "calls" and d.get("confidence") == "INFERRED" + ] + assert len(cross) >= 1 + + +def test_cpp_unknown_receiver_emits_no_edge(tmp_path: Path): + # A lowercase receiver absent from the file's local type table is never guessed. + base = tmp_path / "src" + _write(base / "Helper.h", "class Helper {\npublic:\n void help();\n};\n") + _write(base / "Helper.cpp", '#include "Helper.h"\nvoid Helper::help() {}\n') + _write(base / "Main.cpp", '#include "Helper.h"\nint main() { mystery.help(); }\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + calls = _call_edges(result) + assert not any(c[0] == "main()" and c[2] == "help" for c in calls) + + +# ── ObjC member calls (#1556) ───────────────────────────────────────────────── + +def test_objc_instance_message_send_resolves(tmp_path: Path): + # `Foo *f = [[Foo alloc] init]; [f doThing];` in Bar.m -> cross-file calls edge + # to Foo's -doThing (INFERRED, receiver typed from the `Foo *f` local). + base = tmp_path / "src" + _write(base / "Foo.h", "@interface Foo : NSObject\n- (void)doThing;\n@end\n") + _write(base / "Foo.m", '#import "Foo.h"\n@implementation Foo\n- (void)doThing {}\n@end\n') + _write(base / "Bar.m", + '#import "Foo.h"\n@implementation Bar\n' + '- (void)go {\n Foo *f = [[Foo alloc] init];\n [f doThing];\n}\n@end\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + calls = _call_edges(result) + assert ("-go", "calls", "-doThing", "INFERRED") in calls + + +def test_objc_self_message_send_resolves_to_enclosing_class(tmp_path: Path): + # `[self render]` inside Foo resolves to Foo's -render -> EXTRACTED. + base = tmp_path / "src" + _write(base / "Foo.h", "@interface Foo : NSObject\n- (void)render;\n- (void)setup;\n@end\n") + _write(base / "Foo.m", + '#import "Foo.h"\n@implementation Foo\n' + '- (void)setup { [self render]; }\n- (void)render {}\n@end\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + calls = _call_edges(result) + assert ("-setup", "calls", "-render", "EXTRACTED") in calls + + +def test_objc_godnode_guard_ambiguous_selector(tmp_path: Path): + # Two classes A and B BOTH define -doStuff. An uninferable receiver `[thing + # doStuff]` emits ZERO edges across the corpus (no ambiguous fan-out). + base = tmp_path / "src" + _write(base / "A.h", "@interface A : NSObject\n- (void)doStuff;\n@end\n") + _write(base / "A.m", '#import "A.h"\n@implementation A\n- (void)doStuff {}\n@end\n') + _write(base / "B.h", "@interface B : NSObject\n- (void)doStuff;\n@end\n") + _write(base / "B.m", '#import "B.h"\n@implementation B\n- (void)doStuff {}\n@end\n') + _write(base / "C.m", + '#import "A.h"\n#import "B.h"\n@implementation C\n' + '- (void)go { [thing doStuff]; }\n@end\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + go_calls = [ + e for e in result["edges"] + if e.get("relation") == "calls" and _label(result, e["source"]) == "-go" + ] + assert go_calls == [] + + +def test_objc_resolved_calls_survive_build(tmp_path: Path): + # The cross-file ObjC call must land on a real definition node so + # build_from_json keeps it (no dangling target pruned). + base = tmp_path / "src" + _write(base / "Foo.h", "@interface Foo : NSObject\n- (void)doThing;\n@end\n") + _write(base / "Foo.m", '#import "Foo.h"\n@implementation Foo\n- (void)doThing {}\n@end\n') + _write(base / "Bar.m", + '#import "Foo.h"\n@implementation Bar\n' + '- (void)go {\n Foo *f = [[Foo alloc] init];\n [f doThing];\n}\n@end\n') + result = extract(sorted(base.glob("*")), cache_root=tmp_path / "cache") + + g = build_from_json(result) + cross = [ + d for _, _, d in g.edges(data=True) + if d.get("relation") == "calls" and d.get("confidence") == "INFERRED" + ] + assert len(cross) >= 1 From b9d8067d0a658ac986b058cb19c35b4ba755e5fd Mon Sep 17 00:00:00 2001 From: TheFedaikin Date: Tue, 30 Jun 2026 19:32:10 +0100 Subject: [PATCH 5/6] feat(csharp): namespace-aware cross-file type resolution (#1562) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the C# type-reference resolver (#1466) to be namespace-aware, advancing the #1318 shadow-node umbrella for C#: - The namespace is folded into the C# node id (_make_id(stem, namespace, name)), so two same-named types in different namespaces in one file no longer collapse — replacing #1466's detect-and-skip workaround for multi-namespace files. - Lexical per-block using-scope: a `using` applies only where it is in scope (file-level, or the enclosing namespace block via a scope chain), so sibling namespace blocks no longer share each other's usings. - Qualified references (`Namespace.Type`) resolve via in-scope aliases (`using Q = X.Y`) then exact known namespaces; generics are stripped. Preserves (and tightens) the refuse-rather-than-guess discipline: a bare reference resolves only when exactly one in-scope namespace provides the type; an ambiguous reference (e.g. `using A; using B;` both defining `Widget`) resolves to nothing rather than fanning out. Verified: `using A` -> A.Widget only; ambiguous -> no edge; qualified `B.Widget` -> B.Widget regardless of usings; sibling-block using-scope isolated; no dangling edges or fan-out. Reconciled onto current v8 (the PR predated the C++/ObjC member-call resolvers); full suite green, the C++/ObjC resolution coexists. Co-Authored-By: Claude Opus 4.8 (1M context) --- graphify/extract.py | 472 ++++++++++++++++++++------- graphify/extractors/csharp.py | 459 +++++++++++++++++++------- tests/test_csharp_type_resolution.py | 397 ++++++++++++++++++++++ 3 files changed, 1099 insertions(+), 229 deletions(-) diff --git a/graphify/extract.py b/graphify/extract.py index e0e97ef73..9d8001a0f 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -29,10 +29,14 @@ _read_text, ) from graphify.extractors.blade import extract_blade # noqa: F401 -from graphify.extractors.csharp import _resolve_csharp_type_references +from graphify.extractors.csharp import ( + _resolve_cross_file_csharp_imports, + _resolve_csharp_type_references, +) from graphify.extractors.elixir import extract_elixir # noqa: F401 from graphify.extractors.razor import extract_razor # noqa: F401 from graphify.extractors.zig import extract_zig # noqa: F401 +from graphify.security import sanitize_metadata from graphify.paths import disambiguate_ambiguous_candidates _RECURSION_LIMIT = 10_000 @@ -77,6 +81,11 @@ def _file_node_id(rel_path: Path) -> str: return _make_id(_file_stem(rel_path)) +def _csharp_namespace_id(dotted_name: str) -> str: + digest = hashlib.sha1(dotted_name.encode("utf-8")).hexdigest()[:16] + return f"csharp_namespace:{digest}" + + _TSCONFIG_ALIAS_CACHE: dict[str, dict[str, list[str]]] = {} _WORKSPACE_PACKAGE_CACHE: dict[str, dict[str, Path]] = {} _WORKSPACE_MANIFEST_NAMES = ("pnpm-workspace.yaml", "package.json") @@ -714,22 +723,67 @@ def _csharp_classify_base(name: str, interface_names: set[str]) -> str: return "inherits" -def _csharp_collect_type_refs(node, source: bytes, generic: bool, out: list[tuple[str, str]]) -> None: - """Walk a C# type expression; append (name, role) tuples (role is 'type' or 'generic_arg').""" +_CSHARP_TYPE_PARAMETER_SCOPE_DECLARATIONS = frozenset({ + "class_declaration", + "interface_declaration", + "record_declaration", + "struct_declaration", + "method_declaration", +}) + + +def _csharp_type_parameters_in_scope(node, source: bytes) -> frozenset[str]: + """Return C# type-parameter names visible from ``node``.""" + names: set[str] = set() + scope = node + while scope is not None: + if scope.type in _CSHARP_TYPE_PARAMETER_SCOPE_DECLARATIONS: + for child in scope.children: + if child.type != "type_parameter_list": + continue + for param in child.children: + if param.type == "type_parameter": + name_node = next( + (sub for sub in param.children if sub.type == "identifier"), + None, + ) + if name_node is not None: + name = _read_text(name_node, source) + if name: + names.add(name) + elif param.type == "identifier": + name = _read_text(param, source) + if name: + names.add(name) + scope = scope.parent + return frozenset(names) + + +def _csharp_collect_type_refs( + node, + source: bytes, + generic: bool, + out: list[tuple[str, str, bool, str]], + skip: frozenset[str] | None = None, +) -> None: + """Walk a C# type expression; append (name, role, qualified, qualifier) tuples.""" if node is None: return + if skip is None: + skip = _csharp_type_parameters_in_scope(node, source) t = node.type if t == "predefined_type": return if t == "identifier": name = _read_text(node, source) - if name: - out.append((name, "generic_arg" if generic else "type")) + if name and name not in skip: + out.append((name, "generic_arg" if generic else "type", False, "")) return if t == "qualified_name": - text = _read_text(node, source).rsplit(".", 1)[-1] - if text: - out.append((text, "generic_arg" if generic else "type")) + prefix, _, text = _read_text(node, source).rpartition(".") + text = text.split("<", 1)[0] + if text and text not in skip: + out.append((text, "generic_arg" if generic else "type", True, prefix)) return if t == "generic_name": name_child = node.child_by_field_name("name") @@ -739,29 +793,31 @@ def _csharp_collect_type_refs(node, source: bytes, generic: bool, out: list[tupl name_child = sub break if name_child is not None: - name = _read_text(name_child, source) - if name: - out.append((name, "generic_arg" if generic else "type")) + qualified = name_child.type == "qualified_name" + prefix, _, name = _read_text(name_child, source).rpartition(".") + if name and name not in skip: + out.append((name, "generic_arg" if generic else "type", qualified, prefix if qualified else "")) for sub in node.children: if sub.type == "type_argument_list": for arg in sub.children: if arg.is_named: - _csharp_collect_type_refs(arg, source, True, out) + _csharp_collect_type_refs(arg, source, True, out, skip) return if t in ("nullable_type", "array_type", "pointer_type", "ref_type"): for c in node.children: if c.is_named: - _csharp_collect_type_refs(c, source, generic, out) + _csharp_collect_type_refs(c, source, generic, out, skip) return if node.is_named: for c in node.children: if c.is_named: - _csharp_collect_type_refs(c, source, generic, out) + _csharp_collect_type_refs(c, source, generic, out, skip) -def _csharp_attribute_names(method_node, source: bytes) -> list[str]: +def _csharp_attribute_names(method_node, source: bytes) -> list[tuple[str, bool, str]]: """Collect attribute names from a C# method/declaration's attribute_list children.""" - names: list[str] = [] + names: list[tuple[str, bool, str]] = [] + skip = _csharp_type_parameters_in_scope(method_node, source) for child in method_node.children: if child.type != "attribute_list": continue @@ -775,9 +831,10 @@ def _csharp_attribute_names(method_node, source: bytes) -> list[str]: name_node = sub break if name_node is not None: - text = _read_text(name_node, source).rsplit(".", 1)[-1] - if text: - names.append(text) + qualified = name_node.type == "qualified_name" + prefix, _, text = _read_text(name_node, source).rpartition(".") + if text and text not in skip: + names.append((text, qualified, prefix if qualified else "")) return names @@ -1426,7 +1483,7 @@ def _find_body(node, config: LanguageConfig): # ── Import handlers ─────────────────────────────────────────────────────────── -def _import_python(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_python(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: t = node.type if t == "import_statement": for child in node.children: @@ -1489,7 +1546,7 @@ def _resolve_js_import_target(raw: str, str_path: str) -> "tuple[str, Path | Non return _make_id(module_name), None -def _import_js(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_js(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: is_reexport = node.type == "export_statement" # Only handle export_statement if it has a `from` clause (re-export). # Pure exports like `export const x = 1` or `export { localVar }` have no source module. @@ -1638,7 +1695,7 @@ def _dynamic_import_js(node, source: bytes, caller_nid: str, str_path: str, edge return True -def _import_java(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_java(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: def _walk_scoped(n) -> str: parts: list[str] = [] cur = n @@ -1691,7 +1748,7 @@ def _resolve_c_include_path(raw: str, str_path: str) -> "Path | None": return None -def _import_c(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_c(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: for child in node.children: if child.type in ("string_literal", "system_lib_string", "string"): raw = _read_text(child, source).strip('"<> ') @@ -1728,27 +1785,38 @@ def _import_c(node, source: bytes, file_nid: str, stem: str, edges: list, str_pa break -def _import_csharp(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: - for child in node.children: - if child.type in ("qualified_name", "identifier", "name_equals"): - raw = _read_text(child, source) - module_name = raw.split(".")[-1].strip() - if module_name: - tgt_nid = _make_id(module_name) - edges.append({ - "source": file_nid, - "target": tgt_nid, - "relation": "imports", - "context": "import", - "confidence": "EXTRACTED", - "source_file": str_path, - "source_location": f"L{node.start_point[0] + 1}", - "weight": 1.0, - }) - break +def _import_csharp(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: + text = _read_text(node, source).strip().rstrip(";") + if text.startswith("global "): + text = text[len("global "):].strip() + if not text.startswith("using"): + return + body = text[len("using"):].strip() + using_kind, alias, target_fqn = "namespace", None, body + if body.startswith("static "): + using_kind, target_fqn = "static", body[len("static "):].strip() + elif "=" in body: + lhs, rhs = body.split("=", 1) + using_kind, alias, target_fqn = "alias", lhs.strip(), rhs.strip() + if not target_fqn: + return + edges.append({ + "source": file_nid, + "target": _make_id(target_fqn), + "relation": "imports", + "context": "import", + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": f"L{node.start_point[0] + 1}", + "weight": 1.0, + "metadata": sanitize_metadata({k: v for k, v in + {"using_kind": using_kind, "alias": alias, "target_fqn": target_fqn, + "scope_kind": "namespace" if scope_stack else "file", + "scope_id": scope_stack[-1] if scope_stack else None}.items() if v is not None}), + }) -def _import_kotlin(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_kotlin(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: path_node = node.child_by_field_name("path") if path_node: raw = _read_text(path_node, source) @@ -1784,7 +1852,7 @@ def _import_kotlin(node, source: bytes, file_nid: str, stem: str, edges: list, s break -def _import_scala(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_scala(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: for child in node.children: if child.type in ("stable_id", "identifier"): raw = _read_text(child, source) @@ -1804,7 +1872,7 @@ def _import_scala(node, source: bytes, file_nid: str, stem: str, edges: list, st break -def _import_php(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_php(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: for child in node.children: if child.type in ("qualified_name", "name", "identifier"): raw = _read_text(child, source) @@ -2230,23 +2298,56 @@ def _js_extra_walk(node, source: bytes, file_nid: str, stem: str, str_path: str, # ── C# extra walk for namespace declarations ────────────────────────────────── +def _csharp_namespace_name(node, source: bytes) -> str: + name_node = node.child_by_field_name("name") + if name_node is not None: + return _read_text(name_node, source).strip() + for child in node.children: + if child.type in ("identifier", "qualified_name"): + return _read_text(child, source).strip() + return "" + + def _csharp_extra_walk(node, source: bytes, file_nid: str, stem: str, str_path: str, nodes: list, edges: list, seen_ids: set, function_bodies: list, parent_class_nid: str | None, add_node_fn, add_edge_fn, - walk_fn) -> bool: - """Handle namespace_declaration for C#. Returns True if handled.""" + walk_fn, namespace_stack: list[str], scope_stack: list[str]) -> bool: + """Handle namespace declarations for C#. Returns True if handled.""" if node.type == "namespace_declaration": - name_node = node.child_by_field_name("name") - if name_node: - ns_name = _read_text(name_node, source) - ns_nid = _make_id(stem, ns_name) + ns_name = _csharp_namespace_name(node, source) + pushed = False + if ns_name: + namespace_stack.append(ns_name) + scope_stack.append(f"s{node.start_byte}") + pushed = True + ns_label = ".".join(namespace_stack) + ns_nid = _csharp_namespace_id(ns_label) line = node.start_point[0] + 1 - add_node_fn(ns_nid, ns_name, line) + add_node_fn(ns_nid, ns_label, line, node_type="namespace", metadata={"kind": "csharp_namespace"}) add_edge_fn(file_nid, ns_nid, "contains", line) body = node.child_by_field_name("body") if body: - for child in body.children: - walk_fn(child, parent_class_nid) + try: + for child in body.children: + walk_fn(child, parent_class_nid) + finally: + if pushed: + namespace_stack.pop() + scope_stack.pop() + elif pushed: + namespace_stack.pop() + scope_stack.pop() + return True + if node.type == "file_scoped_namespace_declaration": + ns_name = _csharp_namespace_name(node, source) + if ns_name: + namespace_stack.append(ns_name) + scope_stack.append(f"s{node.start_byte}") + ns_label = ".".join(namespace_stack) + ns_nid = _csharp_namespace_id(ns_label) + line = node.start_point[0] + 1 + add_node_fn(ns_nid, ns_label, line, node_type="namespace", metadata={"kind": "csharp_namespace"}) + add_edge_fn(file_nid, ns_nid, "contains", line) return True return False @@ -2525,7 +2626,7 @@ def _resolve_lua_import_target(raw_module: str, str_path: str) -> str: return _make_id(raw_module) -def _import_lua(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: +def _import_lua(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> None: """Extract require('module') from Lua variable_declaration nodes.""" text = _read_text(node, source) import re @@ -2565,7 +2666,7 @@ def _import_lua(node, source: bytes, file_nid: str, stem: str, edges: list, str_ ) -def _import_swift(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> list[tuple[str, str]]: +def _import_swift(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str, scope_stack: list[str] | None = None) -> list[tuple[str, str]]: """Emit module-level ``imports`` edges and report the imported modules. A Swift ``import CoreKit`` names a module, not a file path, so — unlike the @@ -2594,24 +2695,28 @@ def _import_swift(node, source: bytes, file_nid: str, stem: str, edges: list, st return modules -def _read_csharp_type_name(node, source: bytes) -> str | None: - """Resolve a readable C# type name from a field/type node.""" +def _read_csharp_type_name(node, source: bytes) -> tuple[str, bool, str] | None: + """Resolve a C# type name, whether it was qualified, and its qualifier prefix.""" if node is None: return None if node.type in ("identifier", "predefined_type"): - return _read_text(node, source) + return (_read_text(node, source), False, "") if node.type == "qualified_name": - return _read_text(node, source).split(".")[-1] + prefix, _, tail = _read_text(node, source).rpartition(".") + tail = tail.split("<", 1)[0] + return (tail, True, prefix) if node.type == "generic_name": name_node = node.child_by_field_name("name") if name_node is not None: - return _read_text(name_node, source) + qualified = name_node.type == "qualified_name" + prefix, _, tail = _read_text(name_node, source).rpartition(".") + return (tail, qualified, prefix if qualified else "") for child in node.children: if not child.is_named: continue - name = _read_csharp_type_name(child, source) - if name: - return name + result = _read_csharp_type_name(child, source) + if result: + return result return None @@ -2735,6 +2840,8 @@ def _extract_generic( nodes: list[dict] = [] edges: list[dict] = [] seen_ids: set[str] = set() + namespace_stack: list[str] = [] + scope_stack: list[str] = [] function_bodies: list[tuple[str, object]] = [] pending_listen_edges: list[tuple[str, str, int]] = [] # tree-sitter-swift parses both `class Foo` and `extension Foo` as @@ -2760,20 +2867,33 @@ def _extract_generic( if config.ts_module == "tree_sitter_swift": swift_protocol_names, swift_class_names = _swift_pre_scan(root, source) - def add_node(nid: str, label: str, line: int) -> None: - if nid not in seen_ids: - seen_ids.add(nid) - nodes.append({ - "id": nid, - "label": label, - "file_type": "code", - "source_file": str_path, - "source_location": f"L{line}", - }) + def add_node(nid: str, label: str, line: int, *, node_type: str | None = None, + metadata: dict | None = None) -> None: + if nid in seen_ids: + return + seen_ids.add(nid) + merged = dict(metadata or {}) + if namespace_stack: + merged.setdefault("namespace", ".".join(namespace_stack)) + if scope_stack and node_type != "namespace": + merged.setdefault("scope_chain", list(scope_stack)) + node = { + "id": nid, + "label": label, + "file_type": "code", + "source_file": str_path, + "source_location": f"L{line}", + } + if node_type: + node["type"] = node_type + if merged: + node["metadata"] = sanitize_metadata(merged) + nodes.append(node) def add_edge(src: str, tgt: str, relation: str, line: int, confidence: str = "EXTRACTED", weight: float = 1.0, - context: str | None = None) -> None: + context: str | None = None, + metadata: dict | None = None) -> None: edge = { "source": src, "target": tgt, @@ -2785,10 +2905,12 @@ def add_edge(src: str, tgt: str, relation: str, line: int, } if context: edge["context"] = context + if metadata: + edge["metadata"] = sanitize_metadata(metadata) edges.append(edge) def ensure_named_node(name: str, line: int) -> str: - nid = _make_id(stem, name) + nid = _make_id(stem, ".".join(namespace_stack), name) if nid in seen_ids: return nid nid = _make_id(name) @@ -2820,7 +2942,7 @@ def walk(node, parent_class_nid: str | None = None) -> None: # Import types if t in config.import_types: if config.import_handler: - imported_modules = config.import_handler(node, source, file_nid, stem, edges, str_path) + imported_modules = config.import_handler(node, source, file_nid, stem, edges, str_path, scope_stack) # Module-level import handlers (Swift) name a module, not a file # path, so there is no pre-existing node to anchor the edge to. # They return (id, label) pairs for which we materialize a @@ -2864,9 +2986,12 @@ def walk(node, parent_class_nid: str | None = None) -> None: if not name_node: return class_name = _read_text(name_node, source) - class_nid = _make_id(stem, class_name) + class_nid = _make_id(stem, ".".join(namespace_stack), class_name) line = node.start_point[0] + 1 - add_node(class_nid, class_name, line) + metadata = None + if config.ts_module == "tree_sitter_c_sharp" and parent_class_nid: + metadata = {"is_nested_type": True} + add_node(class_nid, class_name, line, metadata=metadata) add_edge(file_nid, class_nid, "contains", line) if config.ts_module == "tree_sitter_swift" and any( @@ -3050,25 +3175,20 @@ def _php_emit_base(base_name: str, rel: str, at_line: int) -> None: # C#-specific: inheritance / interface implementation via base_list if config.ts_module == "tree_sitter_c_sharp": + csharp_type_params = _csharp_type_parameters_in_scope(node, source) for child in node.children: if child.type != "base_list": continue for sub in child.children: if sub.type not in ("identifier", "generic_name", "qualified_name"): continue - if sub.type == "generic_name": - name_child = sub.child_by_field_name("name") - base = ( - _read_text(name_child, source) if name_child - else _read_text(sub.children[0], source) - ) - elif sub.type == "qualified_name": - base = _read_text(sub, source).rsplit(".", 1)[-1] - else: - base = _read_text(sub, source) - if not base: + base_info = _read_csharp_type_name(sub, source) + if base_info is None: continue - base_nid = _make_id(stem, base) + base, qualified, qualifier = base_info + if not base or base in csharp_type_params: + continue + base_nid = _make_id(stem, ".".join(namespace_stack), base) if base_nid not in seen_ids: base_nid = _make_id(base) if base_nid not in seen_ids: @@ -3081,7 +3201,12 @@ def _php_emit_base(base_name: str, rel: str, at_line: int) -> None: }) seen_ids.add(base_nid) relation = _csharp_classify_base(base, csharp_interface_names) - add_edge(class_nid, base_nid, relation, line) + metadata = {"ref_token": base} + if qualified: + metadata["qualified"] = True + if qualifier: + metadata["ref_qualifier"] = qualifier + add_edge(class_nid, base_nid, relation, line, metadata=metadata) if sub.type == "generic_name": for tal in sub.children: if tal.type != "type_argument_list": @@ -3089,12 +3214,19 @@ def _php_emit_base(base_name: str, rel: str, at_line: int) -> None: for arg in tal.children: if not arg.is_named: continue - refs: list[tuple[str, str]] = [] - _csharp_collect_type_refs(arg, source, True, refs) - for ref_name, _role in refs: + refs: list[tuple[str, str, bool, str]] = [] + _csharp_collect_type_refs( + arg, source, True, refs, csharp_type_params + ) + for ref_name, _role, ref_qualified, ref_qualifier in refs: target = ensure_named_node(ref_name, line) + metadata = {"ref_token": ref_name} + if ref_qualified: + metadata["qualified"] = True + if ref_qualifier: + metadata["ref_qualifier"] = ref_qualifier add_edge(class_nid, target, "references", line, - context="generic_arg") + context="generic_arg", metadata=metadata) # Java-specific: extends (superclass) / implements (interfaces) / interface-extends if config.ts_module == "tree_sitter_java": @@ -3352,11 +3484,22 @@ def _emit_java_parent_type(type_node, rel: str, at_line: int) -> None: type_node = child.child_by_field_name("type") if type_node is not None: break - type_name = _read_csharp_type_name(type_node, source) - if type_name: + type_info = _read_csharp_type_name(type_node, source) + if type_info: + type_name, qualified, qualifier = type_info + csharp_type_params = _csharp_type_parameters_in_scope( + type_node if type_node is not None else node, source + ) + if not type_name or type_name in csharp_type_params: + return line = node.start_point[0] + 1 + metadata = {"ref_token": type_name} + if qualified: + metadata["qualified"] = True + if qualifier: + metadata["ref_qualifier"] = qualifier add_edge(parent_class_nid, ensure_named_node(type_name, line), - "references", line, context="field") + "references", line, context="field", metadata=metadata) return if (config.ts_module == "tree_sitter_java" @@ -3551,32 +3694,55 @@ def _emit_java_parent_type(type_node, rel: str, at_line: int) -> None: ) if config.ts_module == "tree_sitter_c_sharp": + csharp_type_params = _csharp_type_parameters_in_scope(node, source) params_node = node.child_by_field_name("parameters") if params_node is not None: for p in params_node.children: if p.type != "parameter": continue type_node = p.child_by_field_name("type") - refs: list[tuple[str, str]] = [] - _csharp_collect_type_refs(type_node, source, False, refs) - for ref_name, role in refs: + refs: list[tuple[str, str, bool, str]] = [] + _csharp_collect_type_refs( + type_node, source, False, refs, csharp_type_params + ) + for ref_name, role, qualified, qualifier in refs: ctx = "generic_arg" if role == "generic_arg" else "parameter_type" target_nid = ensure_named_node(ref_name, line) if target_nid != func_nid: - add_edge(func_nid, target_nid, "references", line, context=ctx) + metadata = {"ref_token": ref_name} + if qualified: + metadata["qualified"] = True + if qualifier: + metadata["ref_qualifier"] = qualifier + add_edge(func_nid, target_nid, "references", line, + context=ctx, metadata=metadata) return_node = node.child_by_field_name("returns") if return_node is not None: - refs = [] - _csharp_collect_type_refs(return_node, source, False, refs) - for ref_name, role in refs: + refs: list[tuple[str, str, bool, str]] = [] + _csharp_collect_type_refs( + return_node, source, False, refs, csharp_type_params + ) + for ref_name, role, qualified, qualifier in refs: ctx = "generic_arg" if role == "generic_arg" else "return_type" target_nid = ensure_named_node(ref_name, line) if target_nid != func_nid: - add_edge(func_nid, target_nid, "references", line, context=ctx) - for attr_name in _csharp_attribute_names(node, source): + metadata = {"ref_token": ref_name} + if qualified: + metadata["qualified"] = True + if qualifier: + metadata["ref_qualifier"] = qualifier + add_edge(func_nid, target_nid, "references", line, + context=ctx, metadata=metadata) + for attr_name, qualified, qualifier in _csharp_attribute_names(node, source): target_nid = ensure_named_node(attr_name, line) if target_nid != func_nid: - add_edge(func_nid, target_nid, "references", line, context="attribute") + metadata = {"ref_token": attr_name} + if qualified: + metadata["qualified"] = True + if qualifier: + metadata["ref_qualifier"] = qualifier + add_edge(func_nid, target_nid, "references", line, + context="attribute", metadata=metadata) if config.ts_module == "tree_sitter_java": params_node = node.child_by_field_name("parameters") @@ -3844,7 +4010,8 @@ def _emit_java_parent_type(type_node, rel: str, at_line: int) -> None: if config.ts_module == "tree_sitter_c_sharp": if _csharp_extra_walk(node, source, file_nid, stem, str_path, nodes, edges, seen_ids, function_bodies, - parent_class_nid, add_node, add_edge, walk): + parent_class_nid, add_node, add_edge, walk, + namespace_stack, scope_stack): return if config.ts_module == "tree_sitter_swift": @@ -3877,6 +4044,8 @@ def _emit_java_parent_type(type_node, rel: str, at_line: int) -> None: label_to_nid: dict[str, str] = {} # case-sensitive (Ruby, C#, Java, Kotlin, etc.) label_to_nid_ci: dict[str, str] = {} # case-insensitive (PHP functions/classes) for n in nodes: + if n.get("type") == "namespace": + continue raw = n["label"] normalised = raw.strip("()").lstrip(".") label_to_nid[normalised] = n["id"] @@ -8049,7 +8218,7 @@ def _disambiguate_colliding_node_ids( """ by_id: dict[str, list[dict]] = {} for node in nodes: - if node.get("type") == "module": + if node.get("type") in ("module", "namespace"): continue nid = node.get("id") if isinstance(nid, str) and nid: @@ -8156,12 +8325,57 @@ def _disambiguate_colliding_node_ids( raw_call["caller_nid"] = unambiguous_remaps[str(raw_call["caller_nid"])] +def _canonicalize_csharp_namespace_nodes(all_nodes: list[dict], all_edges: list[dict]) -> None: + """Collapse duplicate C# namespace node entries to one canonical node per label.""" + by_label: dict[str, list[dict]] = {} + for node in all_nodes: + if node.get("type") != "namespace": + continue + label = node.get("label") + if isinstance(label, str): + by_label.setdefault(label, []).append(node) + + remap: dict[str, str] = {} + drop_node_ids: set[int] = set() + for group in by_label.values(): + if len(group) < 2: + continue + canonical = sorted( + group, + key=lambda node: ( + str(node.get("source_file") or ""), + str(node.get("source_location") or ""), + str(node.get("id") or ""), + ), + )[0] + canonical_id = canonical.get("id") + for node in group: + if node is canonical: + continue + drop_node_ids.add(id(node)) + dup_id = node.get("id") + if isinstance(dup_id, str) and isinstance(canonical_id, str): + remap[dup_id] = canonical_id + + if remap: + for edge in all_edges: + if edge.get("source") in remap: + edge["source"] = remap[str(edge["source"])] + if edge.get("target") in remap: + edge["target"] = remap[str(edge["target"])] + + if drop_node_ids: + all_nodes[:] = [node for node in all_nodes if id(node) not in drop_node_ids] + + def _node_label_key(node: dict) -> str: label = str(node.get("label", "")).strip() return re.sub(r"[^a-zA-Z0-9]+", "", label).lower() def _is_type_like_definition(node: dict) -> bool: + if node.get("type") == "namespace": + return False label = str(node.get("label", "")).strip() if not label: return False @@ -8188,7 +8402,6 @@ def _rewire_unique_stub_nodes(nodes: list[dict], edges: list[dict]) -> None: stubs.append(node) remap: dict[str, str] = {} - drop_ids: set[str] = set() for stub in stubs: stub_id = str(stub.get("id", "")) if not stub_id: @@ -8199,17 +8412,36 @@ def _rewire_unique_stub_nodes(nodes: list[dict], edges: list[dict]) -> None: target_id = candidates[0].get("id") if isinstance(target_id, str) and target_id and target_id != stub_id: remap[stub_id] = target_id - drop_ids.add(stub_id) if not remap: return + by_id = {node.get("id"): node for node in nodes if node.get("id")} + csharp_scoped_relations = {"inherits", "implements", "references", "imports"} for edge in edges: - if edge.get("source") in remap: - edge["source"] = remap[str(edge["source"])] - if edge.get("target") in remap: - edge["target"] = remap[str(edge["target"])] + is_csharp_scoped_edge = ( + str(edge.get("source_file", "")).endswith(".cs") + and edge.get("relation") in csharp_scoped_relations + ) + source = edge.get("source") + if source in remap: + remapped_source = remap[str(source)] + if not ( + is_csharp_scoped_edge + and str(by_id.get(remapped_source, {}).get("source_file", "")).endswith(".cs") + ): + edge["source"] = remapped_source + target = edge.get("target") + if target in remap: + remapped_target = remap[str(target)] + if not ( + is_csharp_scoped_edge + and str(by_id.get(remapped_target, {}).get("source_file", "")).endswith(".cs") + ): + edge["target"] = remapped_target + referenced = {x for e in edges for x in (e.get("source"), e.get("target"))} + drop_ids = {stub_id for stub_id in remap if stub_id not in referenced} nodes[:] = [node for node in nodes if node.get("id") not in drop_ids] @@ -14506,6 +14738,7 @@ def extract( _merge_swift_extensions(per_file, all_nodes, all_edges) _disambiguate_colliding_node_ids(all_nodes, all_edges, all_raw_calls, root) + _canonicalize_csharp_namespace_nodes(all_nodes, all_edges) _rewire_unique_stub_nodes(all_nodes, all_edges) # Add cross-file class-level edges (Python only - uses Python parser internally) @@ -14547,6 +14780,11 @@ def extract( except Exception as exc: import logging logging.getLogger(__name__).warning("C# type-reference resolution failed, skipping: %s", exc) + try: + _resolve_cross_file_csharp_imports(cs_results, cs_paths, all_nodes, all_edges) + except Exception as exc: + import logging + logging.getLogger(__name__).warning("C# cross-file import resolution failed, skipping: %s", exc) # Cross-file call resolution for all languages # Each extractor saved unresolved calls in raw_calls. Now that we have all @@ -14559,7 +14797,7 @@ def extract( # identifiers, and they were polluting matches for short names — #563). global_label_to_nids: dict[str, list[str]] = {} for n in all_nodes: - if n.get("file_type") == "rationale": + if n.get("file_type") == "rationale" or n.get("type") == "namespace": continue raw = n.get("label", "") normalised = raw.strip("()").lstrip(".") diff --git a/graphify/extractors/csharp.py b/graphify/extractors/csharp.py index 4a4fa9137..2cda37263 100644 --- a/graphify/extractors/csharp.py +++ b/graphify/extractors/csharp.py @@ -10,140 +10,375 @@ """ from __future__ import annotations +import html from pathlib import Path -from graphify.extractors.base import _read_text +from graphify.extractors.base import _make_id -def _resolve_csharp_type_references( +def _build_csharp_type_def_index(all_nodes: list[dict]) -> dict[tuple[str, str], str]: + """Return deterministic ``(namespace, name) -> node_id`` C# type definitions.""" + candidates: dict[tuple[str, str], list[dict]] = {} + for node in all_nodes: + if node.get("type") == "namespace": + continue + metadata = node.get("metadata") or {} + if not isinstance(metadata, dict): + metadata = {} + if metadata.get("is_nested_type"): + continue + nid = node.get("id") + label = node.get("label") + if not (isinstance(nid, str) and nid and isinstance(label, str) and label): + continue + source_file = node.get("source_file") + if ( + not isinstance(source_file, str) + or not source_file.endswith(".cs") + or node.get("file_type") != "code" + ): + continue + if label.endswith(")") or label.startswith(".") or "." in label: + continue + namespace = metadata.get("namespace", "") + if not isinstance(namespace, str): + namespace = "" + candidates.setdefault((namespace, label), []).append(node) + + return { + key: sorted( + nodes, + key=lambda node: ( + str(node.get("source_file") or ""), + str(node.get("source_location") or ""), + str(node.get("id") or ""), + ), + )[0]["id"] + for key, nodes in candidates.items() + } + + +def _strip_trailing_csharp_generic_args(target_fqn: str) -> str: + target_fqn = target_fqn.strip() + if not target_fqn.endswith(">"): + return target_fqn + depth = 0 + for index in range(len(target_fqn) - 1, -1, -1): + char = target_fqn[index] + if char == ">": + depth += 1 + elif char == "<": + depth -= 1 + if depth == 0: + return target_fqn[:index].strip() + return target_fqn + + +def _resolve_cross_file_csharp_imports( per_file: list[dict], paths: list[Path], all_nodes: list[dict], all_edges: list[dict], ) -> None: - """Re-point dangling C# ``inherits``/``implements``/``references`` edges to the - real definition, using the referencing file's ``using`` directives + enclosing - namespace for exact disambiguation. Mirrors ``_resolve_java_type_references``. - - C# deltas from Java: a plain ``using N;`` is NAMESPACE-WIDE (resolve a bare ``T`` - by trying ``(N, T)`` for each open namespace and accepting only a UNIQUE hit — the - god-node guardrail), while ``using X = N.T;`` is a single-type alias. ``global - using`` is normalized (the ``global`` prefix stripped); ``using static N.T;`` is - ignored (it imports members, not a namespace/type). The global namespace is keyed - as the bare label (``""``). A file with MULTIPLE namespace blocks does not register - its defs (which namespace each def belongs to needs source-range tracking) — deferred. - - Mutates ``all_nodes``/``all_edges`` in place. Runs after id-disambiguation and - ``_rewire_unique_stub_nodes`` so target ids are final and only the ambiguous - remainder is left on shadow stubs. + """Re-point resolvable C# ``using`` import edges to canonical internal nodes. + + Namespace imports resolve only to canonical C# namespace nodes. Alias imports + resolve only when the alias target's prefix is a known canonical namespace and + the simple type name exists in the shared C# type-definition index. ``using + static`` and nested type aliases remain deliberate gaps because they need + member/nested-type modeling beyond this import pass. """ - try: - import tree_sitter_c_sharp as tscs - from tree_sitter import Language, Parser - except ImportError: + _ = (per_file, paths) + namespace_id_by_label: dict[str, str] = {} + for node in sorted( + all_nodes, + key=lambda node: ( + str(node.get("source_file") or ""), + str(node.get("source_location") or ""), + str(node.get("id") or ""), + ), + ): + if node.get("type") != "namespace": + continue + label = node.get("label") + nid = node.get("id") + if isinstance(label, str) and label and isinstance(nid, str) and nid: + namespace_id_by_label.setdefault(label, nid) + + type_def_index = _build_csharp_type_def_index(all_nodes) + if not namespace_id_by_label and not type_def_index: return - language = Language(tscs.language()) - parser = Parser(language) - - def _key(ns: str, label: str) -> str: - return label if ns == "" else f"{ns}.{label}" - - own_ns_by_file: dict[str, list[str]] = {} - scope_by_file: dict[str, list[str]] = {} - aliases_by_file: dict[str, dict[str, str]] = {} - for path, result in zip(paths, per_file): - srcs = {n.get("source_file") for n in result.get("nodes", []) if n.get("source_file")} - if not srcs: - continue - try: - source = path.read_bytes() - tree = parser.parse(source) - except Exception: - continue - own_ns: list[str] = [] - usings: list[str] = [] - aliases: dict[str, str] = {} - - def walk(n) -> None: - if n.type in ("namespace_declaration", "file_scoped_namespace_declaration"): - nm = n.child_by_field_name("name") - if nm is not None: - own_ns.append(_read_text(nm, source).strip()) - elif n.type == "using_directive": - text = _read_text(n, source).strip().rstrip(";") - if text.startswith("global "): - text = text[len("global "):].strip() - if text.startswith("using"): - body = text[len("using"):].strip() - if body.startswith("static "): - pass # `using static N.T;` imports members, not a type/namespace — skip - elif "=" in body: - lhs, rhs = body.split("=", 1) - if lhs.strip() and rhs.strip(): - aliases[lhs.strip()] = rhs.strip() - elif body: - usings.append(body) - for child in n.children: - walk(child) - - walk(tree.root_node) - scope = list(dict.fromkeys((own_ns or [""]) + usings + [""])) - for s in srcs: - own_ns_by_file[s] = own_ns - scope_by_file[s] = scope - aliases_by_file[s] = aliases - - fqn_to_id: dict[str, str] = {} - for node in all_nodes: - label = node.get("label", "") - src = node.get("source_file", "") - nid = node.get("id", "") - if not (label and src and nid) or src not in own_ns_by_file: - continue - if not label[:1].isupper() or label.endswith(")") or label.endswith(".cs"): - continue - ns_list = own_ns_by_file.get(src, []) - if len(ns_list) == 0: - fqn_to_id.setdefault(_key("", label), nid) - elif len(ns_list) == 1: - fqn_to_id.setdefault(_key(ns_list[0], label), nid) - # len > 1: skip (deferred) - - stub_label: dict[str, str] = { - node["id"]: node.get("label", "") + repointed_from: set[str] = set() + for edge in all_edges: + if edge.get("relation") != "imports": + continue + metadata = edge.get("metadata") or {} + if not isinstance(metadata, dict): + continue + using_kind = metadata.get("using_kind") + target_fqn = metadata.get("target_fqn") + if not using_kind or not isinstance(target_fqn, str) or not target_fqn: + continue + + resolved = None + if using_kind == "namespace": + resolved = namespace_id_by_label.get(target_fqn) + elif using_kind == "alias": + base_fqn = _strip_trailing_csharp_generic_args(html.unescape(target_fqn)) + prefix, sep, name = base_fqn.rpartition(".") + if sep and prefix in namespace_id_by_label: + resolved = type_def_index.get((prefix, name)) + + old_target = edge.get("target") + if resolved and resolved != old_target: + edge["target"] = resolved + if isinstance(old_target, str) and old_target: + repointed_from.add(old_target) + + if not repointed_from: + return + + still_referenced: set[str] = set() + for edge in all_edges: + still_referenced.add(edge.get("source")) + still_referenced.add(edge.get("target")) + all_nodes[:] = [ + node for node in all_nodes + if node.get("id") not in repointed_from or node.get("id") in still_referenced + ] + + +def _resolve_csharp_type_references( + per_file: list[dict], + paths: list[Path], + all_nodes: list[dict], + all_edges: list[dict], +) -> None: + """Arbitrate all C# ``inherits``/``implements``/``references`` targets. + + The extractor emits provisional same-file bindings and sourceless stubs. This + pass is the single soundness gate: it uses only graph-stamped namespace/import + facts, keeps a binding only when the referenced simple name resolves to one + in-scope real type definition, and otherwise leaves the edge on a dangling stub. + """ + _ = (per_file, paths) + + def _is_cs_file(value: object) -> bool: + return isinstance(value, str) and value.endswith(".cs") + + def _metadata(value: object) -> dict: + return value if isinstance(value, dict) else {} + + def _namespace(node: dict | None) -> str: + metadata = _metadata((node or {}).get("metadata")) + namespace = metadata.get("namespace", "") + return namespace if isinstance(namespace, str) else "" + + def _append_unique(items: list[str], value: str) -> None: + if value not in items: + items.append(value) + + node_by_id = { + node["id"]: node for node in all_nodes - if node.get("id") and not node.get("source_file") and node.get("label", "")[:1].isupper() + if isinstance(node.get("id"), str) and node.get("id") } - if not stub_label: - return + type_def_index = _build_csharp_type_def_index(all_nodes) + known_namespaces = { + node.get("label") + for node in all_nodes + if node.get("type") == "namespace" and isinstance(node.get("label"), str) + } + + # Each using carries its lexical scope: ("file", None) applies file-wide; + # ("namespace", scope_id) applies only where scope_id is in the ref's scope_chain. + namespace_usings_by_file: dict[str, list[tuple[str, str, str | None]]] = {} + aliases_by_file: dict[str, dict[str, list[tuple[str, str, str | None]]]] = {} + + for edge in all_edges: + if edge.get("relation") != "imports": + continue + source_node = node_by_id.get(edge.get("source")) + if not ( + source_node + and isinstance(source_node.get("label"), str) + and source_node.get("label", "").endswith(".cs") + ): + continue + source_file = source_node.get("source_file") + if not _is_cs_file(source_file): + continue + metadata = _metadata(edge.get("metadata")) + target_fqn = metadata.get("target_fqn") + if not isinstance(target_fqn, str) or not target_fqn: + continue + scope_kind = metadata.get("scope_kind") or "file" + scope_id = metadata.get("scope_id") + using_kind = metadata.get("using_kind") + if using_kind == "namespace": + entry = (target_fqn, scope_kind, scope_id) + bucket = namespace_usings_by_file.setdefault(source_file, []) + if entry not in bucket: + bucket.append(entry) + elif using_kind == "alias": + alias = metadata.get("alias") + if isinstance(alias, str) and alias: + entry = (target_fqn, scope_kind, scope_id) + bucket = aliases_by_file.setdefault(source_file, {}).setdefault(alias, []) + if entry not in bucket: + bucket.append(entry) + + def _scope_chain(node: dict) -> list[str]: + chain = _metadata(node.get("metadata")).get("scope_chain") + return chain if isinstance(chain, list) else [] + + def _using_in_scope(scope_kind: str, scope_id: str | None, source_node: dict) -> bool: + if scope_kind == "file": + return True + return scope_id is not None and scope_id in _scope_chain(source_node) + + def _scopes_for(source_node: dict, source_file: str) -> list[str]: + scopes: list[str] = [] + _append_unique(scopes, _namespace(source_node)) + _append_unique(scopes, "") + for namespace, scope_kind, scope_id in namespace_usings_by_file.get(source_file, []): + if _using_in_scope(scope_kind, scope_id, source_node): + _append_unique(scopes, namespace) + return scopes + + def _resolve_alias(label: str, source_node: dict, source_file: str) -> str | None: + hits = set() + for target_fqn, scope_kind, scope_id in aliases_by_file.get(source_file, {}).get(label, []): + if not _using_in_scope(scope_kind, scope_id, source_node): + continue + base_fqn = _strip_trailing_csharp_generic_args(html.unescape(target_fqn)) + namespace, sep, simple_name = base_fqn.rpartition(".") + if not sep: + simple_name = namespace + namespace = "" + if not simple_name: + continue + hit = type_def_index.get((namespace, simple_name)) + if hit: + hits.add(hit) + return next(iter(hits)) if len(hits) == 1 else None + + def _resolve_label(label: str, source_node: dict, source_file: str) -> str | None: + if label in aliases_by_file.get(source_file, {}): + return _resolve_alias(label, source_node, source_file) + candidates: list[str] = [] + for namespace in _scopes_for(source_node, source_file): + hit = type_def_index.get((namespace, label)) + if hit and hit not in candidates: + candidates.append(hit) + return candidates[0] if len(candidates) == 1 else None + + def _resolve_qualified(label: str, qualifier: object, source_node: dict, source_file: str) -> str | None: + # Sound qualified resolution: an in-scope alias for Q shadows the namespace Q. For a qualified + # ref Q.label, look up (alias_target_namespace, label). If no in-scope alias, fall through to an + # exact known namespace. Dangle on ambiguity / no hit / unknown qualifier. + if not isinstance(qualifier, str) or not qualifier: + return None + in_scope = [ + entry for entry in aliases_by_file.get(source_file, {}).get(qualifier, []) + if _using_in_scope(entry[1], entry[2], source_node) + ] + if in_scope: + hits = set() + for target_fqn, _scope_kind, _scope_id in in_scope: + alias_ns = _strip_trailing_csharp_generic_args(html.unescape(target_fqn)) + hit = type_def_index.get((alias_ns, label)) + if hit: + hits.add(hit) + return next(iter(hits)) if len(hits) == 1 else None + if qualifier in known_namespaces: + return type_def_index.get((qualifier, label)) + return None + + def _is_placeholder(node: dict | None) -> bool: + return bool(node) and not node.get("source_file") + + def _is_csharp_relevant_target(node: dict) -> bool: + if node.get("type") == "namespace": + return True + source_file = node.get("source_file") + return not source_file or _is_cs_file(source_file) + + def _label_for_type_ref_target(target_node: dict, source_file: str) -> str | None: + label = target_node.get("label") + if not isinstance(label, str) or not label: + return None + if not label.endswith(".cs"): + return label + + stem = label[:-3] + for alias in aliases_by_file.get(source_file, {}): + if alias.lower() == stem.lower() or _make_id(alias) == _make_id(stem): + return alias + return stem or None + + def _dangling_stub_id(label: str, current_target: object) -> str: + current = node_by_id.get(current_target) + if _is_placeholder(current) and current.get("label") == label: + return str(current_target) + + for node in all_nodes: + nid = node.get("id") + if ( + isinstance(nid, str) + and node.get("label") == label + and _is_placeholder(node) + ): + return nid + + stem = _make_id(label) + stub_id = stem + if stub_id in node_by_id: + stub_id = _make_id("csharp_type_ref", label) + suffix = 2 + while stub_id in node_by_id: + stub_id = _make_id("csharp_type_ref", label, str(suffix)) + suffix += 1 + node = { + "id": stub_id, + "label": label, + "file_type": "code", + "source_file": "", + "source_location": "", + } + all_nodes.append(node) + node_by_id[stub_id] = node + return stub_id REPOINT_RELATIONS = {"implements", "inherits", "references"} repointed_from: set[str] = set() for edge in all_edges: if edge.get("relation") not in REPOINT_RELATIONS: continue - tgt = edge.get("target") - label = stub_label.get(tgt) + source_file = edge.get("source_file") + if not _is_cs_file(source_file): + continue + source_node = node_by_id.get(edge.get("source")) + target_node = node_by_id.get(edge.get("target")) + if not source_node or not target_node: + continue + if not _is_csharp_relevant_target(target_node): + continue + metadata = _metadata(edge.get("metadata")) + label = metadata.get("ref_token") or _label_for_type_ref_target(target_node, source_file) if not label: continue - ref_file = edge.get("source_file", "") - resolved = None - alias_fqn = aliases_by_file.get(ref_file, {}).get(label) - if alias_fqn: - ns, _, simple = alias_fqn.rpartition(".") - resolved = fqn_to_id.get(_key(ns, simple)) - if resolved is None: - cands: list[str] = [] - for ns in scope_by_file.get(ref_file, []): - hit = fqn_to_id.get(_key(ns, label)) - if hit and hit not in cands: - cands.append(hit) - if len(cands) == 1: - resolved = cands[0] - if resolved and resolved != tgt: - edge["target"] = resolved - repointed_from.add(tgt) + if metadata.get("qualified"): + resolved = _resolve_qualified(label, metadata.get("ref_qualifier"), source_node, source_file) + else: + resolved = _resolve_label(label, source_node, source_file) + target = edge.get("target") + desired = resolved or _dangling_stub_id(label, target) + if desired != target: + edge["target"] = desired + if isinstance(target, str) and _is_placeholder(target_node): + repointed_from.add(target) if not repointed_from: return diff --git a/tests/test_csharp_type_resolution.py b/tests/test_csharp_type_resolution.py index 54e54821a..694a491d2 100644 --- a/tests/test_csharp_type_resolution.py +++ b/tests/test_csharp_type_resolution.py @@ -33,6 +33,27 @@ def _defs(result: dict, label: str) -> list[dict]: ] +def test_csharp_declaration_nodes_carry_enclosing_namespace(tmp_path: Path): + block = _write( + tmp_path / "block.cs", + "namespace Game.Core { public class Damage {} }\n", + ) + nested = _write( + tmp_path / "nested.cs", + "namespace Outer { namespace Inner { public class NestedDamage {} } }\n", + ) + file_scoped = _write( + tmp_path / "file_scoped.cs", + "namespace FileScoped.Core;\npublic class FileScopedDamage {}\n", + ) + result = extract([block, nested, file_scoped], cache_root=tmp_path) + + assert _defs(result, "Damage")[0].get("metadata", {}).get("namespace") == "Game.Core" + assert _defs(result, "NestedDamage")[0].get("metadata", {}).get("namespace") == "Outer.Inner" + assert _defs(result, "FileScopedDamage")[0].get("metadata", {}).get("namespace") == "FileScoped.Core" + assert _defs(result, "Damage")[0]["metadata"].get("scope_chain"), "lexical scope_chain must be stamped" + + def test_csharp_cross_file_inherits_resolves_to_real_def(tmp_path: Path): core = _write(tmp_path / "core.cs", "namespace Game.Core { public class Damage { public int Calc() { return 1; } } }\n") @@ -170,3 +191,379 @@ def test_csharp_using_alias_resolves_to_aliased_type(tmp_path: Path): assert all("core.cs" in d["source_file"] for d in damage), ( "the alias `Dmg` must resolve to the real Game.Core.Damage def, not a shadow stub" ) + + +def test_csharp_namespace_nodes_canonical_and_discriminated(tmp_path: Path): + a = _write(tmp_path / "a.cs", "namespace N { class A {} }\n") + b = _write(tmp_path / "b.cs", "namespace N { class B {} }\n") + nested = _write(tmp_path / "n.cs", "namespace Outer { namespace Inner { class C {} } }\n") + result = extract([a, b, nested], cache_root=tmp_path) + + ns = [n for n in result["nodes"] if n.get("type") == "namespace"] + by_label = {} + for n in ns: + by_label.setdefault(n["label"], []).append(n) + assert len(by_label.get("N", [])) == 1, "namespace N must be one canonical node across files" + assert "Outer.Inner" in by_label, sorted(by_label) + assert all(n["id"].startswith("csharp_namespace:") for n in ns), [n["id"] for n in ns] + + +def test_csharp_import_edges_carry_using_kind(tmp_path: Path): + f = _write( + tmp_path / "a.cs", + "using Game.Core;\nusing static System.Math;\nglobal using System;\n" + "using X = Game.Core.Damage;\nclass Z {}\n", + ) + result = extract([f], cache_root=tmp_path) + imports = { + (e["metadata"].get("using_kind"), e["metadata"].get("target_fqn"), e["metadata"].get("alias")) + for e in result["edges"] + if e.get("relation") == "imports" and e.get("metadata") + } + assert ("namespace", "Game.Core", None) in imports, imports + assert ("namespace", "System", None) in imports, imports + assert ("static", "System.Math", None) in imports, imports + assert ("alias", "Game.Core.Damage", "X") in imports, imports + + +def test_csharp_import_edges_resolve_internal_namespace_and_alias(tmp_path: Path): + core = _write( + tmp_path / "core.cs", + "namespace Game.Core { public class Damage {} }\n", + ) + user = _write( + tmp_path / "u.cs", + "using Game.Core;\n" + "using UnityEngine;\n" + "using Dmg = Game.Core.Damage;\n" + "using DMath = System.Math;\n" + "using static Game.Core.Damage;\n" + "class Z {}\n", + ) + result = extract([core, user], cache_root=tmp_path) + by_id = {n["id"]: n for n in result["nodes"]} + imports = [ + (e["metadata"]["using_kind"], e["metadata"].get("target_fqn"), by_id.get(e["target"])) + for e in result["edges"] + if e.get("relation") == "imports" and (e.get("metadata") or {}).get("using_kind") + ] + + assert ("namespace", "Game.Core", "namespace") in [ + (kind, fqn, target.get("type") if target else None) + for kind, fqn, target in imports + ] + assert ("namespace", "UnityEngine", None) in [ + (kind, fqn, target.get("type") if target else None) + for kind, fqn, target in imports + ] + assert ("alias", "Game.Core.Damage", "Damage") in [ + (kind, fqn, target.get("label") if target else None) + for kind, fqn, target in imports + ] + assert ("alias", "System.Math", None) in [ + (kind, fqn, target.get("label") if target else None) + for kind, fqn, target in imports + ] + assert ("static", "Game.Core.Damage", None) in [ + (kind, fqn, target.get("label") if target else None) + for kind, fqn, target in imports + ] + assert not [ + n for n in result["nodes"] + if not n.get("source_file") and n.get("label") in {"Game.Core", "Game.Core.Damage"} + ] + + +def test_csharp_qualified_base_ref_is_flagged(tmp_path: Path): + f = _write(tmp_path / "a.cs", "namespace N { class T {} class Use : B.T {} }\n") + result = extract([f], cache_root=tmp_path) + assert any((e.get("metadata") or {}).get("qualified") for e in result["edges"]), \ + "the qualified base ref B.T must carry metadata.qualified" + + +def test_csharp_one_file_same_name_no_collision_flag(tmp_path: Path): + # ns_collision is gone: A.T and B.T are distinct nodes with no ns_collision metadata. + dup = _write(tmp_path / "dup.cs", "namespace A { class T {} } namespace B { class T {} }\n") + result = extract([dup], cache_root=tmp_path) + tnodes = [n for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")] + assert len({n["id"] for n in tnodes}) == 2, tnodes + assert not any((n.get("metadata") or {}).get("ns_collision") for n in tnodes), \ + "ns_collision must no longer be stamped" + + +def test_csharp_type_parameter_emits_no_reference(tmp_path: Path): + f = _write(tmp_path / "a.cs", "namespace N { class T {} class Box { T value; } }\n") + result = extract([f], cache_root=tmp_path) + real_t = {n["id"] for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")} + box_to_t = [ + e for e in result["edges"] + if e.get("relation") in ("references", "inherits", "implements") + and e.get("target") in real_t + and "box" in str(e.get("source", "")).lower() + ] + assert not box_to_t, f"type parameter T must not produce a ref to the real N.T: {box_to_t}" + + +def test_csharp_nested_type_carries_metadata(tmp_path: Path): + f = _write(tmp_path / "a.cs", "namespace N { class Outer { class Inner {} } }\n") + result = extract([f], cache_root=tmp_path) + inner = [n for n in result["nodes"] if n.get("label") == "Inner"] + assert inner and inner[0].get("metadata", {}).get("is_nested_type") is True, inner + + +def test_csharp_cross_namespace_ref_not_misbound(tmp_path: Path): + # Use in namespace B must NOT bind to C.T (B never opens C) — even though T is globally unique. + f = _write(tmp_path / "x.cs", "namespace B { class Use : T {} } namespace C { class T {} }\n") + result = extract([f], cache_root=tmp_path) + resolved = [t for t in _targets(result, "inherits", "T") if t.get("source_file")] + assert not resolved, f"Use:T in B must not bind C.T: {resolved}" + + +def test_csharp_same_file_cross_namespace_ref_not_misbound(tmp_path: Path): + # Same file, T defined in B, Use in C : T — must NOT bind B.T (the eager same-file binding case). + f = _write(tmp_path / "x.cs", "namespace B { class T {} } namespace C { class Use : T {} }\n") + result = extract([f], cache_root=tmp_path) + resolved = [t for t in _targets(result, "inherits", "T") if t.get("source_file")] + assert not resolved, f"same-file Use:T in C must not bind B.T: {resolved}" + + +def test_csharp_inherits_does_not_bind_namespace_node(tmp_path: Path): + # class Use : Game where Game is a namespace — must NOT bind the namespace node (Chunk-1 review B1). + f = _write(tmp_path / "y.cs", "namespace Game { class Damage {} class Use : Game {} }\n") + result = extract([f], cache_root=tmp_path) + nsids = {n["id"] for n in result["nodes"] if n.get("type") == "namespace"} + bad = [e for e in result["edges"] if e.get("relation") == "inherits" and e.get("target") in nsids] + assert not bad, f"inherits must not target a namespace node: {bad}" + + +def test_csharp_qualified_ref_unknown_qualifier_dangles(tmp_path: Path): + # B.T where B is neither a known namespace nor an alias -> must NOT bind A.T (sound dangle). + f = _write(tmp_path / "a.cs", "namespace A { class T {} class Use : B.T {} }\n") + result = extract([f], cache_root=tmp_path) + resolved = [t for t in _targets(result, "inherits", "T") if t.get("source_file")] + assert not resolved, f"unknown-qualifier B.T must not bind A.T: {resolved}" + + +def test_csharp_qualified_ref_known_namespace_resolves(tmp_path: Path): + a = _write(tmp_path / "n.cs", "namespace N { class T {} }\n") + b = _write(tmp_path / "m.cs", "namespace M { class Use : N.T {} }\n") + result = extract([a, b], cache_root=tmp_path) + n_t = next(n for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")) + use = next(n for n in result["nodes"] if n.get("label") == "Use") + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (use["id"], n_t["id"]) in inh, "M.Use : N.T must bind N.T" + + +def test_csharp_qualified_generic_resolves_to_real_def(tmp_path: Path): + # N.Box previously emitted a junk 'B'-style label; it must resolve to the real N.Box def. + f = _write(tmp_path / "g.cs", "namespace N { class Box {} class Use { N.Box b; } }\n") + result = extract([f], cache_root=tmp_path) + box = next(n for n in result["nodes"] if n.get("label") == "Box" and n.get("source_file")) + use = next(n for n in result["nodes"] if n.get("label") == "Use") + refs = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "references"} + assert (use["id"], box["id"]) in refs, "N.Box field must resolve to the real N.Box def" + assert not any("<" in (n.get("label") or "") for n in result["nodes"]), \ + "no node should carry a junk generic label" + + +def test_csharp_qualified_alias_namespace_resolves(tmp_path: Path): + # using B = X.Y (namespace alias) then B.T -> resolves the type T in namespace X.Y. + a = _write(tmp_path / "n.cs", "namespace X.Y { class T {} }\n") + b = _write(tmp_path / "m.cs", "using B = X.Y;\nnamespace M { class Use : B.T {} }\n") + result = extract([a, b], cache_root=tmp_path) + t = next(n for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")) + use = next(n for n in result["nodes"] if n.get("label") == "Use") + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (use["id"], t["id"]) in inh, "B.T with `using B = X.Y;` must resolve to X.Y.T" + + +def test_csharp_qualified_out_of_scope_alias_falls_through_to_namespace(tmp_path: Path): + # B is a real namespace AND an out-of-scope alias (declared in A, used in M): + # B.T in M must resolve to namespace B's T, not dangle. + a = _write(tmp_path / "b.cs", "namespace B { class T {} }\n") + c = _write(tmp_path / "m.cs", + "namespace A { using B = X.Y; }\nnamespace M { class Use : B.T {} }\n") + result = extract([a, c], cache_root=tmp_path) + b_t = next(n for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")) + use = next(n for n in result["nodes"] if n.get("label") == "Use") + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (use["id"], b_t["id"]) in inh, "out-of-scope alias B must fall through to namespace B" + + +def test_csharp_qualified_in_scope_alias_shadows_namespace(tmp_path: Path): + # B is both a real namespace AND an in-scope alias (B = X.Y) in A's block; a later out-of-scope + # alias (B = Z.Q in C) must not overwrite it. Good : B.T -> X.Y.T, not namespace B's T. + a = _write(tmp_path / "xy.cs", "namespace X.Y { class T {} }\n") + b = _write(tmp_path / "b.cs", "namespace B { class T {} }\n") + c = _write(tmp_path / "use.cs", + "namespace A { using B = X.Y; class Good : B.T {} }\nnamespace C { using B = Z.Q; }\n") + result = extract([a, b, c], cache_root=tmp_path) + xy_t = next(n for n in result["nodes"] + if n.get("label") == "T" and (n.get("metadata") or {}).get("namespace") == "X.Y") + b_t = next(n for n in result["nodes"] + if n.get("label") == "T" and (n.get("metadata") or {}).get("namespace") == "B") + good = next(n for n in result["nodes"] if n.get("label") == "Good") + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (good["id"], xy_t["id"]) in inh, "in-scope alias B=X.Y must resolve B.T to X.Y.T" + assert (good["id"], b_t["id"]) not in inh, "must NOT bind namespace B's T" + + +def test_csharp_one_file_same_name_binds_own_namespace(tmp_path: Path): + # T in both A and B of one file; Use:T in B must bind B.T (its own namespace), not A.T. + f = _write( + tmp_path / "c.cs", + "namespace A { class T {} } namespace B { class T {} class Use : T {} }\n", + ) + result = extract([f], cache_root=tmp_path) + b_t = next(n for n in result["nodes"] + if n.get("label") == "T" and (n.get("metadata") or {}).get("namespace") == "B") + a_t = next(n for n in result["nodes"] + if n.get("label") == "T" and (n.get("metadata") or {}).get("namespace") == "A") + use = next(n for n in result["nodes"] if n.get("label") == "Use") + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (use["id"], b_t["id"]) in inh, "Use:T in B must bind B.T" + assert (use["id"], a_t["id"]) not in inh, "Use:T must NOT bind A.T" + + +def test_csharp_nested_type_not_importable_via_using(tmp_path: Path): + # Inner is nested in Outer; `using N;` does not bring Inner into scope as a bare member. + a = _write(tmp_path / "a.cs", "namespace N { class Outer { class Inner {} } }\n") + b = _write(tmp_path / "b.cs", "using N;\nnamespace M { class Use { Inner x; } }\n") + result = extract([a, b], cache_root=tmp_path) + resolved = [t for t in _targets(result, "references", "Inner") if t.get("source_file")] + assert not resolved, f"nested Inner must not resolve via `using N;`: {resolved}" + + +def test_csharp_generic_alias_resolves_to_base_type(tmp_path: Path): + core = _write(tmp_path / "core.cs", "namespace N { class Box {} }\n") + use = _write(tmp_path / "use.cs", "using Bx = N.Box;\nclass Use : Bx {}\n") + result = extract([core, use], cache_root=tmp_path) + resolved = [t for t in _targets(result, "inherits", "Box") if t.get("source_file")] + assert resolved, "generic alias `using Bx = N.Box;` must resolve to the real Box def" + + +def test_csharp_type_ref_never_targets_a_file_label(tmp_path: Path): + core = _write(tmp_path / "core.cs", "namespace N { class Box {} }\n") + b = _write(tmp_path / "b.cs", "using B = N.Box;\nclass Use : B {}\n") + result = extract([core, b], cache_root=tmp_path) + bad = [ + e for e in result["edges"] + if e.get("relation") in ("inherits", "implements", "references") + and str(_node_by_id(result, e.get("target")).get("label", "") if _node_by_id(result, e.get("target")) else "").endswith(".cs") + ] + assert not bad, f"a C# type ref must not target a .cs file-labeled node: {bad}" + + +def test_csharp_type_ref_edges_carry_ref_token(tmp_path: Path): + core = _write(tmp_path / "core.cs", "namespace N { class Base {} }\n") + use = _write(tmp_path / "use.cs", "using N;\nnamespace M { class Use : Base {} }\n") + result = extract([core, use], cache_root=tmp_path) + inh = [ + e for e in result["edges"] + if e.get("relation") == "inherits" + and "use" in str(e.get("source", "")).lower() + ] + assert inh, "expected the Use : Base inherits edge" + assert any((e.get("metadata") or {}).get("ref_token") == "Base" for e in inh), \ + "the inherits edge must carry metadata.ref_token == 'Base'" + + +def test_csharp_alias_matching_file_stem_resolves_via_token(tmp_path: Path): + # alias name == file stem (B in b.cs) used to corrupt the target label; the + # ref token makes the arbiter resolve it correctly regardless. + core = _write(tmp_path / "core.cs", "namespace N { class Box {} }\n") + b = _write(tmp_path / "b.cs", "using B = N.Box;\nclass Use : B {}\n") + result = extract([core, b], cache_root=tmp_path) + resolved = [t for t in _targets(result, "inherits", "Box") if t.get("source_file")] + assert resolved, "Use : B (alias B == file stem) must resolve to the real Box def" + + +def test_csharp_same_name_diff_namespace_have_distinct_ids(tmp_path: Path): + # The id now carries the namespace, so A.T and B.T are distinct nodes (resolution unchanged here). + f = _write(tmp_path / "x.cs", "namespace A { class T {} } namespace B { class T {} }\n") + result = extract([f], cache_root=tmp_path) + ids = {n["id"] for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")} + assert len(ids) == 2, f"A.T and B.T must be distinct nodes: {ids}" + + +def test_csharp_global_scope_id_unchanged(tmp_path: Path): + # A C# type at global scope (no namespace) keeps the bare stem+name id (empty namespace dropped by make_id). + from graphify.extractors.base import _make_id, _file_stem + f = _write(tmp_path / "g.cs", "class Glob {}\n") + result = extract([f], cache_root=tmp_path) + glob = next(n for n in result["nodes"] if n.get("label") == "Glob") + stem = _file_stem(tmp_path / "g.cs") + if "/" in stem: + stem = stem.rsplit("/", 1)[-1] + assert glob["id"] == _make_id(stem, "Glob"), glob + assert "namespace" not in (glob.get("metadata") or {}) + + +def test_csharp_namespaced_id_carries_namespace_segment(tmp_path: Path): + f = _write(tmp_path / "n.cs", "namespace Game.Core { class Order {} }\n") + result = extract([f], cache_root=tmp_path) + order = next(n for n in result["nodes"] if n.get("label") == "Order") + assert order["id"].endswith("order") and "game_core" in order["id"], order["id"] + assert (order.get("metadata") or {}).get("namespace") == "Game.Core" + +def test_csharp_two_namespaces_each_resolve_own_type(tmp_path: Path): + f = _write( + tmp_path / "two.cs", + "namespace A { class T {} class UseA : T {} } namespace B { class T {} class UseB : T {} }\n", + ) + result = extract([f], cache_root=tmp_path) + + def _n(label, ns): + return next(x for x in result["nodes"] + if x.get("label") == label and (x.get("metadata") or {}).get("namespace") == ns) + + a_t, b_t, use_a, use_b = _n("T", "A"), _n("T", "B"), _n("UseA", "A"), _n("UseB", "B") + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (use_a["id"], a_t["id"]) in inh and (use_b["id"], b_t["id"]) in inh + assert (use_a["id"], b_t["id"]) not in inh and (use_b["id"], a_t["id"]) not in inh + + +def test_csharp_file_level_using_applies_across_blocks(tmp_path: Path): + a = _write(tmp_path / "n.cs", "namespace N { class T {} }\n") + b = _write(tmp_path / "u.cs", "using N;\nnamespace A { class X : T {} } namespace B { class Y : T {} }\n") + result = extract([a, b], cache_root=tmp_path) + resolved = [t["id"] for t in _targets(result, "inherits", "T") if t.get("source_file")] + assert len(resolved) >= 2, f"file-level using N must reach both A.X and B.Y: {resolved}" + + +def test_csharp_namespace_scoped_using_isolated_to_sibling_block(tmp_path: Path): + a = _write(tmp_path / "n.cs", "namespace N { class T {} }\n") + b = _write( + tmp_path / "u.cs", + "namespace A { using N; class Good : T {} }\nnamespace A { class Bad : T {} }\n", + ) + result = extract([a, b], cache_root=tmp_path) + good = next(n for n in result["nodes"] if n.get("label") == "Good") + bad = next(n for n in result["nodes"] if n.get("label") == "Bad") + n_t = next(n for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")) + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (good["id"], n_t["id"]) in inh, "Good (same block as using N) must bind N.T" + assert (bad["id"], n_t["id"]) not in inh, "Bad (sibling block, no using) must NOT bind N.T" + + +def test_csharp_using_flows_into_nested_block(tmp_path: Path): + a = _write(tmp_path / "n.cs", "namespace N { class T {} }\n") + b = _write(tmp_path / "u.cs", "namespace A { using N; namespace B { class Inner : T {} } }\n") + result = extract([a, b], cache_root=tmp_path) + resolved = [t["id"] for t in _targets(result, "inherits", "T") if t.get("source_file")] + assert resolved, "using N in outer block A must flow into nested block B" + + +def test_csharp_alias_using_scoped_to_its_block(tmp_path: Path): + a = _write(tmp_path / "n.cs", "namespace N { class T {} }\n") + b = _write( + tmp_path / "u.cs", + "namespace A { using AliasT = N.T; class Good : AliasT {} }\nnamespace A { class Bad : AliasT {} }\n", + ) + result = extract([a, b], cache_root=tmp_path) + good = next(n for n in result["nodes"] if n.get("label") == "Good") + bad = next(n for n in result["nodes"] if n.get("label") == "Bad") + n_t = next(n for n in result["nodes"] if n.get("label") == "T" and n.get("source_file")) + inh = {(e["source"], e["target"]) for e in result["edges"] if e.get("relation") == "inherits"} + assert (good["id"], n_t["id"]) in inh, "Good must bind N.T via the in-block alias" + assert (bad["id"], n_t["id"]) not in inh, "Bad (sibling block) must NOT see the alias" From 0c551ace369a9c82dfde4ac03e904f9da5b8dff4 Mon Sep 17 00:00:00 2001 From: safishamsi Date: Tue, 30 Jun 2026 20:00:00 +0100 Subject: [PATCH 6/6] release: 0.9.3 Cross-file member-call resolution for C++/ObjC (#1547/#1556) and namespace-aware C# type resolution (#1562), the work-memory overlay (#1441), test-mock call-graph fix (#1553), hyperedge member-key aliases (#1561), plus the TS/JS/ObjC resolution fixes (#1316/#1544/#1552/#1475). See CHANGELOG. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 6 +++++- pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a918232a4..c480ae64f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,12 @@ Full release notes with details on each version: [GitHub Releases](https://github.com/safishamsi/graphify/releases) -## Unreleased +## 0.9.3 (2026-06-30) +- Feat: cross-file member-call resolution for C++ and Objective-C (#1547, #1556). A class declared in a header and defined in its `.cpp`/`.m` no longer fragments into two nodes (a decl/def merge pass collapses the sibling header/impl pair, gated to same-directory same-name so unrelated classes never merge), and a member call now resolves across files by the receiver's inferred type: C++ `Foo f; f.bar()` / `Foo::bar()` / `this->bar()` and ObjC `Foo *f = [[Foo alloc] init]; [f doThing]` / `[self render]` link to the owning class's method. Resolution is by receiver type, never bare name, with the single-definition god-node guard — an uninferable or ambiguous receiver produces no edge (high precision over recall, grounded in how compiler-free indexers like ctags/Doxygen mis-resolve by name). Also routes C++ headers to the C++ extractor and ObjC `#import` bridging headers to the ObjC extractor. Reported by @c0dezer019 and @JabberYQ. (Residual cross-file `#include` edge resolution under symlinked roots and ObjC dynamic-dispatch receivers remain follow-ups.) +- Feat: namespace-aware C# cross-file type resolution (#1562, thanks @TheFedaikin). The namespace is folded into the C# node id (so same-named types in different namespaces stay distinct), `using` directives are honored with lexical per-block scope, and qualified references (`Namespace.Type`, `using` aliases) resolve — disambiguating a bare reference to the one in-scope namespace that provides it, and refusing (no edge) when ambiguous. Advances the #1318 shadow-node umbrella for C#. +- Fix: test mocks no longer erase the real cross-file call graph (#1553, thanks @Schweinehund). When a bare callee name had 2+ definitions without unique import evidence, the god-node guard dropped the edge entirely — so a single same-named test mock wiped the real call graph (a 76-stub Pester suite erased everything). The guard now applies tie-breakers — non-test preference (a shared, segment-aware path classifier) then path proximity — and resolves only when exactly one candidate survives, else still bails. A real def plus a test mock resolves to the real def; two genuine non-test defs still bail (no fan-out). +- Fix: hyperedge member lists keyed `members` or `node_ids` are now accepted, not silently dropped (#1561, thanks @askalot-io). Normalized to the canonical `nodes` at ingest (in build_from_json and semantic_cleanup), deduped, with a warning — mirroring the existing from/to edge-endpoint aliasing. - Feat: work-memory overlay — `graphify reflect` now projects the verdicts it distills (preferred / tentative / contested, recency-weighted) into a `.graphify_learning.json` sidecar next to graph.json, and `graphify explain` / `query` / `GRAPH_REPORT.md` / the HTML viewer surface them where you look (a `Lesson:` hint, a colored node ring). Builds on the idea in #1441/#1542 (thanks @TPAteeq), implemented as a sidecar rather than stamping graph.json: structural truth stays separate (no `learning_*` in graph.json or GraphML exports, no rebuild churn). Each verdict carries the source questions that produced it (provenance) and a content fingerprint of the cited code, so a verdict on a file that has changed since is flagged "code changed — re-verify" instead of shown as still-authoritative. Dead-ends stay query-scoped (a report section, never a node attribute). Letting verdicts influence query traversal is deliberately deferred (it needs propensity correction + exploration to avoid a self-reinforcing feedback loop). - Feat: type-aware `this.field.method()` resolution for TypeScript/JS (#1316, thanks @guyoron1). A member call through a constructor-injected dependency (`constructor(private db: Database)` then `this.db.query()`) now produces a `calls` edge to the field type's method, resolved by the field's declared type and gated by the single-definition god-node guard (an ambiguous or untyped field produces no edge — no global name-match fan-out). EXTRACTED confidence; constructor parameter-property injection scope. - Feat: resolve TypeScript wildcard path aliases (#1544, thanks @oleksii-tumanov). A `compilerOptions.paths` pattern like `@app/*` or `@*/interfaces` now captures the matched segment and substitutes it into each target in order, honoring tsc's longest-prefix / exact-wins specificity, baseUrl, and the first-existing-target fallback. Extends the #1531 resolver. diff --git a/pyproject.toml b/pyproject.toml index 7e76e4f6f..4bcf34d92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "graphifyy" -version = "0.9.2" +version = "0.9.3" description = "AI coding assistant skill (Claude Code, CodeBuddy, Codex, OpenCode, Kilo Code, Cursor, Gemini CLI, Aider, OpenClaw, Factory Droid, Trae, Hermes, Kiro, Pi, Devin CLI, Google Antigravity) - turn any folder of code, docs, papers, images, or videos into a queryable knowledge graph" readme = "README.md" license = { file = "LICENSE" } diff --git a/uv.lock b/uv.lock index c9fe20699..fcd87008b 100644 --- a/uv.lock +++ b/uv.lock @@ -1090,7 +1090,7 @@ wheels = [ [[package]] name = "graphifyy" -version = "0.9.2" +version = "0.9.3" source = { editable = "." } dependencies = [ { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },