From 4f2147af345141d97ad691fef9aed8a616034042 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 27 Apr 2026 11:37:53 +0800 Subject: [PATCH 1/5] support generating release notes by ai --- scripts/release_notes_ai/__init__.py | 1 + scripts/release_notes_ai/ai_client.py | 296 +++++++ scripts/release_notes_ai/cli.py | 283 ++++++ scripts/release_notes_ai/constants.py | 98 +++ scripts/release_notes_ai/excel_workbook.py | 906 ++++++++++++++++++++ scripts/release_notes_ai/github_client.py | 321 +++++++ scripts/release_notes_ai/markdown_writer.py | 121 +++ scripts/release_notes_ai/models.py | 101 +++ scripts/release_notes_ai/requirements.txt | 3 + scripts/release_notes_ai/scope_filter.py | 366 ++++++++ scripts/release_notes_ai/utils.py | 87 ++ scripts/release_notes_generate_ai.py | 10 + 12 files changed, 2593 insertions(+) create mode 100644 scripts/release_notes_ai/__init__.py create mode 100644 scripts/release_notes_ai/ai_client.py create mode 100644 scripts/release_notes_ai/cli.py create mode 100644 scripts/release_notes_ai/constants.py create mode 100644 scripts/release_notes_ai/excel_workbook.py create mode 100644 scripts/release_notes_ai/github_client.py create mode 100644 scripts/release_notes_ai/markdown_writer.py create mode 100644 scripts/release_notes_ai/models.py create mode 100644 scripts/release_notes_ai/requirements.txt create mode 100644 scripts/release_notes_ai/scope_filter.py create mode 100644 scripts/release_notes_ai/utils.py create mode 100644 scripts/release_notes_generate_ai.py diff --git a/scripts/release_notes_ai/__init__.py b/scripts/release_notes_ai/__init__.py new file mode 100644 index 0000000000000..65f7e128c779b --- /dev/null +++ b/scripts/release_notes_ai/__init__.py @@ -0,0 +1 @@ +"""Helpers for generating TiDB release notes with AI.""" diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release_notes_ai/ai_client.py new file mode 100644 index 0000000000000..503e28b63023b --- /dev/null +++ b/scripts/release_notes_ai/ai_client.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import dataclasses +from functools import lru_cache +import json +import os +import shlex +import shutil +import subprocess +import tempfile +import textwrap +from pathlib import Path +from typing import Any + +from .constants import BUG_FIXES_REFERENCE, IMPROVEMENTS_REFERENCE +from .models import GeneratedNote, RowContext + + +class AIClient: + def __init__(self, command: str, model: str | None, timeout: int): + self.command = shlex.split(command) + self.model = model + self.timeout = timeout + + def generate(self, prompt: str, expected_links: list[str], contributors: list[str]) -> GeneratedNote: + result, errors = self._run_and_validate(prompt, expected_links, contributors) + if result: + return result + + repair_prompt = build_repair_prompt(prompt, errors) + result, repair_errors = self._run_and_validate(repair_prompt, expected_links, contributors) + if result: + return result + raise ValueError("; ".join(repair_errors)) + + def _run_and_validate( + self, prompt: str, expected_links: list[str], contributors: list[str] + ) -> tuple[GeneratedNote | None, list[str]]: + output = self._run(prompt) + try: + data = extract_json_object(output) + except ValueError as exc: + return None, [str(exc)] + return validate_ai_response(data, expected_links, contributors) + + def _run(self, prompt: str) -> str: + command = list(self.command) + if not command: + raise ValueError("AI command is empty. Pass a command with --ai-command.") + if not is_executable_available(command[0]): + raise FileNotFoundError( + f"AI command executable not found: {command[0]!r}. " + "Install it or pass a custom command with --ai-command." + ) + + with tempfile.TemporaryDirectory() as temp_dir: + output_path: Path | None = None + if self._is_codex_exec(command): + if self.model: + command.extend(["-m", self.model]) + temp_path = Path(temp_dir) + schema_path = temp_path / "ai-output-schema.json" + output_path = temp_path / "ai-output.txt" + schema_path.write_text(json.dumps(ai_output_schema()), encoding="utf-8") + output_path.touch() + command.extend(["--output-schema", str(schema_path)]) + command.extend(["--output-last-message", str(output_path)]) + + completed = subprocess.run( + command, + input=prompt, + text=True, + capture_output=True, + timeout=self.timeout, + check=False, + ) + if completed.returncode != 0: + raise RuntimeError( + "AI command failed with exit code " + f"{completed.returncode}: {summarize_process_output(completed)}" + ) + if output_path and output_path.exists(): + last_message = output_path.read_text(encoding="utf-8").strip() + if last_message: + return last_message + return completed.stdout.strip() + + @staticmethod + def _is_codex_exec(command: list[str]) -> bool: + if not command: + return False + executable = Path(command[0]).name + return executable == "codex" and "exec" in command[1:] + + +def is_executable_available(executable: str) -> bool: + if os.sep in executable or (os.altsep and os.altsep in executable): + return Path(executable).exists() + return shutil.which(executable) is not None + + +def ai_output_schema() -> dict[str, Any]: + return { + "type": "object", + "additionalProperties": False, + "required": ["type", "release_note", "needs_review", "reason"], + "properties": { + "type": {"type": "string", "enum": ["improvement", "bug_fix"]}, + "release_note": {"type": "string"}, + "needs_review": {"type": "boolean"}, + "reason": {"type": "string"}, + }, + } + + +def summarize_process_output(completed: subprocess.CompletedProcess[str]) -> str: + parts = [] + if completed.stderr.strip(): + parts.append("stderr:\n" + tail_output(completed.stderr)) + if completed.stdout.strip(): + parts.append("stdout:\n" + tail_output(completed.stdout)) + return "\n\n".join(parts) or "no output" + + +def tail_output(text: str, max_lines: int = 40, max_chars: int = 4000) -> str: + tail = "\n".join(text.strip().splitlines()[-max_lines:]) + if len(tail) > max_chars: + tail = "...[truncated]\n" + tail[-max_chars:] + return tail + + +def build_generation_prompt( + row_context: RowContext, + expected_links: list[str], + contributors: list[str], +) -> str: + improvements_reference = load_reference_file(IMPROVEMENTS_REFERENCE) + bug_fixes_reference = load_reference_file(BUG_FIXES_REFERENCE) + context = { + "row_number": row_context.row_number, + "component": row_context.component, + "raw_component_from_excel": row_context.raw_component, + "issue_type_from_excel": row_context.issue_type, + "pr_title_from_excel": row_context.pr_title, + "formatted_release_note_from_excel": row_context.formatted_release_note, + "expected_links": expected_links, + "contributors": contributors, + "issues": [dataclasses.asdict(issue) for issue in row_context.issues], + "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], + } + return textwrap.dedent( + f""" + You write exactly one English TiDB release note entry. + + Return only a JSON object with exactly these keys: + - type: "improvement" or "bug_fix" + - release_note: one Markdown bullet that starts with "- " + - needs_review: true or false + - reason: a short reason for the type and wording + + Rules: + - Write from the user's perspective. + - Use the Excel issue_type as a strong signal, but decide the final type from the issue, + PR description, and code changes. + - For improvements, follow the Improvements reference below. + - For bug fixes, follow the Bug fixes reference below. + - Do not end the release note with a period. + - Include every expected link in Markdown release-note style. + - Include every contributor as @[user](https://github.com/user). + - If there is no issue URL, use the PR link as the suffix link. + - Do not expose internal function names unless they are the user-visible behavior. + - If the available context is insufficient, still draft the best note and set needs_review + to true. + + Expected links: + {json.dumps(expected_links, ensure_ascii=False, indent=2)} + + Contributors: + {json.dumps(contributors, ensure_ascii=False, indent=2)} + + Row context: + {json.dumps(context, ensure_ascii=False, indent=2)} + + Improvements reference: + {improvements_reference} + + Bug fixes reference: + {bug_fixes_reference} + """ + ).strip() + + +def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: + return textwrap.dedent( + f""" + Your previous answer did not satisfy the required JSON schema or release-note rules. + + Validation errors: + {json.dumps(errors, ensure_ascii=False, indent=2)} + + Rewrite the answer. Return only the corrected JSON object. + + Original task: + {original_prompt} + """ + ).strip() + + +@lru_cache(maxsize=None) +def load_reference_file(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Cannot find release-note reference file: {path}. " + "Make sure the repo-local write-review-translate-release-notes skill is present." + ) from exc + + +def extract_json_object(output: str) -> dict[str, Any]: + output = output.strip() + if not output: + raise ValueError("AI command returned no output") + try: + data = json.loads(output) + except json.JSONDecodeError: + candidates = extract_json_object_candidates(output) + if not candidates: + raise ValueError("AI output did not contain a JSON object") from None + required_keys = {"type", "release_note", "needs_review", "reason"} + data = next( + (candidate for candidate in candidates if required_keys <= candidate.keys()), + candidates[0], + ) + if not isinstance(data, dict): + raise ValueError("AI output JSON is not an object") + return data + + +def extract_json_object_candidates(output: str) -> list[dict[str, Any]]: + decoder = json.JSONDecoder() + candidates: list[dict[str, Any]] = [] + for index, char in enumerate(output): + if char != "{": + continue + try: + data, _end = decoder.raw_decode(output[index:]) + except json.JSONDecodeError: + continue + if isinstance(data, dict): + candidates.append(data) + return candidates + + +def validate_ai_response( + data: dict[str, Any], + expected_links: list[str], + contributors: list[str], +) -> tuple[GeneratedNote | None, list[str]]: + errors: list[str] = [] + note_type = data.get("type") + release_note = data.get("release_note") + needs_review = data.get("needs_review") + reason = data.get("reason") + + if note_type not in {"improvement", "bug_fix"}: + errors.append('type must be "improvement" or "bug_fix"') + if not isinstance(release_note, str) or not release_note.startswith("- "): + errors.append('release_note must be a string that starts with "- "') + if isinstance(release_note, str) and release_note.rstrip().endswith("."): + errors.append("release_note must not end with a period") + if not isinstance(needs_review, bool): + errors.append("needs_review must be a boolean") + if not isinstance(reason, str): + errors.append("reason must be a string") + + if isinstance(release_note, str): + for link in expected_links: + if link and link not in release_note: + errors.append(f"release_note is missing expected link: {link}") + for contributor in contributors: + expected = f"@[{contributor}](https://github.com/{contributor})" + if contributor and expected not in release_note: + errors.append(f"release_note is missing contributor: {contributor}") + + if errors: + return None, errors + return ( + GeneratedNote( + note_type=str(note_type), + release_note=str(release_note).strip(), + needs_review=bool(needs_review), + reason=str(reason).strip(), + ), + [], + ) diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py new file mode 100644 index 0000000000000..ee1d79a074c4a --- /dev/null +++ b/scripts/release_notes_ai/cli.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +import argparse +import os +import tempfile +from pathlib import Path + +import openpyxl + +from .ai_client import AIClient +from .excel_workbook import ( + clear_output_columns, + generate_notes_without_ai, + generate_notes_for_sheet, + merge_rows_by_issue_and_component, + prepare_sheet_columns, + sort_sheet_rows_by_component, + store_existing_release_notes, + update_pr_authors_and_dup_notes, +) +from .github_client import GitHubClient +from .markdown_writer import write_release_file +from .scope_filter import move_prs_not_in_scope, parse_date_value + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate English release notes with AI from a tirelease workbook." + ) + parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") + parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") + parser.add_argument( + "--releases-dir", + required=True, + help="Path to the existing English release notes directory.", + ) + parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") + parser.add_argument("--github-token-file", help="Path to a GitHub token file.") + parser.add_argument( + "--ai-command", + default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", + help="Command-line AI command. The prompt is passed through stdin.", + ) + parser.add_argument( + "--ai-model", + default="gpt-5.4", + help="Model name passed to codex exec with -m.", + ) + parser.add_argument( + "--involve-ai-generation", + type=parse_on_off, + default="ON", + help=( + "Whether to use AI for non-dup release notes. Use ON to generate with AI, " + "or OFF to output the original formated_release_note values. Default: ON." + ), + ) + parser.add_argument( + "--output-release-file", + help="Output Markdown file. Defaults to release-{version}-updated-by-ai.md.", + ) + parser.add_argument( + "--ai-timeout", + type=int, + default=600, + help="Timeout in seconds for each AI command invocation.", + ) + parser.add_argument( + "--ai-workers", + type=int, + default=3, + help=( + "Number of concurrent AI command invocations. The default is conservative " + "for codex exec subprocesses." + ), + ) + parser.add_argument( + "--github-workers", + type=int, + default=8, + help="Number of concurrent GitHub API prefetch workers.", + ) + parser.add_argument( + "--author-workers", + type=int, + default=3, + help="Number of concurrent workers used to resolve bot-authored cherry-pick PR authors.", + ) + parser.add_argument( + "--checkpoint-interval", + type=int, + default=1, + help=( + "Save the Excel workbook after every N completed AI rows. " + "Default: 1. Use 0 to disable." + ), + ) + parser.add_argument( + "--force-regenerate", + action="store_true", + help="Clear existing AI release notes and regenerate all non-dup rows.", + ) + parser.add_argument( + "--release-date", + default="TBD", + help='Release date text for the Markdown header, for example "August 14, 2025".', + ) + parser.add_argument( + "--skip-scope-preprocess", + action="store_true", + help="Skip moving not-in-scope PR rows to the PRs_not_in_scope sheet.", + ) + parser.add_argument( + "--scope-base-branch-start-date", + help=( + "Override the estimated release-m.n branch start date for x.y.0 scope " + "preprocessing, in YYYY-MM-DD format." + ), + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + validate_positive_int("--ai-workers", args.ai_workers) + validate_positive_int("--github-workers", args.github_workers) + validate_positive_int("--author-workers", args.author_workers) + if args.checkpoint_interval < 0: + raise ValueError("--checkpoint-interval must be greater than or equal to 0") + base_branch_start_date = None + if args.scope_base_branch_start_date: + base_branch_start_date = parse_date_value(args.scope_base_branch_start_date) + if not base_branch_start_date: + raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") + + token = load_github_token(args.github_token_file) + github = GitHubClient(token) + involve_ai_generation = args.involve_ai_generation == "ON" + ai = AIClient(args.ai_command, args.ai_model, args.ai_timeout) if involve_ai_generation else None + + output_file = ( + Path(args.output_release_file) + if args.output_release_file + else Path(args.releases_dir) / f"release-{args.version}-updated-by-ai.md" + ) + + excel_path = Path(args.excel) + processed_excel_path = default_processed_excel_path(excel_path) + workbook = openpyxl.load_workbook(excel_path) + if args.sheet not in workbook.sheetnames: + raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") + sheet = workbook[args.sheet] + if not args.skip_scope_preprocess: + move_prs_not_in_scope( + workbook, + sheet, + args.version, + Path(args.releases_dir), + github, + base_branch_start_date=base_branch_start_date, + ) + sort_sheet_rows_by_component(sheet) + header = prepare_sheet_columns(sheet) + clear_output_columns(sheet, header, clear_ai=args.force_regenerate) + + existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + update_pr_authors_and_dup_notes( + sheet, + header, + existing_notes, + github, + author_workers=args.author_workers, + ) + merge_rows_by_issue_and_component(sheet, header) + + if involve_ai_generation: + checkpoint_callback = build_checkpoint_callback( + workbook, + processed_excel_path, + args.checkpoint_interval, + ) + markdown_entries = generate_notes_for_sheet( + sheet, + header, + github, + ai, + ai_workers=args.ai_workers, + github_workers=args.github_workers, + checkpoint_callback=checkpoint_callback, + ) + else: + markdown_entries = generate_notes_without_ai(sheet, header) + save_workbook_safely(workbook, processed_excel_path) + write_release_file(output_file, args.version, args.release_date, markdown_entries) + + print(f"Original Excel workbook unchanged: {excel_path}", flush=True) + print(f"Processed Excel workbook: {processed_excel_path}", flush=True) + print(f"Generated release note file: {output_file}", flush=True) + return 0 + + +def validate_positive_int(name: str, value: int) -> None: + if value < 1: + raise ValueError(f"{name} must be greater than or equal to 1") + + +def parse_on_off(value: str) -> str: + normalized = value.strip().upper() + if normalized not in {"ON", "OFF"}: + raise argparse.ArgumentTypeError("value must be ON or OFF") + return normalized + + +def default_processed_excel_path(excel_path: Path) -> Path: + return excel_path.with_name(f"{excel_path.stem}_processed{excel_path.suffix}") + + +def build_checkpoint_callback( + workbook: openpyxl.Workbook, + excel_path: Path, + checkpoint_interval: int, +): + if checkpoint_interval <= 0: + return None + + def checkpoint(completed: int, total: int) -> None: + if completed % checkpoint_interval != 0 and completed != total: + return + save_workbook_safely(workbook, excel_path) + print( + f"Checkpoint saved after {completed}/{total} AI row(s): {excel_path}", + flush=True, + ) + + return checkpoint + + +def save_workbook_safely(workbook: openpyxl.Workbook, excel_path: Path) -> None: + excel_path = excel_path.resolve() + temp_file = tempfile.NamedTemporaryFile( + prefix=f".{excel_path.stem}.", + suffix=excel_path.suffix, + dir=excel_path.parent, + delete=False, + ) + temp_path = Path(temp_file.name) + temp_file.close() + saved_temp = False + try: + workbook.save(temp_path) + saved_temp = True + os.replace(temp_path, excel_path) + except Exception as exc: + if saved_temp and temp_path.exists(): + raise RuntimeError( + f"Failed to replace {excel_path}: {exc}. " + f"A complete temporary workbook remains at {temp_path}." + ) from exc + temp_path.unlink(missing_ok=True) + raise RuntimeError(f"Failed to save workbook {excel_path}: {exc}") from exc + + +def load_github_token(token_file: str | None) -> str | None: + import shutil + import subprocess + + if token_file: + return Path(token_file).read_text(encoding="utf-8").strip() + if os.environ.get("GITHUB_TOKEN"): + return os.environ["GITHUB_TOKEN"].strip() + gh = shutil.which("gh") + if not gh: + return None + completed = subprocess.run( + [gh, "auth", "token"], + text=True, + capture_output=True, + timeout=10, + check=False, + ) + if completed.returncode == 0 and completed.stdout.strip(): + return completed.stdout.strip() + return None diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py new file mode 100644 index 0000000000000..c3e947167a23b --- /dev/null +++ b/scripts/release_notes_ai/constants.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import re +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +IMPROVEMENTS_REFERENCE = ( + REPO_ROOT + / ".ai" + / "skills" + / "write-review-translate-release-notes" + / "references" + / "improvements.md" +) +BUG_FIXES_REFERENCE = ( + REPO_ROOT + / ".ai" + / "skills" + / "write-review-translate-release-notes" + / "references" + / "bug-fixes.md" +) + +BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} +# Keep the misspelled source column name because tirelease exports it this way. +REQUIRED_HEADERS = { + "pr_author", + "pr_link", + "pr_title", + "formated_release_note", + "issue_type", +} +COMPONENT_HEADERS = ("component", "components") + +GITHUB_ITEM_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/" + r"(?Pissues|pull)/(?P\d+)" +) +ISSUE_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/issues/(?P\d+)" +) +PR_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/pull/(?P\d+)" +) +AUTHOR_RE = re.compile(r"@\[([^\]]+)\]") + +TOP_LEVEL_COMPONENTS = ["TiDB", "TiKV", "PD", "TiFlash", "TiProxy"] +TOOL_COMPONENTS = [ + "Backup & Restore (BR)", + "TiCDC", + "TiDB Data Migration (DM)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "TiDB Binlog", + "sync-diff-inspector", +] +COMPONENT_ALIASES = { + "tidb": "TiDB", + "tikv": "TiKV", + "pd": "PD", + "tiflash": "TiFlash", + "tiproxy": "TiProxy", + "br": "Backup & Restore (BR)", + "backup & restore": "Backup & Restore (BR)", + "backup & restore (br)": "Backup & Restore (BR)", + "cdc": "TiCDC", + "ticdc": "TiCDC", + "dm": "TiDB Data Migration (DM)", + "tidb data migration": "TiDB Data Migration (DM)", + "tidb data migration (dm)": "TiDB Data Migration (DM)", + "tidb lightning": "TiDB Lightning", + "lightning": "TiDB Lightning", + "dumpling": "Dumpling", + "tiup": "TiUP", + "tidb binlog": "TiDB Binlog", + "ng monitoring": "TiDB", + "sync_diff": "sync-diff-inspector", + "sync-diff-inspector": "sync-diff-inspector", + "sync diff inspector": "sync-diff-inspector", + "planner": "TiDB", + "execution": "TiDB", + "sql-infra": "TiDB", + "transaction": "TiDB", + "engine": "TiDB", + "observability": "TiDB", + "dxf": "TiDB", + "storage": "TiDB", + "tidb-dashboard": "TiDB", + "tidb dashboard": "TiDB", + "ddl": "TiDB", + "coprocessor": "TiDB", + "compute": "TiDB", + "scheduling": "TiDB", + "spm": "TiDB", + "ng-monitoring": "TiDB", +} diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release_notes_ai/excel_workbook.py new file mode 100644 index 0000000000000..260b4b807d04e --- /dev/null +++ b/scripts/release_notes_ai/excel_workbook.py @@ -0,0 +1,906 @@ +from __future__ import annotations + +import copy +import re +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from collections import OrderedDict +from pathlib import Path +from typing import Any, Callable + +from openpyxl.styles import PatternFill + +from .ai_client import build_generation_prompt +from .constants import ( + AUTHOR_RE, + BOT_AUTHORS, + COMPONENT_HEADERS, + GITHUB_ITEM_URL_RE, + REQUIRED_HEADERS, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) +from .models import ( + ExistingNote, + GitHubDataCache, + MarkdownEntry, + RowContext, + RowGenerationResult, + RowInput, +) +from .utils import ( + extract_issue_urls, + extract_pr_urls, + normalize_component, + normalize_raw_component, + normalized_release_component, + replace_author_markdown, + split_lines, + split_multi_value, + str_value, + unique_ordered, +) + + +GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") + + +def prepare_sheet_columns(sheet: Any) -> dict[str, int]: + header = get_header(sheet) + missing = sorted(REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError(f"Missing required Excel columns: {', '.join(missing)}") + get_component_col(header) + + ai_col = header.get("release_notes_written_by_ai") + formatted_col = header["formated_release_note"] + if not ai_col: + sheet.insert_cols(formatted_col + 1) + sheet.cell(row=1, column=formatted_col + 1, value="release_notes_written_by_ai") + header = get_header(sheet) + + if "published_release_notes" not in header: + last_col = sheet.max_column + sheet.cell(row=1, column=last_col + 1, value="published_release_notes") + header = get_header(sheet) + return header + + +def get_header(sheet: Any) -> dict[str, int]: + header: dict[str, int] = {} + for index, cell in enumerate(sheet[1], start=1): + if cell.value: + header[str(cell.value).strip()] = index + return header + + +def clear_output_columns(sheet: Any, header: dict[str, int], clear_ai: bool = True) -> None: + for row_number in range(2, sheet.max_row + 1): + if clear_ai: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + sheet.cell(row=row_number, column=header["published_release_notes"]).value = None + + +def sort_sheet_rows_by_component(sheet: Any) -> None: + header = get_header(sheet) + component_col = get_component_col(header) + if sheet.max_row <= 2: + return + + snapshots = [ + (row_number, component_sort_key(sheet.cell(row=row_number, column=component_col).value), snapshot_row(sheet, row_number)) + for row_number in range(2, sheet.max_row + 1) + ] + sorted_snapshots = sorted(snapshots, key=lambda item: item[1]) + if [row_number for row_number, _key, _snapshot in snapshots] == [ + row_number for row_number, _key, _snapshot in sorted_snapshots + ]: + return + + for target_row, (_source_row, _key, snapshot) in enumerate(sorted_snapshots, start=2): + restore_row(sheet, target_row, snapshot) + + print("Sorted worksheet rows by component before release-note generation", flush=True) + + +def component_sort_key(value: Any) -> tuple[int, str]: + component = normalize_raw_component(value) + if not component: + return (1, "") + return (0, component.casefold()) + + +def snapshot_row(sheet: Any, row_number: int) -> dict[str, Any]: + row_dimension = sheet.row_dimensions[row_number] + return { + "height": row_dimension.height, + "hidden": row_dimension.hidden, + "outline_level": row_dimension.outlineLevel, + "collapsed": row_dimension.collapsed, + "cells": [snapshot_cell(sheet.cell(row=row_number, column=column)) for column in range(1, sheet.max_column + 1)], + } + + +def snapshot_cell(cell: Any) -> dict[str, Any]: + return { + "value": cell.value, + "style": copy.copy(cell._style), + "number_format": cell.number_format, + "hyperlink": copy.copy(cell.hyperlink) if cell.hyperlink else None, + "comment": copy.copy(cell.comment) if cell.comment else None, + } + + +def restore_row(sheet: Any, row_number: int, snapshot: dict[str, Any]) -> None: + row_dimension = sheet.row_dimensions[row_number] + row_dimension.height = snapshot["height"] + row_dimension.hidden = snapshot["hidden"] + row_dimension.outlineLevel = snapshot["outline_level"] + row_dimension.collapsed = snapshot["collapsed"] + for column, cell_snapshot in enumerate(snapshot["cells"], start=1): + cell = sheet.cell(row=row_number, column=column) + cell.value = cell_snapshot["value"] + cell._style = copy.copy(cell_snapshot["style"]) + cell.number_format = cell_snapshot["number_format"] + cell._hyperlink = copy.copy(cell_snapshot["hyperlink"]) if cell_snapshot["hyperlink"] else None + cell.comment = copy.copy(cell_snapshot["comment"]) if cell_snapshot["comment"] else None + + +def get_component_col(header: dict[str, int]) -> int: + for name in COMPONENT_HEADERS: + if name in header: + return header[name] + raise ValueError("Missing required Excel column: component or components") + + +def issue_urls_for_row(sheet: Any, header: dict[str, int], row_number: int) -> list[str]: + candidates: list[str] = [] + if "issue_url" in header: + candidates.append(str_value(sheet.cell(row=row_number, column=header["issue_url"]).value)) + candidates.append(str_value(sheet.cell(row=row_number, column=header["formated_release_note"]).value)) + return unique_ordered(url for text in candidates for url in extract_issue_urls(text)) + + +def first_issue_url_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str | None: + issue_urls = issue_urls_for_row(sheet, header, row_number) + return issue_urls[0] if issue_urls else None + + +def store_existing_release_notes(releases_dir: Path, version: str) -> list[ExistingNote]: + existing_notes: list[ExistingNote] = [] + seen: set[tuple[str, tuple[str, ...]]] = set() + target_version = parse_semver_tuple(version) + + for file_path in sorted(releases_dir.rglob("*.md")): + if should_skip_release_file(file_path, target_version): + continue + level1 = level2 = level3 = "" + with file_path.open("r", encoding="utf-8") as file: + for raw_line in file: + line = raw_line.strip() + authors = AUTHOR_RE.findall(line) + item_url = GITHUB_ITEM_URL_RE.search(line) + if item_url: + key = (item_url.group(), tuple(authors)) + if key in seen: + continue + seen.add(key) + note_level = level1 + level2 + level3 + note_type, component = classify_note_level(note_level) + existing_notes.append( + ExistingNote( + url=item_url.group(), + line=line, + file_name=file_path.name, + note_level=note_level, + authors=authors, + note_type=note_type, + component=component, + ) + ) + continue + + heading = parse_release_note_heading(raw_line) + if not heading: + continue + heading_level, label = heading + if heading_level == 1: + level1 = "> " + label + level2 = level3 = "" + elif heading_level == 2: + level2 = "> " + label + level3 = "" + elif heading_level == 3: + level3 = "> " + label + return existing_notes + + +def should_skip_release_file(file_path: Path, target_version: tuple[int, int, int]) -> bool: + if "updated-by-ai" in file_path.stem: + return True + file_version = release_file_semver_tuple(file_path) + if not file_version: + return False + return file_version >= target_version + + +def parse_semver_tuple(version: str) -> tuple[int, int, int]: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def release_file_semver_tuple(file_path: Path) -> tuple[int, int, int] | None: + match = re.match( + r"^release-(?P\d+)\.(?P\d+)\.(?P\d+)", + file_path.stem, + ) + if not match: + return None + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def parse_release_note_heading(raw_line: str) -> tuple[int, str] | None: + line = raw_line.rstrip() + section = re.match(r"^##\s+(.+?)\s*$", line) + if section: + return 1, section.group(1).strip() + + top_component = re.match(r"^[+-]\s+(.+?)\s*$", line) + if top_component: + label = top_component.group(1).strip() + if label.lower() == "tools" or normalized_release_component(label): + return 2, label + + tool_component = re.match(r"^ {4}[+-]\s+(.+?)\s*$", line) + if tool_component: + label = tool_component.group(1).strip() + if normalized_release_component(label): + return 3, label + return None + + +def update_pr_authors_and_dup_notes( + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + github: Any, + author_workers: int = 1, +) -> None: + apply_bot_author_replacements(sheet, header, github, author_workers) + existing_notes_by_url = index_existing_notes_by_url(existing_notes) + + for row_number in range(2, sheet.max_row + 1): + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + current_author = str_value(author_cell.value) + + issue_url = first_issue_url_for_row(sheet, header, row_number) + if not issue_url: + continue + + current_authors = split_multi_value(current_author) + dup_notes = [] + for existing in existing_notes_by_url.get(issue_url, []): + if existing.authors and not set(current_authors).intersection(existing.authors): + continue + dup_notes.append(existing.dup_text) + + if dup_notes: + dup_col = header["published_release_notes"] + sheet.cell(row=row_number, column=dup_col, value="\n".join(unique_ordered(dup_notes))) + fill_row(sheet, row_number) + print(f"Row {row_number}: found duplicated release note for {issue_url}", flush=True) + + +def apply_bot_author_replacements( + sheet: Any, + header: dict[str, int], + github: Any, + author_workers: int, +) -> None: + requests = bot_author_requests(sheet, header) + if not requests: + return + print( + f"Resolving {len(requests)} bot-authored PR row(s) with {author_workers} worker(s)", + flush=True, + ) + + replacements = resolve_bot_author_replacements(requests, github, author_workers) + for row_number in sorted(replacements): + current_author, actual_author = replacements[row_number] + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + formatted_cell = sheet.cell(row=row_number, column=header["formated_release_note"]) + formatted_note = str_value(formatted_cell.value) + print( + f"Replacing bot author in row {row_number}: {current_author} -> {actual_author}", + flush=True, + ) + author_cell.value = actual_author + formatted_cell.value = replace_author_markdown( + formatted_note, current_author, actual_author + ) + + +def bot_author_requests(sheet: Any, header: dict[str, int]) -> list[tuple[int, str, str, str]]: + requests = [] + for row_number in range(2, sheet.max_row + 1): + current_author = str_value(sheet.cell(row=row_number, column=header["pr_author"]).value) + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + if current_author not in BOT_AUTHORS or not pr_link: + continue + pr_title = str_value(sheet.cell(row=row_number, column=header["pr_title"]).value) + requests.append((row_number, pr_link, pr_title, current_author)) + return requests + + +def resolve_bot_author_replacements( + requests: list[tuple[int, str, str, str]], + github: Any, + author_workers: int, +) -> dict[int, tuple[str, str]]: + replacements: dict[int, tuple[str, str]] = {} + total = len(requests) + if author_workers == 1: + for completed, request in enumerate(requests, start=1): + row_number, pr_link, pr_title, current_author = request + actual_author = resolve_bot_author(github, request) + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + with ThreadPoolExecutor(max_workers=author_workers) as executor: + futures = { + executor.submit(resolve_bot_author, github, request): request + for request in requests + } + for completed, future in enumerate(as_completed(futures), start=1): + row_number, _pr_link, _pr_title, current_author = futures[future] + actual_author = future.result() + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + +def print_bot_author_progress( + completed: int, + total: int, + row_number: int, + current_author: str, + actual_author: str, +) -> None: + status = "unchanged" if actual_author == current_author else f"{current_author} -> {actual_author}" + print( + f"Resolved bot author {completed}/{total}: row {row_number} ({status})", + flush=True, + ) + + +def resolve_bot_author(github: Any, request: tuple[int, str, str, str]) -> str: + row_number, pr_link, pr_title, current_author = request + try: + return github.get_original_author_for_cherry_pick( + row_number, + pr_link, + pr_title, + current_author, + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to resolve bot author for {pr_link}: {exc}", + file=sys.stderr, + flush=True, + ) + return current_author + + +def index_existing_notes_by_url(existing_notes: list[ExistingNote]) -> dict[str, list[ExistingNote]]: + indexed: dict[str, list[ExistingNote]] = {} + for existing in existing_notes: + indexed.setdefault(existing.url, []).append(existing) + return indexed + + +def merge_rows_by_issue_and_component(sheet: Any, header: dict[str, int]) -> None: + groups: OrderedDict[tuple[str, str], list[int]] = OrderedDict() + component_col = get_component_col(header) + for row_number in range(2, sheet.max_row + 1): + issue_url = first_issue_url_for_row(sheet, header, row_number) + if not issue_url: + continue + component = normalize_raw_component(sheet.cell(row=row_number, column=component_col).value) + if not component: + continue + groups.setdefault((issue_url, component), []).append(row_number) + + rows_to_delete: list[int] = [] + for (_issue_url, _component), rows in groups.items(): + if len(rows) <= 1: + continue + keep_row = rows[0] + merge_pr_links(sheet, header, keep_row, rows) + merge_authors(sheet, header, keep_row, rows) + merge_dup_notes(sheet, header, keep_row, rows) + fill_first_empty_values(sheet, header, keep_row, rows) + if str_value(sheet.cell(row=keep_row, column=header["published_release_notes"]).value): + fill_row(sheet, keep_row) + rows_to_delete.extend(rows[1:]) + + for row_number in sorted(rows_to_delete, reverse=True): + sheet.delete_rows(row_number, 1) + + +def merge_pr_links(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + links: list[str] = [] + for row in rows: + links.extend(split_multi_value(sheet.cell(row=row, column=header["pr_link"]).value)) + sheet.cell(row=keep_row, column=header["pr_link"], value=", ".join(unique_ordered(links))) + + +def merge_authors(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + authors: list[str] = [] + for row in rows: + authors.extend(split_multi_value(sheet.cell(row=row, column=header["pr_author"]).value)) + sheet.cell(row=keep_row, column=header["pr_author"], value=", ".join(unique_ordered(authors))) + + +def merge_dup_notes(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + notes: list[str] = [] + for row in rows: + notes.extend(split_lines(sheet.cell(row=row, column=header["published_release_notes"]).value)) + if notes: + sheet.cell(row=keep_row, column=header["published_release_notes"], value="\n".join(unique_ordered(notes))) + + +def fill_first_empty_values(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + columns_to_skip = { + header["pr_link"], + header["pr_author"], + header["published_release_notes"], + header["release_notes_written_by_ai"], + } + for col in range(1, sheet.max_column + 1): + if col in columns_to_skip: + continue + keep_cell = sheet.cell(row=keep_row, column=col) + if str_value(keep_cell.value): + continue + for row in rows[1:]: + value = sheet.cell(row=row, column=col).value + if str_value(value): + keep_cell.value = value + break + + +def generate_notes_for_sheet( + sheet: Any, + header: dict[str, int], + github: Any, + ai: Any, + ai_workers: int = 1, + github_workers: int = 1, + checkpoint_callback: Callable[[int, int], None] | None = None, +) -> list[MarkdownEntry]: + entries_by_row: dict[int, list[MarkdownEntry]] = {} + row_inputs = [ + build_row_input(sheet, header, row_number) + for row_number in range(2, sheet.max_row + 1) + ] + rows_to_generate: list[RowInput] = [] + + for row_input in row_inputs: + row_number = row_input.row_number + component = row_input.component + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + entries_by_row[row_number] = dup_entries_for_row(row_input, dup_text) + continue + + ai_cell = sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]) + expected_links = row_input.issue_urls or row_input.pr_urls + if not expected_links: + ai_cell.value = "AI_GENERATION_FAILED: missing issue URL and PR URL" + continue + + existing_note = str_value(ai_cell.value) + if is_reusable_ai_note(existing_note): + note_type = classify_note_type_from_text(existing_note, row_input.issue_type) + entries_by_row[row_number] = [ + MarkdownEntry( + note_type or "improvement", + component, + existing_note, + row_input.raw_component, + ) + ] + print(f"Row {row_number}: skipped existing AI release note", flush=True) + continue + + rows_to_generate.append(row_input) + + github_cache = prefetch_github_data(rows_to_generate, github, github_workers) + total_to_generate = len(rows_to_generate) + if total_to_generate: + print( + f"Generating AI release notes for {total_to_generate} row(s) " + f"with {ai_workers} worker(s)", + flush=True, + ) + + completed = 0 + with ThreadPoolExecutor(max_workers=ai_workers) as executor: + futures = [ + executor.submit(generate_note_for_row, row_input, github_cache, ai) + for row_input in rows_to_generate + ] + for future in as_completed(futures): + result = future.result() + apply_generation_result(sheet, header, result, entries_by_row) + completed += 1 + if checkpoint_callback: + checkpoint_callback(completed, total_to_generate) + + entries: list[MarkdownEntry] = [] + for row_input in row_inputs: + entries.extend(entries_by_row.get(row_input.row_number, [])) + return entries + + +def generate_notes_without_ai(sheet: Any, header: dict[str, int]) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + for row_number in range(2, sheet.max_row + 1): + row_input = build_row_input(sheet, header, row_number) + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + entries.extend(dup_entries_for_row(row_input, dup_text)) + continue + + formatted_notes = split_lines(row_input.formatted_release_note) + if not formatted_notes: + print( + f"Row {row_number}: skipped non-dup row because formated_release_note is empty", + file=sys.stderr, + flush=True, + ) + continue + note_type = classify_note_type_from_text( + row_input.formatted_release_note, + row_input.issue_type, + ) + for note in formatted_notes: + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + note, + row_input.raw_component, + ) + ) + + print( + f"AI generation is OFF; generated Markdown from formated_release_note for {len(entries)} note(s)", + flush=True, + ) + return entries + + +def dup_entries_for_row(row_input: RowInput, dup_text: str) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + for dup_note in split_lines(dup_text): + note_type = classify_note_type_from_text( + dup_note, + row_input.issue_type, + ) + dup_component = parse_component_from_dup(dup_note) or row_input.component + if note_type in {"improvement", "bug_fix"}: + entries.append( + MarkdownEntry( + note_type, + normalize_component(dup_component), + dup_note, + row_input.raw_component, + ) + ) + return entries + + +def build_row_input(sheet: Any, header: dict[str, int], row_number: int) -> RowInput: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + return RowInput( + row_number=row_number, + component=release_component_for_row(sheet, header, row_number), + raw_component=raw_component, + issue_type=str_value(sheet.cell(row=row_number, column=header["issue_type"]).value), + pr_title=str_value(sheet.cell(row=row_number, column=header["pr_title"]).value), + pr_authors=split_multi_value(sheet.cell(row=row_number, column=header["pr_author"]).value), + pr_urls=extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value)), + issue_urls=issue_urls_for_row(sheet, header, row_number), + formatted_release_note=str_value( + sheet.cell(row=row_number, column=header["formated_release_note"]).value + ), + ) + + +def is_reusable_ai_note(note: str) -> bool: + return bool(note) and not note.startswith("AI_GENERATION_FAILED:") + + +def prefetch_github_data(row_inputs: list[RowInput], github: Any, github_workers: int) -> GitHubDataCache: + issue_urls = unique_ordered(url for row_input in row_inputs for url in row_input.issue_urls) + pr_urls = unique_ordered(url for row_input in row_inputs for url in row_input.pr_urls) + issues = {} + pulls = {} + + if not issue_urls and not pr_urls: + return GitHubDataCache(issues=issues, pulls=pulls) + + print( + f"Prefetching GitHub data: {len(issue_urls)} issue(s), {len(pr_urls)} PR(s) " + f"with {github_workers} worker(s)", + flush=True, + ) + + with ThreadPoolExecutor(max_workers=github_workers) as executor: + futures = { + executor.submit(github.get_issue, issue_url): ("issue", issue_url) + for issue_url in issue_urls + } + futures.update( + { + executor.submit(github.get_pull, pr_url): ("pull", pr_url) + for pr_url in pr_urls + } + ) + for future in as_completed(futures): + item_type, url = futures[future] + try: + data = future.result() + except Exception as exc: # noqa: BLE001 + print(f"Failed to prefetch GitHub {item_type} {url}: {exc}", file=sys.stderr, flush=True) + continue + if item_type == "issue": + issues[url] = data + else: + pulls[url] = data + return GitHubDataCache(issues=issues, pulls=pulls) + + +def generate_note_for_row( + row_input: RowInput, + github_cache: GitHubDataCache, + ai: Any, +) -> RowGenerationResult: + expected_links = row_input.issue_urls or row_input.pr_urls + row_context = build_row_context_from_cache(row_input, github_cache) + contributors = unique_ordered( + [author for author in row_context.pr_authors if author not in BOT_AUTHORS] + ) + try: + prompt = build_generation_prompt(row_context, expected_links, contributors) + generated = ai.generate(prompt, expected_links, contributors) + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=generated.note_type, + note=generated.release_note, + error=None, + needs_review=generated.needs_review, + reason=generated.reason, + ) + except Exception as exc: # noqa: BLE001 + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=None, + note=None, + error=str(exc), + ) + + +def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCache) -> RowContext: + pr_authors = list(row_input.pr_authors) + issues = [ + github_cache.issues[issue_url] + for issue_url in row_input.issue_urls + if issue_url in github_cache.issues + ] + pulls = [] + for pr_url in row_input.pr_urls: + pull = github_cache.pulls.get(pr_url) + if not pull: + continue + pulls.append(pull) + if pull.author: + pr_authors.append(pull.author) + return RowContext( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + issue_type=row_input.issue_type, + pr_title=row_input.pr_title, + pr_authors=unique_ordered(pr_authors), + pr_urls=row_input.pr_urls, + issue_urls=row_input.issue_urls, + formatted_release_note=row_input.formatted_release_note, + issues=issues, + pulls=pulls, + ) + + +def apply_generation_result( + sheet: Any, + header: dict[str, int], + result: RowGenerationResult, + entries_by_row: dict[int, list[MarkdownEntry]], +) -> None: + ai_cell = sheet.cell(row=result.row_number, column=header["release_notes_written_by_ai"]) + if result.error: + ai_cell.value = f"AI_GENERATION_FAILED: {result.error}" + print( + f"Row {result.row_number}: AI generation failed: {result.error}", + file=sys.stderr, + flush=True, + ) + return + if not result.note or not result.note_type: + ai_cell.value = "AI_GENERATION_FAILED: empty AI generation result" + print( + f"Row {result.row_number}: AI generation failed: empty AI generation result", + file=sys.stderr, + flush=True, + ) + return + + ai_cell.value = result.note + entries_by_row[result.row_number] = [ + MarkdownEntry(result.note_type, result.component, result.note, result.raw_component) + ] + review_marker = " (needs review)" if result.needs_review else "" + print( + f"Row {result.row_number}: generated {result.note_type}{review_marker}: {result.reason}", + flush=True, + ) + + +def release_component_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + raw_lower = raw_component.lower() + raw_release_component = release_component_from_raw(raw_component) + if raw_release_component: + return raw_release_component + + urls = issue_urls_for_row(sheet, header, row_number) + urls.extend(extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value))) + repos = {match.group("repo").lower() for url in urls for match in [GITHUB_ITEM_URL_RE.search(url)] if match} + + if "pd" in repos: + return "PD" + if "tikv" in repos: + return "TiKV" + if "tiflash" in repos: + return "TiFlash" + if "ng-monitoring" in repos: + return "TiDB" + if "tiup" in repos: + return "TiUP" + if repos.intersection({"tiflow", "ticdc"}): + if "dm" in raw_lower and "cdc" not in raw_lower: + return "TiDB Data Migration (DM)" + return "TiCDC" + if "tidb" in repos: + if "br" in raw_lower: + return "Backup & Restore (BR)" + if "lightning" in raw_lower: + return "TiDB Lightning" + if "dumpling" in raw_lower: + return "Dumpling" + return "TiDB" + if "tidb-dashboard" in repos: + return "TiDB" + return normalize_component(raw_component) + + +def release_component_from_raw(raw_component: str) -> str: + normalized_raw = normalize_component(raw_component) + if normalized_raw in TOP_LEVEL_COMPONENTS or normalized_raw in TOOL_COMPONENTS: + return normalized_raw + + token_components = [ + normalize_component(token) + for token in split_multi_value(raw_component) + ] + if not token_components: + return "" + + for component in [ + "Backup & Restore (BR)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "sync-diff-inspector", + ]: + if component in token_components: + return component + + for component in TOP_LEVEL_COMPONENTS: + if component in token_components: + return component + + if "TiDB Data Migration (DM)" in token_components: + return "TiDB Data Migration (DM)" + if "TiCDC" in token_components: + return "TiCDC" + + return "" + + +def classify_note_level(note_level: str) -> tuple[str | None, str | None]: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note_level)] + if not labels: + return None, None + section = labels[0].lower() + note_type = None + if "bug fixes" in section or "error fixes" in section: + note_type = "bug_fix" + elif "improvements" in section: + note_type = "improvement" + + component_labels = labels[1:] + if component_labels and component_labels[0].lower() == "tools": + component_labels = component_labels[1:] + for label in reversed(component_labels): + component = normalized_release_component(label) + if component: + return note_type, component + return note_type, None + + +def classify_note_type_from_text(note: str, issue_type: str) -> str | None: + note_lower = note.lower() + issue_type_lower = issue_type.lower() + if "> bug fixes" in note_lower or "> 错误修复" in note_lower: + return "bug_fix" + if "> improvements" in note_lower or "> 改进提升" in note_lower: + return "improvement" + if "bug" in issue_type_lower or "fix" in issue_type_lower: + return "bug_fix" + if "improvement" in issue_type_lower or "enhancement" in issue_type_lower: + return "improvement" + if note.strip().startswith("- Fix "): + return "bug_fix" + return "improvement" + + +def parse_component_from_dup(note: str) -> str | None: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note)] + cleaned: list[str] = [] + for label in labels: + if " - " in label: + label = label.split(" - ", 1)[0] + cleaned.append(label.strip()) + if len(cleaned) < 2: + return None + return normalized_release_component(cleaned[-1]) + + +def fill_row(sheet: Any, row_number: int) -> None: + for column in range(1, sheet.max_column + 1): + sheet.cell(row=row_number, column=column).fill = copy.copy(GRAY_FILL) diff --git a/scripts/release_notes_ai/github_client.py b/scripts/release_notes_ai/github_client.py new file mode 100644 index 0000000000000..f0f4d1b5e2ff2 --- /dev/null +++ b/scripts/release_notes_ai/github_client.py @@ -0,0 +1,321 @@ +from __future__ import annotations + +import re +import sys +import threading +import time +from typing import Any + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +from .constants import GITHUB_ITEM_URL_RE +from .models import IssueInfo, PullInfo +from .utils import parse_github_url + + +def create_retry_policy() -> Retry: + return Retry( + total=3, + connect=3, + read=3, + status=3, + backoff_factor=1, + status_forcelist=(500, 502, 503, 504), + allowed_methods=frozenset(["GET"]), + respect_retry_after_header=True, + raise_on_status=False, + ) + + +class GitHubClient: + def __init__( + self, + token: str | None, + max_rate_limit_retries: int = 3, + max_rate_limit_sleep: int = 600, + ): + self.max_rate_limit_retries = max_rate_limit_retries + self.max_rate_limit_sleep = max_rate_limit_sleep + self.headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + if token: + self.headers["Authorization"] = f"Bearer {token}" + self._thread_local = threading.local() + + def get_session(self) -> requests.Session: + session = getattr(self._thread_local, "session", None) + if session is None: + session = requests.Session() + session.headers.update(self.headers) + adapter = HTTPAdapter(max_retries=create_retry_policy()) + session.mount("https://", adapter) + self._thread_local.session = session + return session + + def get_json(self, api_path: str) -> dict[str, Any]: + data = self.get_api_json(api_path) + if not isinstance(data, dict): + raise ValueError(f"Expected object response from {api_path}") + return data + + def get_api_json(self, api_path: str, params: dict[str, Any] | None = None) -> Any: + return self.get_url_json(f"https://api.github.com{api_path}", params=params) + + def get_url_json(self, url: str, params: dict[str, Any] | None = None) -> Any: + last_response: requests.Response | None = None + for attempt in range(self.max_rate_limit_retries + 1): + response = self.get_session().get(url, params=params, timeout=30) + last_response = response + if self.is_rate_limited(response) and attempt < self.max_rate_limit_retries: + sleep_seconds = self.rate_limit_sleep_seconds(response, attempt) + print( + "GitHub API rate limit reached; retrying in " + f"{sleep_seconds} seconds: {url}", + file=sys.stderr, + flush=True, + ) + time.sleep(sleep_seconds) + continue + response.raise_for_status() + return response.json() + if last_response is not None: + last_response.raise_for_status() + raise RuntimeError(f"GitHub API request failed: {url}") + + def is_rate_limited(self, response: requests.Response) -> bool: + if response.status_code == 429: + return True + if response.status_code != 403: + return False + if response.headers.get("x-ratelimit-remaining") == "0": + return True + message = response.text.lower() + return "rate limit" in message or "abuse detection" in message + + def rate_limit_sleep_seconds(self, response: requests.Response, attempt: int) -> int: + retry_after = response.headers.get("retry-after") + if retry_after and retry_after.isdigit(): + return min(max(int(retry_after), 1), self.max_rate_limit_sleep) + reset = response.headers.get("x-ratelimit-reset") + if reset and reset.isdigit(): + wait_seconds = int(reset) - int(time.time()) + 5 + return min(max(wait_seconds, 1), self.max_rate_limit_sleep) + return min(2 ** attempt, self.max_rate_limit_sleep) + + def get_pull(self, pr_url: str) -> PullInfo: + owner, repo, number = parse_github_url(pr_url, "pull") + pull = self.get_json(f"/repos/{owner}/{repo}/pulls/{number}") + files_summary = self.get_pull_files_summary(owner, repo, number) + return PullInfo( + url=pr_url, + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary=files_summary, + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + + def get_issue(self, issue_url: str) -> IssueInfo: + owner, repo, number = parse_github_url(issue_url, "issues") + issue = self.get_json(f"/repos/{owner}/{repo}/issues/{number}") + labels = [ + str(label.get("name")) + for label in issue.get("labels", []) + if isinstance(label, dict) and label.get("name") + ] + return IssueInfo( + url=issue_url, + title=str(issue.get("title") or ""), + body=str(issue.get("body") or ""), + labels=labels, + ) + + def get_pull_files_summary( + self, + owner: str, + repo: str, + number: str, + max_files: int = 80, + max_patch_chars: int = 1200, + max_total_chars: int = 60000, + ) -> str: + lines: list[str] = [] + page = 1 + total_chars = 0 + while len(lines) < max_files: + files = self.get_api_json( + f"/repos/{owner}/{repo}/pulls/{number}/files", + params={"per_page": 100, "page": page}, + ) + if not isinstance(files, list) or not files: + break + for item in files: + if len(lines) >= max_files or total_chars >= max_total_chars: + break + if not isinstance(item, dict): + continue + patch = str(item.get("patch") or "") + if len(patch) > max_patch_chars: + patch = patch[:max_patch_chars] + "\n...[patch truncated]" + block = "\n".join( + [ + f"file: {item.get('filename', '')}", + f"status: {item.get('status', '')}", + f"additions: {item.get('additions', 0)}", + f"deletions: {item.get('deletions', 0)}", + "patch:", + patch, + ] + ) + lines.append(block) + total_chars += len(block) + page += 1 + if not lines: + return "No changed-file information is available." + if len(lines) >= max_files: + lines.append("...[file list truncated]") + return "\n\n".join(lines) + + def list_pulls_for_base( + self, + owner: str, + repo: str, + base: str, + state: str = "closed", + max_pages: int = 10, + ) -> list[PullInfo]: + pulls: list[PullInfo] = [] + for page in range(1, max_pages + 1): + data = self.get_api_json( + f"/repos/{owner}/{repo}/pulls", + params={ + "state": state, + "base": base, + "sort": "created", + "direction": "asc", + "per_page": 100, + "page": page, + }, + ) + if not isinstance(data, list) or not data: + break + for pull in data: + if not isinstance(pull, dict): + continue + pulls.append( + PullInfo( + url=str(pull.get("html_url") or ""), + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary="", + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + ) + if len(data) < 100: + break + return pulls + + def get_original_author_for_cherry_pick( + self, row_number: int, cp_pr_link: str, cp_pr_title: str, current_author: str + ) -> str: + default_owner, default_repo, _cp_number = parse_github_url(cp_pr_link, "pull") + target_ref = find_original_pr_reference(cp_pr_title, default_owner, default_repo) + if not target_ref: + try: + cp_info = self.get_pull(cp_pr_link) + target_ref = ( + find_original_pr_reference(cp_info.head_ref, default_owner, default_repo) + or find_original_pr_reference(cp_info.title, default_owner, default_repo) + or find_original_pr_reference(cp_info.body, default_owner, default_repo) + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to inspect cherry-pick PR " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + if not target_ref: + print( + f"Row {row_number}: failed to find the original PR for " + f"{cp_pr_link} created by {current_author}.", + file=sys.stderr, + ) + return current_author + + target_owner, target_repo, target_number = target_ref + target_pr_link = f"https://github.com/{target_owner}/{target_repo}/pull/{target_number}" + try: + return self.get_pull(target_pr_link).author or current_author + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to find the non-bot author for " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + +def find_original_pr_reference( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + text = text or "" + marker_lines = [ + line + for line in text.splitlines() + if re.search(r"\b(backport|cherry[- ]?pick|original|source|from)\b", line, re.I) + ] + for line in marker_lines: + reference = find_pr_reference_in_text(line, default_owner, default_repo) + if reference: + return reference + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text) + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + branch = re.search(r"(?:^|[/_-])cherry-pick-(?P\d+)(?:\D|$)", text) + if branch: + return default_owner, default_repo, branch.group("number") + + if "\n" not in text and len(text) <= 300: + return find_pr_reference_in_text(text, default_owner, default_repo) + + return None + + +def find_pr_reference_in_text( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + for full_url in GITHUB_ITEM_URL_RE.finditer(text or ""): + if full_url.group("kind") == "pull": + return full_url.group("owner"), full_url.group("repo"), full_url.group("number") + + cross_repo = re.search( + r"(?[\w.-]+)/(?P[\w.-]+)#(?P\d+)\b", + text or "", + ) + if cross_repo: + return cross_repo.group("owner"), cross_repo.group("repo"), cross_repo.group("number") + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text or "") + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + return None diff --git a/scripts/release_notes_ai/markdown_writer.py b/scripts/release_notes_ai/markdown_writer.py new file mode 100644 index 0000000000000..38d02cdf51950 --- /dev/null +++ b/scripts/release_notes_ai/markdown_writer.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path + +from .constants import TOOL_COMPONENTS, TOP_LEVEL_COMPONENTS +from .models import MarkdownEntry +from .utils import normalize_component, str_value + + +def write_release_file( + output_file: Path, + version: str, + release_date: str, + entries: list[MarkdownEntry], +) -> None: + major_minor = ".".join(version.split(".")[:2]) + grouped = group_markdown_entries(entries) + content: list[str] = [ + "---", + f"title: TiDB {version} Release Notes", + f"summary: Learn about the improvements and bug fixes in TiDB {version}.", + "---", + "", + f"# TiDB {version} Release Notes", + "", + f"Release date: {release_date}", + "", + f"TiDB version: {version}", + "", + "Quick access: " + f"[Quick start](https://docs.pingcap.com/tidb/v{major_minor}/quick-start-with-tidb) | " + f"[Production deployment](https://docs.pingcap.com/tidb/v{major_minor}/production-deployment-using-tiup)", + "", + ] + + content.extend(render_section("## Improvements", grouped["improvement"])) + content.append("") + content.extend(render_section("## Bug fixes", grouped["bug_fix"])) + content.append("") + while content and content[-1] == "": + content.pop() + + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text("\n".join(content) + "\n", encoding="utf-8") + + +def group_markdown_entries(entries: list[MarkdownEntry]) -> dict[str, dict[str, list[MarkdownEntry]]]: + grouped: dict[str, dict[str, list[MarkdownEntry]]] = { + "improvement": defaultdict(list), + "bug_fix": defaultdict(list), + } + for entry in entries: + if entry.note_type not in grouped: + continue + component = normalize_component(entry.component) or "Other" + grouped[entry.note_type][component].append(entry) + return grouped + + +def render_section(title: str, entries_by_component: dict[str, list[MarkdownEntry]]) -> list[str]: + lines = [title, ""] + top_components = [ + component + for component in TOP_LEVEL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + unknown_top_components = sorted( + component + for component in entries_by_component + if component not in TOP_LEVEL_COMPONENTS + and component not in TOOL_COMPONENTS + and entries_by_component[component] + ) + tool_components = [ + component + for component in TOOL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + + for component in top_components + unknown_top_components: + lines.append(f"+ {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + if tool_components: + lines.append("+ Tools") + lines.append("") + for component in tool_components: + lines.append(f" + {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + while lines and lines[-1] == "": + lines.pop() + return lines + + +def note_with_component_marker(entry: MarkdownEntry) -> str: + note = ensure_release_note_bullet(entry.note) + raw_component = sanitize_component_marker(entry.raw_component) + if not raw_component or "" + + +def ensure_release_note_bullet(note: str) -> str: + note = str_value(note) + if note.startswith("- "): + return note + if note.startswith(("+ ", "* ")): + return "- " + note[2:].lstrip() + return f"- {note}" + + +def sanitize_component_marker(component: str) -> str: + return " ".join(str_value(component).replace("--", "- -").split()) diff --git a/scripts/release_notes_ai/models.py b/scripts/release_notes_ai/models.py new file mode 100644 index 0000000000000..7e89853cb3202 --- /dev/null +++ b/scripts/release_notes_ai/models.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import dataclasses + + +@dataclasses.dataclass +class ExistingNote: + url: str + line: str + file_name: str + note_level: str + authors: list[str] + note_type: str | None + component: str | None + + @property + def dup_text(self) -> str: + return f"- (dup): {self.file_name} {self.note_level} {self.line}" + + +@dataclasses.dataclass +class PullInfo: + url: str + title: str + body: str + author: str + head_ref: str + base_ref: str + files_summary: str + merged_at: str = "" + created_at: str = "" + + +@dataclasses.dataclass +class IssueInfo: + url: str + title: str + body: str + labels: list[str] + + +@dataclasses.dataclass +class GeneratedNote: + note_type: str + release_note: str + needs_review: bool + reason: str + + +@dataclasses.dataclass +class RowContext: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + issues: list[IssueInfo] + pulls: list[PullInfo] + + +@dataclasses.dataclass +class RowInput: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + + +@dataclasses.dataclass +class GitHubDataCache: + issues: dict[str, IssueInfo] + pulls: dict[str, PullInfo] + + +@dataclasses.dataclass +class MarkdownEntry: + note_type: str + component: str + note: str + raw_component: str = "" + + +@dataclasses.dataclass +class RowGenerationResult: + row_number: int + component: str + raw_component: str + note_type: str | None + note: str | None + error: str | None + needs_review: bool = False + reason: str = "" diff --git a/scripts/release_notes_ai/requirements.txt b/scripts/release_notes_ai/requirements.txt new file mode 100644 index 0000000000000..89cfc13a2a578 --- /dev/null +++ b/scripts/release_notes_ai/requirements.txt @@ -0,0 +1,3 @@ +openpyxl>=3.1 +requests>=2.31 +urllib3>=1.26 diff --git a/scripts/release_notes_ai/scope_filter.py b/scripts/release_notes_ai/scope_filter.py new file mode 100644 index 0000000000000..019824068d6e1 --- /dev/null +++ b/scripts/release_notes_ai/scope_filter.py @@ -0,0 +1,366 @@ +from __future__ import annotations + +import copy +import re +from dataclasses import dataclass +from datetime import date, datetime +from pathlib import Path +from typing import Any + +from .excel_workbook import get_header +from .models import PullInfo +from .utils import parse_github_url, str_value + + +OUT_OF_SCOPE_SHEET = "PRs_not_in_scope" +REASON_HEADER = "Reason" +SCOPE_REQUIRED_HEADERS = {"pr_status", "pr_merge_time", "pr_link"} + + +@dataclass(frozen=True) +class Version: + major: int + minor: int + patch: int + + @property + def release_branch(self) -> str: + return f"release-{self.major}.{self.minor}" + + @property + def text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch}" + + @property + def previous_patch_text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch - 1}" + + +@dataclass(frozen=True) +class TimelineRelease: + version: Version + display_version: str + release_date: date + + +@dataclass +class ScopeContext: + version: Version + releases_dir: Path + github: Any + base_branch_start_date: date | None = None + timeline: list[TimelineRelease] | None = None + release_branch_pulls: dict[str, list[PullInfo]] | None = None + + def __post_init__(self) -> None: + if self.timeline is None: + self.timeline = parse_release_timeline(self.releases_dir / "release-timeline.md") + if self.release_branch_pulls is None: + self.release_branch_pulls = {} + + +def move_prs_not_in_scope( + workbook: Any, + sheet: Any, + version: str, + releases_dir: Path, + github: Any, + base_branch_start_date: date | None = None, + target_sheet_name: str = OUT_OF_SCOPE_SHEET, +) -> int: + header = get_header(sheet) + missing = sorted(SCOPE_REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError( + "Missing required Excel columns for scope preprocessing: " + + ", ".join(missing) + ) + + context = ScopeContext( + version=parse_version(version), + releases_dir=releases_dir, + github=github, + base_branch_start_date=base_branch_start_date, + ) + target = ensure_out_of_scope_sheet(workbook, sheet, target_sheet_name) + + rows_to_move: list[tuple[int, str]] = [] + for row_number in range(2, sheet.max_row + 1): + reason = out_of_scope_reason(sheet, header, row_number, context) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} before release-note generation", + flush=True, + ) + return len(rows_to_move) + + +def ensure_out_of_scope_sheet(workbook: Any, source_sheet: Any, target_sheet_name: str) -> Any: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if target.max_row == 0 or not target.cell(row=1, column=1).value: + copy_header(source_sheet, target) + else: + ensure_reason_header(source_sheet, target) + return target + + target = workbook.create_sheet(target_sheet_name) + copy_header(source_sheet, target) + return target + + +def copy_header(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + copy_cell(source_sheet.cell(row=1, column=column), target_sheet.cell(row=1, column=column)) + ensure_reason_header(source_sheet, target_sheet) + + +def ensure_reason_header(source_sheet: Any, target_sheet: Any) -> None: + target_sheet.cell(row=1, column=source_sheet.max_column + 1, value=REASON_HEADER) + + +def append_row_with_reason(source_sheet: Any, target_sheet: Any, row_number: int, reason: str) -> None: + target_row = target_sheet.max_row + 1 + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=source_sheet.max_column + 1, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) + + +def out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + context: ScopeContext, +) -> str | None: + status = str_value(sheet.cell(row=row_number, column=header["pr_status"]).value).lower() + if status != "merged": + return f"PR status is {status or 'empty'}, not merged" + + merge_date = parse_date_value(sheet.cell(row=row_number, column=header["pr_merge_time"]).value) + if not merge_date: + return None + + if context.version.patch >= 1: + previous_date = release_date_for_version(context.timeline or [], context.version.previous_patch_text) + if not previous_date: + raise ValueError( + f"Cannot find release date for previous version {context.version.previous_patch_text} " + "in releases/release-timeline.md" + ) + if merge_date < previous_date: + return ( + f"PR merged on {merge_date.isoformat()}, before previous release " + f"{context.version.previous_patch_text} date {previous_date.isoformat()}" + ) + return None + + return major_release_out_of_scope_reason(sheet, header, row_number, merge_date, context) + + +def major_release_out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + merge_date: date, + context: ScopeContext, +) -> str | None: + latest_zero = latest_released_zero_patch(context.timeline or [], context.version.text) + if not latest_zero: + raise ValueError("Cannot find a previously released x.y.0 version in releases/release-timeline.md") + + if merge_date >= latest_zero.release_date: + return None + + branch_start = context.base_branch_start_date or estimated_release_branch_start_date(context, latest_zero) + if not branch_start: + return None + if merge_date < branch_start: + return ( + f"PR merged on {merge_date.isoformat()}, before estimated {latest_zero.version.release_branch} " + f"branch start date {branch_start.isoformat()}" + ) + + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + cherry_pick = find_release_branch_cherry_pick(context, latest_zero, pr_link) + if not cherry_pick: + return None + cherry_pick_date = parse_date_value(cherry_pick.merged_at) + if cherry_pick_date and cherry_pick_date < latest_zero.release_date: + return ( + f"Cherry-pick PR {cherry_pick.url} merged on {cherry_pick_date.isoformat()} " + f"before {latest_zero.display_version} release date {latest_zero.release_date.isoformat()}" + ) + return None + + +def estimated_release_branch_start_date( + context: ScopeContext, + latest_zero: TimelineRelease, +) -> date | None: + branch_pulls = release_branch_pulls(context, latest_zero.version.release_branch) + created_dates = [parse_date_value(pull.created_at) for pull in branch_pulls] + created_dates = [value for value in created_dates if value] + return min(created_dates) if created_dates else None + + +def find_release_branch_cherry_pick( + context: ScopeContext, + latest_zero: TimelineRelease, + pr_link: str, +) -> PullInfo | None: + try: + owner, repo, number = parse_github_url(pr_link, "pull") + except ValueError: + return None + if (owner, repo) != ("pingcap", "tidb"): + return None + + candidates = [] + for pull in release_branch_pulls(context, latest_zero.version.release_branch): + haystack = "\n".join([pull.title, pull.body, pull.head_ref, pull.url]) + if references_original_pr(haystack, owner, repo, number, pr_link): + candidates.append(pull) + + merged_candidates = [ + pull for pull in candidates if parse_date_value(pull.merged_at) + ] + if not merged_candidates: + return None + return min( + merged_candidates, + key=lambda pull: parse_date_value(pull.merged_at) or date.max, + ) + + +def references_original_pr( + text: str, + owner: str, + repo: str, + number: str, + pr_link: str, +) -> bool: + text = text or "" + patterns = [ + re.escape(pr_link), + rf"(? list[PullInfo]: + assert context.release_branch_pulls is not None + if branch not in context.release_branch_pulls: + context.release_branch_pulls[branch] = context.github.list_pulls_for_base( + "pingcap", + "tidb", + branch, + state="closed", + ) + return context.release_branch_pulls[branch] + + +def parse_release_timeline(path: Path) -> list[TimelineRelease]: + releases: list[TimelineRelease] = [] + if not path.exists(): + raise FileNotFoundError(f"Cannot find release timeline: {path}") + pattern = re.compile( + r"\|\s*\[(?P[^\]]+)\]\([^)]+\)\s*\|\s*(?P\d{4}-\d{2}-\d{2})\s*\|" + ) + for line in path.read_text(encoding="utf-8").splitlines(): + match = pattern.search(line) + if not match: + continue + try: + version = parse_version(match.group("version")) + except ValueError: + continue + release_date = date.fromisoformat(match.group("date")) + releases.append(TimelineRelease(version, match.group("version"), release_date)) + return releases + + +def release_date_for_version(timeline: list[TimelineRelease], version_text: str) -> date | None: + for release in timeline: + if release.version.text == version_text: + return release.release_date + return None + + +def latest_released_zero_patch( + timeline: list[TimelineRelease], + target_version_text: str, +) -> TimelineRelease | None: + zero_patch_releases = [ + release + for release in timeline + if release.version.patch == 0 and release.version.text != target_version_text + ] + if not zero_patch_releases: + return None + return max(zero_patch_releases, key=lambda release: release.release_date) + + +def parse_version(version: str) -> Version: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return Version( + major=int(match.group("major")), + minor=int(match.group("minor")), + patch=int(match.group("patch")), + ) + + +def parse_date_value(value: Any) -> date | None: + if value is None: + return None + if isinstance(value, datetime): + return value.date() + if isinstance(value, date): + return value + text = str_value(value) + if not text: + return None + text = text.replace("Z", "+00:00") + try: + return datetime.fromisoformat(text).date() + except ValueError: + pass + match = re.search(r"\d{4}-\d{2}-\d{2}", text) + if match: + return date.fromisoformat(match.group()) + return None diff --git a/scripts/release_notes_ai/utils.py b/scripts/release_notes_ai/utils.py new file mode 100644 index 0000000000000..1c0641787019c --- /dev/null +++ b/scripts/release_notes_ai/utils.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from typing import Any, Iterable + +from .constants import ( + COMPONENT_ALIASES, + GITHUB_ITEM_URL_RE, + ISSUE_URL_RE, + PR_URL_RE, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) + + +def parse_github_url(url: str, expected_kind: str) -> tuple[str, str, str]: + match = GITHUB_ITEM_URL_RE.search(url) + if not match: + raise ValueError(f"Invalid GitHub URL: {url}") + if match.group("kind") != expected_kind: + raise ValueError(f"Expected a GitHub {expected_kind} URL, got: {url}") + return match.group("owner"), match.group("repo"), match.group("number") + + +def extract_issue_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in ISSUE_URL_RE.finditer(text or "")) + + +def extract_pr_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in PR_URL_RE.finditer(text or "")) + + +def replace_author_markdown(text: str, old_author: str, new_author: str) -> str: + text = text or "" + return text.replace( + f"[{old_author}](https://github.com/{old_author}", + f"[{new_author}](https://github.com/{new_author}", + ) + + +def normalize_component(component: str) -> str: + cleaned = " ".join(str_value(component).split()) + if not cleaned: + return "" + return COMPONENT_ALIASES.get(cleaned.lower(), cleaned) + + +def normalize_raw_component(component: Any) -> str: + return " ".join(str_value(component).split()) + + +def normalized_release_component(component: str) -> str | None: + normalized = normalize_component(component) + if normalized in TOP_LEVEL_COMPONENTS or normalized in TOOL_COMPONENTS: + return normalized + return None + + +def split_multi_value(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [item.strip() for item in text.replace("\n", ",").split(",") if item.strip()] + + +def split_lines(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [line.strip() for line in text.splitlines() if line.strip()] + + +def unique_ordered(values: Iterable[str]) -> list[str]: + result: list[str] = [] + seen: set[str] = set() + for value in values: + cleaned = str_value(value) + if not cleaned or cleaned in seen: + continue + seen.add(cleaned) + result.append(cleaned) + return result + + +def str_value(value: Any) -> str: + if value is None: + return "" + return str(value).strip() diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py new file mode 100644 index 0000000000000..5d1e701f56cec --- /dev/null +++ b/scripts/release_notes_generate_ai.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""CLI entry point for generating English TiDB release notes with AI.""" + +from release_notes_ai.cli import main + + +if __name__ == "__main__": + raise SystemExit(main()) From 4583453ca945f965df345d18bbc7941e4ae045fb Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 29 Apr 2026 17:55:48 +0800 Subject: [PATCH 2/5] improve the filter logic and move the prompt to an independent file --- scripts/release_notes_ai/ai_client.py | 83 +++--- scripts/release_notes_ai/cli.py | 8 + scripts/release_notes_ai/constants.py | 3 + scripts/release_notes_ai/excel_workbook.py | 240 ++++++++++++++++-- .../release_notes_ai/prompts/generation.md | 40 +++ scripts/release_notes_generate_ai.py | 37 ++- 6 files changed, 345 insertions(+), 66 deletions(-) create mode 100644 scripts/release_notes_ai/prompts/generation.md diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release_notes_ai/ai_client.py index 503e28b63023b..d2770e3fbc56c 100644 --- a/scripts/release_notes_ai/ai_client.py +++ b/scripts/release_notes_ai/ai_client.py @@ -12,7 +12,11 @@ from pathlib import Path from typing import Any -from .constants import BUG_FIXES_REFERENCE, IMPROVEMENTS_REFERENCE +from .constants import ( + BUG_FIXES_REFERENCE, + GENERATION_PROMPT_TEMPLATE, + IMPROVEMENTS_REFERENCE, +) from .models import GeneratedNote, RowContext @@ -134,6 +138,7 @@ def build_generation_prompt( expected_links: list[str], contributors: list[str], ) -> str: + prompt_template = load_prompt_template(GENERATION_PROMPT_TEMPLATE) improvements_reference = load_reference_file(IMPROVEMENTS_REFERENCE) bug_fixes_reference = load_reference_file(BUG_FIXES_REFERENCE) context = { @@ -148,46 +153,16 @@ def build_generation_prompt( "issues": [dataclasses.asdict(issue) for issue in row_context.issues], "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], } - return textwrap.dedent( - f""" - You write exactly one English TiDB release note entry. - - Return only a JSON object with exactly these keys: - - type: "improvement" or "bug_fix" - - release_note: one Markdown bullet that starts with "- " - - needs_review: true or false - - reason: a short reason for the type and wording - - Rules: - - Write from the user's perspective. - - Use the Excel issue_type as a strong signal, but decide the final type from the issue, - PR description, and code changes. - - For improvements, follow the Improvements reference below. - - For bug fixes, follow the Bug fixes reference below. - - Do not end the release note with a period. - - Include every expected link in Markdown release-note style. - - Include every contributor as @[user](https://github.com/user). - - If there is no issue URL, use the PR link as the suffix link. - - Do not expose internal function names unless they are the user-visible behavior. - - If the available context is insufficient, still draft the best note and set needs_review - to true. - - Expected links: - {json.dumps(expected_links, ensure_ascii=False, indent=2)} - - Contributors: - {json.dumps(contributors, ensure_ascii=False, indent=2)} - - Row context: - {json.dumps(context, ensure_ascii=False, indent=2)} - - Improvements reference: - {improvements_reference} - - Bug fixes reference: - {bug_fixes_reference} - """ - ).strip() + return render_prompt_template( + prompt_template, + { + "EXPECTED_LINKS": json.dumps(expected_links, ensure_ascii=False, indent=2), + "CONTRIBUTORS": json.dumps(contributors, ensure_ascii=False, indent=2), + "ROW_CONTEXT": json.dumps(context, ensure_ascii=False, indent=2), + "IMPROVEMENTS_REFERENCE": improvements_reference, + "BUG_FIXES_REFERENCE": bug_fixes_reference, + }, + ) def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: @@ -206,6 +181,32 @@ def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: ).strip() +def render_prompt_template(template: str, values: dict[str, str]) -> str: + for key, value in values.items(): + template = template.replace(f"{{{{{key}}}}}", value) + return template.strip() + + +@lru_cache(maxsize=None) +def load_prompt_template(path: Path) -> str: + try: + return strip_prompt_template_heading(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Cannot find release-note prompt template: {path}. " + "Make sure scripts/release_notes_ai/prompts/generation.md exists." + ) from exc + + +def strip_prompt_template_heading(template: str) -> str: + lines = template.splitlines() + if lines and lines[0].startswith("# "): + lines = lines[1:] + if lines and not lines[0].strip(): + lines = lines[1:] + return "\n".join(lines) + + @lru_cache(maxsize=None) def load_reference_file(path: Path) -> str: try: diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index ee1d79a074c4a..fdeaccfda3efb 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -13,6 +13,7 @@ generate_notes_without_ai, generate_notes_for_sheet, merge_rows_by_issue_and_component, + move_rows_with_issues_already_in_same_series, prepare_sheet_columns, sort_sheet_rows_by_component, store_existing_release_notes, @@ -164,6 +165,13 @@ def main() -> int: clear_output_columns(sheet, header, clear_ai=args.force_regenerate) existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + move_rows_with_issues_already_in_same_series( + workbook, + sheet, + header, + existing_notes, + args.version, + ) update_pr_authors_and_dup_notes( sheet, header, diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py index c3e947167a23b..89cca90e52d2b 100644 --- a/scripts/release_notes_ai/constants.py +++ b/scripts/release_notes_ai/constants.py @@ -21,6 +21,9 @@ / "references" / "bug-fixes.md" ) +GENERATION_PROMPT_TEMPLATE = ( + REPO_ROOT / "scripts" / "release_notes_ai" / "prompts" / "generation.md" +) BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} # Keep the misspelled source column name because tirelease exports it this way. diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release_notes_ai/excel_workbook.py index 260b4b807d04e..177f28fd64c9f 100644 --- a/scripts/release_notes_ai/excel_workbook.py +++ b/scripts/release_notes_ai/excel_workbook.py @@ -43,6 +43,7 @@ GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") +SAME_SERIES_REASON_HEADER = "reason" def prepare_sheet_columns(sheet: Any) -> dict[str, int]: @@ -168,7 +169,7 @@ def first_issue_url_for_row(sheet: Any, header: dict[str, int], row_number: int) def store_existing_release_notes(releases_dir: Path, version: str) -> list[ExistingNote]: existing_notes: list[ExistingNote] = [] - seen: set[tuple[str, tuple[str, ...]]] = set() + seen: set[tuple[str, tuple[str, ...], str]] = set() target_version = parse_semver_tuple(version) for file_path in sorted(releases_dir.rglob("*.md")): @@ -179,25 +180,26 @@ def store_existing_release_notes(releases_dir: Path, version: str) -> list[Exist for raw_line in file: line = raw_line.strip() authors = AUTHOR_RE.findall(line) - item_url = GITHUB_ITEM_URL_RE.search(line) - if item_url: - key = (item_url.group(), tuple(authors)) - if key in seen: - continue - seen.add(key) + item_urls = [match.group() for match in GITHUB_ITEM_URL_RE.finditer(line)] + if item_urls: note_level = level1 + level2 + level3 note_type, component = classify_note_level(note_level) - existing_notes.append( - ExistingNote( - url=item_url.group(), - line=line, - file_name=file_path.name, - note_level=note_level, - authors=authors, - note_type=note_type, - component=component, + for item_url in item_urls: + key = (item_url, tuple(authors), file_path.name) + if key in seen: + continue + seen.add(key) + existing_notes.append( + ExistingNote( + url=item_url, + line=line, + file_name=file_path.name, + note_level=note_level, + authors=authors, + note_type=note_type, + component=component, + ) ) - ) continue heading = parse_release_note_heading(raw_line) @@ -283,22 +285,207 @@ def update_pr_authors_and_dup_notes( author_cell = sheet.cell(row=row_number, column=header["pr_author"]) current_author = str_value(author_cell.value) - issue_url = first_issue_url_for_row(sheet, header, row_number) - if not issue_url: + issue_urls = issue_urls_for_row(sheet, header, row_number) + if not issue_urls: continue current_authors = split_multi_value(current_author) dup_notes = [] - for existing in existing_notes_by_url.get(issue_url, []): - if existing.authors and not set(current_authors).intersection(existing.authors): - continue - dup_notes.append(existing.dup_text) + for issue_url in issue_urls: + for existing in existing_notes_by_url.get(issue_url, []): + if existing.authors and not set(current_authors).intersection(existing.authors): + continue + dup_notes.append(existing.dup_text) if dup_notes: dup_col = header["published_release_notes"] sheet.cell(row=row_number, column=dup_col, value="\n".join(unique_ordered(dup_notes))) fill_row(sheet, row_number) - print(f"Row {row_number}: found duplicated release note for {issue_url}", flush=True) + print( + f"Row {row_number}: found duplicated release note for {', '.join(issue_urls)}", + flush=True, + ) + + +def move_rows_with_issues_already_in_same_series( + workbook: Any, + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + version: str, +) -> int: + files_by_issue_url = same_series_release_files_by_issue_url(existing_notes, version) + if not files_by_issue_url: + return 0 + + target_sheet_name = same_series_issues_sheet_name(version) + target, reason_col = ensure_sheet_with_reason(workbook, sheet, target_sheet_name) + rows_to_move: list[tuple[int, str]] = [] + + for row_number in range(2, sheet.max_row + 1): + issue_urls = issue_urls_for_row(sheet, header, row_number) + reason = same_series_issue_reason(issue_urls, files_by_issue_url) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason, reason_col) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} because their issues " + "already appear in earlier release notes from the same major.minor series", + flush=True, + ) + return len(rows_to_move) + + +def same_series_release_files_by_issue_url( + existing_notes: list[ExistingNote], + version: str, +) -> dict[str, list[str]]: + target_version = parse_semver_tuple(version) + files_by_issue_url: dict[str, list[str]] = {} + + for existing in existing_notes: + match = GITHUB_ITEM_URL_RE.search(existing.url) + if not match or match.group("kind") != "issues": + continue + + file_version = release_file_semver_tuple(Path(existing.file_name)) + if not file_version: + continue + if file_version[:2] != target_version[:2] or file_version >= target_version: + continue + + files = files_by_issue_url.setdefault(existing.url, []) + if existing.file_name not in files: + files.append(existing.file_name) + + for issue_url, files in list(files_by_issue_url.items()): + files_by_issue_url[issue_url] = sorted(files, key=release_file_name_sort_key) + return files_by_issue_url + + +def same_series_issues_sheet_name(version: str) -> str: + major, minor, _patch = parse_semver_tuple(version) + return f"issues_already_in_earlier_v{major}.{minor}_notes" + + +def same_series_issue_reason( + issue_urls: list[str], + files_by_issue_url: dict[str, list[str]], +) -> str | None: + reasons = [] + for issue_url in issue_urls: + files = files_by_issue_url.get(issue_url) + if files: + reasons.append(f"{issue_url} appears in {', '.join(files)}") + return "; ".join(reasons) if reasons else None + + +def release_file_name_sort_key(file_name: str) -> tuple[int, int, int, str]: + version = release_file_semver_tuple(Path(file_name)) + if not version: + return (sys.maxsize, sys.maxsize, sys.maxsize, file_name) + return (*version, file_name) + + +def ensure_sheet_with_reason( + workbook: Any, + source_sheet: Any, + target_sheet_name: str, +) -> tuple[Any, int]: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if not str_value(target.cell(row=1, column=1).value): + reason_col = copy_header_with_reason(source_sheet, target) + else: + reason_col = ensure_same_series_reason_header(source_sheet, target) + return target, reason_col + + target = workbook.create_sheet(target_sheet_name) + reason_col = copy_header_with_reason(source_sheet, target) + return target, reason_col + + +def copy_header_with_reason(source_sheet: Any, target_sheet: Any) -> int: + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + return ensure_same_series_reason_header(source_sheet, target_sheet) + + +def ensure_same_series_reason_header(source_sheet: Any, target_sheet: Any) -> int: + reason_col = find_header_column(target_sheet, SAME_SERIES_REASON_HEADER) + if not reason_col: + reason_col = max(source_sheet.max_column, target_sheet.max_column) + 1 + copy_missing_header_cells(source_sheet, target_sheet) + target_sheet.cell(row=1, column=reason_col, value=SAME_SERIES_REASON_HEADER) + return reason_col + + while reason_col <= source_sheet.max_column: + target_sheet.insert_cols(reason_col) + reason_col += 1 + + copy_missing_header_cells(source_sheet, target_sheet) + return reason_col + + +def copy_missing_header_cells(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + if not str_value(target_sheet.cell(row=1, column=column).value): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + + +def find_header_column(sheet: Any, header_name: str) -> int | None: + for column in range(1, sheet.max_column + 1): + if str_value(sheet.cell(row=1, column=column).value) == header_name: + return column + return None + + +def append_row_with_reason( + source_sheet: Any, + target_sheet: Any, + row_number: int, + reason: str, + reason_col: int, +) -> None: + target_row = target_sheet.max_row + 1 + source_dimension = source_sheet.row_dimensions[row_number] + target_dimension = target_sheet.row_dimensions[target_row] + target_dimension.height = source_dimension.height + target_dimension.hidden = source_dimension.hidden + target_dimension.outlineLevel = source_dimension.outlineLevel + target_dimension.collapsed = source_dimension.collapsed + + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=reason_col, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) def apply_bot_author_replacements( @@ -407,7 +594,12 @@ def resolve_bot_author(github: Any, request: tuple[int, str, str, str]) -> str: def index_existing_notes_by_url(existing_notes: list[ExistingNote]) -> dict[str, list[ExistingNote]]: indexed: dict[str, list[ExistingNote]] = {} + seen: set[tuple[str, tuple[str, ...]]] = set() for existing in existing_notes: + key = (existing.url, tuple(existing.authors)) + if key in seen: + continue + seen.add(key) indexed.setdefault(existing.url, []).append(existing) return indexed diff --git a/scripts/release_notes_ai/prompts/generation.md b/scripts/release_notes_ai/prompts/generation.md new file mode 100644 index 0000000000000..8eb5b1e993381 --- /dev/null +++ b/scripts/release_notes_ai/prompts/generation.md @@ -0,0 +1,40 @@ +# Generation Prompt + +You are a senior technical writer who has profound knowledge of TiDB. + +Your task is to write exactly one English release note entry for a TiDB issue or PR. + +Return only a JSON object with exactly these keys: + +- type: "improvement" or "bug_fix" +- release_note: one Markdown bullet that starts with "- " +- needs_review: true or false +- reason: a short reason for the type and wording + +Rules: + +- Write from the user's perspective. +- Use the Excel issue_type as a strong signal, but decide the final type from the issue, PR description, and code changes. +- For improvements, follow the Improvements reference below. +- For bug fixes, follow the Bug fixes reference below. +- Do not end the release note with a period. +- Include every expected link in Markdown release-note style. +- Include every contributor as @[user](https://github.com/user). +- If there is no issue URL, use the PR link as the suffix link. +- Do not expose internal function names unless they are the user-visible behavior. +- If the available context is insufficient, still draft the best note and set needs_review to true. + +Expected links: +{{EXPECTED_LINKS}} + +Contributors: +{{CONTRIBUTORS}} + +Row context: +{{ROW_CONTEXT}} + +Improvements reference: +{{IMPROVEMENTS_REFERENCE}} + +Bug fixes reference: +{{BUG_FIXES_REFERENCE}} diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index 5d1e701f56cec..bdcb30ba8433b 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -1,7 +1,42 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -"""CLI entry point for generating English TiDB release notes with AI.""" +""" This script generates English TiDB release notes from a workbook with PR links and issue links of a specific release. + +What does this script do? + + - Filter out the PRs and issues that are not in the target release scope. For example, PRs that were merged before this previous path release. + - Move the issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. + - Mark the release notes that are already published in other series as ``(dup)`` and reuse the release notes for the same issue. + - Generate the English release note using AI according to the release note draft provided in the PR, the description and code changes of the PR, the descriptions of the issue + - Map components in the workbook to the corresponding release note components. + - Generate the release note file for the target release according to the release note template file. + +Typical usage: + + python3 scripts/release_notes_generate_ai.py \ + --version 8.5.7 \ + --excel /path/to/tirelease.xlsx \ + --releases-dir releases \ + --github-token-file /path/to/github-token.txt + +Useful options: + + --involve-ai-generation OFF + Skip AI generation and use the source ``formated_release_note`` values + for non-duplicate rows. + + --force-regenerate + Clear existing AI-generated notes in the processed workbook and generate + them again. + + --output-release-file /path/to/release-8.5.7.md + Write the generated Markdown to a custom path. By default, the output is + ``release--updated-by-ai.md`` under ``--releases-dir``. + +Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option +list. +""" from release_notes_ai.cli import main From 195da0b95b759c8511ec1130213d4ee50ec68224 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 6 May 2026 14:59:38 +0800 Subject: [PATCH 3/5] update the naming rule of the release note file --- scripts/release_notes_ai/cli.py | 14 ++++++++++++-- scripts/release_notes_generate_ai.py | 6 ++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index fdeaccfda3efb..7aea9b9ee43db 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -58,7 +58,10 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--output-release-file", - help="Output Markdown file. Defaults to release-{version}-updated-by-ai.md.", + help=( + "Output Markdown file. Defaults to release-{version}-updated-by-ai.md " + "if release-{version}.md already exists, otherwise release-{version}.md." + ), ) parser.add_argument( "--ai-timeout", @@ -142,7 +145,7 @@ def main() -> int: output_file = ( Path(args.output_release_file) if args.output_release_file - else Path(args.releases_dir) / f"release-{args.version}-updated-by-ai.md" + else default_output_release_file(Path(args.releases_dir), args.version) ) excel_path = Path(args.excel) @@ -219,6 +222,13 @@ def parse_on_off(value: str) -> str: return normalized +def default_output_release_file(releases_dir: Path, version: str) -> Path: + release_file = releases_dir / f"release-{version}.md" + if release_file.is_file(): + return releases_dir / f"release-{version}-updated-by-ai.md" + return release_file + + def default_processed_excel_path(excel_path: Path) -> Path: return excel_path.with_name(f"{excel_path.stem}_processed{excel_path.suffix}") diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index bdcb30ba8433b..03d1ec7f8a59b 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -31,8 +31,10 @@ them again. --output-release-file /path/to/release-8.5.7.md - Write the generated Markdown to a custom path. By default, the output is - ``release--updated-by-ai.md`` under ``--releases-dir``. + Write the generated Markdown to a custom path. By default, the output + under ``--releases-dir`` is ``release--updated-by-ai.md`` if + ``release-.md`` already exists, otherwise + ``release-.md``. Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option list. From 259870d8186173c3a079e96f33c9e28a169d75e1 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 6 May 2026 16:24:10 +0800 Subject: [PATCH 4/5] add the usage descriptions for the scripts --- scripts/release-notes-generator-readme.md | 79 +++++++++++++++++++++++ scripts/release_notes_ai/__init__.py | 1 - scripts/release_notes_ai/cli.py | 34 +++------- scripts/release_notes_ai/constants.py | 2 +- scripts/release_notes_generate_ai.py | 35 ++-------- 5 files changed, 94 insertions(+), 57 deletions(-) create mode 100644 scripts/release-notes-generator-readme.md delete mode 100644 scripts/release_notes_ai/__init__.py diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-generator-readme.md new file mode 100644 index 0000000000000..d966eaf3b83c4 --- /dev/null +++ b/scripts/release-notes-generator-readme.md @@ -0,0 +1,79 @@ +# Release notes generator + +`scripts/release_notes_generate_ai.py` generates English TiDB release notes according to PRs and issues in a specified excel file. + +## What it does + +**Scope filtering:** + +- Filters out PRs and issues that are not in the target release scope. For example, it filters out PRs that were merged before the previous patch release. +- Moves issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. + +**Duplicate handling:** + +- Marks release notes that are already published in other series as `(dup)` and reuses the release notes for the same issue. + +**Release note generation:** + +- Generates English release notes using AI according to the release note draft provided in the PR, the PR description and code changes, and the issue description. +- Maps components in the workbook to the corresponding release note components. + +**File output in Markdown:** + +- Generates the release note file for the target release according to the release note template file. +- Add the improvements and bug fixes of each component to the corresponding sections of the release note file. + +## Prerequisites + +- Install Python dependencies: + + ```bash + python3 -m pip install -r scripts/release_notes_ai/requirements.txt + ``` + +- Prepare a GitHub token with access to the public repositories and set the GitHub token in the `GITHUB_TOKEN` environment variable: + + ```bash + export GITHUB_TOKEN= + ``` + +- Install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. + +## Typical usage + +```bash +python3 scripts/release_notes_generate_ai.py \ + --version 8.5.7 \ + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases +``` + +## Option descriptions + +| Option | Required | Default value | Usage example | Description | +| --- | --- | --- | --- | --- | +| `--version ` | Yes | None | `--version 8.5.7` | Target TiDB version. This value is used for scope filtering, existing release-note lookup, generated Markdown front matter, and the default output file name. | +| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | +| `--releases-dir ` | Yes | None | `--releases-dir releases` | Path to the existing English release notes directory. The script scans this directory for historical release notes and writes the generated Markdown under this directory unless `--output-release-file` is specified. | +| `--sheet ` | No | `pr_for_release_note` | `--sheet pr_for_release_note` | Workbook sheet to process. | +| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator. The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | +| `--ai-model ` | No | `gpt-5.4` | `--ai-model gpt-5.4` | Model name passed to `codex exec` with `-m`. | +| `--involve-ai-generation ` | No | `ON` | `--involve-ai-generation OFF` | Whether to generate non-duplicate release notes with AI. Use `ON` to invoke AI, or `OFF` to use the source `formated_release_note` values. | +| `--output-release-file ` | No | Conditional | `--output-release-file /path/to/release-8.5.7.md` | Write the generated Markdown to a custom path. By default, the output under `--releases-dir` is `release--updated-by-ai.md` if `release-.md` already exists, otherwise `release-.md`. | +| `--ai-timeout ` | No | `600` | `--ai-timeout 600` | Timeout in seconds for each AI command invocation. | +| `--ai-workers ` | No | `3` | `--ai-workers 3` | Number of concurrent AI command invocations. | +| `--github-workers ` | No | `8` | `--github-workers 8` | Number of concurrent GitHub API prefetch workers. | +| `--author-workers ` | No | `3` | `--author-workers 3` | Number of concurrent workers used to resolve bot-authored cherry-pick PR authors. | +| `--checkpoint-interval ` | No | `1` | `--checkpoint-interval 1` | Save the processed workbook after every N completed AI rows. Use `0` to disable checkpoint saves. | +| `--force-regenerate` | No | Disabled | `--force-regenerate` | Clear existing AI-generated notes in the processed workbook and generate all non-duplicate rows again. | +| `--release-date ` | No | `TBD` | `--release-date "August 14, 2025"` | Release date text for the generated Markdown header. | +| `--skip-scope-preprocess` | No | Disabled | `--skip-scope-preprocess` | Skip moving not-in-scope PR rows to the `PRs_not_in_scope` sheet. | +| `--scope-base-branch-start-date ` | No | Estimated from release history | `--scope-base-branch-start-date 2025-01-01` | Override the estimated release-m.n branch start date for x.y.0 scope preprocessing. The value must use the `YYYY-MM-DD` format. | + +## Generated files + +- The source excel file passed to `--excel` is not overwritten. +- The processed excel file is written to `_processed.xlsx` next to the source workbook. +- The generated Markdown file is written to `--output-release-file` when that option is specified. +- If `--output-release-file` is omitted and `release-.md` already exists under `--releases-dir`, the generated Markdown file is written to `release--updated-by-ai.md`. +- If `--output-release-file` is omitted and `release-.md` does not exist under `--releases-dir`, the generated Markdown file is written to `release-.md`. diff --git a/scripts/release_notes_ai/__init__.py b/scripts/release_notes_ai/__init__.py deleted file mode 100644 index 65f7e128c779b..0000000000000 --- a/scripts/release_notes_ai/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Helpers for generating TiDB release notes with AI.""" diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index 7aea9b9ee43db..b1e913a7bca0d 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -26,7 +26,7 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Generate English release notes with AI from a tirelease workbook." + description="Generate English release notes with AI according to PRs and issues in a specified excel file." ) parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") @@ -36,7 +36,6 @@ def parse_args() -> argparse.Namespace: help="Path to the existing English release notes directory.", ) parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") - parser.add_argument("--github-token-file", help="Path to a GitHub token file.") parser.add_argument( "--ai-command", default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", @@ -137,7 +136,10 @@ def main() -> int: if not base_branch_start_date: raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") - token = load_github_token(args.github_token_file) + try: + token = load_github_token() + except ValueError as exc: + raise SystemExit(f"error: {exc}") from None github = GitHubClient(token) involve_ai_generation = args.involve_ai_generation == "ON" ai = AIClient(args.ai_command, args.ai_model, args.ai_timeout) if involve_ai_generation else None @@ -278,24 +280,8 @@ def save_workbook_safely(workbook: openpyxl.Workbook, excel_path: Path) -> None: raise RuntimeError(f"Failed to save workbook {excel_path}: {exc}") from exc -def load_github_token(token_file: str | None) -> str | None: - import shutil - import subprocess - - if token_file: - return Path(token_file).read_text(encoding="utf-8").strip() - if os.environ.get("GITHUB_TOKEN"): - return os.environ["GITHUB_TOKEN"].strip() - gh = shutil.which("gh") - if not gh: - return None - completed = subprocess.run( - [gh, "auth", "token"], - text=True, - capture_output=True, - timeout=10, - check=False, - ) - if completed.returncode == 0 and completed.stdout.strip(): - return completed.stdout.strip() - return None +def load_github_token() -> str: + token = os.environ.get("GITHUB_TOKEN", "").strip() + if not token: + raise ValueError("GITHUB_TOKEN environment variable is required") + return token diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py index 89cca90e52d2b..12d0d9ca8dcee 100644 --- a/scripts/release_notes_ai/constants.py +++ b/scripts/release_notes_ai/constants.py @@ -26,7 +26,7 @@ ) BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} -# Keep the misspelled source column name because tirelease exports it this way. +# Keep the misspelled source column name because release note excel file exports it this way. REQUIRED_HEADERS = { "pr_author", "pr_link", diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index 03d1ec7f8a59b..0a6d3eb761268 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -1,43 +1,16 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" This script generates English TiDB release notes from a workbook with PR links and issue links of a specific release. - -What does this script do? - - - Filter out the PRs and issues that are not in the target release scope. For example, PRs that were merged before this previous path release. - - Move the issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. - - Mark the release notes that are already published in other series as ``(dup)`` and reuse the release notes for the same issue. - - Generate the English release note using AI according to the release note draft provided in the PR, the description and code changes of the PR, the descriptions of the issue - - Map components in the workbook to the corresponding release note components. - - Generate the release note file for the target release according to the release note template file. +"""Generate TiDB improvements and bug fixes for release notes according to PRs and issues in a specified excel file. Typical usage: python3 scripts/release_notes_generate_ai.py \ --version 8.5.7 \ - --excel /path/to/tirelease.xlsx \ - --releases-dir releases \ - --github-token-file /path/to/github-token.txt - -Useful options: - - --involve-ai-generation OFF - Skip AI generation and use the source ``formated_release_note`` values - for non-duplicate rows. - - --force-regenerate - Clear existing AI-generated notes in the processed workbook and generate - them again. - - --output-release-file /path/to/release-8.5.7.md - Write the generated Markdown to a custom path. By default, the output - under ``--releases-dir`` is ``release--updated-by-ai.md`` if - ``release-.md`` already exists, otherwise - ``release-.md``. + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases -Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option -list. +For detailed usage and options, see scripts/release-notes-generator-readme.md. """ from release_notes_ai.cli import main From c47192b1936baf717e6ff1e6b04fef71b9bb1305 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 11 May 2026 16:55:36 +0800 Subject: [PATCH 5/5] Update release-notes-generator-readme.md --- scripts/release-notes-generator-readme.md | 327 +++++++++++++++++++++- 1 file changed, 312 insertions(+), 15 deletions(-) diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-generator-readme.md index d966eaf3b83c4..d377f4da3f689 100644 --- a/scripts/release-notes-generator-readme.md +++ b/scripts/release-notes-generator-readme.md @@ -1,27 +1,38 @@ # Release notes generator -`scripts/release_notes_generate_ai.py` generates English TiDB release notes according to PRs and issues in a specified excel file. +`scripts/release_notes_generate_ai.py` generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in a Excel workbook. + +The generator keeps the source workbook unchanged, writes all processing results to a processed workbook, and renders the generated entries to a Markdown release note file. ## What it does -**Scope filtering:** +**Scope filtering** + +- Filters out rows of PRs and issues that are not in the target release scope. +- Moves issues that already appeared in earlier release notes from the same major.minor series to a separate worksheet for review. + +**Author correction** + +- Resolves bot-authored cherry-pick rows to the original PR author when possible. + +**Duplicate handling** + +- Reuses already-published release note entries as `(dup)` entries when appropriate. -- Filters out PRs and issues that are not in the target release scope. For example, it filters out PRs that were merged before the previous patch release. -- Moves issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. +**Release note generation** -**Duplicate handling:** +- Generates English release notes with AI from workbook data, GitHub PR and issue context, changed-file summaries, and repo-local release note writing references. -- Marks release notes that are already published in other series as `(dup)` and reuses the release notes for the same issue. +**Component mapping** -**Release note generation:** +- Maps workbook components to the corresponding release note Markdown components. -- Generates English release notes using AI according to the release note draft provided in the PR, the PR description and code changes, and the issue description. -- Maps components in the workbook to the corresponding release note components. +**Markdown generation** -**File output in Markdown:** +- Writes `Improvements` and `Bug fixes` entries to a Markdown release note draft. + +The generator does not create a complete formal release note. It does not generate sections such as compatibility changes, known issues, deprecations, or upgrade notes. -- Generates the release note file for the target release according to the release note template file. -- Add the improvements and bug fixes of each component to the corresponding sections of the release note file. ## Prerequisites @@ -53,7 +64,7 @@ python3 scripts/release_notes_generate_ai.py \ | Option | Required | Default value | Usage example | Description | | --- | --- | --- | --- | --- | | `--version ` | Yes | None | `--version 8.5.7` | Target TiDB version. This value is used for scope filtering, existing release-note lookup, generated Markdown front matter, and the default output file name. | -| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | +| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note Excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | | `--releases-dir ` | Yes | None | `--releases-dir releases` | Path to the existing English release notes directory. The script scans this directory for historical release notes and writes the generated Markdown under this directory unless `--output-release-file` is specified. | | `--sheet ` | No | `pr_for_release_note` | `--sheet pr_for_release_note` | Workbook sheet to process. | | `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator. The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | @@ -72,8 +83,294 @@ python3 scripts/release_notes_generate_ai.py \ ## Generated files -- The source excel file passed to `--excel` is not overwritten. -- The processed excel file is written to `_processed.xlsx` next to the source workbook. +- The source Excel file passed to `--excel` is not overwritten. +- The processed Excel file is written to `_processed.xlsx` next to the source workbook. - The generated Markdown file is written to `--output-release-file` when that option is specified. - If `--output-release-file` is omitted and `release-.md` already exists under `--releases-dir`, the generated Markdown file is written to `release--updated-by-ai.md`. - If `--output-release-file` is omitted and `release-.md` does not exist under `--releases-dir`, the generated Markdown file is written to `release-.md`. + +## Reference: processing rules + +The following sections describe the main processing logic and rules used by the generator. + +### Processing pipeline + +| Stage | What happens | Review value | +| --- | --- | --- | +| Scope filtering | Out-of-scope rows are moved to `PRs_not_in_scope` with a reason. | Reviewers can see why a row was excluded. | +| Workbook setup | Rows are sorted by component, and output columns are added or reset. | Related rows are easier to inspect, and generated data stays separate from source data. | +| Historical scan | Existing release notes are indexed by GitHub URL, contributor, section, and component. | The generator can reuse published wording instead of drafting duplicate text. | +| Same-series quarantine | Issues already published in the same major.minor series are moved to a separate sheet. | Repeated issues in the same series are visible for manual review. | +| Duplicate marking | Reusable historical entries are written to `published_release_notes` and rendered as `(dup)` entries. | The output keeps the reviewed published note and its source location. | +| Author replacement | Bot-authored cherry-pick rows are resolved to the original PR author when possible. | Contributor suffixes and duplicate matching use the real author. | +| Row merging | Rows with the same first issue URL and raw Excel component are merged. | Multiple PRs for one issue produce one release note entry. | +| Entry generation | Non-duplicate rows are generated by AI or copied from `formated_release_note` in non-AI mode. | The same preprocessing works for both drafting and dry-run workflows. | +| Markdown rendering | Entries are grouped by type and Markdown component. | The draft follows the expected release note structure. | + +### Scope filtering + +Scope filtering removes rows that should not appear in the target release note. Removed rows are copied to `PRs_not_in_scope`, receive a `Reason` value, and are deleted from the main sheet in the processed workbook. + +General rules: + +| Condition | Result | Why | +| --- | --- | --- | +| `pr_status` is not `merged` | Move the row to `PRs_not_in_scope`. | Unmerged changes should not be documented as released. | +| `pr_merge_time` is empty or cannot be parsed | Keep the row. | The generator cannot prove that the row is out of scope. | + +Patch-release rules: + +For a patch release such as `8.5.7`, the generator finds the previous patch release date in `releases/release-timeline.md`. When parsing `release-timeline.md`, the generator skips non-semver entries such as `Pre-GA`. + +| Condition | Result | Why | +| --- | --- | --- | +| The PR was merged before the previous patch release date. | Move the row to `PRs_not_in_scope`. | The PR should already have been considered for the previous patch release. | +| The PR was merged on or after the previous patch release date. | Keep the row. | The PR falls into the target patch-release window. | + +`x.y.0` release rules: + +For an `x.y.0` release, the generator uses `releases/release-timeline.md` and release-branch PR data to avoid including changes that were already shipped in the latest previous major.minor release. + +| Condition | Result | Why | +| --- | --- | --- | +| The PR was merged on or after the latest previously released `x.y.0` date. | Keep the row. | The PR is newer than that previous release boundary. | +| The PR was merged before the estimated start date of the previous release branch. | Move the row to `PRs_not_in_scope`. | The PR is older than the branch window for the previous major.minor release. | +| The PR was merged during the previous release-branch window, and a cherry-pick PR for the previous release branch was merged before that previous release date. | Move the row to `PRs_not_in_scope`. | The change was already included through that cherry-pick. | +| No earlier-release evidence is found. | Keep the row. | The generator keeps the row when it cannot prove that the change is out of scope. | + +The estimated release-branch start date comes from the earliest closed PR that targets the previous release branch. You can override it with `--scope-base-branch-start-date`. + +When matching a cherry-pick PR to the original PR, the generator recognizes: + +- The full original PR URL. +- A cross-repository reference such as `pingcap/tidb#12345`. +- A same-repository suffix such as `(#12345)`. +- A branch or text pattern such as `cherry-pick-12345`. +- A line that contains `backport`, `cherry-pick`, `original`, `source`, or `from` together with `#12345`. + +### Historical release note index + +The generator scans existing Markdown files under `--releases-dir` before it decides whether a workbook row is a duplicate. + +The scanner: + +- Ignores generated drafts whose file name contains `updated-by-ai`. +- Ignores release-note files whose version is greater than or equal to the target version. +- Tracks the current release-note section and component from headings and component bullets. +- Extracts every GitHub issue or PR URL from a release note line. +- Extracts contributors from `@[user](https://github.com/user)` suffixes. +- Classifies each historical line as `improvement` or `bug_fix` from its surrounding section. +- Records the surrounding Markdown component when possible. + +Each historical entry can later be reused in this format: + +```markdown +- (dup): +``` + +This preserves the published wording and shows the source file and component path. + +### Repeated issues and duplicates + +The generator handles repeated issues in two different ways: + +- Same-series repeats are moved to a separate worksheet for review. +- Reusable duplicates from other series are rendered as `(dup)` entries. + +This separation is intentional. If the same issue appears again in the same major.minor series, it is often a sign that the row needs human judgment. If the issue has already been documented elsewhere and the author check passes, reusing the published note is usually safer than drafting a new sentence. + +For target version `8.5.7`, the same-series quarantine sheet is named: + +```text +issues_already_in_earlier_v8.5_notes +``` + +A row is moved to this sheet when all of the following are true: + +- The row has an issue URL in `issue_url` or `formated_release_note`. +- The same issue URL appears in an existing release-note file. +- The existing release-note file is from the same major.minor series. +- The existing release-note file version is earlier than the target version. + +Rows in this sheet are not rendered to Markdown. + +After same-series rows are moved out, the generator marks remaining rows as duplicates when their issue URLs match reusable historical entries. + +| Rule | Behavior | +| --- | --- | +| Issue URL source | The generator reads issue URLs from `issue_url`, if present, and from `formated_release_note`. | +| PR URL source | PR URLs are not used for duplicate matching. They are used for AI context and component inference. | +| Author check | If a historical note has contributors, at least one current row author must match a historical contributor. If the historical note has no contributors, the URL match is enough. | +| Workbook output | Matching historical notes are written to `published_release_notes`, and the row is filled in gray. | +| Markdown output | Duplicate rows are rendered from `published_release_notes`; they do not go through AI generation. | +| Type selection | The generator uses the historical section when possible. Otherwise, it falls back to the current row `issue_type`. | +| Component selection | The generator uses the historical component path when possible. Otherwise, it falls back to the current row component. | + +### Author and row normalization + +Cherry-pick PRs are often authored by `ti-chi-bot` or `ti-srebot`. For rows with those authors, the generator tries to find the original PR from the cherry-pick PR title, branch name, or body. + +When the original PR is found, the generator: + +- Replaces `pr_author` with the original PR author. +- Updates author Markdown in `formated_release_note` from the bot account to the original author. + +If the original PR cannot be found, the row keeps the bot author. This avoids blocking the whole run because of one incomplete cherry-pick reference. + +Rows are then merged when they have the same first issue URL and the same raw Excel component. For each merged group, the first row is kept. The kept row receives: + +- The union of `pr_link` values. +- The union of `pr_author` values. +- The union of duplicate notes from `published_release_notes`. +- The first available non-empty value for other empty cells. + +Rows are grouped by the raw Excel component, not the normalized Markdown component. This keeps workbook distinctions intact until the final component mapping stage. + +### Entry generation + +With `--involve-ai-generation ON`, the generator calls the configured AI command for non-duplicate rows that do not already have reusable text in `release_notes_written_by_ai`. + +The prompt includes: + +- The raw Excel component and normalized Markdown component. +- Workbook fields such as `issue_type`, `pr_title`, `formated_release_note`, expected links, and contributors. +- GitHub issue titles, bodies, and labels. +- GitHub PR titles, bodies, authors, branches, merge times, and changed-file summaries. +- The repository-local writing references for improvements and bug fixes. +- The prompt template in `scripts/release_notes_ai/prompts/generation.md`. + +The AI command must return a JSON object with these fields: + +| Field | Rule | +| --- | --- | +| `type` | Must be `improvement` or `bug_fix`. | +| `release_note` | Must be one Markdown bullet that starts with a hyphen followed by a space. | +| `needs_review` | Must be a boolean. | +| `reason` | Must explain the type and wording choice. | + +The generator validates that the release note: + +- Starts with a hyphen followed by a space. +- Does not end with a period. +- Includes every expected issue or PR link. +- Includes every non-bot contributor as `@[user](https://github.com/user)`. + +If validation fails, the generator sends one repair prompt. If the repaired output still fails, the row is marked as: + +```text +AI_GENERATION_FAILED: +``` + +Failed rows are not rendered to Markdown. + +If `release_notes_written_by_ai` already contains a value and does not start with `AI_GENERATION_FAILED:`, the generator reuses it instead of calling AI again. Use `--force-regenerate` to clear existing AI output and regenerate all non-duplicate rows. + +With `--involve-ai-generation OFF`, the generator does not call the AI command. For non-duplicate rows, it splits `formated_release_note` into non-empty lines and renders those lines as Markdown entries. The preprocessing pipeline still runs in non-AI mode. + +### Component mapping + +The generator maps each workbook component to a Markdown release-note component before rendering. It also keeps the original workbook component in an HTML comment after each generated entry: + +```markdown +- Improve ... [#12345](https://github.com/pingcap/tidb/issues/12345) @[user](https://github.com/user) +``` + +This marker lets reviewers trace the generated component back to the workbook value without changing the visible release-note text. + +The generator resolves components in this order: + +1. If the raw workbook value is already a known release-note component or alias, use that value. +2. If the raw workbook value contains multiple comma-separated or newline-separated values, apply the multi-value priority rules. +3. If the workbook value still cannot be resolved, infer the component from the GitHub repositories in the issue and PR URLs. +4. If no rule matches, use the normalized raw workbook value. +5. If the final value is empty, render the entry under `Other`. + +Direct aliases: + +| Excel component value | Markdown component | +| --- | --- | +| `tidb` | `TiDB` | +| `tikv` | `TiKV` | +| `pd` | `PD` | +| `tiflash` | `TiFlash` | +| `tiproxy` | `TiProxy` | +| `br`, `backup & restore`, `backup & restore (br)` | `Backup & Restore (BR)` | +| `cdc`, `ticdc` | `TiCDC` | +| `dm`, `tidb data migration`, `tidb data migration (dm)` | `TiDB Data Migration (DM)` | +| `tidb lightning`, `lightning` | `TiDB Lightning` | +| `dumpling` | `Dumpling` | +| `tiup` | `TiUP` | +| `tidb binlog` | `TiDB Binlog` | +| `sync_diff`, `sync-diff-inspector`, `sync diff inspector` | `sync-diff-inspector` | + +TiDB subcomponent aliases: + +| Excel component value | Markdown component | +| --- | --- | +| `ng monitoring`, `ng-monitoring` | `TiDB` | +| `planner` | `TiDB` | +| `execution` | `TiDB` | +| `sql-infra` | `TiDB` | +| `transaction` | `TiDB` | +| `engine` | `TiDB` | +| `observability` | `TiDB` | +| `dxf` | `TiDB` | +| `storage` | `TiDB` | +| `tidb-dashboard`, `tidb dashboard` | `TiDB` | +| `ddl` | `TiDB` | +| `coprocessor` | `TiDB` | +| `compute` | `TiDB` | +| `scheduling` | `TiDB` | +| `spm` | `TiDB` | + +When a workbook cell contains multiple component values, the generator applies this priority: + +1. Tool components with stronger source meaning: `Backup & Restore (BR)`, `TiDB Lightning`, `Dumpling`, `TiUP`, and `sync-diff-inspector`. +2. Top-level components: `TiDB`, `TiKV`, `PD`, `TiFlash`, and `TiProxy`. +3. `TiDB Data Migration (DM)`. +4. `TiCDC`. + +Repository fallback rules: + +| Repository evidence | Markdown component | +| --- | --- | +| `pd` | `PD` | +| `tikv` | `TiKV` | +| `tiflash` | `TiFlash` | +| `ng-monitoring` | `TiDB` | +| `tiup` | `TiUP` | +| `tiflow` or `ticdc`, and the raw component contains `dm` but not `cdc` | `TiDB Data Migration (DM)` | +| `tiflow` or `ticdc`, otherwise | `TiCDC` | +| `tidb`, and the raw component contains `br` | `Backup & Restore (BR)` | +| `tidb`, and the raw component contains `lightning` | `TiDB Lightning` | +| `tidb`, and the raw component contains `dumpling` | `Dumpling` | +| `tidb`, otherwise | `TiDB` | +| `tidb-dashboard` | `TiDB` | + +### Markdown rendering and safe saving + +The generated file contains front matter, the `# TiDB Release Notes` heading, release metadata, quick access links, `## Improvements`, and `## Bug fixes`. + +Entries are grouped by type and component. Top-level components are rendered in this order: + +```text +TiDB, TiKV, PD, TiFlash, TiProxy +``` + +Tool components are rendered under `+ Tools` in this order: + +```text +Backup & Restore (BR), TiCDC, TiDB Data Migration (DM), TiDB Lightning, Dumpling, TiUP, TiDB Binlog, sync-diff-inspector +``` + +Known top-level components are rendered first. Unknown non-tool components are rendered next in alphabetical order. Tool components are rendered last under `Tools`. + +Before writing an entry, the renderer normalizes its bullet marker to a hyphen followed by a space. If the entry does not already contain a component marker, the renderer appends the raw workbook component as an HTML comment. + +The processed workbook is saved to `_processed.xlsx`. During AI generation, `--checkpoint-interval` controls how often the processed workbook is saved: + +- The default value `1` saves after every completed AI row. +- `0` disables checkpoint saves. + +Workbook saves are atomic. The generator first writes a temporary file in the target directory and then replaces the processed workbook. If replacement fails after a complete temporary workbook has been written, the error message includes the temporary file path. +