From 3cc047ada1f81adc598b0a11ae1dd61e0bb286c4 Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Mon, 9 Mar 2026 14:59:22 +0900 Subject: [PATCH 1/6] feat: strip common prefix from ZIP archives GitHub-style ZIP archives nest all files under a single directory (e.g. `repo-main/`). Strip this prefix in both ZipMemoryFileSystem (analysis) and materialize_zip_workspace (extraction) so entry detection and asset resolution work correctly. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Yonghye Kwon --- crates/marknest-core/src/lib.rs | 31 +++++++++ crates/marknest-core/tests/analyze_zip.rs | 76 +++++++++++++++++++++++ crates/marknest/src/lib.rs | 47 ++++++++++++-- 3 files changed, 149 insertions(+), 5 deletions(-) diff --git a/crates/marknest-core/src/lib.rs b/crates/marknest-core/src/lib.rs index 635e992..a1c03fb 100644 --- a/crates/marknest-core/src/lib.rs +++ b/crates/marknest-core/src/lib.rs @@ -431,11 +431,42 @@ impl ZipMemoryFileSystem { }); } + strip_common_prefix(&mut files); files.sort_by(|left, right| left.normalized_path.cmp(&right.normalized_path)); Ok(Self { files }) } } +/// If every file shares the same first path segment (e.g. `repo-main/`), +/// strip that segment from all paths. This handles GitHub-style archives +/// that nest everything under `{repo}-{ref}/`. +fn strip_common_prefix(files: &mut [IndexedFile]) { + if files.is_empty() { + return; + } + + let common: String = match files[0].normalized_path.split('/').next() { + Some(segment) => segment.to_string(), + None => return, + }; + + // All files must share the same first segment AND have content after it + let all_share_prefix = files.iter().all(|file| { + file.normalized_path.starts_with(&common) + && file.normalized_path.len() > common.len() + && file.normalized_path.as_bytes()[common.len()] == b'/' + }); + + if !all_share_prefix { + return; + } + + let strip_len: usize = common.len() + 1; // include the '/' + for file in files.iter_mut() { + file.normalized_path = file.normalized_path[strip_len..].to_string(); + } +} + impl IndexedFileSystem for ZipMemoryFileSystem { fn source_kind(&self) -> ProjectSourceKind { ProjectSourceKind::Zip diff --git a/crates/marknest-core/tests/analyze_zip.rs b/crates/marknest-core/tests/analyze_zip.rs index 3c60d64..943ab08 100644 --- a/crates/marknest-core/tests/analyze_zip.rs +++ b/crates/marknest-core/tests/analyze_zip.rs @@ -69,3 +69,79 @@ fn rejects_windows_drive_paths_inside_zip() { } ); } + +#[test] +fn strips_common_prefix_from_github_style_zip() { + let bytes = build_zip(&[ + ( + "repo-main/README.md", + "# Hello\n\n![Logo](./images/logo.png)\n", + ), + ("repo-main/images/logo.png", "fake-png-bytes"), + ]); + + let index = analyze_zip(&bytes).expect("github-style zip should analyze"); + + assert_eq!(index.selected_entry.as_deref(), Some("README.md")); + assert_eq!(index.entry_selection_reason, EntrySelectionReason::Readme); + + let candidate_paths: Vec<&str> = index + .entry_candidates + .iter() + .map(|candidate| candidate.path.as_str()) + .collect(); + assert_eq!(candidate_paths, vec!["README.md"]); + + let resolved_asset_paths: Vec> = index + .assets + .iter() + .map(|asset| asset.resolved_path.as_deref()) + .collect(); + assert_eq!(resolved_asset_paths, vec![Some("images/logo.png")]); +} + +#[test] +fn preserves_paths_when_no_common_prefix() { + let bytes = build_zip(&[ + ("README.md", "# Root readme\n"), + ("docs/guide.md", "# Guide\n"), + ]); + + let index = analyze_zip(&bytes).expect("zip without common prefix should analyze"); + + let candidate_paths: Vec<&str> = index + .entry_candidates + .iter() + .map(|candidate| candidate.path.as_str()) + .collect(); + assert!(candidate_paths.contains(&"README.md")); + assert!(candidate_paths.contains(&"docs/guide.md")); +} + +#[test] +fn preserves_paths_when_multiple_top_level_directories() { + let bytes = build_zip(&[("dir-a/README.md", "# A\n"), ("dir-b/README.md", "# B\n")]); + + let index = analyze_zip(&bytes).expect("zip with multiple top dirs should analyze"); + + let candidate_paths: Vec<&str> = index + .entry_candidates + .iter() + .map(|candidate| candidate.path.as_str()) + .collect(); + assert!(candidate_paths.contains(&"dir-a/README.md")); + assert!(candidate_paths.contains(&"dir-b/README.md")); +} + +#[test] +fn strips_common_prefix_single_nested_file() { + let bytes = build_zip(&[("only-dir/file.md", "# Single\n")]); + + let index = analyze_zip(&bytes).expect("single nested file zip should analyze"); + + assert_eq!(index.selected_entry.as_deref(), Some("file.md")); + assert_eq!( + index.entry_selection_reason, + EntrySelectionReason::SingleMarkdownFile + ); +} diff --git a/crates/marknest/src/lib.rs b/crates/marknest/src/lib.rs index 2f0031b..c184977 100644 --- a/crates/marknest/src/lib.rs +++ b/crates/marknest/src/lib.rs @@ -293,6 +293,8 @@ fn materialize_zip_workspace(zip_path: &Path) -> Result { )) })?; + // Collect all entries with normalized paths first to detect a common prefix + let mut collected_entries: Vec<(String, Vec)> = Vec::new(); for index in 0..archive.len() { let mut entry = archive.by_index(index).map_err(|error| { AppFailure::validation(format!("Failed to read ZIP entry {index}: {error}")) @@ -306,7 +308,21 @@ fn materialize_zip_workspace(zip_path: &Path) -> Result { let normalized_path = normalize_relative_string(&raw_path).map_err(|_| { AppFailure::validation(format!("Unsafe ZIP entry path detected: {raw_path}")) })?; - let output_path = normalized_path_to_filesystem_path(temp_dir.path(), &normalized_path); + + let mut contents: Vec = Vec::new(); + entry.read_to_end(&mut contents).map_err(|error| { + AppFailure::validation(format!("Failed to extract ZIP entry {raw_path}: {error}")) + })?; + + collected_entries.push((normalized_path, contents)); + } + + // Strip common prefix (e.g. GitHub archive `repo-main/` wrapper) + let prefix_len = detect_common_prefix_len(&collected_entries); + + for (normalized_path, contents) in &collected_entries { + let stripped_path = &normalized_path[prefix_len..]; + let output_path = normalized_path_to_filesystem_path(temp_dir.path(), stripped_path); if let Some(parent) = output_path.parent() { fs::create_dir_all(parent).map_err(|error| { @@ -317,10 +333,6 @@ fn materialize_zip_workspace(zip_path: &Path) -> Result { })?; } - let mut contents: Vec = Vec::new(); - entry.read_to_end(&mut contents).map_err(|error| { - AppFailure::validation(format!("Failed to extract ZIP entry {raw_path}: {error}")) - })?; fs::write(&output_path, contents).map_err(|error| { AppFailure::system(format!( "Failed to write the extracted ZIP entry {}: {error}", @@ -332,6 +344,31 @@ fn materialize_zip_workspace(zip_path: &Path) -> Result { Ok(temp_dir) } +/// Returns the length (including trailing `/`) of the common first path segment +/// shared by all entries, or 0 if no common prefix exists. +fn detect_common_prefix_len(entries: &[(String, Vec)]) -> usize { + if entries.is_empty() { + return 0; + } + + let common = match entries[0].0.split('/').next() { + Some(segment) => segment, + None => return 0, + }; + + let all_share_prefix = entries.iter().all(|(path, _)| { + path.starts_with(common) + && path.len() > common.len() + && path.as_bytes()[common.len()] == b'/' + }); + + if all_share_prefix { + common.len() + 1 + } else { + 0 + } +} + fn run_single_convert( args: &ConvertArgs, analyzed_input: &AnalyzedInput, From ccb820255fd294c935ba6a68389ed2f12d9ce1d1 Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Mon, 9 Mar 2026 15:00:40 +0900 Subject: [PATCH 2/6] feat: add GitHub URL parser for CLI input Parse GitHub URLs into owner, repo, ref, subpath, and blob/tree type. Supports bare repo URLs, branch/tag refs, blob paths, and tree paths. Handles .git suffix and http/https schemes. Rejects non-GitHub URLs. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Yonghye Kwon --- crates/marknest/src/lib.rs | 206 +++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/crates/marknest/src/lib.rs b/crates/marknest/src/lib.rs index c184977..7768a17 100644 --- a/crates/marknest/src/lib.rs +++ b/crates/marknest/src/lib.rs @@ -3650,6 +3650,88 @@ impl ParseFailure { } } +#[derive(Debug, Clone, PartialEq, Eq)] +struct ParsedGitHubUrl { + owner: String, + repo: String, + git_ref: Option, + subpath: Option, + is_file_reference: bool, +} + +/// Parse a GitHub URL into its components. Returns `None` for non-GitHub URLs +/// or malformed input. +fn parse_github_url(input: &str) -> Option { + let trimmed: &str = input.trim(); + + // Must start with http:// or https:// + let after_scheme: &str = trimmed + .strip_prefix("https://") + .or_else(|| trimmed.strip_prefix("http://"))?; + + // Must be github.com host (with optional www.) + let after_host: &str = after_scheme + .strip_prefix("github.com/") + .or_else(|| after_scheme.strip_prefix("www.github.com/"))?; + + // Split remaining path segments + let segments: Vec<&str> = after_host + .trim_end_matches('/') + .split('/') + .filter(|segment| !segment.is_empty()) + .collect(); + + if segments.len() < 2 { + return None; + } + + let owner: String = segments[0].to_string(); + let repo: String = segments[1].trim_end_matches(".git").to_string(); + + if owner.is_empty() || repo.is_empty() { + return None; + } + + // Bare repo URL: https://github.com/owner/repo + if segments.len() == 2 { + return Some(ParsedGitHubUrl { + owner, + repo, + git_ref: None, + subpath: None, + is_file_reference: false, + }); + } + + // Must have /tree/ or /blob/ as the third segment + let path_type: &str = segments[2]; + let is_file_reference: bool = match path_type { + "blob" => true, + "tree" => false, + _ => return None, + }; + + // Must have a ref after /tree/ or /blob/ + if segments.len() < 4 { + return None; + } + + let git_ref: String = segments[3].to_string(); + let subpath: Option = if segments.len() > 4 { + Some(segments[4..].join("/")) + } else { + None + }; + + Some(ParsedGitHubUrl { + owner, + repo, + git_ref: Some(git_ref), + subpath, + is_file_reference, + }) +} + #[cfg(test)] mod tests { use super::*; @@ -5038,4 +5120,128 @@ mod tests { None => unsafe { env::remove_var(key) }, } } + + // --- GitHub URL parsing tests --- + + #[test] + fn parses_bare_github_repo_url() { + let result = parse_github_url("https://github.com/user/repo"); + assert_eq!( + result, + Some(ParsedGitHubUrl { + owner: "user".to_string(), + repo: "repo".to_string(), + git_ref: None, + subpath: None, + is_file_reference: false, + }) + ); + } + + #[test] + fn parses_github_tree_url_with_branch() { + let result = parse_github_url("https://github.com/user/repo/tree/main"); + assert_eq!( + result, + Some(ParsedGitHubUrl { + owner: "user".to_string(), + repo: "repo".to_string(), + git_ref: Some("main".to_string()), + subpath: None, + is_file_reference: false, + }) + ); + } + + #[test] + fn parses_github_blob_url_with_file_path() { + let result = parse_github_url("https://github.com/user/repo/blob/main/docs/guide.md"); + assert_eq!( + result, + Some(ParsedGitHubUrl { + owner: "user".to_string(), + repo: "repo".to_string(), + git_ref: Some("main".to_string()), + subpath: Some("docs/guide.md".to_string()), + is_file_reference: true, + }) + ); + } + + #[test] + fn parses_github_tree_url_with_tag_and_directory() { + let result = parse_github_url("https://github.com/user/repo/tree/v2.0/src"); + assert_eq!( + result, + Some(ParsedGitHubUrl { + owner: "user".to_string(), + repo: "repo".to_string(), + git_ref: Some("v2.0".to_string()), + subpath: Some("src".to_string()), + is_file_reference: false, + }) + ); + } + + #[test] + fn parses_github_url_with_dot_git_suffix() { + let result = parse_github_url("https://github.com/user/repo.git"); + assert_eq!( + result, + Some(ParsedGitHubUrl { + owner: "user".to_string(), + repo: "repo".to_string(), + git_ref: None, + subpath: None, + is_file_reference: false, + }) + ); + } + + #[test] + fn parses_http_github_url() { + let result = parse_github_url("http://github.com/user/repo"); + assert_eq!( + result, + Some(ParsedGitHubUrl { + owner: "user".to_string(), + repo: "repo".to_string(), + git_ref: None, + subpath: None, + is_file_reference: false, + }) + ); + } + + #[test] + fn rejects_non_github_url() { + assert_eq!(parse_github_url("https://gitlab.com/user/repo"), None); + } + + #[test] + fn rejects_malformed_github_url_missing_repo() { + assert_eq!(parse_github_url("https://github.com/user"), None); + } + + #[test] + fn rejects_non_url_input() { + assert_eq!(parse_github_url("README.md"), None); + assert_eq!(parse_github_url("./docs.zip"), None); + assert_eq!(parse_github_url("/some/path"), None); + } + + #[test] + fn parses_github_url_with_trailing_slash() { + let result = parse_github_url("https://github.com/user/repo/"); + assert_eq!( + result, + Some(ParsedGitHubUrl { + owner: "user".to_string(), + repo: "repo".to_string(), + git_ref: None, + subpath: None, + is_file_reference: false, + }) + ); + } } From 58248be6380b27eaa266048c3a61ea717d0c58ee Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Mon, 9 Mar 2026 15:02:52 +0900 Subject: [PATCH 3/6] feat: add GitHub archive download functions Add resolve_github_auth_token (GITHUB_TOKEN/GH_TOKEN env vars), resolve_github_default_branch (GitHub API), and download_github_archive (zipball endpoint with 256 MB limit). Reuses existing ureq HTTP patterns. Includes descriptive error messages for 404, 403/rate-limit, timeout, and size exceeded. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Yonghye Kwon --- crates/marknest/src/lib.rs | 184 +++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/crates/marknest/src/lib.rs b/crates/marknest/src/lib.rs index 7768a17..5c6169a 100644 --- a/crates/marknest/src/lib.rs +++ b/crates/marknest/src/lib.rs @@ -35,6 +35,9 @@ const REMOTE_ASSET_TIMEOUT_SECONDS: u64 = 15; const REMOTE_ASSET_MAX_REDIRECTS: u32 = 5; const REMOTE_ASSET_MAX_BYTES: usize = 16 * 1024 * 1024; const REMOTE_ASSET_MAX_TOTAL_BYTES: usize = 64 * 1024 * 1024; +const GITHUB_ARCHIVE_MAX_BYTES: usize = 256 * 1024 * 1024; +const GITHUB_API_TIMEOUT_SECONDS: u64 = 30; +const GITHUB_API_MAX_REDIRECTS: u32 = 5; pub fn run(args: I) -> i32 where @@ -3732,6 +3735,141 @@ fn parse_github_url(input: &str) -> Option { }) } +/// Resolve GitHub auth token from environment variables. +/// Checks GITHUB_TOKEN first, then falls back to GH_TOKEN. +fn resolve_github_auth_token() -> Option { + env::var("GITHUB_TOKEN") + .ok() + .or_else(|| env::var("GH_TOKEN").ok()) + .filter(|token| !token.is_empty()) +} + +fn build_github_api_agent() -> ureq::Agent { + ureq::AgentBuilder::new() + .timeout_connect(Duration::from_secs(GITHUB_API_TIMEOUT_SECONDS)) + .timeout_read(Duration::from_secs(GITHUB_API_TIMEOUT_SECONDS)) + .timeout_write(Duration::from_secs(GITHUB_API_TIMEOUT_SECONDS)) + .redirects(GITHUB_API_MAX_REDIRECTS) + .build() +} + +/// Query the GitHub API for the default branch of a repository. +fn resolve_github_default_branch( + owner: &str, + repo: &str, + token: Option<&str>, +) -> Result { + let url: String = format!("https://api.github.com/repos/{owner}/{repo}"); + let agent: ureq::Agent = build_github_api_agent(); + let mut request = agent + .get(&url) + .set("Accept", "application/vnd.github+json") + .set("User-Agent", "marknest"); + + if let Some(token) = token { + request = request.set("Authorization", &format!("Bearer {token}")); + } + + let response = request.call().map_err(|error| match &error { + ureq::Error::Status(404, _) => AppFailure::validation( + "GitHub repository not found or access denied. Use GITHUB_TOKEN or GH_TOKEN for private repositories.".to_string(), + ), + ureq::Error::Status(403, response) => { + if response + .header("X-RateLimit-Remaining") + .map(|value| value == "0") + .unwrap_or(false) + { + AppFailure::validation( + "GitHub API rate limit exceeded. Set GITHUB_TOKEN or GH_TOKEN to increase the limit.".to_string(), + ) + } else { + AppFailure::system(format!("Failed to query the GitHub API: {error}")) + } + } + _ => AppFailure::system(format!("Failed to query the GitHub API: {error}")), + })?; + + let body: String = response.into_string().map_err(|error| { + AppFailure::system(format!("Failed to read the GitHub API response: {error}")) + })?; + + let json: serde_json::Value = serde_json::from_str(&body).map_err(|error| { + AppFailure::system(format!("Failed to parse the GitHub API response: {error}")) + })?; + + json["default_branch"] + .as_str() + .map(|value| value.to_string()) + .ok_or_else(|| { + AppFailure::system( + "GitHub API response did not include a default_branch field.".to_string(), + ) + }) +} + +/// Download a GitHub repository archive as a ZIP file. +fn download_github_archive( + owner: &str, + repo: &str, + git_ref: &str, + token: Option<&str>, +) -> Result, AppFailure> { + let url: String = format!("https://api.github.com/repos/{owner}/{repo}/zipball/{git_ref}"); + let agent: ureq::Agent = build_github_api_agent(); + let mut request = agent + .get(&url) + .set("Accept", "application/vnd.github+json") + .set("User-Agent", "marknest"); + + if let Some(token) = token { + request = request.set("Authorization", &format!("Bearer {token}")); + } + + let response = request.call().map_err(|error| match &error { + ureq::Error::Status(404, _) => AppFailure::validation( + "GitHub repository not found or access denied. Use GITHUB_TOKEN or GH_TOKEN for private repositories.".to_string(), + ), + ureq::Error::Status(403, response) => { + if response + .header("X-RateLimit-Remaining") + .map(|value| value == "0") + .unwrap_or(false) + { + AppFailure::validation( + "GitHub API rate limit exceeded. Set GITHUB_TOKEN or GH_TOKEN to increase the limit.".to_string(), + ) + } else { + AppFailure::system(format!("Failed to download the GitHub archive: {error}")) + } + } + _ => AppFailure::system(format!("Failed to download the GitHub archive: {error}")), + })?; + + let mut reader = response.into_reader(); + let mut bytes: Vec = Vec::new(); + let mut buffer = [0_u8; 8192]; + + loop { + let bytes_read: usize = reader.read(&mut buffer).map_err(|error| { + AppFailure::system(format!("Failed to download the GitHub archive: {error}")) + })?; + if bytes_read == 0 { + break; + } + + if bytes.len() + bytes_read > GITHUB_ARCHIVE_MAX_BYTES { + return Err(AppFailure::validation( + "GitHub archive download exceeded the 256 MB limit.".to_string(), + )); + } + + bytes.extend_from_slice(&buffer[..bytes_read]); + } + + Ok(bytes) +} + #[cfg(test)] mod tests { use super::*; @@ -5121,6 +5259,52 @@ mod tests { } } + // --- GitHub auth token resolution tests --- + // Combined into one test to avoid env var race conditions in parallel test execution + + #[test] + fn resolves_github_auth_token_from_environment() { + let original_github = env::var_os("GITHUB_TOKEN"); + let original_gh = env::var_os("GH_TOKEN"); + + // GITHUB_TOKEN takes priority + unsafe { + env::set_var("GITHUB_TOKEN", "token-from-github"); + env::remove_var("GH_TOKEN"); + } + assert_eq!( + resolve_github_auth_token(), + Some("token-from-github".to_string()) + ); + + // Falls back to GH_TOKEN + unsafe { + env::remove_var("GITHUB_TOKEN"); + env::set_var("GH_TOKEN", "token-from-gh"); + } + assert_eq!( + resolve_github_auth_token(), + Some("token-from-gh".to_string()) + ); + + // Returns None when neither is set + unsafe { + env::remove_var("GITHUB_TOKEN"); + env::remove_var("GH_TOKEN"); + } + assert_eq!(resolve_github_auth_token(), None); + + // Ignores empty values + unsafe { + env::set_var("GITHUB_TOKEN", ""); + env::set_var("GH_TOKEN", ""); + } + assert_eq!(resolve_github_auth_token(), None); + + restore_env_var("GITHUB_TOKEN", original_github); + restore_env_var("GH_TOKEN", original_gh); + } + // --- GitHub URL parsing tests --- #[test] From 1bc0e26237f22670d6d017403fd2cf297acf74ea Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Mon, 9 Mar 2026 15:06:15 +0900 Subject: [PATCH 4/6] feat: integrate GitHub URL support into CLI pipeline Add GitHubUrl variant to ResolvedInput. resolve_input() detects GitHub URLs before filesystem access and returns the parsed URL. analyze_input_path() downloads the archive, saves to a temp file, and feeds it into the existing ZIP analysis pipeline. Blob URLs set the implicit entry. Temp directory kept alive via _temp_dir field on AnalyzedInput. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Yonghye Kwon --- crates/marknest/src/lib.rs | 104 +++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/crates/marknest/src/lib.rs b/crates/marknest/src/lib.rs index 5c6169a..a3b758b 100644 --- a/crates/marknest/src/lib.rs +++ b/crates/marknest/src/lib.rs @@ -811,6 +811,7 @@ fn analyze_input_path(input: Option<&Path>) -> Result workspace_root: Some(workspace_root.clone()), default_output_directory: Some(workspace_root), project_index, + _temp_dir: None, }) } ResolvedInput::Zip { path, display_path } => { @@ -842,6 +843,7 @@ fn analyze_input_path(input: Option<&Path>) -> Result workspace_root: None, default_output_directory, project_index, + _temp_dir: None, }) } ResolvedInput::Folder { @@ -867,6 +869,58 @@ fn analyze_input_path(input: Option<&Path>) -> Result workspace_root: Some(canonical_root.clone()), default_output_directory: Some(canonical_root), project_index, + _temp_dir: None, + }) + } + ResolvedInput::GitHubUrl { + display_path, + parsed, + } => { + let token: Option = resolve_github_auth_token(); + + let git_ref: String = match &parsed.git_ref { + Some(r) => r.clone(), + None => { + resolve_github_default_branch(&parsed.owner, &parsed.repo, token.as_deref())? + } + }; + + eprintln!( + "Downloading GitHub archive: {}/{} @ {} ...", + parsed.owner, parsed.repo, git_ref + ); + let zip_bytes: Vec = + download_github_archive(&parsed.owner, &parsed.repo, &git_ref, token.as_deref())?; + + // Save to temp file so the existing ZIP pipeline can process it + let temp_dir: TempDir = TempDir::new().map_err(|error| { + AppFailure::system(format!("Failed to create temp directory: {error}")) + })?; + let temp_zip_path: PathBuf = temp_dir.path().join("github-archive.zip"); + fs::write(&temp_zip_path, &zip_bytes).map_err(|error| { + AppFailure::system(format!("Failed to write temp archive: {error}")) + })?; + + let project_index: ProjectIndex = analyze_zip(&zip_bytes).map_err(map_analyze_error)?; + + // If URL pointed to a specific file (/blob/), use it as implicit entry + let explicit_entry: Option = if parsed.is_file_reference { + parsed.subpath.clone() + } else { + None + }; + + Ok(AnalyzedInput { + resolved_input_path: temp_zip_path, + input_kind: ValidationInputKind::Zip, + input_path: display_path, + is_default_input: false, + uses_implicit_all: false, + explicit_entry, + workspace_root: None, + default_output_directory: Some(env::current_dir().unwrap_or_default()), + project_index, + _temp_dir: Some(temp_dir), }) } } @@ -880,6 +934,17 @@ fn resolve_input(input: Option<&Path>) -> Result { AppFailure::system(format!("Failed to read the current directory: {error}")) })?, }; + + // Check for GitHub URL before filesystem access + if let Some(path_str) = path.to_str() { + if let Some(parsed) = parse_github_url(path_str) { + return Ok(ResolvedInput::GitHubUrl { + display_path: path_str.to_string(), + parsed, + }); + } + } + let display_path = path.display().to_string(); let metadata = fs::metadata(&path).map_err(|error| { AppFailure::validation(format!( @@ -2395,6 +2460,10 @@ enum ResolvedInput { display_path: String, is_default_input: bool, }, + GitHubUrl { + display_path: String, + parsed: ParsedGitHubUrl, + }, } #[derive(Debug)] @@ -2408,6 +2477,9 @@ struct AnalyzedInput { workspace_root: Option, default_output_directory: Option, project_index: ProjectIndex, + /// Keeps temporary directory alive for the duration of analysis/conversion. + /// Used by GitHub URL downloads to hold the temp archive file. + _temp_dir: Option, } #[derive(Debug, Clone)] @@ -5259,6 +5331,38 @@ mod tests { } } + // --- GitHub URL resolve_input tests --- + + #[test] + fn resolve_input_returns_github_url_variant_for_github_urls() { + let path = Path::new("https://github.com/user/repo"); + let result = resolve_input(Some(path)).expect("should resolve GitHub URL"); + match result { + ResolvedInput::GitHubUrl { + display_path, + parsed, + } => { + assert_eq!(display_path, "https://github.com/user/repo"); + assert_eq!(parsed.owner, "user"); + assert_eq!(parsed.repo, "repo"); + } + _ => panic!("expected GitHubUrl variant"), + } + } + + #[test] + fn resolve_input_returns_local_type_for_non_url_paths() { + let temp_dir = TempDir::new().expect("temp dir"); + let md_path = temp_dir.path().join("test.md"); + fs::write(&md_path, "# Test").expect("write"); + + let result = resolve_input(Some(&md_path)).expect("should resolve markdown file"); + match result { + ResolvedInput::MarkdownFile { .. } => {} + _ => panic!("expected MarkdownFile variant"), + } + } + // --- GitHub auth token resolution tests --- // Combined into one test to avoid env var race conditions in parallel test execution From 86aea19a62fbbcf271e08cc00195aea2abab6a2e Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Mon, 9 Mar 2026 15:09:53 +0900 Subject: [PATCH 5/6] docs: add GitHub URL usage to help text and README Update convert, validate, and root help messages to document GitHub URL input support and GITHUB_TOKEN/GH_TOKEN env vars. Add GitHub URL examples to README development section. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Yonghye Kwon --- README.md | 11 +++++++++++ crates/marknest/src/lib.rs | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5ad2a13..f83486a 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,17 @@ cargo run -p marknest -- convert ./docs.zip --all --out-dir ./pdf cargo run -p marknest -- convert ./docs --out-dir ./pdf --render-report ./out/render-report.json ``` +Convert directly from a GitHub URL: + +```bash +cargo run -p marknest -- convert https://github.com/user/repo -o output.pdf +cargo run -p marknest -- convert https://github.com/user/repo/blob/main/docs/guide.md -o guide.pdf +cargo run -p marknest -- convert https://github.com/user/repo/tree/v2.0 --all --out-dir ./pdf +cargo run -p marknest -- validate https://github.com/user/repo +``` + +GitHub URL support downloads the repository as a ZIP archive through the GitHub API and processes it through the existing ZIP pipeline. Set `GITHUB_TOKEN` or `GH_TOKEN` for private repositories or to avoid API rate limits. + `convert` requires `node`, `npm ci --prefix crates/marknest/playwright-runtime`, and a local Chrome, Edge, or Chromium installation for Playwright headless PDF generation. `--mermaid auto|on` and `--math auto|on` use vendored local Mermaid and MathJax runtime assets; when `--debug-html` is written with those modes enabled, a sibling `runtime-assets/` directory is emitted for offline reproduction. Supported defaults can come from `.marknest.toml`, `marknest.toml`, `MARKNEST_CONFIG`, `MARKNEST_THEME`, `MARKNEST_CSS`, `MARKNEST_TOC`, and `MARKNEST_SANITIZE_HTML`. diff --git a/crates/marknest/src/lib.rs b/crates/marknest/src/lib.rs index a3b758b..541512a 100644 --- a/crates/marknest/src/lib.rs +++ b/crates/marknest/src/lib.rs @@ -2311,19 +2311,19 @@ fn parse_convert_args(binary_name: &str, args: &[String]) -> Result String { format!( - "Convert and validate Markdown workspaces.\n\nUsage:\n {binary_name} convert [INPUT] [--entry | --all] [-o | --out-dir ] [--config ] [--render-report ] [--debug-html ] [--asset-manifest ] [--css ] [--header-template ] [--footer-template ] [--page-size ] [--margin ] [--margin-top ] [--margin-right ] [--margin-bottom ] [--margin-left ] [--theme ] [--landscape] [--toc | --no-toc] [--sanitize-html | --no-sanitize-html] [--title ] [--author ] [--subject ] [--mermaid ] [--math ] [--mermaid-timeout-ms ] [--math-timeout-ms ]\n {binary_name} validate [INPUT] [--entry | --all] [--strict] [--report ]\n {binary_name} --help\n" + "Convert and validate Markdown workspaces.\n\nUsage:\n {binary_name} convert [INPUT] [--entry | --all] [-o | --out-dir ] [--config ] [--render-report ] [--debug-html ] [--asset-manifest ] [--css ] [--header-template ] [--footer-template ] [--page-size ] [--margin ] [--margin-top ] [--margin-right ] [--margin-bottom ] [--margin-left ] [--theme ] [--landscape] [--toc | --no-toc] [--sanitize-html | --no-sanitize-html] [--title ] [--author ] [--subject ] [--mermaid ] [--math ] [--mermaid-timeout-ms ] [--math-timeout-ms ]\n {binary_name} validate [INPUT] [--entry | --all] [--strict] [--report ]\n {binary_name} --help\n\nINPUT can be a Markdown file, ZIP archive, folder, or GitHub URL.\n\nGitHub URL examples:\n {binary_name} convert https://github.com/user/repo -o output.pdf\n {binary_name} convert https://github.com/user/repo/blob/main/guide.md -o guide.pdf\n {binary_name} convert https://github.com/user/repo --all --out-dir ./pdf\n\nEnvironment:\n GITHUB_TOKEN / GH_TOKEN GitHub auth token for private repos and higher rate limits\n" ) } fn validate_help(binary_name: &str) -> String { format!( - "Validate Markdown workspaces and ZIP inputs.\n\nUsage:\n {binary_name} validate [INPUT] [OPTIONS]\n\nOptions:\n --entry Validate a single Markdown entry inside a folder or ZIP input.\n --all Validate all Markdown entries.\n --strict Treat warnings as validation failures.\n --report Write a JSON validation report.\n -h, --help Show this help message.\n" + "Validate Markdown workspaces and ZIP inputs.\n\nUsage:\n {binary_name} validate [INPUT] [OPTIONS]\n\nINPUT can be a Markdown file, ZIP archive, folder, or GitHub URL.\n\nOptions:\n --entry Validate a single Markdown entry inside a folder or ZIP input.\n --all Validate all Markdown entries.\n --strict Treat warnings as validation failures.\n --report Write a JSON validation report.\n -h, --help Show this help message.\n\nEnvironment:\n GITHUB_TOKEN / GH_TOKEN GitHub auth token for private repos and higher rate limits\n" ) } fn convert_help(binary_name: &str) -> String { format!( - "Convert Markdown entries into PDF files.\n\nUsage:\n {binary_name} convert [INPUT] [OPTIONS]\n\nOptions:\n --entry Convert one Markdown entry inside a folder or ZIP input.\n --all Convert all Markdown entries.\n -o, --output Write a single PDF to a specific path.\n --out-dir Write batch PDF output under a directory.\n --config Load conversion defaults from a TOML config file.\n --render-report Write a JSON conversion report.\n --debug-html Write the rendered HTML used for PDF generation.\n --asset-manifest Write the selected entry asset manifest as JSON.\n --css Append a custom CSS file after the theme stylesheet.\n --header-template Load an HTML header template for Chromium print output.\n --footer-template Load an HTML footer template for Chromium print output.\n --page-size Set the output page size.\n --margin Set the same margin on all sides in millimeters.\n --margin-top Override the top page margin in millimeters.\n --margin-right Override the right page margin in millimeters.\n --margin-bottom Override the bottom page margin in millimeters.\n --margin-left Override the left page margin in millimeters.\n --theme \n Apply a built-in document theme.\n --landscape Render the PDF in landscape orientation.\n --toc Insert a generated table of contents near the top of the document.\n --no-toc Skip the generated table of contents.\n --sanitize-html Sanitize rendered document HTML before PDF generation.\n --no-sanitize-html Trust document HTML and skip sanitization.\n --title Override the document title.\n --author Set the PDF author metadata.\n --subject Set the PDF subject metadata.\n --mermaid Control Mermaid rendering.\n --math Control Math rendering.\n --mermaid-timeout-ms Set the per-diagram Mermaid render timeout.\n --math-timeout-ms Set the per-expression Math render timeout.\n -h, --help Show this help message.\n" + "Convert Markdown entries into PDF files.\n\nUsage:\n {binary_name} convert [INPUT] [OPTIONS]\n\nINPUT can be a Markdown file, ZIP archive, folder, or GitHub URL.\n\nGitHub URL examples:\n {binary_name} convert https://github.com/user/repo -o output.pdf\n {binary_name} convert https://github.com/user/repo/blob/main/guide.md -o guide.pdf\n {binary_name} convert https://github.com/user/repo --all --out-dir ./pdf\n\nOptions:\n --entry Convert one Markdown entry inside a folder or ZIP input.\n --all Convert all Markdown entries.\n -o, --output Write a single PDF to a specific path.\n --out-dir Write batch PDF output under a directory.\n --config Load conversion defaults from a TOML config file.\n --render-report Write a JSON conversion report.\n --debug-html Write the rendered HTML used for PDF generation.\n --asset-manifest Write the selected entry asset manifest as JSON.\n --css Append a custom CSS file after the theme stylesheet.\n --header-template Load an HTML header template for Chromium print output.\n --footer-template Load an HTML footer template for Chromium print output.\n --page-size Set the output page size.\n --margin Set the same margin on all sides in millimeters.\n --margin-top Override the top page margin in millimeters.\n --margin-right Override the right page margin in millimeters.\n --margin-bottom Override the bottom page margin in millimeters.\n --margin-left Override the left page margin in millimeters.\n --theme \n Apply a built-in document theme.\n --landscape Render the PDF in landscape orientation.\n --toc Insert a generated table of contents near the top of the document.\n --no-toc Skip the generated table of contents.\n --sanitize-html Sanitize rendered document HTML before PDF generation.\n --no-sanitize-html Trust document HTML and skip sanitization.\n --title Override the document title.\n --author Set the PDF author metadata.\n --subject Set the PDF subject metadata.\n --mermaid Control Mermaid rendering.\n --math Control Math rendering.\n --mermaid-timeout-ms Set the per-diagram Mermaid render timeout.\n --math-timeout-ms Set the per-expression Math render timeout.\n -h, --help Show this help message.\n\nEnvironment:\n GITHUB_TOKEN / GH_TOKEN GitHub auth token for private repos and higher rate limits\n" ) } From 60f60e86e0b50b034b62133680c3ddbb407173ea Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Mon, 9 Mar 2026 15:20:52 +0900 Subject: [PATCH 6/6] fix: make ZIP prefix stripping opt-in for GitHub archives only Regular analyze_zip() no longer strips common prefixes, preserving existing behavior for user-created ZIPs. New analyze_zip_strip_prefix() applies stripping only when explicitly requested. The GitHub URL flow uses the strip variant; regular ZIP inputs are unchanged. This fixes WASM test failures where intentional subdirectory structure was being incorrectly stripped. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Yonghye Kwon --- crates/marknest-core/src/lib.rs | 59 +++++++++++++---------- crates/marknest-core/tests/analyze_zip.rs | 31 +++++++++--- crates/marknest/src/lib.rs | 30 +++++++++--- 3 files changed, 80 insertions(+), 40 deletions(-) diff --git a/crates/marknest-core/src/lib.rs b/crates/marknest-core/src/lib.rs index a1c03fb..daf5b16 100644 --- a/crates/marknest-core/src/lib.rs +++ b/crates/marknest-core/src/lib.rs @@ -269,6 +269,15 @@ pub fn analyze_zip(bytes: &[u8]) -> Result { analyze_project(&ZipMemoryFileSystem::new(bytes)?) } +/// Analyze a ZIP archive, stripping the common top-level directory prefix +/// from all paths before analysis. Use this for GitHub-style archives where +/// files are nested under a single `{repo}-{ref}/` directory. +pub fn analyze_zip_strip_prefix(bytes: &[u8]) -> Result { + let mut fs = ZipMemoryFileSystem::new(bytes)?; + fs.strip_common_prefix(); + analyze_project(&fs) +} + fn remote_fetch_url(reference: &str) -> Option { if !is_http_reference(reference) { return None; @@ -431,39 +440,39 @@ impl ZipMemoryFileSystem { }); } - strip_common_prefix(&mut files); files.sort_by(|left, right| left.normalized_path.cmp(&right.normalized_path)); Ok(Self { files }) } -} -/// If every file shares the same first path segment (e.g. `repo-main/`), -/// strip that segment from all paths. This handles GitHub-style archives -/// that nest everything under `{repo}-{ref}/`. -fn strip_common_prefix(files: &mut [IndexedFile]) { - if files.is_empty() { - return; - } + /// Strip the common first path segment from all files if every file shares + /// the same top-level directory. Used for GitHub-style archives that nest + /// everything under `{repo}-{ref}/`. + fn strip_common_prefix(&mut self) { + if self.files.is_empty() { + return; + } - let common: String = match files[0].normalized_path.split('/').next() { - Some(segment) => segment.to_string(), - None => return, - }; + let common: String = match self.files[0].normalized_path.split('/').next() { + Some(segment) => segment.to_string(), + None => return, + }; - // All files must share the same first segment AND have content after it - let all_share_prefix = files.iter().all(|file| { - file.normalized_path.starts_with(&common) - && file.normalized_path.len() > common.len() - && file.normalized_path.as_bytes()[common.len()] == b'/' - }); + let all_share_prefix = self.files.iter().all(|file| { + file.normalized_path.starts_with(&common) + && file.normalized_path.len() > common.len() + && file.normalized_path.as_bytes()[common.len()] == b'/' + }); - if !all_share_prefix { - return; - } + if !all_share_prefix { + return; + } - let strip_len: usize = common.len() + 1; // include the '/' - for file in files.iter_mut() { - file.normalized_path = file.normalized_path[strip_len..].to_string(); + let strip_len: usize = common.len() + 1; + for file in self.files.iter_mut() { + file.normalized_path = file.normalized_path[strip_len..].to_string(); + } + self.files + .sort_by(|left, right| left.normalized_path.cmp(&right.normalized_path)); } } diff --git a/crates/marknest-core/tests/analyze_zip.rs b/crates/marknest-core/tests/analyze_zip.rs index 943ab08..574b0a5 100644 --- a/crates/marknest-core/tests/analyze_zip.rs +++ b/crates/marknest-core/tests/analyze_zip.rs @@ -1,6 +1,8 @@ use std::io::{Cursor, Write}; -use marknest_core::{AnalyzeError, EntrySelectionReason, ProjectSourceKind, analyze_zip}; +use marknest_core::{ + AnalyzeError, EntrySelectionReason, ProjectSourceKind, analyze_zip, analyze_zip_strip_prefix, +}; use zip::write::SimpleFileOptions; fn build_zip(entries: &[(&str, &str)]) -> Vec { @@ -80,7 +82,7 @@ fn strips_common_prefix_from_github_style_zip() { ("repo-main/images/logo.png", "fake-png-bytes"), ]); - let index = analyze_zip(&bytes).expect("github-style zip should analyze"); + let index = analyze_zip_strip_prefix(&bytes).expect("github-style zip should analyze"); assert_eq!(index.selected_entry.as_deref(), Some("README.md")); assert_eq!(index.entry_selection_reason, EntrySelectionReason::Readme); @@ -101,13 +103,13 @@ fn strips_common_prefix_from_github_style_zip() { } #[test] -fn preserves_paths_when_no_common_prefix() { +fn strip_prefix_preserves_paths_when_no_common_prefix() { let bytes = build_zip(&[ ("README.md", "# Root readme\n"), ("docs/guide.md", "# Guide\n"), ]); - let index = analyze_zip(&bytes).expect("zip without common prefix should analyze"); + let index = analyze_zip_strip_prefix(&bytes).expect("zip without common prefix should analyze"); let candidate_paths: Vec<&str> = index .entry_candidates @@ -119,10 +121,11 @@ fn preserves_paths_when_no_common_prefix() { } #[test] -fn preserves_paths_when_multiple_top_level_directories() { +fn strip_prefix_preserves_paths_when_multiple_top_level_directories() { let bytes = build_zip(&[("dir-a/README.md", "# A\n"), ("dir-b/README.md", "# B\n")]); - let index = analyze_zip(&bytes).expect("zip with multiple top dirs should analyze"); + let index = + analyze_zip_strip_prefix(&bytes).expect("zip with multiple top dirs should analyze"); let candidate_paths: Vec<&str> = index .entry_candidates @@ -134,10 +137,10 @@ fn preserves_paths_when_multiple_top_level_directories() { } #[test] -fn strips_common_prefix_single_nested_file() { +fn strip_prefix_strips_single_nested_file() { let bytes = build_zip(&[("only-dir/file.md", "# Single\n")]); - let index = analyze_zip(&bytes).expect("single nested file zip should analyze"); + let index = analyze_zip_strip_prefix(&bytes).expect("single nested file zip should analyze"); assert_eq!(index.selected_entry.as_deref(), Some("file.md")); assert_eq!( @@ -145,3 +148,15 @@ fn strips_common_prefix_single_nested_file() { EntrySelectionReason::SingleMarkdownFile ); } + +#[test] +fn regular_analyze_zip_does_not_strip_common_prefix() { + let bytes = build_zip(&[ + ("repo-main/README.md", "# Hello\n"), + ("repo-main/images/logo.png", "fake-png-bytes"), + ]); + + let index = analyze_zip(&bytes).expect("should analyze without stripping"); + + assert_eq!(index.selected_entry.as_deref(), Some("repo-main/README.md")); +} diff --git a/crates/marknest/src/lib.rs b/crates/marknest/src/lib.rs index 541512a..3df4376 100644 --- a/crates/marknest/src/lib.rs +++ b/crates/marknest/src/lib.rs @@ -12,7 +12,7 @@ use marknest_core::{ EntrySelectionReason, MATHJAX_SCRIPT_URL, MATHJAX_VERSION, MERMAID_SCRIPT_URL, MERMAID_VERSION, MathMode, MermaidMode, PdfMetadata, ProjectIndex, ProjectSourceKind, RUNTIME_ASSET_MODE, RenderHtmlError, RenderOptions, ThemePreset, analyze_workspace, analyze_zip, - render_workspace_entry_with_options, rewrite_html_img_sources, + analyze_zip_strip_prefix, render_workspace_entry_with_options, rewrite_html_img_sources, }; use serde::{Deserialize, Serialize}; use tempfile::TempDir; @@ -268,7 +268,10 @@ fn prepare_render_workspace( } if matches!(analyzed_input.input_kind, ValidationInputKind::Zip) { - let temp_dir = materialize_zip_workspace(&analyzed_input.resolved_input_path)?; + let temp_dir = materialize_zip_workspace( + &analyzed_input.resolved_input_path, + analyzed_input.strip_zip_prefix, + )?; let root = temp_dir.path().to_path_buf(); return Ok(PreparedWorkspace { root, @@ -281,7 +284,7 @@ fn prepare_render_workspace( )) } -fn materialize_zip_workspace(zip_path: &Path) -> Result { +fn materialize_zip_workspace(zip_path: &Path, strip_prefix: bool) -> Result { let file = fs::File::open(zip_path).map_err(|error| { AppFailure::system(format!( "Failed to open ZIP input {}: {error}", @@ -296,7 +299,7 @@ fn materialize_zip_workspace(zip_path: &Path) -> Result { )) })?; - // Collect all entries with normalized paths first to detect a common prefix + // Collect all entries with normalized paths first (needed for prefix detection) let mut collected_entries: Vec<(String, Vec)> = Vec::new(); for index in 0..archive.len() { let mut entry = archive.by_index(index).map_err(|error| { @@ -320,8 +323,12 @@ fn materialize_zip_workspace(zip_path: &Path) -> Result { collected_entries.push((normalized_path, contents)); } - // Strip common prefix (e.g. GitHub archive `repo-main/` wrapper) - let prefix_len = detect_common_prefix_len(&collected_entries); + // Only strip the common prefix for GitHub-style archives + let prefix_len: usize = if strip_prefix { + detect_common_prefix_len(&collected_entries) + } else { + 0 + }; for (normalized_path, contents) in &collected_entries { let stripped_path = &normalized_path[prefix_len..]; @@ -811,6 +818,7 @@ fn analyze_input_path(input: Option<&Path>) -> Result workspace_root: Some(workspace_root.clone()), default_output_directory: Some(workspace_root), project_index, + strip_zip_prefix: false, _temp_dir: None, }) } @@ -843,6 +851,7 @@ fn analyze_input_path(input: Option<&Path>) -> Result workspace_root: None, default_output_directory, project_index, + strip_zip_prefix: false, _temp_dir: None, }) } @@ -869,6 +878,7 @@ fn analyze_input_path(input: Option<&Path>) -> Result workspace_root: Some(canonical_root.clone()), default_output_directory: Some(canonical_root), project_index, + strip_zip_prefix: false, _temp_dir: None, }) } @@ -901,7 +911,9 @@ fn analyze_input_path(input: Option<&Path>) -> Result AppFailure::system(format!("Failed to write temp archive: {error}")) })?; - let project_index: ProjectIndex = analyze_zip(&zip_bytes).map_err(map_analyze_error)?; + // GitHub archives nest files under {repo}-{ref}/, strip that prefix + let project_index: ProjectIndex = + analyze_zip_strip_prefix(&zip_bytes).map_err(map_analyze_error)?; // If URL pointed to a specific file (/blob/), use it as implicit entry let explicit_entry: Option = if parsed.is_file_reference { @@ -920,6 +932,7 @@ fn analyze_input_path(input: Option<&Path>) -> Result workspace_root: None, default_output_directory: Some(env::current_dir().unwrap_or_default()), project_index, + strip_zip_prefix: true, _temp_dir: Some(temp_dir), }) } @@ -2477,6 +2490,9 @@ struct AnalyzedInput { workspace_root: Option, default_output_directory: Option, project_index: ProjectIndex, + /// Strip common prefix from ZIP paths during materialization. + /// Enabled for GitHub archive downloads where files are nested under `{repo}-{ref}/`. + strip_zip_prefix: bool, /// Keeps temporary directory alive for the duration of analysis/conversion. /// Used by GitHub URL downloads to hold the temp archive file. _temp_dir: Option,