diff --git a/server.js b/server.js index cddb5b7..13ff2ee 100644 --- a/server.js +++ b/server.js @@ -12,6 +12,8 @@ const { rateLimit } = require("express-rate-limit"); const slowDown = require("express-slow-down"); const helmet = require("helmet"); const jwt = require("jsonwebtoken"); +const { pipeline } = require("stream/promises"); +const { createReadStream, createWriteStream } = require("fs"); const { askSchema, askCredentialSchema, @@ -526,6 +528,11 @@ const FILE_RETENTION_MS = parseInt(process.env.FILE_RETENTION_MS || "3600000", 1 const CLEANUP_INTERVAL_MS = parseInt(process.env.CLEANUP_INTERVAL_MS || "3600000", 10); const startUploadsCleanup = () => { + // Skip cleanup interval during tests to prevent ENOENT errors after test completion + if (process.env.NODE_TEST === "1") { + return; + } + const intervalId = setInterval(async () => { try { const files = await fsPromises.readdir(UPLOADS_DIR); @@ -1193,8 +1200,9 @@ app.post( } // All validation passed — safe to open the file stream for forwarding. + const fileStream = fs.createReadStream(absoluteFilePath); const formData = { - file: fs.createReadStream(absoluteFilePath), + file: fileStream, original_filename: req.file.originalname, }; if (sessionId && sessionSecret) { @@ -1205,7 +1213,8 @@ app.post( const controller = new AbortController(); const onClientDisconnect = () => { controller.abort(); - cleanupFile(uploadedFilePath); + fileStream.destroy(); + void cleanupFile(uploadedFilePath); }; req.on("close", onClientDisconnect); @@ -1323,57 +1332,122 @@ app.post("/process-from-url", uploadLimiter, requireSupabaseAuth, async (req, re .replace(/[^a-zA-Z0-9._\- ]/g, "_") .slice(0, 200); + let tempFilePath = null; + let downloadController = null; + try { - // Download the PDF from the remote URL into a Buffer - let pdfBuffer; - try { - const downloadUrl = new URL(trustedSupabaseOrigin); - downloadUrl.pathname = parsedUrl.pathname; - downloadUrl.search = parsedUrl.search; - - const dlResponse = await axios.get(downloadUrl.toString(), { - responseType: "arraybuffer", - timeout: 30000, - maxContentLength: 50 * 1024 * 1024, // 50 MB cap - }); - pdfBuffer = Buffer.from(dlResponse.data); - } catch (dlErr) { - console.error("Failed to download PDF from URL:", dlErr.message); - return res.status(502).json({ error: "Could not download PDF from the provided URL." }); + // Generate a unique temporary file path + const tempId = crypto.randomUUID(); + tempFilePath = path.join(UPLOADS_DIR, `${tempId}.pdf`); + + // Download the PDF from the remote URL using streaming + const downloadUrl = new URL(trustedSupabaseOrigin); + downloadUrl.pathname = parsedUrl.pathname; + downloadUrl.search = parsedUrl.search; + + downloadController = new AbortController(); + const downloadTimeout = setTimeout(() => { + downloadController.abort(); + }, 30000); // 30 second timeout for download + + let bytesDownloaded = 0; + const MAX_DOWNLOAD_SIZE = MAX_PDF_SIZE_BYTES; + + // Create a size-tracking transform stream + const { Transform } = require('stream'); + const sizeTracker = new Transform({ + transform(chunk, encoding, callback) { + bytesDownloaded += chunk.length; + if (bytesDownloaded > MAX_DOWNLOAD_SIZE) { + callback(new Error(`File size exceeds maximum allowed size of ${MAX_DOWNLOAD_SIZE} bytes`)); + return; + } + callback(null, chunk); + } + }); + + // Stream the download to a temporary file + const dlResponse = await axios.get(downloadUrl.toString(), { + responseType: "stream", + signal: downloadController.signal, + timeout: 30000, + maxRedirects: 5, + }); + + clearTimeout(downloadTimeout); + + // Verify PDF magic bytes from the first chunk + const magicByteValidator = new Transform({ + transform(chunk, encoding, callback) { + // Only check the first chunk + if (!this._validated) { + this._validated = true; + if (chunk.length < 4 || chunk.slice(0, 4).toString() !== "%PDF") { + callback(new Error("The file at the provided URL is not a valid PDF.")); + return; + } + } + callback(null, chunk); + } + }); + + // Pipeline: download stream -> magic byte validator -> size tracker -> file + await pipeline( + dlResponse.data, + magicByteValidator, + sizeTracker, + createWriteStream(tempFilePath), + { end: true } + ); + + // Verify the file was downloaded successfully + const stats = await fsPromises.stat(tempFilePath); + if (stats.size === 0) { + throw new Error("Downloaded file is empty"); } - // Verify PDF magic bytes - if (pdfBuffer.slice(0, 4).toString() !== "%PDF") { - return res.status(415).json({ error: "The file at the provided URL is not a valid PDF." }); + if (stats.size > MAX_DOWNLOAD_SIZE) { + throw new Error(`File size exceeds maximum allowed size of ${MAX_DOWNLOAD_SIZE} bytes`); } // Build multipart form and forward to RAG service - // Uses axios.postForm with a FormData blob — no extra form-data package needed - const form = new FormData(); - const pdfBlob = new Blob([pdfBuffer], { type: 'application/pdf' }); - form.append("file", pdfBlob, safeFilename); - form.append("original_filename", safeFilename); + // Use streaming approach with file path instead of loading into memory + const formData = { + file: createReadStream(tempFilePath), + original_filename: safeFilename, + }; // Optionally extend an existing session const resolvedSessionSecret = await resolveSessionSecret(req, session_id, session_secret); if (session_id && resolvedSessionSecret) { - form.append("session_id", session_id); - form.append("session_secret", resolvedSessionSecret); + formData.session_id = session_id; + formData.session_secret = resolvedSessionSecret; } - const ragResponse = await axios.post( + const ragController = new AbortController(); + const ragTimeout = setTimeout(() => { + ragController.abort(); + }, 120000); // 2 min timeout for RAG processing + + const ragResponse = await axios.postForm( `${RAG_SERVICE_URL}/process-pdf`, - form, + formData, { headers: ragAuthHeaders(), - timeout: 120000, // 2 min — embedding generation can be slow + timeout: 120000, + signal: ragController.signal, maxContentLength: Infinity, maxBodyLength: Infinity, } ); + clearTimeout(ragTimeout); + await setSessionSecretCookie(res, ragResponse.data.session_id || session_id, ragResponse.data.session_secret || resolvedSessionSecret); + // Clean up the temporary file + await cleanupFile(tempFilePath); + return res.json({ message: "PDF processed and indexed successfully.", session_id: ragResponse.data.session_id, @@ -1381,8 +1455,52 @@ app.post("/process-from-url", uploadLimiter, requireSupabaseAuth, async (req, re documents: ragResponse.data.documents || [], }); } catch (err) { - const statusCode = - err.response?.status || (err.code === "ECONNREFUSED" ? 502 : 500); + // Clean up the temporary file on error + if (tempFilePath) { + try { + await cleanupFile(tempFilePath); + } catch (cleanupErr) { + console.error("Failed to clean up temporary file:", cleanupErr.message); + } + } + + // Handle specific error cases + if (err.code === 'ABORTED') { + console.error("process-from-url failed: Request timed out"); + return res.status(504).json({ error: "Request timed out during download or processing." }); + } + + if (err.message && err.message.includes('File size exceeds')) { + console.error("process-from-url failed: File too large"); + return res.status(413).json({ error: err.message }); + } + + if (err.message && err.message.includes('not a valid PDF')) { + console.error("process-from-url failed: Invalid PDF format"); + return res.status(415).json({ error: "The file at the provided URL is not a valid PDF." }); + } + + if (err.code === 'ECONNABORTED' || err.code === 'ETIMEDOUT') { + console.error("process-from-url failed: Connection timeout"); + return res.status(504).json({ error: "Connection timeout while downloading PDF." }); + } + + if (err.code === 'ENOTFOUND' || err.code === 'ECONNREFUSED') { + console.error("process-from-url failed: Cannot reach remote server"); + return res.status(502).json({ error: "Could not download PDF from the provided URL." }); + } + + if (err.response) { + const statusCode = err.response.status; + const details = extractServiceDetails(err, "RAG processing failed"); + console.error("process-from-url failed:", details); + return res.status(statusCode).json({ + error: typeof details === "string" ? details : "PDF processing failed", + details: isDevelopment ? details : "Internal processing error", + }); + } + + const statusCode = err.code === "ECONNREFUSED" ? 502 : 500; const details = extractServiceDetails(err, "RAG processing failed"); console.error("process-from-url failed:", details); diff --git a/server.test.js b/server.test.js index 2454893..70e509f 100644 --- a/server.test.js +++ b/server.test.js @@ -494,14 +494,17 @@ describe("route error responses", () => { test("POST /process-from-url keeps protocol-relative paths on the trusted host", async () => { const originalGet = axios.get; - const originalPost = axios.post; + const originalPostForm = axios.postForm; let requestedDownloadUrl = null; axios.get = async (url) => { requestedDownloadUrl = url; - return { data: Buffer.from("%PDF-1.4\n%%EOF") }; + const { PassThrough } = require("node:stream"); + const stream = new PassThrough(); + stream.end(Buffer.from("%PDF-1.4\n%%EOF")); + return { data: stream }; }; - axios.post = async () => ({ + axios.postForm = async () => ({ data: { session_id: "550e8400-e29b-41d4-a716-446655440000", session_secret: "session-secret-123", @@ -534,20 +537,23 @@ describe("route error responses", () => { assert.equal(downloadUrl.search, "?download=1"); } finally { axios.get = originalGet; - axios.post = originalPost; + axios.postForm = originalPostForm; } }); test("POST /process-from-url accepts whitespace-trimmed Supabase URLs", async () => { const originalGet = axios.get; - const originalPost = axios.post; + const originalPostForm = axios.postForm; let requestedDownloadUrl = null; axios.get = async (url) => { requestedDownloadUrl = url; - return { data: Buffer.from("%PDF-1.4\n%%EOF") }; + const { PassThrough } = require("node:stream"); + const stream = new PassThrough(); + stream.end(Buffer.from("%PDF-1.4\n%%EOF")); + return { data: stream }; }; - axios.post = async () => ({ + axios.postForm = async () => ({ data: { session_id: "550e8400-e29b-41d4-a716-446655440000", session_secret: "session-secret-123", @@ -578,10 +584,194 @@ describe("route error responses", () => { assert.equal(downloadUrl.pathname, "/storage/v1/object/public/docs/trimmed.pdf"); } finally { axios.get = originalGet; - axios.post = originalPost; + axios.postForm = originalPostForm; + } + }); + + test("POST /process-from-url rejects non-PDF files via magic byte validation", async () => { + const originalGet = axios.get; + const originalPostForm = axios.postForm; + + axios.get = async (url) => { + const { PassThrough } = require("node:stream"); + const stream = new PassThrough(); + stream.end(Buffer.from("NOT-A-PDF-FILE-CONTENT")); + return { data: stream }; + }; + axios.postForm = async () => ({ + data: { + session_id: "550e8400-e29b-41d4-a716-446655440000", + session_secret: "session-secret-123", + document: { filename: "test.pdf" }, + documents: [], + }, + }); + + const jwt = require("jsonwebtoken"); + const validToken = jwt.sign({ role: "authenticated" }, process.env.SUPABASE_JWT_SECRET); + + try { + const res = await fetch(`${baseUrl}/process-from-url`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${validToken}`, + }, + body: JSON.stringify({ + url: "https://xyz.supabase.co/storage/v1/object/public/docs/test.pdf", + filename: "test.pdf", + }), + }); + + assert.equal(res.status, 415); + const data = await res.json(); + assert.match(data.error, /not a valid PDF/i); + } finally { + axios.get = originalGet; + axios.postForm = originalPostForm; + } + }); + + test("POST /process-from-url rejects oversized files during streaming", async () => { + const originalGet = axios.get; + const originalPostForm = axios.postForm; + + axios.get = async (url) => { + const { PassThrough } = require("node:stream"); + const stream = new PassThrough(); + // Create a PDF that exceeds the default 20MB limit + const largePdf = Buffer.concat([ + Buffer.from("%PDF-1.4\n"), + Buffer.alloc(25 * 1024 * 1024, "X"), // 25MB of data + Buffer.from("%%EOF"), + ]); + stream.end(largePdf); + return { data: stream }; + }; + axios.postForm = async () => ({ + data: { + session_id: "550e8400-e29b-41d4-a716-446655440000", + session_secret: "session-secret-123", + document: { filename: "large.pdf" }, + documents: [], + }, + }); + + const jwt = require("jsonwebtoken"); + const validToken = jwt.sign({ role: "authenticated" }, process.env.SUPABASE_JWT_SECRET); + + try { + const res = await fetch(`${baseUrl}/process-from-url`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${validToken}`, + }, + body: JSON.stringify({ + url: "https://xyz.supabase.co/storage/v1/object/public/docs/large.pdf", + filename: "large.pdf", + }), + }); + + assert.equal(res.status, 413); + const data = await res.json(); + assert.match(data.error, /File size exceeds/i); + } finally { + axios.get = originalGet; + axios.postForm = originalPostForm; } }); + test("POST /process-from-url handles download timeout", async () => { + const originalGet = axios.get; + const originalPostForm = axios.postForm; + + axios.get = async (url) => { + const { PassThrough } = require("node:stream"); + const stream = new PassThrough(); + // Simulate a timeout by aborting the request + const error = new Error("ECONNABORTED"); + error.code = "ECONNABORTED"; + throw error; + }; + axios.postForm = async () => ({ + data: { + session_id: "550e8400-e29b-41d4-a716-446655440000", + session_secret: "session-secret-123", + document: { filename: "test.pdf" }, + documents: [], + }, + }); + + const jwt = require("jsonwebtoken"); + const validToken = jwt.sign({ role: "authenticated" }, process.env.SUPABASE_JWT_SECRET); + + try { + const res = await fetch(`${baseUrl}/process-from-url`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${validToken}`, + }, + body: JSON.stringify({ + url: "https://xyz.supabase.co/storage/v1/object/public/docs/test.pdf", + filename: "test.pdf", + }), + }); + + assert.equal(res.status, 504); + const data = await res.json(); + assert.match(data.error, /timeout/i); + } finally { + axios.get = originalGet; + axios.postForm = originalPostForm; + } + }); + + test("POST /process-from-url handles connection errors", async () => { + const originalGet = axios.get; + const originalPostForm = axios.postForm; + + axios.get = async (url) => { + const error = new Error("connect ECONNREFUSED"); + error.code = "ECONNREFUSED"; + throw error; + }; + axios.postForm = async () => ({ + data: { + session_id: "550e8400-e29b-41d4-a716-446655440000", + session_secret: "session-secret-123", + document: { filename: "test.pdf" }, + documents: [], + }, + }); + + const jwt = require("jsonwebtoken"); + const validToken = jwt.sign({ role: "authenticated" }, process.env.SUPABASE_JWT_SECRET); + + try { + const res = await fetch(`${baseUrl}/process-from-url`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${validToken}`, + }, + body: JSON.stringify({ + url: "https://xyz.supabase.co/storage/v1/object/public/docs/test.pdf", + filename: "test.pdf", + }), + }); + + assert.equal(res.status, 502); + const data = await res.json(); + assert.match(data.error, /Could not download PDF/i); + } finally { + axios.get = originalGet; + axios.postForm = originalPostForm; + } + }); + + test("POST /summarize with empty body returns 400", async () => { const res = await fetch(`${baseUrl}/summarize`, { method: "POST", diff --git a/src/data/users.json b/src/data/users.json index adb2d28..9fdac28 100644 --- a/src/data/users.json +++ b/src/data/users.json @@ -62,5 +62,53 @@ { "email": "testuser2-1780859135237@example.com", "password": "$2b$10$URdT8Tm/Xg7Kwi/UnSiinOWzjsqFc8QCUgpY/TeURTj2T3R1wYW8S" + }, + { + "email": "testuser-1781558159466@example.com", + "password": "$2b$10$jjIGQwRcukiTZbrPn7c8SO2ndGxWeP/P7hvwHC/i3hPSEwoK3lAdO" + }, + { + "email": "testuser2-1781558159577@example.com", + "password": "$2b$10$zSp3FrouzlCK5q2nRjxZ/.xYjdYGi5dOZGhK.enCHN2ygXjrG21FK" + }, + { + "email": "testuser-1781558718112@example.com", + "password": "$2b$10$LMw5n7lZQNO0yHYmyoYYMuvqyR4g3/ACo3./bAxFqHEH7hprcEx6a" + }, + { + "email": "testuser2-1781558718188@example.com", + "password": "$2b$10$.N4syzi3IPeubiVJmyBPXOhhVIdlYzJQ98qmRJ7bJ.Yjw9quf62xO" + }, + { + "email": "testuser-1781558853098@example.com", + "password": "$2b$10$GSTE7bRwEhGszzKt2OUoU.AAA24Fj/JSEWsRE2ROACbuUGZEsugHu" + }, + { + "email": "testuser2-1781558853174@example.com", + "password": "$2b$10$IC./adkiaR9gn/Sxwkfeq.IxX.4EPQgRoPjaJUce044dSFZvlyOdS" + }, + { + "email": "testuser-1781560500052@example.com", + "password": "$2b$10$AVJDmTmKShyy8qYd4iyEQ.FeT4kB/TDya.oe80s9hPOm0CX9tcHNq" + }, + { + "email": "testuser2-1781560500119@example.com", + "password": "$2b$10$piV18JLcapUP0suPAQ/rO.TKB0POk/N2fRCoO0aIsJqrbXli.uSTC" + }, + { + "email": "testuser-1781561062629@example.com", + "password": "$2b$10$.b6OhsYVfAZllwAenkmhSeD3gtLIwU9dLeRdPyRsvFcT0CRC286Iu" + }, + { + "email": "testuser2-1781561062693@example.com", + "password": "$2b$10$okv3K5Kt.YYpCDFn9yZwp.FrjJOXKUqQWKufliZgz/8i.X611eq0q" + }, + { + "email": "testuser-1781566160894@example.com", + "password": "$2b$10$mIoSrZo3mf2d8QAvd9aZpemRjr5HkNyXTti18fJwnvhISJwiHvzyC" + }, + { + "email": "testuser2-1781566160967@example.com", + "password": "$2b$10$G9usikE2dL4ahvEEsMavZ.BGIXA5jFXI0j.aQkeJ/LvjMNJj8B/Dm" } ] \ No newline at end of file