From d14a98828d32479c07a983a0580d78df6b97b9fa Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sat, 4 Jul 2026 11:11:16 +0200 Subject: [PATCH] Enforce the -E time limit inside the transfer wait cycle -E was only evaluated at per-link boundaries, so a slow or throttling server starved the check for minutes, and the smooth stop it finally requested drained the remaining transfers at server pace with no bound. back_wait now checks the deadline every cycle and, once a short grace period expires, aborts the in-flight HTTP transfers like the -T timeout path does (FTP slots stay with their owning thread). back_checkmirror's 0 return, previously dead, now carries the hard stop. Co-Authored-By: Claude Fable 5 Signed-off-by: Xavier Roche --- src/htsback.c | 71 +++++++++++++++++++++++++++++++++---- src/htsback.h | 2 ++ src/htsname.c | 54 +++++++++++++++------------- src/htsparse.c | 3 ++ src/htsparse.h | 50 ++++++++++++++------------ tests/34_local-maxtime.test | 21 +++++++++++ tests/Makefile.am | 3 +- tests/local-crawl.sh | 19 ++++++---- tests/local-server.py | 33 +++++++++++++++++ 9 files changed, 196 insertions(+), 60 deletions(-) create mode 100644 tests/34_local-maxtime.test diff --git a/src/htsback.c b/src/htsback.c index 70bf8ce6..d2676da9 100644 --- a/src/htsback.c +++ b/src/htsback.c @@ -1359,6 +1359,18 @@ int back_flush_output(httrackp * opt, cache_back * cache, struct_back * sback, } // effacer entrée +/* Discard a cancelled mid-write .delayed placeholder (unusable across runs). */ +static void back_delayed_discard(httrackp *opt, lien_back *back) { + if (back->r.out != NULL) { + fclose(back->r.out); + back->r.out = NULL; + } + back->r.is_write = 0; + if (opt != NULL) + url_savename_refname_remove(opt, back->url_adr, back->url_fil); + (void) UNLINK(back->url_sav); +} + int back_delete(httrackp * opt, cache_back * cache, struct_back * sback, const int p) { lien_back *const back = sback->lnk; @@ -1366,6 +1378,12 @@ int back_delete(httrackp * opt, cache_back * cache, struct_back * sback, assertf(p >= 0 && p < back_max); if (p >= 0 && p < sback->count) { // on sait jamais.. + /* mid-write cancel: drop a .delayed placeholder; real-named partials + survive for resume (--continue) */ + if (back[p].r.is_write && IS_DELAYED_EXT(back[p].url_sav) && + (back[p].status != STATUS_READY || back[p].r.statuscode <= 0)) { + back_delayed_discard(opt, &back[p]); + } // Vérificateur d'intégrité #if DEBUG_CHECKINT _CHECKINT(&back[p], "Appel back_delete") @@ -2419,6 +2437,34 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache, back_clean(opt, cache, sback); #endif + /* Time limit exceeded past grace: abort in-flight transfers so no wait loop + starves (#481). FTP slots stay, their thread owns the socket. */ + if (!back_checkmirror(opt)) { + int aborted = 0; + unsigned int i; + + for (i = 0; i < (unsigned int) back_max; i++) { + if (back[i].status > 0 && back[i].status < STATUS_FTP_TRANSFER) { + if (back[i].r.soc != INVALID_SOCKET) { + deletehttp(&back[i].r); + } + back[i].r.soc = INVALID_SOCKET; + /* drop a .delayed placeholder; real partials survive for resume */ + if (back[i].r.is_write && IS_DELAYED_EXT(back[i].url_sav)) + back_delayed_discard(opt, &back[i]); + back[i].r.statuscode = STATUSCODE_TIMEOUT; + strcpybuff(back[i].r.msg, "Mirror Time Out"); + back[i].status = STATUS_READY; + back_set_finished(sback, i); + aborted++; + } + } + if (aborted > 0) + hts_log_print(opt, LOG_WARNING, + "time limit reached, %d transfer(s) aborted", aborted); + return; + } + // recevoir tant qu'il y a des données (avec un maximum de max_loop boucles) do_wait = 0; gestion_timeout = 0; @@ -4164,6 +4210,11 @@ int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize) return 1; } +/* Grace left to the smooth stop before in-flight transfers are aborted. */ +static int back_maxtime_grace(const int maxtime) { + return maximum(5, minimum(30, maxtime / 10)); +} + int back_checkmirror(httrackp * opt) { // Check max size if ((opt->maxsite > 0) && (HTS_STAT.stat_bytes >= opt->maxsite)) { @@ -4180,13 +4231,19 @@ int back_checkmirror(httrackp * opt) { */ } // Check max time - if ((opt->maxtime > 0) - && ((time_local() - HTS_STAT.stat_timestart) >= opt->maxtime)) { - if (!opt->state.stop) { /* not yet stopped */ - hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up", - opt->maxtime); - /* cancel mirror smoothly */ - hts_request_stop(opt, 0); + if (opt->maxtime > 0) { + const TStamp elapsed = time_local() - HTS_STAT.stat_timestart; + + if (elapsed >= opt->maxtime) { + if (!opt->state.stop) { /* not yet stopped */ + hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up", + opt->maxtime); + /* cancel mirror smoothly */ + hts_request_stop(opt, 0); + } + /* smooth stop starved past the grace period: stop waiting (#481) */ + if (elapsed - opt->maxtime >= back_maxtime_grace(opt->maxtime)) + return 0; } } return 1; /* Ok, go on */ diff --git a/src/htsback.h b/src/htsback.h index 618d0490..db59ba51 100644 --- a/src/htsback.h +++ b/src/htsback.h @@ -136,6 +136,8 @@ void back_solve(httrackp * opt, lien_back * sback); int host_wait(httrackp * opt, lien_back * sback); #endif int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize); +/* Enforce -M/-E quotas: requests a smooth stop when reached; returns 0 once + the -E deadline overran its grace period (callers must stop waiting). */ int back_checkmirror(httrackp * opt); #endif diff --git a/src/htsname.c b/src/htsname.c index fa865380..7b0c2f5d 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -74,30 +74,36 @@ static const char *hts_tbdev[] = { "" }; -#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \ - int prev = opt->state._hts_in_html_parsing; \ - while(back_pluggable_sockets_strict(sback, opt) <= 0) { \ - opt->state. _hts_in_html_parsing = 6; \ - /* Wait .. */ \ - back_wait(sback,opt,cache,0); \ - /* Transfer rate */ \ - engine_stats(); \ - /* Refresh various stats */ \ - HTS_STAT.stat_nsocket=back_nsoc(sback); \ - HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \ - HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \ - HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \ - HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \ - HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \ - /* Check */ \ - { \ - if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count,-1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \ - return -1; \ - } \ - } \ - } \ - opt->state._hts_in_html_parsing = prev; \ -} while(0) +#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() \ + do { \ + int prev = opt->state._hts_in_html_parsing; \ + while (back_pluggable_sockets_strict(sback, opt) <= 0) { \ + opt->state._hts_in_html_parsing = 6; \ + /* Wait .. */ \ + back_wait(sback, opt, cache, 0); \ + /* time limit (-E) exceeded: stop waiting for a socket (#481) */ \ + if (!back_checkmirror(opt)) \ + break; \ + /* Transfer rate */ \ + engine_stats(); \ + /* Refresh various stats */ \ + HTS_STAT.stat_nsocket = back_nsoc(sback); \ + HTS_STAT.stat_errors = fspc(opt, NULL, "error"); \ + HTS_STAT.stat_warnings = fspc(opt, NULL, "warning"); \ + HTS_STAT.stat_infos = fspc(opt, NULL, "info"); \ + HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr); \ + HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback); \ + /* Check */ \ + { \ + if (!RUN_CALLBACK7( \ + opt, loop, sback->lnk, sback->count, -1, ptr, opt->lien_tot, \ + (int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) { \ + return -1; \ + } \ + } \ + } \ + opt->state._hts_in_html_parsing = prev; \ + } while (0) /* Strip all // */ static void cleanDoubleSlash(char *s) { diff --git a/src/htsparse.c b/src/htsparse.c index c1d645c2..33f5c522 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -4077,6 +4077,9 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str, while(opt->state._hts_setpause || back_pluggable_sockets_strict(sback, opt) <= 0) { // on fait la pause.. opt->state._hts_in_html_parsing = 6; back_wait(sback, opt, cache, HTS_STAT.stat_timestart); + /* time limit (-E) exceeded: stop waiting for a socket (#481) */ + if (!back_checkmirror(opt)) + break; // Transfer rate engine_stats(); diff --git a/src/htsparse.h b/src/htsparse.h index 30688e86..3071c7be 100644 --- a/src/htsparse.h +++ b/src/htsparse.h @@ -175,27 +175,33 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs, /* Apply changes */ \ * str->ptr_ = ptr -#define WAIT_FOR_AVAILABLE_SOCKET() do { \ - int prev = opt->state._hts_in_html_parsing; \ - while(back_pluggable_sockets_strict(sback, opt) <= 0) { \ - opt->state._hts_in_html_parsing = 6; \ - /* Wait .. */ \ - back_wait(sback,opt,cache,0); \ - /* Transfer rate */ \ - engine_stats(); \ - /* Refresh various stats */ \ - HTS_STAT.stat_nsocket=back_nsoc(sback); \ - HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \ - HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \ - HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \ - HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \ - HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \ - /* Check */ \ - if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, -1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \ - return -1; \ - } \ - } \ - opt->state._hts_in_html_parsing = prev; \ -} while(0) +#define WAIT_FOR_AVAILABLE_SOCKET() \ + do { \ + int prev = opt->state._hts_in_html_parsing; \ + while (back_pluggable_sockets_strict(sback, opt) <= 0) { \ + opt->state._hts_in_html_parsing = 6; \ + /* Wait .. */ \ + back_wait(sback, opt, cache, 0); \ + /* time limit (-E) exceeded: stop waiting for a socket (#481) */ \ + if (!back_checkmirror(opt)) \ + break; \ + /* Transfer rate */ \ + engine_stats(); \ + /* Refresh various stats */ \ + HTS_STAT.stat_nsocket = back_nsoc(sback); \ + HTS_STAT.stat_errors = fspc(opt, NULL, "error"); \ + HTS_STAT.stat_warnings = fspc(opt, NULL, "warning"); \ + HTS_STAT.stat_infos = fspc(opt, NULL, "info"); \ + HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr); \ + HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback); \ + /* Check */ \ + if (!RUN_CALLBACK7( \ + opt, loop, sback->lnk, sback->count, -1, ptr, opt->lien_tot, \ + (int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) { \ + return -1; \ + } \ + } \ + opt->state._hts_in_html_parsing = prev; \ + } while (0) #endif diff --git a/tests/34_local-maxtime.test b/tests/34_local-maxtime.test new file mode 100644 index 00000000..92ed3fe3 --- /dev/null +++ b/tests/34_local-maxtime.test @@ -0,0 +1,21 @@ +#!/bin/bash +# +# -E time limit (#481): server pages trickle for minutes; the engine must stop +# on its own at -E plus grace, aborting the in-flight transfers. + +set -euo pipefail + +: "${top_srcdir:=..}" + +# cancelled crawls can orphan .delayed placeholders (#483): skip that audit +start=$(date +%s) +bash "$top_srcdir/tests/local-crawl.sh" \ + --skip-delayed-audit \ + --log-found 'More than 2 seconds passed' \ + httrack 'BASEURL/trickle/index.html' -E2 -c4 +wall=$(($(date +%s) - start)) +# hard stop is due at -E2 + 5s grace; near TRICKLE_SECONDS means it never fired +if [ "$wall" -ge 30 ]; then + echo "crawl took ${wall}s, -E hard stop did not engage" >&2 + exit 1 +fi diff --git a/tests/Makefile.am b/tests/Makefile.am index b309726b..4074fd94 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -96,6 +96,7 @@ TESTS = \ 30_local-fragment-link.test \ 31_local-javaclass.test \ 32_local-cdispo.test \ - 33_local-delayed.test + 33_local-delayed.test \ + 34_local-maxtime.test CLEANFILES = check-network_sh.cache diff --git a/tests/local-crawl.sh b/tests/local-crawl.sh index 5ecac78e..581df608 100755 --- a/tests/local-crawl.sh +++ b/tests/local-crawl.sh @@ -92,6 +92,7 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create # --- parse leading control flags -------------------------------------------- declare -a audit=() declare -a cookies=() +skip_delayed_audit="" scheme=http pos=0 args=("$@") @@ -116,6 +117,9 @@ while test "$pos" -lt "$nargs"; do pos=$((pos + 1)) cookies+=("${args[$pos]}") ;; + --skip-delayed-audit) + skip_delayed_audit=1 + ;; --errors | --files) audit+=("${args[$pos]}" "${args[$((pos + 1))]}") pos=$((pos + 1)) @@ -246,12 +250,15 @@ done test -n "$hostroot" || die "could not find host root under $out" debug "host root: $hostroot" -# A completed crawl must leave no .delayed temporaries (issue #107) -info "checking for leftover .delayed files" -leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5) -if test -z "$leftovers"; then result "OK"; else - result "leftover: $leftovers" - exit 1 +# A completed crawl must leave no .delayed temporaries (issue #107). +# --skip-delayed-audit: a cancelled crawl can orphan placeholders (issue #483) +if test -z "$skip_delayed_audit"; then + info "checking for leftover .delayed files" + leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5) + if test -z "$leftovers"; then result "OK"; else + result "leftover: $leftovers" + exit 1 + fi fi # --- audit ------------------------------------------------------------------- diff --git a/tests/local-server.py b/tests/local-server.py index 7fa9503d..60eb9576 100755 --- a/tests/local-server.py +++ b/tests/local-server.py @@ -464,6 +464,30 @@ def route_delayed_notype(self): def route_delayed_empty(self): self.send_raw(b"", "text/html") # 200 + Content-Length: 0 + # -E time-limit (#481): pages that trickle far longer than any -E budget, + # so only an engine-side abort can end the crawl. + TRICKLE_SECONDS = 60 + + def route_trickle_index(self): + self.send_html( + "".join('\tp%d\n' % (i, i) for i in range(8)) + ) + + def route_trickle_page(self): + self.send_response(200) + self.send_header("Content-Type", "application/octet-stream") + self.send_header("Content-Length", str(2 * self.TRICKLE_SECONDS)) + self.end_headers() + if self.command == "HEAD": + return + try: + for _ in range(self.TRICKLE_SECONDS): + self.wfile.write(b"xy") + self.wfile.flush() + time.sleep(1.0) + except OSError: + pass + ROUTES = { "/cookies/entrance.php": route_entrance, "/cookies/second.php": route_second, @@ -509,6 +533,15 @@ def route_delayed_empty(self): "/cdispo/fetch.php": route_cdispo, "/cdispo/evil.php": route_cdispo, "/delayed/index.html": route_delayed_index, + "/trickle/index.html": route_trickle_index, + "/trickle/p0.bin": route_trickle_page, + "/trickle/p1.bin": route_trickle_page, + "/trickle/p2.bin": route_trickle_page, + "/trickle/p3.bin": route_trickle_page, + "/trickle/p4.bin": route_trickle_page, + "/trickle/p5.bin": route_trickle_page, + "/trickle/p6.bin": route_trickle_page, + "/trickle/p7.bin": route_trickle_page, "/delayed/noloc.php": route_delayed_noloc, "/delayed/selfloop.php": route_delayed_selfloop, "/delayed/redir.php": route_delayed_redir,