Skip to content

Commit 6b83aea

Browse files
committed
Add test timeouts and fix error propagation
Prevent indefinite test hangs by adding 120-second timeout and duration reporting. Fix silent error drops by propagating errors from background write goroutines to PanicAbort channel. Check for abort in sleepWhileTrue loop and handle its error in cutOver.
1 parent b2d0f6a commit 6b83aea

3 files changed

Lines changed: 83 additions & 18 deletions

File tree

go/logic/migrator.go

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,10 @@ func NewMigrator(context *base.MigrationContext, appVersion string) *Migrator {
124124
// (or fails with error)
125125
func (this *Migrator) sleepWhileTrue(operation func() (bool, error)) error {
126126
for {
127+
// Check for abort before continuing
128+
if err := this.checkAbort(); err != nil {
129+
return err
130+
}
127131
shouldSleep, err := operation()
128132
if err != nil {
129133
return err
@@ -539,7 +543,12 @@ func (this *Migrator) Migrate() (err error) {
539543
if err := this.hooksExecutor.onBeforeRowCopy(); err != nil {
540544
return err
541545
}
542-
go this.executeWriteFuncs()
546+
go func() {
547+
if err := this.executeWriteFuncs(); err != nil {
548+
// Send error to PanicAbort to trigger abort
549+
_ = base.SendWithContext(this.migrationContext.GetContext(), this.migrationContext.PanicAbort, err)
550+
}
551+
}()
543552
go this.iterateChunks()
544553
this.migrationContext.MarkRowCopyStartTime()
545554
go this.initiateStatus()
@@ -679,7 +688,12 @@ func (this *Migrator) Revert() error {
679688

680689
this.initiateThrottler()
681690
go this.initiateStatus()
682-
go this.executeDMLWriteFuncs()
691+
go func() {
692+
if err := this.executeDMLWriteFuncs(); err != nil {
693+
// Send error to PanicAbort to trigger abort
694+
_ = base.SendWithContext(this.migrationContext.GetContext(), this.migrationContext.PanicAbort, err)
695+
}
696+
}()
683697

684698
this.printStatus(ForcePrintStatusRule)
685699
var retrier func(func() error, ...bool) error
@@ -755,7 +769,7 @@ func (this *Migrator) cutOver() (err error) {
755769

756770
this.migrationContext.MarkPointOfInterest()
757771
this.migrationContext.Log.Debugf("checking for cut-over postpone")
758-
this.sleepWhileTrue(
772+
if err := this.sleepWhileTrue(
759773
func() (bool, error) {
760774
heartbeatLag := this.migrationContext.TimeSinceLastHeartbeatOnChangelog()
761775
maxLagMillisecondsThrottle := time.Duration(atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold)) * time.Millisecond
@@ -783,7 +797,9 @@ func (this *Migrator) cutOver() (err error) {
783797
}
784798
return false, nil
785799
},
786-
)
800+
); err != nil {
801+
return err
802+
}
787803
atomic.StoreInt64(&this.migrationContext.IsPostponingCutOver, 0)
788804
this.migrationContext.MarkPointOfInterest()
789805
this.migrationContext.Log.Debugf("checking for cut-over postpone: complete")

localtests/panic-on-warnings-update-pk-with-duplicate-on-new-unique-index/test.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,22 @@ ghost_pid=$!
1818

1919
# Wait for row copy to complete
2020
echo_dot
21+
row_copy_complete=false
2122
for i in {1..30}; do
22-
grep -q "Row copy complete" $test_logfile && break
23+
if grep -q "Row copy complete" $test_logfile; then
24+
row_copy_complete=true
25+
break
26+
fi
2327
ps -p $ghost_pid > /dev/null || { echo; echo "ERROR gh-ost exited early"; rm -f $postpone_flag_file; return 1; }
2428
sleep 1; echo_dot
2529
done
2630

31+
if ! $row_copy_complete; then
32+
echo; echo "ERROR row copy did not complete within expected time"
33+
rm -f $postpone_flag_file
34+
return 1
35+
fi
36+
2737
# Inject conflicting SQL after row copy (UPDATE with PK change creates DELETE+INSERT in binlog)
2838
echo_dot
2939
gh-ost-test-mysql-master test -e "update gh_ost_test set id = 200, email = 'alice@example.com' where id = 2"

localtests/test.sh

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ replica_port=
3030
original_sql_mode=
3131
current_gtid_mode=
3232
sysbench_pid=
33+
test_timeout=120
34+
test_failure_log_tail_lines=50
3335

3436
OPTIND=1
3537
while getopts "b:s:dtg" OPTION; do
@@ -168,16 +170,20 @@ build_ghost_command() {
168170
--execute ${extra_args[@]}"
169171
}
170172

173+
print_log_excerpt() {
174+
echo "=== Last $test_failure_log_tail_lines lines of $test_logfile ==="
175+
tail -n $test_failure_log_tail_lines $test_logfile
176+
echo "=== End log excerpt ==="
177+
}
178+
171179
validate_expected_failure() {
172180
# Check if test expected to fail and validate error message
173181
# Expects: tests_path, test_name, execution_result, test_logfile
174182
if [ -f $tests_path/$test_name/expect_failure ]; then
175183
if [ $execution_result -eq 0 ]; then
176184
echo
177185
echo "ERROR $test_name execution was expected to exit on error but did not."
178-
echo "=== Last 50 lines of $test_logfile ==="
179-
tail -n 50 $test_logfile
180-
echo "=== End log excerpt ==="
186+
print_log_excerpt
181187
return 1
182188
fi
183189
if [ -s $tests_path/$test_name/expect_failure ]; then
@@ -188,9 +194,7 @@ validate_expected_failure() {
188194
fi
189195
echo
190196
echo "ERROR $test_name execution was expected to exit with error message '${expected_error_message}' but did not."
191-
echo "=== Last 50 lines of $test_logfile ==="
192-
tail -n 50 $test_logfile
193-
echo "=== End log excerpt ==="
197+
print_log_excerpt
194198
return 1
195199
fi
196200
# 'expect_failure' file has no content. We generally agree that the failure is correct
@@ -327,10 +331,32 @@ test_single() {
327331

328332
# Check for custom test script
329333
if [ -f $tests_path/$test_name/test.sh ]; then
330-
# Source the custom test script which can override default behavior
331-
# It has access to all variables and functions from this script
332-
source $tests_path/$test_name/test.sh
333-
return $?
334+
# Run the custom test script in a subshell with timeout monitoring
335+
# The subshell inherits all functions and variables from the current shell
336+
(source $tests_path/$test_name/test.sh) &
337+
test_pid=$!
338+
339+
# Monitor the test with timeout
340+
timeout_counter=0
341+
while kill -0 $test_pid 2>/dev/null; do
342+
if [ $timeout_counter -ge $test_timeout ]; then
343+
kill -TERM $test_pid 2>/dev/null
344+
sleep 1
345+
kill -KILL $test_pid 2>/dev/null
346+
wait $test_pid 2>/dev/null
347+
echo
348+
echo "ERROR $test_name execution timed out"
349+
print_log_excerpt
350+
return 1
351+
fi
352+
sleep 1
353+
((timeout_counter++))
354+
done
355+
356+
# Get the exit code
357+
wait $test_pid 2>/dev/null
358+
execution_result=$?
359+
return $execution_result
334360
fi
335361

336362
# test with sysbench oltp write load
@@ -358,11 +384,19 @@ test_single() {
358384
echo_dot
359385
echo $cmd >$exec_command_file
360386
echo_dot
361-
bash $exec_command_file >$test_logfile 2>&1
387+
timeout $test_timeout bash $exec_command_file >$test_logfile 2>&1
362388

363389
execution_result=$?
364390
cleanup
365391

392+
# Check for timeout (exit code 124)
393+
if [ $execution_result -eq 124 ]; then
394+
echo
395+
echo "ERROR $test_name execution timed out"
396+
print_log_excerpt
397+
return 1
398+
fi
399+
366400
if [ -f $tests_path/$test_name/sql_mode ]; then
367401
gh-ost-test-mysql-master --default-character-set=utf8mb4 test -e "set @@global.sql_mode='${original_sql_mode}'"
368402
gh-ost-test-mysql-replica --default-character-set=utf8mb4 test -e "set @@global.sql_mode='${original_sql_mode}'"
@@ -441,14 +475,19 @@ test_all() {
441475
test_dirs=$(find "$tests_path" -mindepth 1 -maxdepth 1 ! -path . -type d | grep "$test_pattern" | sort)
442476
while read -r test_dir; do
443477
test_name=$(basename "$test_dir")
478+
local test_start_time=$(date +%s)
444479
if ! test_single "$test_name"; then
480+
local test_end_time=$(date +%s)
481+
local test_duration=$((test_end_time - test_start_time))
445482
create_statement=$(gh-ost-test-mysql-replica test -t -e "show create table ${ghost_table_name} \G")
446483
echo "$create_statement" >>$test_logfile
447-
echo "+ FAIL"
484+
echo "+ FAIL (${test_duration}s)"
448485
return 1
449486
else
487+
local test_end_time=$(date +%s)
488+
local test_duration=$((test_end_time - test_start_time))
450489
echo
451-
echo "+ pass"
490+
echo "+ pass (${test_duration}s)"
452491
fi
453492
mysql_version="$(gh-ost-test-mysql-replica -e "select @@version")"
454493
replica_terminology="slave"

0 commit comments

Comments
 (0)