Skip to content

Commit b9652c3

Browse files
yosefbsybs-me
andauthored
Add retry logic for instant DDL on lock wait timeout (#1651)
* Add retry logic for instant DDL on lock wait timeout When attempting instant DDL, a lock wait timeout (errno 1205) may occur if a long-running transaction holds a metadata lock. Rather than failing immediately, retry the operation up to 5 times with linear backoff. Non-timeout errors (e.g. ALGORITHM=INSTANT not supported) still return immediately without retrying. * Fix int-to-Duration type mismatch in retry backoff --------- Co-authored-by: ybs-me <yosef.bensimchon@melio.com>
1 parent 67cc636 commit b9652c3

2 files changed

Lines changed: 94 additions & 1 deletion

File tree

go/logic/applier.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,32 @@ func (this *Applier) AttemptInstantDDL() error {
305305
return err
306306
}
307307
// We don't need a trx, because for instant DDL the SQL mode doesn't matter.
308-
_, err := this.db.Exec(query)
308+
return retryOnLockWaitTimeout(func() error {
309+
_, err := this.db.Exec(query)
310+
return err
311+
}, this.migrationContext.Log)
312+
}
313+
314+
// retryOnLockWaitTimeout retries the given operation on MySQL lock wait timeout
315+
// (errno 1205). Non-timeout errors return immediately. This is used for instant
316+
// DDL attempts where the operation may be blocked by a long-running transaction.
317+
func retryOnLockWaitTimeout(operation func() error, logger base.Logger) error {
318+
const maxRetries = 5
319+
var err error
320+
for i := 0; i < maxRetries; i++ {
321+
if i != 0 {
322+
logger.Infof("Retrying after lock wait timeout (attempt %d/%d)", i+1, maxRetries)
323+
RetrySleepFn(time.Duration(i) * 5 * time.Second)
324+
}
325+
err = operation()
326+
if err == nil {
327+
return nil
328+
}
329+
var mysqlErr *drivermysql.MySQLError
330+
if !errors.As(err, &mysqlErr) || mysqlErr.Number != 1205 {
331+
return err
332+
}
333+
}
309334
return err
310335
}
311336

go/logic/applier_test.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@ package logic
88
import (
99
"context"
1010
gosql "database/sql"
11+
"errors"
1112
"strings"
1213
"testing"
14+
"time"
1315

16+
drivermysql "github.com/go-sql-driver/mysql"
1417
"github.com/stretchr/testify/require"
1518
"github.com/stretchr/testify/suite"
1619

@@ -198,6 +201,71 @@ func TestApplierInstantDDL(t *testing.T) {
198201
})
199202
}
200203

204+
func TestRetryOnLockWaitTimeout(t *testing.T) {
205+
oldRetrySleepFn := RetrySleepFn
206+
defer func() { RetrySleepFn = oldRetrySleepFn }()
207+
RetrySleepFn = func(d time.Duration) {} // no-op for tests
208+
209+
logger := base.NewMigrationContext().Log
210+
211+
lockWaitTimeoutErr := &drivermysql.MySQLError{Number: 1205, Message: "Lock wait timeout exceeded"}
212+
nonRetryableErr := &drivermysql.MySQLError{Number: 1845, Message: "ALGORITHM=INSTANT is not supported"}
213+
214+
t.Run("success on first attempt", func(t *testing.T) {
215+
calls := 0
216+
err := retryOnLockWaitTimeout(func() error {
217+
calls++
218+
return nil
219+
}, logger)
220+
require.NoError(t, err)
221+
require.Equal(t, 1, calls)
222+
})
223+
224+
t.Run("retry on lock wait timeout then succeed", func(t *testing.T) {
225+
calls := 0
226+
err := retryOnLockWaitTimeout(func() error {
227+
calls++
228+
if calls < 3 {
229+
return lockWaitTimeoutErr
230+
}
231+
return nil
232+
}, logger)
233+
require.NoError(t, err)
234+
require.Equal(t, 3, calls)
235+
})
236+
237+
t.Run("non-retryable error returns immediately", func(t *testing.T) {
238+
calls := 0
239+
err := retryOnLockWaitTimeout(func() error {
240+
calls++
241+
return nonRetryableErr
242+
}, logger)
243+
require.ErrorIs(t, err, nonRetryableErr)
244+
require.Equal(t, 1, calls)
245+
})
246+
247+
t.Run("non-mysql error returns immediately", func(t *testing.T) {
248+
calls := 0
249+
genericErr := errors.New("connection refused")
250+
err := retryOnLockWaitTimeout(func() error {
251+
calls++
252+
return genericErr
253+
}, logger)
254+
require.ErrorIs(t, err, genericErr)
255+
require.Equal(t, 1, calls)
256+
})
257+
258+
t.Run("exhausts all retries", func(t *testing.T) {
259+
calls := 0
260+
err := retryOnLockWaitTimeout(func() error {
261+
calls++
262+
return lockWaitTimeoutErr
263+
}, logger)
264+
require.ErrorIs(t, err, lockWaitTimeoutErr)
265+
require.Equal(t, 5, calls)
266+
})
267+
}
268+
201269
type ApplierTestSuite struct {
202270
suite.Suite
203271

0 commit comments

Comments
 (0)