Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ const (
// DefaultDatabaseCacheTTL is the default value for the LocalTTL parameter for databases if not specified.
DefaultDatabaseCacheTTL = time.Second * 30

// DefaultDataStoreHealthCheckInterval is the default interval for checking whether the persistent
// data store still contains its initialization data. If data loss is detected (e.g. after a Redis
// restart), the relay will automatically repopulate the store from its in-memory snapshot.
DefaultDataStoreHealthCheckInterval = time.Second * 30

// DefaultPrometheusPort is the default value for PrometheusConfig.Port if not specified.
DefaultPrometheusPort = 8031

Expand Down Expand Up @@ -212,13 +217,14 @@ type EventsConfig struct {
// variables, individual fields are not documented here; instead, see the `README.md` section on
// configuration.
type RedisConfig struct {
Host string `conf:"REDIS_HOST"`
Port ct.OptIntGreaterThanZero
URL ct.OptURLAbsolute `conf:"REDIS_URL"`
LocalTTL ct.OptDuration `conf:"CACHE_TTL"`
TLS bool `conf:"REDIS_TLS"`
Username string `conf:"REDIS_USERNAME"`
Password string `conf:"REDIS_PASSWORD"`
Host string `conf:"REDIS_HOST"`
Port ct.OptIntGreaterThanZero
URL ct.OptURLAbsolute `conf:"REDIS_URL"`
LocalTTL ct.OptDuration `conf:"CACHE_TTL"`
TLS bool `conf:"REDIS_TLS"`
Username string `conf:"REDIS_USERNAME"`
Password string `conf:"REDIS_PASSWORD"`
HealthCheckInterval ct.OptDuration `conf:"REDIS_HEALTH_CHECK_INTERVAL"`
}

// ConsulConfig configures the optional Consul integration.
Expand Down
35 changes: 35 additions & 0 deletions docs/persistent-storage.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,41 @@ note over Relay Proxy: TTL fresh, serve from memory
Relay Proxy-->>SDK2: Streaming response
```

## Data Store Health Check and Automatic Repopulation

The Relay Proxy includes a health check mechanism that detects when a persistent store loses its data (for example, when Redis restarts without persistence enabled). Without this, SDKs using daemon mode (such as PHP) would receive empty flag evaluations until the Relay Proxy is manually restarted.

### How it works

When using Redis, the Relay Proxy periodically checks for the presence of a sentinel key (`$inited`) that the SDK writes when it first populates the store. If this key is missing but the Relay has valid data in memory, it automatically repopulates the store.

The health check also includes a **circuit breaker**: if a read from the persistent store fails with a connection error, subsequent reads are served directly from an in-memory snapshot, avoiding connection pool exhaustion and timeout cascades. The circuit breaker is cleared automatically when the health check confirms the store is available again.

### Configuration

The health check interval is configurable via the `REDIS_HEALTH_CHECK_INTERVAL` environment variable or the `healthCheckInterval` option in the `[Redis]` configuration section. The default is 30 seconds.

```
# Configuration file
[Redis]
host = "localhost"
port = 6379
localTtl = 30s
healthCheckInterval = 30s
```

```
# Environment variable
REDIS_HEALTH_CHECK_INTERVAL=30s
```

### Behavior summary

- **Store read error (e.g. connection failure):** Circuit breaker activates immediately. Proxy-mode SDKs are served from the in-memory snapshot. The health check probes for recovery at the configured interval.
- **Store data loss (e.g. Redis restart):** Detected within one health check interval. The store is automatically repopulated from the in-memory snapshot.
- **Store recovered:** Circuit breaker is cleared. Normal read path resumes.
- **No snapshot available (e.g. Relay just started):** Health check cannot repopulate. Errors pass through normally.

## Example: Persistent Store during LaunchDarkly Outage - Cold Relay

In this example, LaunchDarkly SaaS is down. Additionally, the Relay in this diagram is starting up **during** the
Expand Down
68 changes: 68 additions & 0 deletions internal/relayenv/env_context_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ import (
"github.com/launchdarkly/go-server-sdk/v7/subsystems"
"github.com/launchdarkly/go-server-sdk/v7/subsystems/ldstoreimpl"
"github.com/launchdarkly/go-server-sdk/v7/subsystems/ldstoretypes"

redigo "github.com/gomodule/redigo/redis"
)

// LogNameMode is used in NewEnvContext to determine whether the environment's log messages should be
Expand Down Expand Up @@ -120,6 +122,8 @@ type envContextImpl struct {
stopMonitoringCredentials chan struct{}
doneMonitoringCredentials chan struct{}
connectionMapper ConnectionMapper
storeHealthCheck *store.StoreHealthCheck
storeInitChecker *store.RedisInitChecker
offline bool
closed bool
}
Expand Down Expand Up @@ -401,6 +405,25 @@ func NewEnvContext(
// Connecting may take time, so do this in parallel
go envContext.startSDKClient(envConfig.SDKKey, readyCh, allConfig.Main.IgnoreConnectionErrors)

// Start the persistent store health check if using Redis
if allConfig.Redis.URL.IsDefined() {
healthCheckInterval := allConfig.Redis.HealthCheckInterval.GetOrElse(config.DefaultDataStoreHealthCheckInterval)
redisURL, prefix := sdks.GetRedisBasicProperties(allConfig.Redis, envConfig)
var dialOptions []redigo.DialOption
if allConfig.Redis.Password != "" {
dialOptions = append(dialOptions, redigo.DialPassword(allConfig.Redis.Password))
}
if allConfig.Redis.Username != "" {
dialOptions = append(dialOptions, redigo.DialUsername(allConfig.Redis.Username))
}
initChecker := store.NewRedisInitChecker(redisURL, prefix, dialOptions)
envContext.storeInitChecker = initChecker
thingsToCleanUp.AddFunc(func() { _ = initChecker.Close() })
// Health check is started later after the store adapter builds the actual store.
// We defer this to startStoreHealthCheck which is called after the SDK client is ready.
envContext.deferredHealthCheckStart(initChecker, healthCheckInterval, envLoggers)
}

cleanupInterval := params.ExpiredCredentialCleanupInterval
if cleanupInterval == 0 { // 0 means it wasn't specified; the config system disallows 0 as a valid value.
cleanupInterval = defaultCredentialCleanupInterval
Expand All @@ -412,6 +435,42 @@ func NewEnvContext(
return envContext, nil
}

func (c *envContextImpl) deferredHealthCheckStart(
initChecker *store.RedisInitChecker,
interval time.Duration,
loggers ldlog.Loggers,
) {
go func() {
ticker := time.NewTicker(100 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-ticker.C:
ss := c.storeAdapter.GetSnapshotStore()
if ss == nil {
continue
}
hc := store.NewStoreHealthCheck(ss, initChecker, interval, loggers)
if hc == nil {
return
}
c.mu.Lock()
if c.closed {
c.mu.Unlock()
return
}
c.storeHealthCheck = hc
c.mu.Unlock()
hc.Start()
loggers.Info("Data store health check started")
return
case <-c.stopMonitoringCredentials:
return
}
}
}()
}

func (c *envContextImpl) cleanupExpiredCredentials(interval time.Duration) {
ticker := time.NewTicker(interval)
defer ticker.Stop()
Expand Down Expand Up @@ -748,6 +807,15 @@ func (c *envContextImpl) Close() error {
if c.sdkBigSegments != nil {
c.sdkBigSegments.Close()
}
c.mu.RLock()
hc := c.storeHealthCheck
c.mu.RUnlock()
if hc != nil {
hc.Stop()
}
Comment thread
cursor[bot] marked this conversation as resolved.
if c.storeInitChecker != nil {
_ = c.storeInitChecker.Close()
}
return nil
}

Expand Down
51 changes: 51 additions & 0 deletions internal/store/redis_init_checker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package store

import (
"fmt"

redigo "github.com/gomodule/redigo/redis"
)

// RedisInitChecker implements StoreInitChecker by directly querying Redis for the
// $inited sentinel key, bypassing the SDK's caching layer.
type RedisInitChecker struct {
pool *redigo.Pool
prefix string
}

// NewRedisInitChecker creates a checker that connects to Redis using the given URL and
// dial options. The prefix should match the store prefix used by the SDK (e.g. "launchdarkly").
func NewRedisInitChecker(redisURL string, prefix string, dialOptions []redigo.DialOption) *RedisInitChecker {
pool := &redigo.Pool{
MaxIdle: 1,
MaxActive: 1,
Dial: func() (redigo.Conn, error) {
return redigo.DialURL(redisURL, dialOptions...)
},
}
return &RedisInitChecker{
pool: pool,
prefix: prefix,
}
}

func (r *RedisInitChecker) initedKey() string {
return fmt.Sprintf("%s:$inited", r.prefix)
}

// CheckInitialized checks if the $inited key exists in Redis.
func (r *RedisInitChecker) CheckInitialized() (available bool, initialized bool, err error) {
conn := r.pool.Get()
defer conn.Close() //nolint:errcheck

exists, err := redigo.Bool(conn.Do("EXISTS", r.initedKey()))
if err != nil {
return false, false, err
}
return true, exists, nil
}

// Close releases the Redis connection pool.
func (r *RedisInitChecker) Close() error {
return r.pool.Close()
}
Loading
Loading