diff --git a/cmd/drone-autoscaler/main.go b/cmd/drone-autoscaler/main.go index fad0bff1..f6116a49 100644 --- a/cmd/drone-autoscaler/main.go +++ b/cmd/drone-autoscaler/main.go @@ -62,9 +62,11 @@ func main() { Fatalln("Invalid or missing hosting provider") } + collector := metrics.New(conf) + // instruments the provider with prometheus metrics. provider = metrics.ServerCreate(provider) - provider = metrics.ServerDelete(provider) + provider = metrics.ServerDelete(provider, collector) db, err := store.Connect( conf.Database.Driver, @@ -95,7 +97,7 @@ func main() { conf, servers, provider, - metrics.New(), + collector, ) // diff --git a/config/config.go b/config/config.go index bdd34bcb..ab81f981 100644 --- a/config/config.go +++ b/config/config.go @@ -99,6 +99,10 @@ type ( Root string `envconfig:"DRONE_HTTP_ROOT" default:"/"` } + Metrics struct { + RegisterKnownServers bool `envconfig:"DRONE_METRICS_REGISTER_KNOWN_SERVERS" default:"false"` + } + UI struct { Username string `envconfig:"DRONE_UI_USERNAME"` Password string `envconfig:"DRONE_UI_PASSWORD"` diff --git a/engine/alloc.go b/engine/alloc.go index 22dff6db..97bc4838 100644 --- a/engine/alloc.go +++ b/engine/alloc.go @@ -123,5 +123,7 @@ func (a *allocator) allocate(ctx context.Context, server *autoscaler.Server) err return err } + a.metrics.RegisterKnownInstance(instance) + return nil } diff --git a/metrics/metrics.go b/metrics/metrics.go index 08bcfe37..ac70467b 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -8,6 +8,9 @@ import ( "context" "time" + "github.com/drone/autoscaler" + "github.com/drone/autoscaler/config" + "github.com/prometheus/client_golang/prometheus" ) @@ -40,6 +43,10 @@ type Collector interface { // IncrServerSetupError keeps a count of errors encountered // when installing software on servers. IncrServerSetupError() + + RegisterKnownInstance(instance *autoscaler.Instance) + + UnregisterKnownInstance(instance *autoscaler.Instance) } // Prometheus is a Prometheus metrics collector. @@ -50,11 +57,17 @@ type Prometheus struct { countServerCreateErr prometheus.Counter countServerInitErr prometheus.Counter countServerSetupErr prometheus.Counter + knownInstance *prometheus.GaugeVec + + registerKnownServers bool } // New returns a new Prometheus metrics provider. -func New() *Prometheus { +func New(c config.Config) *Prometheus { p := new(Prometheus) + + p.registerKnownServers = c.Metrics.RegisterKnownServers + p.trackServerCreateTime = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "drone_server_create_time_seconds", Help: "Elapsed time creating a server.", @@ -82,12 +95,25 @@ func New() *Prometheus { Name: "drone_server_install_errors_total", Help: "Total number of errors installing software on a server.", }) + p.knownInstance = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "drone_server_known_instance", + Help: "Known server instances.", + }, + []string{ + "name", + "provider", + "region", + "size", + }) prometheus.MustRegister(p.trackServerCreateTime) prometheus.MustRegister(p.trackServerInitTime) prometheus.MustRegister(p.trackServerSetupTime) prometheus.MustRegister(p.countServerCreateErr) prometheus.MustRegister(p.countServerInitErr) prometheus.MustRegister(p.countServerSetupErr) + if p.registerKnownServers { + prometheus.MustRegister(p.knownInstance) + } return p } @@ -135,6 +161,30 @@ func (m *Prometheus) IncrServerSetupError() { m.countServerSetupErr.Inc() } +// RegisterKnownInstance registers that we know about a server. +func (m *Prometheus) RegisterKnownInstance(instance *autoscaler.Instance) { + if m.registerKnownServers { + m.knownInstance.With(prometheus.Labels{ + "name": instance.Name, + "provider": string(instance.Provider), + "region": instance.Region, + "size": instance.Size, + }).Set(1) + } +} + +// UnregisterKnownInstance forgets a server we once knew. +func (m *Prometheus) UnregisterKnownInstance(instance *autoscaler.Instance) { + if m.registerKnownServers { + m.knownInstance.Delete(prometheus.Labels{ + "name": instance.Name, + "provider": string(instance.Provider), + "region": instance.Region, + "size": instance.Size, + }) + } +} + // NopCollector provides a no-op metrics collector. type NopCollector struct{} @@ -163,3 +213,9 @@ func (*NopCollector) IncrServerInitError() {} // IncrServerSetupError keeps a count of errors encountered // when installing software on servers. func (*NopCollector) IncrServerSetupError() {} + +// RegisterKnownInstance registers that we know about a server. +func (*NopCollector) RegisterKnownInstance(instance *autoscaler.Instance) {} + +// UnregisterKnownInstance forgets a server we once knew. +func (*NopCollector) UnregisterKnownInstance(instance *autoscaler.Instance) {} diff --git a/metrics/server_delete.go b/metrics/server_delete.go index 3d60bb33..ccab6d17 100644 --- a/metrics/server_delete.go +++ b/metrics/server_delete.go @@ -12,7 +12,7 @@ import ( ) // ServerDelete provides metrics for servers deleted. -func ServerDelete(provider autoscaler.Provider) autoscaler.Provider { +func ServerDelete(provider autoscaler.Provider, collector Collector) autoscaler.Provider { created := prometheus.NewCounter(prometheus.CounterOpts{ Name: "drone_servers_deleted", Help: "Total number of servers deleted.", @@ -24,17 +24,19 @@ func ServerDelete(provider autoscaler.Provider) autoscaler.Provider { prometheus.MustRegister(created) prometheus.MustRegister(errors) return &providerWrapDestroy{ - Provider: provider, - created: created, - errors: errors, + Provider: provider, + collector: collector, + created: created, + errors: errors, } } // instruments the Provider to count server destroy events. type providerWrapDestroy struct { autoscaler.Provider - created prometheus.Counter - errors prometheus.Counter + collector Collector + created prometheus.Counter + errors prometheus.Counter } func (p *providerWrapDestroy) Destroy(ctx context.Context, instance *autoscaler.Instance) error { @@ -44,5 +46,6 @@ func (p *providerWrapDestroy) Destroy(ctx context.Context, instance *autoscaler. } else { p.errors.Add(1) } + p.collector.UnregisterKnownInstance(instance) return err } diff --git a/metrics/server_delete_test.go b/metrics/server_delete_test.go index 8aee546e..46dac9f3 100644 --- a/metrics/server_delete_test.go +++ b/metrics/server_delete_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/drone/autoscaler" + "github.com/drone/autoscaler/config" "github.com/drone/autoscaler/mocks" "github.com/golang/mock/gomock" "github.com/prometheus/client_golang/prometheus" @@ -35,7 +36,8 @@ func TestServerDelete(t *testing.T) { provider.EXPECT().Destroy(noContext, instance).Times(3).Return(nil) provider.EXPECT().Destroy(noContext, instance).Return(errors.New("error")) - providerInst := ServerDelete(provider) + collector := New(config.Config{}) + providerInst := ServerDelete(provider, collector) for i := 0; i < 3; i++ { err := providerInst.Destroy(noContext, instance) if err != nil { @@ -52,20 +54,20 @@ func TestServerDelete(t *testing.T) { t.Error(err) return } - if want, got := len(metrics), 2; want != got { - t.Errorf("Expect registered metric") + if want, got := len(metrics), 8; want != got { + t.Errorf("Expect registered metric %d, got %d", want, got) return } - if got, want := metrics[0].GetName(), "drone_servers_deleted"; want != got { + if got, want := metrics[6].GetName(), "drone_servers_deleted"; want != got { t.Errorf("Expect metric name %s, got %s", want, got) } - if got, want := metrics[0].Metric[0].Counter.GetValue(), float64(3); want != got { + if got, want := metrics[6].Metric[0].Counter.GetValue(), float64(3); want != got { t.Errorf("Expect metric value %f, got %f", want, got) } - if got, want := metrics[1].GetName(), "drone_servers_deleted_err"; want != got { + if got, want := metrics[7].GetName(), "drone_servers_deleted_err"; want != got { t.Errorf("Expect metric name %s, got %s", want, got) } - if got, want := metrics[1].Metric[0].Counter.GetValue(), float64(1); want != got { + if got, want := metrics[7].Metric[0].Counter.GetValue(), float64(1); want != got { t.Errorf("Expect metric value %f, got %f", want, got) } }