Recently I've encountered an issue where we have a task trying to start and load some startup configuration, but the configuration did not exist. So it kept trying to load the configuration every second causing system load and filling the logs with errors. To mitigate this I've added a back-off mechanism that exponentially increases the delay between retries. This is a common practice for such tasks.
The implementation is below.
type Backoff struct {
nowTime kittime.NowTime
minPeriod time.Duration
maxPeriod time.Duration
currentPeriod time.Duration
lastFailure *time.Time
}
func ProduceBackoff(
nowTime kittime.NowTime,
minPeriod time.Duration,
maxPeriod time.Duration,
) *Backoff {
return &Backoff{
nowTime: nowTime,
minPeriod: minPeriod,
maxPeriod: maxPeriod,
currentPeriod: minPeriod,
}
}
func (b *Backoff) MarkSuccess() {
b.lastFailure = nil
b.currentPeriod = b.minPeriod
}
func (b *Backoff) MarkFail() {
if b.lastFailure == nil {
b.currentPeriod = b.minPeriod
} else {
b.currentPeriod = 2 * b.currentPeriod
if b.currentPeriod > b.maxPeriod {
b.currentPeriod = b.maxPeriod
}
}
b.lastFailure = b.nowTime.NowPointer()
}
func (b *Backoff) CanRun() bool {
if b.lastFailure == nil {
return true
}
passedDuration := b.nowTime.NowPointer().Sub(*b.lastFailure)
return passedDuration >= b.currentPeriod
}
An example usage is displayed in the short unit-test for this.
func (t *Test) check() {
t.backoff = backoff.ProduceBackoff(t.NowTime, time.Second, 10*time.Second)
for range 3 {
t.checkBackoffLoop()
t.Log("success")
t.backoff.MarkSuccess()
}
}
func (t *Test) checkBackoffLoop() {
t.Log("failure #1")
t.Assert(t.backoff.CanRun())
t.backoff.MarkFail()
t.verifyCannotRunUntil(time.Second)
t.Log("failure #2")
t.Assert(t.backoff.CanRun())
t.backoff.MarkFail()
t.verifyCannotRunUntil(2 * time.Second)
t.Log("failure #3")
t.Assert(t.backoff.CanRun())
t.backoff.MarkFail()
t.verifyCannotRunUntil(4 * time.Second)
t.Log("failure #4")
t.Assert(t.backoff.CanRun())
t.backoff.MarkFail()
t.verifyCannotRunUntil(8 * time.Second)
t.Log("failure #5")
for range 5 {
t.Assert(t.backoff.CanRun())
t.backoff.MarkFail()
t.verifyCannotRunUntil(10 * time.Second)
}
}
func (t *Test) verifyCannotRunUntil(
period time.Duration,
) {
seconds := int(period.Seconds())
for range seconds {
t.AssertFalse(t.backoff.CanRun())
t.NowTime.IncrementFakeTime(time.Second)
}
t.Assert(t.backoff.CanRun())
}
Final Note
After using the back-off mechanism our logs are cleaner, and the system load is lower. By the way, a similar mechanism is used in the kubernetes pod crash back off.
No comments:
Post a Comment