run KISS: Error Backoff

Recently I've encountered an issue where we have a task trying to start and load some startup configuration, but the configuration did not exist. So it kept trying to load the configuration every second causing system load and filling the logs with errors. To mitigate this I've added a back-off mechanism that exponentially increases the delay between retries. This is a common practice for such tasks.

The implementation is below.


type Backoff struct {
    nowTime   kittime.NowTime
    minPeriod time.Duration
    maxPeriod time.Duration

    currentPeriod time.Duration
    lastFailure   *time.Time
}

func ProduceBackoff(
    nowTime kittime.NowTime,
    minPeriod time.Duration,
    maxPeriod time.Duration,
) *Backoff {
    return &Backoff{
       nowTime:       nowTime,
       minPeriod:     minPeriod,
       maxPeriod:     maxPeriod,
       currentPeriod: minPeriod,
    }
}

func (b *Backoff) MarkSuccess() {
    b.lastFailure = nil
    b.currentPeriod = b.minPeriod
}

func (b *Backoff) MarkFail() {
    if b.lastFailure == nil {
       b.currentPeriod = b.minPeriod
    } else {
       b.currentPeriod = 2 * b.currentPeriod
       if b.currentPeriod > b.maxPeriod {
          b.currentPeriod = b.maxPeriod
       }
    }
    b.lastFailure = b.nowTime.NowPointer()
}

func (b *Backoff) CanRun() bool {
    if b.lastFailure == nil {
       return true
    }

    passedDuration := b.nowTime.NowPointer().Sub(*b.lastFailure)
    return passedDuration >= b.currentPeriod
}

An example usage is displayed in the short unit-test for this.


func (t *Test) check() {
    t.backoff = backoff.ProduceBackoff(t.NowTime, time.Second, 10*time.Second)

    for range 3 {
       t.checkBackoffLoop()
       t.Log("success")
       t.backoff.MarkSuccess()
    }
}

func (t *Test) checkBackoffLoop() {
    t.Log("failure #1")
    t.Assert(t.backoff.CanRun())
    t.backoff.MarkFail()
    t.verifyCannotRunUntil(time.Second)

    t.Log("failure #2")
    t.Assert(t.backoff.CanRun())
    t.backoff.MarkFail()
    t.verifyCannotRunUntil(2 * time.Second)

    t.Log("failure #3")
    t.Assert(t.backoff.CanRun())
    t.backoff.MarkFail()
    t.verifyCannotRunUntil(4 * time.Second)

    t.Log("failure #4")
    t.Assert(t.backoff.CanRun())
    t.backoff.MarkFail()
    t.verifyCannotRunUntil(8 * time.Second)

    t.Log("failure #5")
    for range 5 {
       t.Assert(t.backoff.CanRun())
       t.backoff.MarkFail()
       t.verifyCannotRunUntil(10 * time.Second)
    }
}

func (t *Test) verifyCannotRunUntil(
    period time.Duration,
) {
    seconds := int(period.Seconds())
    for range seconds {
       t.AssertFalse(t.backoff.CanRun())
       t.NowTime.IncrementFakeTime(time.Second)
    }

    t.Assert(t.backoff.CanRun())
}

Final Note

After using the back-off mechanism our logs are cleaner, and the system load is lower. By the way, a similar mechanism is used in the kubernetes pod crash back off.

Full Blog TOC

Full Blog Table Of Content with Keywords Available HERE

Sunday, June 29, 2025

Error Backoff

Final Note

No comments:

Post a Comment