Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VAULT-25341 Address issue where having no permissions to renew caused auto-auth to attempt to renew with no backoff #26844

Merged
merged 11 commits into from
May 9, 2024
9 changes: 8 additions & 1 deletion api/lifetime_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package api
import (
"errors"
"math/rand"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -289,12 +290,18 @@ func (r *LifetimeWatcher) doRenewWithOptions(tokenMode bool, nonRenewable bool,
switch {
case nonRenewable || r.renewBehavior == RenewBehaviorRenewDisabled:
// Can't or won't renew, just keep the same expiration so we exit
// when it's reauthentication time
// when it's re-authentication time
remainingLeaseDuration = fallbackLeaseDuration

default:
// Renew the token
renewal, err = renew(credString, r.increment)
if err != nil && strings.Contains(err.Error(), "permission denied") {
// We can't renew since the token doesn't have permission to. Fall back
// to the code path for non-renewable tokens.
nonRenewable = true
continue
}
if err != nil || renewal == nil || (tokenMode && renewal.Auth == nil) {
if r.renewBehavior == RenewBehaviorErrorOnErrors {
if err != nil {
Expand Down
18 changes: 17 additions & 1 deletion api/renewer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,20 @@ func TestLifetimeWatcher(t *testing.T) {
expectError: nil,
expectRenewal: true,
},
{
maxTestTime: time.Second,
name: "permission_denied_error",
leaseDurationSeconds: 60,
incrementSeconds: 10,
// This should cause the lifetime watcher to behave just
// like a non-renewable secret, i.e. wait until its lifetime
// then be done.
renew: func(_ string, _ int) (*Secret, error) {
return nil, fmt.Errorf("permission denied")
},
expectError: nil,
expectRenewal: false,
},
}

for _, tc := range cases {
Expand Down Expand Up @@ -204,7 +218,9 @@ func TestLifetimeWatcher(t *testing.T) {
for {
select {
case <-time.After(tc.maxTestTime):
t.Fatalf("renewal didn't happen")
if tc.expectRenewal || tc.expectError != nil {
t.Fatalf("expected error or renewal, and neither happened")
}
case r := <-v.RenewCh():
if !tc.expectRenewal {
t.Fatal("expected no renewals")
Expand Down
3 changes: 3 additions & 0 deletions changelog/26844.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
auto-auth: Addressed issue where having no permissions to renew a renewable token caused auto-auth to attempt to renew constantly with no backoff
```
138 changes: 138 additions & 0 deletions command/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3179,6 +3179,144 @@ vault {
}
}

// TestAgent_TokenRenewal tests that the token renewal process by the LifetimeWatcher
// for a token without the policy to allow it to renew itself doesn't result in many,
// many unnecessary requests to attempt to renew itself.
// Prior to a bug fix in the PR that added this test, this would have resulted
// in hundreds of token renewal requests with no backoff.
func TestAgent_TokenRenewal(t *testing.T) {
logger := logging.NewVaultLogger(hclog.Trace)
cluster := vault.NewTestCluster(t, nil, &vault.TestClusterOptions{
HandlerFunc: vaulthttp.Handler,
})
cluster.Start()
defer cluster.Cleanup()

serverClient := cluster.Cores[0].Client

auditLogFileName := makeTempFile(t, "audit-log", "")
err := serverClient.Sys().EnableAuditWithOptions("file", &api.EnableAuditOptions{
Type: "file",
Options: map[string]string{
"file_path": auditLogFileName,
},
})
require.NoError(t, err)

// Unset the environment variable so that agent picks up the right test
// cluster address
defer os.Setenv(api.EnvVaultAddress, os.Getenv(api.EnvVaultAddress))
os.Unsetenv(api.EnvVaultAddress)

policyName := "less-than-default"
// Has a subset of the default policy's permissions
// Specifically removing renew-self.
err = serverClient.Sys().PutPolicy(policyName, `
path "auth/token/lookup-self" {
capabilities = ["read"]
}

# Allow tokens to revoke themselves
path "auth/token/revoke-self" {
capabilities = ["update"]
}

# Allow a token to look up its own capabilities on a path
path "sys/capabilities-self" {
capabilities = ["update"]
}
`)
require.NoError(t, err)

renewable := true
// Make the token renewable but give it no permissions
// (e.g. the permission to renew itself)
tokenCreateRequest := &api.TokenCreateRequest{
Policies: []string{policyName},
TTL: "10s",
Renewable: &renewable,
NoDefaultPolicy: true,
}

secret, err := serverClient.Auth().Token().CreateOrphan(tokenCreateRequest)
require.NoError(t, err)
lowPermissionToken := secret.Auth.ClientToken

tokenFileName := makeTempFile(t, "token-file", lowPermissionToken)

sinkFileName := makeTempFile(t, "sink-file", "")

autoAuthConfig := fmt.Sprintf(`
auto_auth {
method {
type = "token_file"
config = {
token_file_path = "%s"
}
}

sink "file" {
config = {
path = "%s"
}
}
}`, tokenFileName, sinkFileName)

config := fmt.Sprintf(`
vault {
address = "%s"
tls_skip_verify = true
}

log_level = "trace"

%s
`, serverClient.Address(), autoAuthConfig)
configPath := makeTempFile(t, "config.hcl", config)

// Start the agent
ui, cmd := testAgentCommand(t, logger)

cmd.startedCh = make(chan struct{})

wg := &sync.WaitGroup{}
wg.Add(1)
go func() {
cmd.Run([]string{"-config", configPath})
wg.Done()
}()

select {
case <-cmd.startedCh:
case <-time.After(5 * time.Second):
t.Errorf("timeout")
t.Errorf("stdout: %s", ui.OutputWriter.String())
t.Errorf("stderr: %s", ui.ErrorWriter.String())
}

// Sleep, to allow the renewal/auth process to work and ensure that it doesn't
// go crazy with renewals.
time.Sleep(30 * time.Second)

fileBytes, err := os.ReadFile(auditLogFileName)
require.NoError(t, err)
stringAudit := string(fileBytes)

// This is a bit of an imperfect way to test things, but we want to make sure
// that a token like this doesn't keep triggering retries.
// Due to the fact this is an auto-auth specific thing, unit tests for the
// LifetimeWatcher wouldn't be sufficient here.
// Prior to the fix made in the same PR this test was added, it would trigger many, many
// retries (hundreds to thousands in less than a minute).
// We really want to make sure that doesn't happen.
numberOfRenewSelves := strings.Count(stringAudit, "auth/token/renew-self")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool strategy!

// We actually expect ~6, but I added some buffer for CI weirdness. It can also vary
// due to the grace added/removed from the sleep in LifetimeWatcher too.
if numberOfRenewSelves > 10 {
t.Fatalf("did too many renews -- Vault received %d renew-self requests", numberOfRenewSelves)
}
}

// TestAgent_Logging_ConsulTemplate attempts to ensure two things about Vault Agent logs:
// 1. When -log-format command line arg is set to JSON, it is honored as the output format
// for messages generated from within the consul-template library.
Expand Down
50 changes: 44 additions & 6 deletions command/agentproxyshared/auth/auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"math/rand"
"net/http"
Expand Down Expand Up @@ -478,10 +479,9 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
}

metrics.IncrCounter([]string{ah.metricsSignifier, "auth", "success"}, 1)
// We don't want to trigger the renewal process for tokens with
// unlimited TTL, such as the root token.
if leaseDuration == 0 && isTokenFileMethod {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, this was a bug, and could occur for non-renewable tokens that happened to check it as their lease ticked down, causing re-authentication to not happen.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to keep a check doesn't trigger lifetime watcher if the secret.Renewable == false?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the secret's not renewable, then the LifetimeWatcher still needs to get triggered given the current architecture, so that auth gets re-triggered at the correct time. If we need to authenticate again for any reason, we should run the lifetimewatcher -- the root token is the only token where we don't need to reauthenticate (since it never expires)

e.g. for a 10s TTL non-renewable token, it might look like this

  • LifetimeWatcher triggered
  • It waits for ~8.5 seconds, does not try to renew
  • Triggers the done channel with no error
  • Auth gets re-triggered

We're essentially making renewable tokens without the permission to renew follow the same path as the above ^

ah.logger.Info("not starting token renewal process, as token has unlimited TTL")
// We don't want to trigger the renewal process for the root token
if isRootToken(leaseDuration, isTokenFileMethod, secret) {
ah.logger.Info("not starting token renewal process, as token is root token")
} else {
ah.logger.Info("starting renewal process")
go watcher.Renew()
Expand All @@ -496,11 +496,31 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
break LifetimeWatcherLoop

case err := <-watcher.DoneCh():
ah.logger.Info("lifetime watcher done channel triggered")
ah.logger.Info("lifetime watcher done channel triggered, re-authenticating")
if err != nil {
ah.logger.Error("error renewing token", "error", err, "backoff", backoffCfg)
metrics.IncrCounter([]string{ah.metricsSignifier, "auth", "failure"}, 1)
ah.logger.Error("error renewing token", "error", err)

// Add some exponential backoff so that if auth is successful
// but the watcher errors, we won't go into an immediate
// aggressive retry loop.
// This might be quite a small sleep, since if we have a successful
// auth, we reset the backoff. Still, some backoff is important, and
// ensuring we follow the normal flow is important:
// auth -> try to renew
if !backoffSleep(ctx, backoffCfg) {
// We're at max retries. Return an error.
return fmt.Errorf("exceeded max retries failing to renew auth token")
}
}

// If the lease duration is 0, wait a second before re-authenticating
// so that we don't go into a loop, as the LifetimeWatcher will immediately
// return for tokens like this.
if leaseDuration == 0 {
time.Sleep(1 * time.Second)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without this sleep, we'd reauthenticate then receive something on the DoneCh immediately a few times. It's a bit of an edge case but it seemed important. I wanted to avoid using backoffSleep since that's intended to be for errors only.

}

break LifetimeWatcherLoop

case <-watcher.RenewCh():
Expand All @@ -523,6 +543,24 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
}
}

// isRootToken checks if the secret in the argument is the root token
// This is determinable without leaseDuration and isTokenFileMethod,
// but those make it easier to rule out other tokens cheaply.
func isRootToken(leaseDuration int, isTokenFileMethod bool, secret *api.Secret) bool {
// This check is cheaper than the others, so we do this first.
if leaseDuration == 0 && isTokenFileMethod && !secret.Renewable {
if secret != nil {
policies, err := secret.TokenPolicies()
if err == nil {
if len(policies) == 1 && policies[0] == "root" {
return true
}
}
}
}
return false
}

// autoAuthBackoff tracks exponential backoff state.
type autoAuthBackoff struct {
backoff *backoff.Backoff
Expand Down
Loading