fix(deploy): Fixed glitch when updating an app inst with an invalid manifest
All checks were successful
test / test (push) Successful in 16s
All checks were successful
test / test (push) Successful in 16s
This commit is contained in:
parent
26ba07200e
commit
f3cbfa3723
5 changed files with 374 additions and 1 deletions
|
|
@ -4,7 +4,9 @@ package v2
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/internal/config"
|
||||
|
|
@ -204,7 +206,8 @@ func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, re
|
|||
|
||||
rollbackErrors := []error{}
|
||||
|
||||
// Rollback completed instances (in reverse order)
|
||||
// Phase 1: Delete resources that were created in this deployment attempt (in reverse order)
|
||||
rm.logf("Phase 1: Rolling back created resources")
|
||||
for i := len(result.CompletedActions) - 1; i >= 0; i-- {
|
||||
action := result.CompletedActions[i]
|
||||
|
||||
|
|
@ -218,6 +221,32 @@ func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, re
|
|||
}
|
||||
}
|
||||
|
||||
// Phase 2: Restore resources that were deleted before the failed deployment
|
||||
// This is critical for RecreateStrategy which deletes everything before recreating
|
||||
if result.DeletedAppBackup != nil || len(result.DeletedInstancesBackup) > 0 {
|
||||
rm.logf("Phase 2: Restoring deleted resources")
|
||||
|
||||
// Restore app first (must exist before instances can be created)
|
||||
if result.DeletedAppBackup != nil {
|
||||
if err := rm.restoreApp(ctx, result.DeletedAppBackup); err != nil {
|
||||
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore app: %w", err))
|
||||
rm.logf("Failed to restore app: %v", err)
|
||||
} else {
|
||||
rm.logf("Successfully restored app: %s", result.DeletedAppBackup.App.Key.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Restore instances
|
||||
for _, backup := range result.DeletedInstancesBackup {
|
||||
if err := rm.restoreInstance(ctx, &backup); err != nil {
|
||||
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore instance %s: %w", backup.Instance.Key.Name, err))
|
||||
rm.logf("Failed to restore instance %s: %v", backup.Instance.Key.Name, err)
|
||||
} else {
|
||||
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(rollbackErrors) > 0 {
|
||||
return fmt.Errorf("rollback encountered %d errors: %v", len(rollbackErrors), rollbackErrors)
|
||||
}
|
||||
|
|
@ -278,6 +307,125 @@ func (rm *EdgeConnectResourceManager) rollbackInstance(ctx context.Context, acti
|
|||
return fmt.Errorf("instance action not found for rollback: %s", action.Target)
|
||||
}
|
||||
|
||||
// restoreApp recreates an app that was deleted during deployment
|
||||
func (rm *EdgeConnectResourceManager) restoreApp(ctx context.Context, backup *AppBackup) error {
|
||||
rm.logf("Restoring app: %s/%s version %s",
|
||||
backup.App.Key.Organization, backup.App.Key.Name, backup.App.Key.Version)
|
||||
|
||||
// Build a clean app input with only creation-safe fields
|
||||
// We must exclude read-only fields like CreatedAt, UpdatedAt, etc.
|
||||
appInput := &v2.NewAppInput{
|
||||
Region: backup.Region,
|
||||
App: v2.App{
|
||||
Key: backup.App.Key,
|
||||
Deployment: backup.App.Deployment,
|
||||
ImageType: backup.App.ImageType,
|
||||
ImagePath: backup.App.ImagePath,
|
||||
AllowServerless: backup.App.AllowServerless,
|
||||
DefaultFlavor: backup.App.DefaultFlavor,
|
||||
ServerlessConfig: backup.App.ServerlessConfig,
|
||||
DeploymentManifest: backup.App.DeploymentManifest,
|
||||
DeploymentGenerator: backup.App.DeploymentGenerator,
|
||||
RequiredOutboundConnections: backup.App.RequiredOutboundConnections,
|
||||
// Explicitly omit read-only fields like CreatedAt, UpdatedAt, Fields, etc.
|
||||
},
|
||||
}
|
||||
|
||||
if err := rm.client.CreateApp(ctx, appInput); err != nil {
|
||||
return fmt.Errorf("failed to restore app: %w", err)
|
||||
}
|
||||
|
||||
rm.logf("Successfully restored app: %s", backup.App.Key.Name)
|
||||
return nil
|
||||
}
|
||||
|
||||
// restoreInstance recreates an instance that was deleted during deployment
|
||||
func (rm *EdgeConnectResourceManager) restoreInstance(ctx context.Context, backup *InstanceBackup) error {
|
||||
rm.logf("Restoring instance: %s on %s:%s",
|
||||
backup.Instance.Key.Name,
|
||||
backup.Instance.Key.CloudletKey.Organization,
|
||||
backup.Instance.Key.CloudletKey.Name)
|
||||
|
||||
// Build a clean instance input with only creation-safe fields
|
||||
// We must exclude read-only fields like CloudletLoc, CreatedAt, etc.
|
||||
instanceInput := &v2.NewAppInstanceInput{
|
||||
Region: backup.Region,
|
||||
AppInst: v2.AppInstance{
|
||||
Key: backup.Instance.Key,
|
||||
AppKey: backup.Instance.AppKey,
|
||||
Flavor: backup.Instance.Flavor,
|
||||
// Explicitly omit read-only fields like CloudletLoc, State, PowerState, CreatedAt, etc.
|
||||
},
|
||||
}
|
||||
|
||||
// Retry logic to handle namespace termination race conditions
|
||||
maxRetries := 5
|
||||
retryDelay := 10 * time.Second
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt <= maxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
rm.logf("Retrying instance restore %s (attempt %d/%d)", backup.Instance.Key.Name, attempt, maxRetries)
|
||||
select {
|
||||
case <-time.After(retryDelay):
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
err := rm.client.CreateAppInstance(ctx, instanceInput)
|
||||
if err == nil {
|
||||
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
|
||||
return nil
|
||||
}
|
||||
|
||||
lastErr = err
|
||||
|
||||
// Check if error is retryable
|
||||
if !rm.isRetryableError(err) {
|
||||
rm.logf("Failed to restore instance %s: %v (non-retryable error, giving up)", backup.Instance.Key.Name, err)
|
||||
return fmt.Errorf("failed to restore instance: %w", err)
|
||||
}
|
||||
|
||||
if attempt < maxRetries {
|
||||
rm.logf("Failed to restore instance %s: %v (will retry)", backup.Instance.Key.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to restore instance after %d attempts: %w", maxRetries+1, lastErr)
|
||||
}
|
||||
|
||||
// isRetryableError determines if an error should be retried
|
||||
func (rm *EdgeConnectResourceManager) isRetryableError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
errStr := strings.ToLower(err.Error())
|
||||
|
||||
// Special case: Kubernetes namespace termination race condition
|
||||
// This is a transient 400 error that should be retried
|
||||
if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check if it's an APIError with a status code
|
||||
var apiErr *v2.APIError
|
||||
if errors.As(err, &apiErr) {
|
||||
// Don't retry client errors (4xx)
|
||||
if apiErr.StatusCode >= 400 && apiErr.StatusCode < 500 {
|
||||
return false
|
||||
}
|
||||
// Retry server errors (5xx)
|
||||
if apiErr.StatusCode >= 500 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Retry all other errors (network issues, timeouts, etc.)
|
||||
return true
|
||||
}
|
||||
|
||||
// logf logs a message if a logger is configured
|
||||
func (rm *EdgeConnectResourceManager) logf(format string, v ...interface{}) {
|
||||
if rm.logger != nil {
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import (
|
|||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
|
|
@ -464,6 +465,111 @@ func TestRollbackDeploymentFailure(t *testing.T) {
|
|||
mockClient.AssertExpectations(t)
|
||||
}
|
||||
|
||||
func TestRollbackDeploymentWithRestore(t *testing.T) {
|
||||
mockClient := &MockResourceClient{}
|
||||
logger := &TestLogger{}
|
||||
manager := NewResourceManager(mockClient, WithLogger(logger), WithStrategyConfig(createTestStrategyConfig()))
|
||||
|
||||
plan := createTestDeploymentPlan()
|
||||
|
||||
// Simulate a RecreateStrategy scenario:
|
||||
// 1. Old app and instance were deleted and backed up
|
||||
// 2. New app was created successfully
|
||||
// 3. New instance creation failed
|
||||
// 4. Rollback should: delete new app, restore old app, restore old instance
|
||||
oldApp := v2.App{
|
||||
Key: v2.AppKey{
|
||||
Organization: "test-org",
|
||||
Name: "test-app",
|
||||
Version: "1.0.0",
|
||||
},
|
||||
Deployment: "kubernetes",
|
||||
DeploymentManifest: "old-manifest-content",
|
||||
}
|
||||
|
||||
oldInstance := v2.AppInstance{
|
||||
Key: v2.AppInstanceKey{
|
||||
Organization: "test-org",
|
||||
Name: "test-app-1.0.0-instance",
|
||||
CloudletKey: v2.CloudletKey{
|
||||
Organization: "test-cloudlet-org",
|
||||
Name: "test-cloudlet",
|
||||
},
|
||||
},
|
||||
AppKey: v2.AppKey{
|
||||
Organization: "test-org",
|
||||
Name: "test-app",
|
||||
Version: "1.0.0",
|
||||
},
|
||||
Flavor: v2.Flavor{Name: "small"},
|
||||
}
|
||||
|
||||
result := &ExecutionResult{
|
||||
Plan: plan,
|
||||
// Completed actions: new app was created before failure
|
||||
CompletedActions: []ActionResult{
|
||||
{
|
||||
Type: ActionCreate,
|
||||
Target: "test-app",
|
||||
Success: true,
|
||||
},
|
||||
},
|
||||
// Failed action: new instance creation failed
|
||||
FailedActions: []ActionResult{
|
||||
{
|
||||
Type: ActionCreate,
|
||||
Target: "test-app-1.0.0-instance",
|
||||
Success: false,
|
||||
},
|
||||
},
|
||||
// Backup of deleted resources
|
||||
DeletedAppBackup: &AppBackup{
|
||||
App: oldApp,
|
||||
Region: "US",
|
||||
ManifestContent: "old-manifest-content",
|
||||
},
|
||||
DeletedInstancesBackup: []InstanceBackup{
|
||||
{
|
||||
Instance: oldInstance,
|
||||
Region: "US",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Mock rollback operations in order:
|
||||
// 1. Delete newly created app (rollback create)
|
||||
mockClient.On("DeleteApp", mock.Anything, mock.AnythingOfType("v2.AppKey"), "US").
|
||||
Return(nil).Once()
|
||||
|
||||
// 2. Restore old app (from backup)
|
||||
mockClient.On("CreateApp", mock.Anything, mock.MatchedBy(func(input *v2.NewAppInput) bool {
|
||||
return input.App.Key.Name == "test-app" && input.App.DeploymentManifest == "old-manifest-content"
|
||||
})).Return(nil).Once()
|
||||
|
||||
// 3. Restore old instance (from backup)
|
||||
mockClient.On("CreateAppInstance", mock.Anything, mock.MatchedBy(func(input *v2.NewAppInstanceInput) bool {
|
||||
return input.AppInst.Key.Name == "test-app-1.0.0-instance"
|
||||
})).Return(nil).Once()
|
||||
|
||||
ctx := context.Background()
|
||||
err := manager.RollbackDeployment(ctx, result)
|
||||
|
||||
require.NoError(t, err)
|
||||
mockClient.AssertExpectations(t)
|
||||
|
||||
// Verify rollback was logged
|
||||
assert.Greater(t, len(logger.messages), 0)
|
||||
// Should have messages about rolling back created resources and restoring deleted resources
|
||||
hasRestoreLog := false
|
||||
for _, msg := range logger.messages {
|
||||
if strings.Contains(msg, "Restoring deleted resources") {
|
||||
hasRestoreLog = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, hasRestoreLog, "Should log restoration of deleted resources")
|
||||
}
|
||||
|
||||
func TestConvertNetworkRules(t *testing.T) {
|
||||
network := &config.NetworkConfig{
|
||||
OutboundConnections: []config.OutboundConnection{
|
||||
|
|
|
|||
|
|
@ -159,6 +159,19 @@ func (r *RecreateStrategy) deleteInstancesPhase(ctx context.Context, plan *Deplo
|
|||
return nil
|
||||
}
|
||||
|
||||
// Backup instances before deleting them (for rollback restoration)
|
||||
r.logf("Backing up %d existing instances before deletion", len(instancesToDelete))
|
||||
for _, action := range instancesToDelete {
|
||||
backup, err := r.backupInstance(ctx, action, config)
|
||||
if err != nil {
|
||||
r.logf("Warning: failed to backup instance %s before deletion: %v", action.InstanceName, err)
|
||||
// Continue with deletion even if backup fails - this is best effort
|
||||
} else {
|
||||
result.DeletedInstancesBackup = append(result.DeletedInstancesBackup, *backup)
|
||||
r.logf("Backed up instance: %s", action.InstanceName)
|
||||
}
|
||||
}
|
||||
|
||||
deleteResults := r.executeInstanceActionsWithRetry(ctx, instancesToDelete, "delete", config)
|
||||
|
||||
for _, deleteResult := range deleteResults {
|
||||
|
|
@ -172,6 +185,19 @@ func (r *RecreateStrategy) deleteInstancesPhase(ctx context.Context, plan *Deplo
|
|||
}
|
||||
|
||||
r.logf("Phase 1 complete: deleted %d instances", len(deleteResults))
|
||||
|
||||
// Wait for Kubernetes namespace termination to complete
|
||||
// This prevents "namespace is being terminated" errors when recreating instances
|
||||
if len(deleteResults) > 0 {
|
||||
waitTime := 5 * time.Second
|
||||
r.logf("Waiting %v for namespace termination to complete...", waitTime)
|
||||
select {
|
||||
case <-time.After(waitTime):
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
@ -184,6 +210,17 @@ func (r *RecreateStrategy) deleteAppPhase(ctx context.Context, plan *DeploymentP
|
|||
|
||||
r.logf("Phase 2: Deleting existing application")
|
||||
|
||||
// Backup app before deleting it (for rollback restoration)
|
||||
r.logf("Backing up existing app before deletion")
|
||||
backup, err := r.backupApp(ctx, plan, config)
|
||||
if err != nil {
|
||||
r.logf("Warning: failed to backup app before deletion: %v", err)
|
||||
// Continue with deletion even if backup fails - this is best effort
|
||||
} else {
|
||||
result.DeletedAppBackup = backup
|
||||
r.logf("Backed up app: %s", plan.AppAction.Desired.Name)
|
||||
}
|
||||
|
||||
appKey := v2.AppKey{
|
||||
Organization: plan.AppAction.Desired.Organization,
|
||||
Name: plan.AppAction.Desired.Name,
|
||||
|
|
@ -516,6 +553,52 @@ func (r *RecreateStrategy) updateApplication(ctx context.Context, action AppActi
|
|||
return true, nil
|
||||
}
|
||||
|
||||
// backupApp fetches and stores the current app state before deletion
|
||||
func (r *RecreateStrategy) backupApp(ctx context.Context, plan *DeploymentPlan, config *config.EdgeConnectConfig) (*AppBackup, error) {
|
||||
appKey := v2.AppKey{
|
||||
Organization: plan.AppAction.Desired.Organization,
|
||||
Name: plan.AppAction.Desired.Name,
|
||||
Version: plan.AppAction.Desired.Version,
|
||||
}
|
||||
|
||||
app, err := r.client.ShowApp(ctx, appKey, plan.AppAction.Desired.Region)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch app for backup: %w", err)
|
||||
}
|
||||
|
||||
backup := &AppBackup{
|
||||
App: app,
|
||||
Region: plan.AppAction.Desired.Region,
|
||||
ManifestContent: app.DeploymentManifest,
|
||||
}
|
||||
|
||||
return backup, nil
|
||||
}
|
||||
|
||||
// backupInstance fetches and stores the current instance state before deletion
|
||||
func (r *RecreateStrategy) backupInstance(ctx context.Context, action InstanceAction, config *config.EdgeConnectConfig) (*InstanceBackup, error) {
|
||||
instanceKey := v2.AppInstanceKey{
|
||||
Organization: action.Desired.Organization,
|
||||
Name: action.InstanceName,
|
||||
CloudletKey: v2.CloudletKey{
|
||||
Organization: action.Target.CloudletOrg,
|
||||
Name: action.Target.CloudletName,
|
||||
},
|
||||
}
|
||||
|
||||
instance, err := r.client.ShowAppInstance(ctx, instanceKey, action.Target.Region)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch instance for backup: %w", err)
|
||||
}
|
||||
|
||||
backup := &InstanceBackup{
|
||||
Instance: instance,
|
||||
Region: action.Target.Region,
|
||||
}
|
||||
|
||||
return backup, nil
|
||||
}
|
||||
|
||||
// logf logs a message if a logger is configured
|
||||
func (r *RecreateStrategy) logf(format string, v ...interface{}) {
|
||||
if r.logger != nil {
|
||||
|
|
@ -530,6 +613,14 @@ func isRetryableError(err error) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
errStr := strings.ToLower(err.Error())
|
||||
|
||||
// Special case: Kubernetes namespace termination race condition
|
||||
// This is a transient 400 error that should be retried
|
||||
if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check if it's an APIError with a status code
|
||||
var apiErr *v2.APIError
|
||||
if errors.As(err, &apiErr) {
|
||||
|
|
|
|||
|
|
@ -271,6 +271,12 @@ type ExecutionResult struct {
|
|||
|
||||
// RollbackSuccess indicates if rollback was successful
|
||||
RollbackSuccess bool
|
||||
|
||||
// DeletedAppBackup stores the app that was deleted (for rollback restoration)
|
||||
DeletedAppBackup *AppBackup
|
||||
|
||||
// DeletedInstancesBackup stores instances that were deleted (for rollback restoration)
|
||||
DeletedInstancesBackup []InstanceBackup
|
||||
}
|
||||
|
||||
// ActionResult represents the result of executing a single action
|
||||
|
|
@ -294,6 +300,27 @@ type ActionResult struct {
|
|||
Details string
|
||||
}
|
||||
|
||||
// AppBackup stores a deleted app's complete state for rollback restoration
|
||||
type AppBackup struct {
|
||||
// App is the full app object that was deleted
|
||||
App v2.App
|
||||
|
||||
// Region where the app was deployed
|
||||
Region string
|
||||
|
||||
// ManifestContent is the deployment manifest content
|
||||
ManifestContent string
|
||||
}
|
||||
|
||||
// InstanceBackup stores a deleted instance's complete state for rollback restoration
|
||||
type InstanceBackup struct {
|
||||
// Instance is the full instance object that was deleted
|
||||
Instance v2.AppInstance
|
||||
|
||||
// Region where the instance was deployed
|
||||
Region string
|
||||
}
|
||||
|
||||
// IsEmpty returns true if the deployment plan has no actions to perform
|
||||
func (dp *DeploymentPlan) IsEmpty() bool {
|
||||
if dp.AppAction.Type != ActionNone {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue