diff --git a/internal/apply/v2/manager.go b/internal/apply/v2/manager.go index 4866129..f43e933 100644 --- a/internal/apply/v2/manager.go +++ b/internal/apply/v2/manager.go @@ -4,7 +4,9 @@ package v2 import ( "context" + "errors" "fmt" + "strings" "time" "edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/internal/config" @@ -204,7 +206,8 @@ func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, re rollbackErrors := []error{} - // Rollback completed instances (in reverse order) + // Phase 1: Delete resources that were created in this deployment attempt (in reverse order) + rm.logf("Phase 1: Rolling back created resources") for i := len(result.CompletedActions) - 1; i >= 0; i-- { action := result.CompletedActions[i] @@ -218,6 +221,32 @@ func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, re } } + // Phase 2: Restore resources that were deleted before the failed deployment + // This is critical for RecreateStrategy which deletes everything before recreating + if result.DeletedAppBackup != nil || len(result.DeletedInstancesBackup) > 0 { + rm.logf("Phase 2: Restoring deleted resources") + + // Restore app first (must exist before instances can be created) + if result.DeletedAppBackup != nil { + if err := rm.restoreApp(ctx, result.DeletedAppBackup); err != nil { + rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore app: %w", err)) + rm.logf("Failed to restore app: %v", err) + } else { + rm.logf("Successfully restored app: %s", result.DeletedAppBackup.App.Key.Name) + } + } + + // Restore instances + for _, backup := range result.DeletedInstancesBackup { + if err := rm.restoreInstance(ctx, &backup); err != nil { + rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore instance %s: %w", backup.Instance.Key.Name, err)) + rm.logf("Failed to restore instance %s: %v", backup.Instance.Key.Name, err) + } else { + rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name) + } + } + } + if len(rollbackErrors) > 0 { return fmt.Errorf("rollback encountered %d errors: %v", len(rollbackErrors), rollbackErrors) } @@ -278,6 +307,125 @@ func (rm *EdgeConnectResourceManager) rollbackInstance(ctx context.Context, acti return fmt.Errorf("instance action not found for rollback: %s", action.Target) } +// restoreApp recreates an app that was deleted during deployment +func (rm *EdgeConnectResourceManager) restoreApp(ctx context.Context, backup *AppBackup) error { + rm.logf("Restoring app: %s/%s version %s", + backup.App.Key.Organization, backup.App.Key.Name, backup.App.Key.Version) + + // Build a clean app input with only creation-safe fields + // We must exclude read-only fields like CreatedAt, UpdatedAt, etc. + appInput := &v2.NewAppInput{ + Region: backup.Region, + App: v2.App{ + Key: backup.App.Key, + Deployment: backup.App.Deployment, + ImageType: backup.App.ImageType, + ImagePath: backup.App.ImagePath, + AllowServerless: backup.App.AllowServerless, + DefaultFlavor: backup.App.DefaultFlavor, + ServerlessConfig: backup.App.ServerlessConfig, + DeploymentManifest: backup.App.DeploymentManifest, + DeploymentGenerator: backup.App.DeploymentGenerator, + RequiredOutboundConnections: backup.App.RequiredOutboundConnections, + // Explicitly omit read-only fields like CreatedAt, UpdatedAt, Fields, etc. + }, + } + + if err := rm.client.CreateApp(ctx, appInput); err != nil { + return fmt.Errorf("failed to restore app: %w", err) + } + + rm.logf("Successfully restored app: %s", backup.App.Key.Name) + return nil +} + +// restoreInstance recreates an instance that was deleted during deployment +func (rm *EdgeConnectResourceManager) restoreInstance(ctx context.Context, backup *InstanceBackup) error { + rm.logf("Restoring instance: %s on %s:%s", + backup.Instance.Key.Name, + backup.Instance.Key.CloudletKey.Organization, + backup.Instance.Key.CloudletKey.Name) + + // Build a clean instance input with only creation-safe fields + // We must exclude read-only fields like CloudletLoc, CreatedAt, etc. + instanceInput := &v2.NewAppInstanceInput{ + Region: backup.Region, + AppInst: v2.AppInstance{ + Key: backup.Instance.Key, + AppKey: backup.Instance.AppKey, + Flavor: backup.Instance.Flavor, + // Explicitly omit read-only fields like CloudletLoc, State, PowerState, CreatedAt, etc. + }, + } + + // Retry logic to handle namespace termination race conditions + maxRetries := 5 + retryDelay := 10 * time.Second + + var lastErr error + for attempt := 0; attempt <= maxRetries; attempt++ { + if attempt > 0 { + rm.logf("Retrying instance restore %s (attempt %d/%d)", backup.Instance.Key.Name, attempt, maxRetries) + select { + case <-time.After(retryDelay): + case <-ctx.Done(): + return ctx.Err() + } + } + + err := rm.client.CreateAppInstance(ctx, instanceInput) + if err == nil { + rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name) + return nil + } + + lastErr = err + + // Check if error is retryable + if !rm.isRetryableError(err) { + rm.logf("Failed to restore instance %s: %v (non-retryable error, giving up)", backup.Instance.Key.Name, err) + return fmt.Errorf("failed to restore instance: %w", err) + } + + if attempt < maxRetries { + rm.logf("Failed to restore instance %s: %v (will retry)", backup.Instance.Key.Name, err) + } + } + + return fmt.Errorf("failed to restore instance after %d attempts: %w", maxRetries+1, lastErr) +} + +// isRetryableError determines if an error should be retried +func (rm *EdgeConnectResourceManager) isRetryableError(err error) bool { + if err == nil { + return false + } + + errStr := strings.ToLower(err.Error()) + + // Special case: Kubernetes namespace termination race condition + // This is a transient 400 error that should be retried + if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") { + return true + } + + // Check if it's an APIError with a status code + var apiErr *v2.APIError + if errors.As(err, &apiErr) { + // Don't retry client errors (4xx) + if apiErr.StatusCode >= 400 && apiErr.StatusCode < 500 { + return false + } + // Retry server errors (5xx) + if apiErr.StatusCode >= 500 { + return true + } + } + + // Retry all other errors (network issues, timeouts, etc.) + return true +} + // logf logs a message if a logger is configured func (rm *EdgeConnectResourceManager) logf(format string, v ...interface{}) { if rm.logger != nil { diff --git a/internal/apply/v2/manager_test.go b/internal/apply/v2/manager_test.go index dd2fc55..6d5ef18 100644 --- a/internal/apply/v2/manager_test.go +++ b/internal/apply/v2/manager_test.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "testing" "time" @@ -464,6 +465,111 @@ func TestRollbackDeploymentFailure(t *testing.T) { mockClient.AssertExpectations(t) } +func TestRollbackDeploymentWithRestore(t *testing.T) { + mockClient := &MockResourceClient{} + logger := &TestLogger{} + manager := NewResourceManager(mockClient, WithLogger(logger), WithStrategyConfig(createTestStrategyConfig())) + + plan := createTestDeploymentPlan() + + // Simulate a RecreateStrategy scenario: + // 1. Old app and instance were deleted and backed up + // 2. New app was created successfully + // 3. New instance creation failed + // 4. Rollback should: delete new app, restore old app, restore old instance + oldApp := v2.App{ + Key: v2.AppKey{ + Organization: "test-org", + Name: "test-app", + Version: "1.0.0", + }, + Deployment: "kubernetes", + DeploymentManifest: "old-manifest-content", + } + + oldInstance := v2.AppInstance{ + Key: v2.AppInstanceKey{ + Organization: "test-org", + Name: "test-app-1.0.0-instance", + CloudletKey: v2.CloudletKey{ + Organization: "test-cloudlet-org", + Name: "test-cloudlet", + }, + }, + AppKey: v2.AppKey{ + Organization: "test-org", + Name: "test-app", + Version: "1.0.0", + }, + Flavor: v2.Flavor{Name: "small"}, + } + + result := &ExecutionResult{ + Plan: plan, + // Completed actions: new app was created before failure + CompletedActions: []ActionResult{ + { + Type: ActionCreate, + Target: "test-app", + Success: true, + }, + }, + // Failed action: new instance creation failed + FailedActions: []ActionResult{ + { + Type: ActionCreate, + Target: "test-app-1.0.0-instance", + Success: false, + }, + }, + // Backup of deleted resources + DeletedAppBackup: &AppBackup{ + App: oldApp, + Region: "US", + ManifestContent: "old-manifest-content", + }, + DeletedInstancesBackup: []InstanceBackup{ + { + Instance: oldInstance, + Region: "US", + }, + }, + } + + // Mock rollback operations in order: + // 1. Delete newly created app (rollback create) + mockClient.On("DeleteApp", mock.Anything, mock.AnythingOfType("v2.AppKey"), "US"). + Return(nil).Once() + + // 2. Restore old app (from backup) + mockClient.On("CreateApp", mock.Anything, mock.MatchedBy(func(input *v2.NewAppInput) bool { + return input.App.Key.Name == "test-app" && input.App.DeploymentManifest == "old-manifest-content" + })).Return(nil).Once() + + // 3. Restore old instance (from backup) + mockClient.On("CreateAppInstance", mock.Anything, mock.MatchedBy(func(input *v2.NewAppInstanceInput) bool { + return input.AppInst.Key.Name == "test-app-1.0.0-instance" + })).Return(nil).Once() + + ctx := context.Background() + err := manager.RollbackDeployment(ctx, result) + + require.NoError(t, err) + mockClient.AssertExpectations(t) + + // Verify rollback was logged + assert.Greater(t, len(logger.messages), 0) + // Should have messages about rolling back created resources and restoring deleted resources + hasRestoreLog := false + for _, msg := range logger.messages { + if strings.Contains(msg, "Restoring deleted resources") { + hasRestoreLog = true + break + } + } + assert.True(t, hasRestoreLog, "Should log restoration of deleted resources") +} + func TestConvertNetworkRules(t *testing.T) { network := &config.NetworkConfig{ OutboundConnections: []config.OutboundConnection{ diff --git a/internal/apply/v2/strategy_recreate.go b/internal/apply/v2/strategy_recreate.go index 89c9c56..4d81029 100644 --- a/internal/apply/v2/strategy_recreate.go +++ b/internal/apply/v2/strategy_recreate.go @@ -159,6 +159,19 @@ func (r *RecreateStrategy) deleteInstancesPhase(ctx context.Context, plan *Deplo return nil } + // Backup instances before deleting them (for rollback restoration) + r.logf("Backing up %d existing instances before deletion", len(instancesToDelete)) + for _, action := range instancesToDelete { + backup, err := r.backupInstance(ctx, action, config) + if err != nil { + r.logf("Warning: failed to backup instance %s before deletion: %v", action.InstanceName, err) + // Continue with deletion even if backup fails - this is best effort + } else { + result.DeletedInstancesBackup = append(result.DeletedInstancesBackup, *backup) + r.logf("Backed up instance: %s", action.InstanceName) + } + } + deleteResults := r.executeInstanceActionsWithRetry(ctx, instancesToDelete, "delete", config) for _, deleteResult := range deleteResults { @@ -172,6 +185,19 @@ func (r *RecreateStrategy) deleteInstancesPhase(ctx context.Context, plan *Deplo } r.logf("Phase 1 complete: deleted %d instances", len(deleteResults)) + + // Wait for Kubernetes namespace termination to complete + // This prevents "namespace is being terminated" errors when recreating instances + if len(deleteResults) > 0 { + waitTime := 5 * time.Second + r.logf("Waiting %v for namespace termination to complete...", waitTime) + select { + case <-time.After(waitTime): + case <-ctx.Done(): + return ctx.Err() + } + } + return nil } @@ -184,6 +210,17 @@ func (r *RecreateStrategy) deleteAppPhase(ctx context.Context, plan *DeploymentP r.logf("Phase 2: Deleting existing application") + // Backup app before deleting it (for rollback restoration) + r.logf("Backing up existing app before deletion") + backup, err := r.backupApp(ctx, plan, config) + if err != nil { + r.logf("Warning: failed to backup app before deletion: %v", err) + // Continue with deletion even if backup fails - this is best effort + } else { + result.DeletedAppBackup = backup + r.logf("Backed up app: %s", plan.AppAction.Desired.Name) + } + appKey := v2.AppKey{ Organization: plan.AppAction.Desired.Organization, Name: plan.AppAction.Desired.Name, @@ -516,6 +553,52 @@ func (r *RecreateStrategy) updateApplication(ctx context.Context, action AppActi return true, nil } +// backupApp fetches and stores the current app state before deletion +func (r *RecreateStrategy) backupApp(ctx context.Context, plan *DeploymentPlan, config *config.EdgeConnectConfig) (*AppBackup, error) { + appKey := v2.AppKey{ + Organization: plan.AppAction.Desired.Organization, + Name: plan.AppAction.Desired.Name, + Version: plan.AppAction.Desired.Version, + } + + app, err := r.client.ShowApp(ctx, appKey, plan.AppAction.Desired.Region) + if err != nil { + return nil, fmt.Errorf("failed to fetch app for backup: %w", err) + } + + backup := &AppBackup{ + App: app, + Region: plan.AppAction.Desired.Region, + ManifestContent: app.DeploymentManifest, + } + + return backup, nil +} + +// backupInstance fetches and stores the current instance state before deletion +func (r *RecreateStrategy) backupInstance(ctx context.Context, action InstanceAction, config *config.EdgeConnectConfig) (*InstanceBackup, error) { + instanceKey := v2.AppInstanceKey{ + Organization: action.Desired.Organization, + Name: action.InstanceName, + CloudletKey: v2.CloudletKey{ + Organization: action.Target.CloudletOrg, + Name: action.Target.CloudletName, + }, + } + + instance, err := r.client.ShowAppInstance(ctx, instanceKey, action.Target.Region) + if err != nil { + return nil, fmt.Errorf("failed to fetch instance for backup: %w", err) + } + + backup := &InstanceBackup{ + Instance: instance, + Region: action.Target.Region, + } + + return backup, nil +} + // logf logs a message if a logger is configured func (r *RecreateStrategy) logf(format string, v ...interface{}) { if r.logger != nil { @@ -530,6 +613,14 @@ func isRetryableError(err error) bool { return false } + errStr := strings.ToLower(err.Error()) + + // Special case: Kubernetes namespace termination race condition + // This is a transient 400 error that should be retried + if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") { + return true + } + // Check if it's an APIError with a status code var apiErr *v2.APIError if errors.As(err, &apiErr) { diff --git a/internal/apply/v2/types.go b/internal/apply/v2/types.go index ae52420..26d998e 100644 --- a/internal/apply/v2/types.go +++ b/internal/apply/v2/types.go @@ -271,6 +271,12 @@ type ExecutionResult struct { // RollbackSuccess indicates if rollback was successful RollbackSuccess bool + + // DeletedAppBackup stores the app that was deleted (for rollback restoration) + DeletedAppBackup *AppBackup + + // DeletedInstancesBackup stores instances that were deleted (for rollback restoration) + DeletedInstancesBackup []InstanceBackup } // ActionResult represents the result of executing a single action @@ -294,6 +300,27 @@ type ActionResult struct { Details string } +// AppBackup stores a deleted app's complete state for rollback restoration +type AppBackup struct { + // App is the full app object that was deleted + App v2.App + + // Region where the app was deployed + Region string + + // ManifestContent is the deployment manifest content + ManifestContent string +} + +// InstanceBackup stores a deleted instance's complete state for rollback restoration +type InstanceBackup struct { + // Instance is the full instance object that was deleted + Instance v2.AppInstance + + // Region where the instance was deployed + Region string +} + // IsEmpty returns true if the deployment plan has no actions to perform func (dp *DeploymentPlan) IsEmpty() bool { if dp.AppAction.Type != ActionNone { diff --git a/sdk/examples/comprehensive/k8s-deployment.yaml b/sdk/examples/comprehensive/k8s-deployment.yaml index 2a0a741..dff3649 100644 --- a/sdk/examples/comprehensive/k8s-deployment.yaml +++ b/sdk/examples/comprehensive/k8s-deployment.yaml @@ -18,6 +18,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: edgeconnect-coder-deployment + #namespace: gitea spec: replicas: 1 selector: