fix(deploy): Fixed glitch when updating an app inst with an invalid manifest
All checks were successful
test / test (push) Successful in 16s

This commit is contained in:
Richard Robert Reitz 2025-10-22 10:31:03 +02:00
parent 26ba07200e
commit f3cbfa3723
5 changed files with 374 additions and 1 deletions

View file

@ -4,7 +4,9 @@ package v2
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"strings"
"time" "time"
"edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/internal/config" "edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/internal/config"
@ -204,7 +206,8 @@ func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, re
rollbackErrors := []error{} rollbackErrors := []error{}
// Rollback completed instances (in reverse order) // Phase 1: Delete resources that were created in this deployment attempt (in reverse order)
rm.logf("Phase 1: Rolling back created resources")
for i := len(result.CompletedActions) - 1; i >= 0; i-- { for i := len(result.CompletedActions) - 1; i >= 0; i-- {
action := result.CompletedActions[i] action := result.CompletedActions[i]
@ -218,6 +221,32 @@ func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, re
} }
} }
// Phase 2: Restore resources that were deleted before the failed deployment
// This is critical for RecreateStrategy which deletes everything before recreating
if result.DeletedAppBackup != nil || len(result.DeletedInstancesBackup) > 0 {
rm.logf("Phase 2: Restoring deleted resources")
// Restore app first (must exist before instances can be created)
if result.DeletedAppBackup != nil {
if err := rm.restoreApp(ctx, result.DeletedAppBackup); err != nil {
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore app: %w", err))
rm.logf("Failed to restore app: %v", err)
} else {
rm.logf("Successfully restored app: %s", result.DeletedAppBackup.App.Key.Name)
}
}
// Restore instances
for _, backup := range result.DeletedInstancesBackup {
if err := rm.restoreInstance(ctx, &backup); err != nil {
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore instance %s: %w", backup.Instance.Key.Name, err))
rm.logf("Failed to restore instance %s: %v", backup.Instance.Key.Name, err)
} else {
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
}
}
}
if len(rollbackErrors) > 0 { if len(rollbackErrors) > 0 {
return fmt.Errorf("rollback encountered %d errors: %v", len(rollbackErrors), rollbackErrors) return fmt.Errorf("rollback encountered %d errors: %v", len(rollbackErrors), rollbackErrors)
} }
@ -278,6 +307,125 @@ func (rm *EdgeConnectResourceManager) rollbackInstance(ctx context.Context, acti
return fmt.Errorf("instance action not found for rollback: %s", action.Target) return fmt.Errorf("instance action not found for rollback: %s", action.Target)
} }
// restoreApp recreates an app that was deleted during deployment
func (rm *EdgeConnectResourceManager) restoreApp(ctx context.Context, backup *AppBackup) error {
rm.logf("Restoring app: %s/%s version %s",
backup.App.Key.Organization, backup.App.Key.Name, backup.App.Key.Version)
// Build a clean app input with only creation-safe fields
// We must exclude read-only fields like CreatedAt, UpdatedAt, etc.
appInput := &v2.NewAppInput{
Region: backup.Region,
App: v2.App{
Key: backup.App.Key,
Deployment: backup.App.Deployment,
ImageType: backup.App.ImageType,
ImagePath: backup.App.ImagePath,
AllowServerless: backup.App.AllowServerless,
DefaultFlavor: backup.App.DefaultFlavor,
ServerlessConfig: backup.App.ServerlessConfig,
DeploymentManifest: backup.App.DeploymentManifest,
DeploymentGenerator: backup.App.DeploymentGenerator,
RequiredOutboundConnections: backup.App.RequiredOutboundConnections,
// Explicitly omit read-only fields like CreatedAt, UpdatedAt, Fields, etc.
},
}
if err := rm.client.CreateApp(ctx, appInput); err != nil {
return fmt.Errorf("failed to restore app: %w", err)
}
rm.logf("Successfully restored app: %s", backup.App.Key.Name)
return nil
}
// restoreInstance recreates an instance that was deleted during deployment
func (rm *EdgeConnectResourceManager) restoreInstance(ctx context.Context, backup *InstanceBackup) error {
rm.logf("Restoring instance: %s on %s:%s",
backup.Instance.Key.Name,
backup.Instance.Key.CloudletKey.Organization,
backup.Instance.Key.CloudletKey.Name)
// Build a clean instance input with only creation-safe fields
// We must exclude read-only fields like CloudletLoc, CreatedAt, etc.
instanceInput := &v2.NewAppInstanceInput{
Region: backup.Region,
AppInst: v2.AppInstance{
Key: backup.Instance.Key,
AppKey: backup.Instance.AppKey,
Flavor: backup.Instance.Flavor,
// Explicitly omit read-only fields like CloudletLoc, State, PowerState, CreatedAt, etc.
},
}
// Retry logic to handle namespace termination race conditions
maxRetries := 5
retryDelay := 10 * time.Second
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
if attempt > 0 {
rm.logf("Retrying instance restore %s (attempt %d/%d)", backup.Instance.Key.Name, attempt, maxRetries)
select {
case <-time.After(retryDelay):
case <-ctx.Done():
return ctx.Err()
}
}
err := rm.client.CreateAppInstance(ctx, instanceInput)
if err == nil {
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
return nil
}
lastErr = err
// Check if error is retryable
if !rm.isRetryableError(err) {
rm.logf("Failed to restore instance %s: %v (non-retryable error, giving up)", backup.Instance.Key.Name, err)
return fmt.Errorf("failed to restore instance: %w", err)
}
if attempt < maxRetries {
rm.logf("Failed to restore instance %s: %v (will retry)", backup.Instance.Key.Name, err)
}
}
return fmt.Errorf("failed to restore instance after %d attempts: %w", maxRetries+1, lastErr)
}
// isRetryableError determines if an error should be retried
func (rm *EdgeConnectResourceManager) isRetryableError(err error) bool {
if err == nil {
return false
}
errStr := strings.ToLower(err.Error())
// Special case: Kubernetes namespace termination race condition
// This is a transient 400 error that should be retried
if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") {
return true
}
// Check if it's an APIError with a status code
var apiErr *v2.APIError
if errors.As(err, &apiErr) {
// Don't retry client errors (4xx)
if apiErr.StatusCode >= 400 && apiErr.StatusCode < 500 {
return false
}
// Retry server errors (5xx)
if apiErr.StatusCode >= 500 {
return true
}
}
// Retry all other errors (network issues, timeouts, etc.)
return true
}
// logf logs a message if a logger is configured // logf logs a message if a logger is configured
func (rm *EdgeConnectResourceManager) logf(format string, v ...interface{}) { func (rm *EdgeConnectResourceManager) logf(format string, v ...interface{}) {
if rm.logger != nil { if rm.logger != nil {

View file

@ -7,6 +7,7 @@ import (
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"testing" "testing"
"time" "time"
@ -464,6 +465,111 @@ func TestRollbackDeploymentFailure(t *testing.T) {
mockClient.AssertExpectations(t) mockClient.AssertExpectations(t)
} }
func TestRollbackDeploymentWithRestore(t *testing.T) {
mockClient := &MockResourceClient{}
logger := &TestLogger{}
manager := NewResourceManager(mockClient, WithLogger(logger), WithStrategyConfig(createTestStrategyConfig()))
plan := createTestDeploymentPlan()
// Simulate a RecreateStrategy scenario:
// 1. Old app and instance were deleted and backed up
// 2. New app was created successfully
// 3. New instance creation failed
// 4. Rollback should: delete new app, restore old app, restore old instance
oldApp := v2.App{
Key: v2.AppKey{
Organization: "test-org",
Name: "test-app",
Version: "1.0.0",
},
Deployment: "kubernetes",
DeploymentManifest: "old-manifest-content",
}
oldInstance := v2.AppInstance{
Key: v2.AppInstanceKey{
Organization: "test-org",
Name: "test-app-1.0.0-instance",
CloudletKey: v2.CloudletKey{
Organization: "test-cloudlet-org",
Name: "test-cloudlet",
},
},
AppKey: v2.AppKey{
Organization: "test-org",
Name: "test-app",
Version: "1.0.0",
},
Flavor: v2.Flavor{Name: "small"},
}
result := &ExecutionResult{
Plan: plan,
// Completed actions: new app was created before failure
CompletedActions: []ActionResult{
{
Type: ActionCreate,
Target: "test-app",
Success: true,
},
},
// Failed action: new instance creation failed
FailedActions: []ActionResult{
{
Type: ActionCreate,
Target: "test-app-1.0.0-instance",
Success: false,
},
},
// Backup of deleted resources
DeletedAppBackup: &AppBackup{
App: oldApp,
Region: "US",
ManifestContent: "old-manifest-content",
},
DeletedInstancesBackup: []InstanceBackup{
{
Instance: oldInstance,
Region: "US",
},
},
}
// Mock rollback operations in order:
// 1. Delete newly created app (rollback create)
mockClient.On("DeleteApp", mock.Anything, mock.AnythingOfType("v2.AppKey"), "US").
Return(nil).Once()
// 2. Restore old app (from backup)
mockClient.On("CreateApp", mock.Anything, mock.MatchedBy(func(input *v2.NewAppInput) bool {
return input.App.Key.Name == "test-app" && input.App.DeploymentManifest == "old-manifest-content"
})).Return(nil).Once()
// 3. Restore old instance (from backup)
mockClient.On("CreateAppInstance", mock.Anything, mock.MatchedBy(func(input *v2.NewAppInstanceInput) bool {
return input.AppInst.Key.Name == "test-app-1.0.0-instance"
})).Return(nil).Once()
ctx := context.Background()
err := manager.RollbackDeployment(ctx, result)
require.NoError(t, err)
mockClient.AssertExpectations(t)
// Verify rollback was logged
assert.Greater(t, len(logger.messages), 0)
// Should have messages about rolling back created resources and restoring deleted resources
hasRestoreLog := false
for _, msg := range logger.messages {
if strings.Contains(msg, "Restoring deleted resources") {
hasRestoreLog = true
break
}
}
assert.True(t, hasRestoreLog, "Should log restoration of deleted resources")
}
func TestConvertNetworkRules(t *testing.T) { func TestConvertNetworkRules(t *testing.T) {
network := &config.NetworkConfig{ network := &config.NetworkConfig{
OutboundConnections: []config.OutboundConnection{ OutboundConnections: []config.OutboundConnection{

View file

@ -159,6 +159,19 @@ func (r *RecreateStrategy) deleteInstancesPhase(ctx context.Context, plan *Deplo
return nil return nil
} }
// Backup instances before deleting them (for rollback restoration)
r.logf("Backing up %d existing instances before deletion", len(instancesToDelete))
for _, action := range instancesToDelete {
backup, err := r.backupInstance(ctx, action, config)
if err != nil {
r.logf("Warning: failed to backup instance %s before deletion: %v", action.InstanceName, err)
// Continue with deletion even if backup fails - this is best effort
} else {
result.DeletedInstancesBackup = append(result.DeletedInstancesBackup, *backup)
r.logf("Backed up instance: %s", action.InstanceName)
}
}
deleteResults := r.executeInstanceActionsWithRetry(ctx, instancesToDelete, "delete", config) deleteResults := r.executeInstanceActionsWithRetry(ctx, instancesToDelete, "delete", config)
for _, deleteResult := range deleteResults { for _, deleteResult := range deleteResults {
@ -172,6 +185,19 @@ func (r *RecreateStrategy) deleteInstancesPhase(ctx context.Context, plan *Deplo
} }
r.logf("Phase 1 complete: deleted %d instances", len(deleteResults)) r.logf("Phase 1 complete: deleted %d instances", len(deleteResults))
// Wait for Kubernetes namespace termination to complete
// This prevents "namespace is being terminated" errors when recreating instances
if len(deleteResults) > 0 {
waitTime := 5 * time.Second
r.logf("Waiting %v for namespace termination to complete...", waitTime)
select {
case <-time.After(waitTime):
case <-ctx.Done():
return ctx.Err()
}
}
return nil return nil
} }
@ -184,6 +210,17 @@ func (r *RecreateStrategy) deleteAppPhase(ctx context.Context, plan *DeploymentP
r.logf("Phase 2: Deleting existing application") r.logf("Phase 2: Deleting existing application")
// Backup app before deleting it (for rollback restoration)
r.logf("Backing up existing app before deletion")
backup, err := r.backupApp(ctx, plan, config)
if err != nil {
r.logf("Warning: failed to backup app before deletion: %v", err)
// Continue with deletion even if backup fails - this is best effort
} else {
result.DeletedAppBackup = backup
r.logf("Backed up app: %s", plan.AppAction.Desired.Name)
}
appKey := v2.AppKey{ appKey := v2.AppKey{
Organization: plan.AppAction.Desired.Organization, Organization: plan.AppAction.Desired.Organization,
Name: plan.AppAction.Desired.Name, Name: plan.AppAction.Desired.Name,
@ -516,6 +553,52 @@ func (r *RecreateStrategy) updateApplication(ctx context.Context, action AppActi
return true, nil return true, nil
} }
// backupApp fetches and stores the current app state before deletion
func (r *RecreateStrategy) backupApp(ctx context.Context, plan *DeploymentPlan, config *config.EdgeConnectConfig) (*AppBackup, error) {
appKey := v2.AppKey{
Organization: plan.AppAction.Desired.Organization,
Name: plan.AppAction.Desired.Name,
Version: plan.AppAction.Desired.Version,
}
app, err := r.client.ShowApp(ctx, appKey, plan.AppAction.Desired.Region)
if err != nil {
return nil, fmt.Errorf("failed to fetch app for backup: %w", err)
}
backup := &AppBackup{
App: app,
Region: plan.AppAction.Desired.Region,
ManifestContent: app.DeploymentManifest,
}
return backup, nil
}
// backupInstance fetches and stores the current instance state before deletion
func (r *RecreateStrategy) backupInstance(ctx context.Context, action InstanceAction, config *config.EdgeConnectConfig) (*InstanceBackup, error) {
instanceKey := v2.AppInstanceKey{
Organization: action.Desired.Organization,
Name: action.InstanceName,
CloudletKey: v2.CloudletKey{
Organization: action.Target.CloudletOrg,
Name: action.Target.CloudletName,
},
}
instance, err := r.client.ShowAppInstance(ctx, instanceKey, action.Target.Region)
if err != nil {
return nil, fmt.Errorf("failed to fetch instance for backup: %w", err)
}
backup := &InstanceBackup{
Instance: instance,
Region: action.Target.Region,
}
return backup, nil
}
// logf logs a message if a logger is configured // logf logs a message if a logger is configured
func (r *RecreateStrategy) logf(format string, v ...interface{}) { func (r *RecreateStrategy) logf(format string, v ...interface{}) {
if r.logger != nil { if r.logger != nil {
@ -530,6 +613,14 @@ func isRetryableError(err error) bool {
return false return false
} }
errStr := strings.ToLower(err.Error())
// Special case: Kubernetes namespace termination race condition
// This is a transient 400 error that should be retried
if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") {
return true
}
// Check if it's an APIError with a status code // Check if it's an APIError with a status code
var apiErr *v2.APIError var apiErr *v2.APIError
if errors.As(err, &apiErr) { if errors.As(err, &apiErr) {

View file

@ -271,6 +271,12 @@ type ExecutionResult struct {
// RollbackSuccess indicates if rollback was successful // RollbackSuccess indicates if rollback was successful
RollbackSuccess bool RollbackSuccess bool
// DeletedAppBackup stores the app that was deleted (for rollback restoration)
DeletedAppBackup *AppBackup
// DeletedInstancesBackup stores instances that were deleted (for rollback restoration)
DeletedInstancesBackup []InstanceBackup
} }
// ActionResult represents the result of executing a single action // ActionResult represents the result of executing a single action
@ -294,6 +300,27 @@ type ActionResult struct {
Details string Details string
} }
// AppBackup stores a deleted app's complete state for rollback restoration
type AppBackup struct {
// App is the full app object that was deleted
App v2.App
// Region where the app was deployed
Region string
// ManifestContent is the deployment manifest content
ManifestContent string
}
// InstanceBackup stores a deleted instance's complete state for rollback restoration
type InstanceBackup struct {
// Instance is the full instance object that was deleted
Instance v2.AppInstance
// Region where the instance was deployed
Region string
}
// IsEmpty returns true if the deployment plan has no actions to perform // IsEmpty returns true if the deployment plan has no actions to perform
func (dp *DeploymentPlan) IsEmpty() bool { func (dp *DeploymentPlan) IsEmpty() bool {
if dp.AppAction.Type != ActionNone { if dp.AppAction.Type != ActionNone {

View file

@ -18,6 +18,7 @@ apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
name: edgeconnect-coder-deployment name: edgeconnect-coder-deployment
#namespace: gitea
spec: spec:
replicas: 1 replicas: 1
selector: selector: