edge-connect-client/internal/apply/v2/manager.go
2025-11-17 14:40:47 +01:00

434 lines
14 KiB
Go

// ABOUTME: Resource management for EdgeConnect apply command with deployment execution and rollback
// ABOUTME: Handles actual deployment operations, manifest processing, and error recovery with parallel execution
package v2
import (
"context"
"errors"
"fmt"
"strings"
"time"
"edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/internal/config"
v2 "edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/sdk/edgeconnect/v2"
)
// ResourceManagerInterface defines the interface for resource management
type ResourceManagerInterface interface {
// ApplyDeployment executes a deployment plan
ApplyDeployment(ctx context.Context, plan *DeploymentPlan, config *config.EdgeConnectConfig, manifestContent string) (*ExecutionResult, error)
// RollbackDeployment attempts to rollback a failed deployment
RollbackDeployment(ctx context.Context, result *ExecutionResult) error
// ValidatePrerequisites checks if deployment prerequisites are met
ValidatePrerequisites(ctx context.Context, plan *DeploymentPlan) error
}
// EdgeConnectResourceManager implements resource management for EdgeConnect
type EdgeConnectResourceManager struct {
client EdgeConnectClientInterface
parallelLimit int
rollbackOnFail bool
logger Logger
strategyConfig StrategyConfig
}
// Logger interface for deployment logging
type Logger interface {
Printf(format string, v ...interface{})
}
// ResourceManagerOptions configures the resource manager behavior
type ResourceManagerOptions struct {
// ParallelLimit controls how many operations run concurrently
ParallelLimit int
// RollbackOnFail automatically rolls back on deployment failure
RollbackOnFail bool
// Logger for deployment operations
Logger Logger
// Timeout for individual operations
OperationTimeout time.Duration
// StrategyConfig for deployment strategies
StrategyConfig StrategyConfig
}
// DefaultResourceManagerOptions returns sensible defaults
func DefaultResourceManagerOptions() ResourceManagerOptions {
return ResourceManagerOptions{
ParallelLimit: 5, // Conservative parallel limit
RollbackOnFail: true,
OperationTimeout: 2 * time.Minute,
StrategyConfig: DefaultStrategyConfig(),
}
}
// NewResourceManager creates a new EdgeConnect resource manager
func NewResourceManager(client EdgeConnectClientInterface, opts ...func(*ResourceManagerOptions)) ResourceManagerInterface {
options := DefaultResourceManagerOptions()
for _, opt := range opts {
opt(&options)
}
return &EdgeConnectResourceManager{
client: client,
parallelLimit: options.ParallelLimit,
rollbackOnFail: options.RollbackOnFail,
logger: options.Logger,
strategyConfig: options.StrategyConfig,
}
}
// WithParallelLimit sets the parallel execution limit
func WithParallelLimit(limit int) func(*ResourceManagerOptions) {
return func(opts *ResourceManagerOptions) {
opts.ParallelLimit = limit
}
}
// WithRollbackOnFail enables/disables automatic rollback
func WithRollbackOnFail(rollback bool) func(*ResourceManagerOptions) {
return func(opts *ResourceManagerOptions) {
opts.RollbackOnFail = rollback
}
}
// WithLogger sets a logger for deployment operations
func WithLogger(logger Logger) func(*ResourceManagerOptions) {
return func(opts *ResourceManagerOptions) {
opts.Logger = logger
}
}
// WithStrategyConfig sets the strategy configuration
func WithStrategyConfig(config StrategyConfig) func(*ResourceManagerOptions) {
return func(opts *ResourceManagerOptions) {
opts.StrategyConfig = config
}
}
// ApplyDeployment executes a deployment plan using deployment strategies
func (rm *EdgeConnectResourceManager) ApplyDeployment(ctx context.Context, plan *DeploymentPlan, config *config.EdgeConnectConfig, manifestContent string) (*ExecutionResult, error) {
rm.logf("Starting deployment: %s", plan.ConfigName)
// Step 1: Validate prerequisites
if err := rm.ValidatePrerequisites(ctx, plan); err != nil {
result := &ExecutionResult{
Plan: plan,
CompletedActions: []ActionResult{},
FailedActions: []ActionResult{},
Error: fmt.Errorf("prerequisites validation failed: %w", err),
Duration: 0,
}
return result, err
}
// Step 2: Determine deployment strategy
strategyName := DeploymentStrategy(config.Spec.GetDeploymentStrategy())
rm.logf("Using deployment strategy: %s", strategyName)
// Step 3: Create strategy executor
strategyConfig := rm.strategyConfig
strategyConfig.ParallelOperations = rm.parallelLimit > 1
factory := NewStrategyFactory(rm.client, strategyConfig, rm.logger)
strategy, err := factory.CreateStrategy(strategyName)
if err != nil {
result := &ExecutionResult{
Plan: plan,
CompletedActions: []ActionResult{},
FailedActions: []ActionResult{},
Error: fmt.Errorf("failed to create deployment strategy: %w", err),
Duration: 0,
}
return result, err
}
// Step 4: Validate strategy can handle this deployment
if err := strategy.Validate(plan); err != nil {
result := &ExecutionResult{
Plan: plan,
CompletedActions: []ActionResult{},
FailedActions: []ActionResult{},
Error: fmt.Errorf("strategy validation failed: %w", err),
Duration: 0,
}
return result, err
}
// Step 5: Execute the deployment strategy
rm.logf("Estimated deployment duration: %v", strategy.EstimateDuration(plan))
result, err := strategy.Execute(ctx, plan, config, manifestContent)
// Step 6: Handle rollback if needed
if err != nil && rm.rollbackOnFail && result != nil {
rm.logf("Deployment failed, attempting rollback...")
if rollbackErr := rm.RollbackDeployment(ctx, result); rollbackErr != nil {
rm.logf("Rollback failed: %v", rollbackErr)
} else {
result.RollbackPerformed = true
result.RollbackSuccess = true
}
}
if result != nil && result.Success {
rm.logf("Deployment completed successfully in %v", result.Duration)
}
return result, err
}
// ValidatePrerequisites checks if deployment prerequisites are met
func (rm *EdgeConnectResourceManager) ValidatePrerequisites(ctx context.Context, plan *DeploymentPlan) error {
rm.logf("Validating deployment prerequisites for: %s", plan.ConfigName)
// Check if we have any actions to perform
if plan.IsEmpty() {
return fmt.Errorf("deployment plan is empty - no actions to perform")
}
// Validate that we have required client capabilities
if rm.client == nil {
return fmt.Errorf("EdgeConnect client is not configured")
}
rm.logf("Prerequisites validation passed")
return nil
}
// RollbackDeployment attempts to rollback a failed deployment
func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, result *ExecutionResult) error {
rm.logf("Starting rollback for deployment: %s", result.Plan.ConfigName)
rollbackErrors := []error{}
// Phase 1: Delete resources that were created in this deployment attempt (in reverse order)
rm.logf("Phase 1: Rolling back created resources")
for i := len(result.CompletedActions) - 1; i >= 0; i-- {
action := result.CompletedActions[i]
switch action.Type {
case ActionCreate:
if err := rm.rollbackCreateAction(ctx, action, result.Plan); err != nil {
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to rollback %s: %w", action.Target, err))
} else {
rm.logf("Successfully rolled back: %s", action.Target)
}
}
}
// Phase 2: Restore resources that were deleted before the failed deployment
// This is critical for RecreateStrategy which deletes everything before recreating
if result.DeletedAppBackup != nil || len(result.DeletedInstancesBackup) > 0 {
rm.logf("Phase 2: Restoring deleted resources")
// Restore app first (must exist before instances can be created)
if result.DeletedAppBackup != nil {
if err := rm.restoreApp(ctx, result.DeletedAppBackup); err != nil {
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore app: %w", err))
rm.logf("Failed to restore app: %v", err)
} else {
rm.logf("Successfully restored app: %s", result.DeletedAppBackup.App.Key.Name)
}
}
// Restore instances
for _, backup := range result.DeletedInstancesBackup {
if err := rm.restoreInstance(ctx, &backup); err != nil {
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore instance %s: %w", backup.Instance.Key.Name, err))
rm.logf("Failed to restore instance %s: %v", backup.Instance.Key.Name, err)
} else {
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
}
}
}
if len(rollbackErrors) > 0 {
return fmt.Errorf("rollback encountered %d errors: %v", len(rollbackErrors), rollbackErrors)
}
rm.logf("Rollback completed successfully")
return nil
}
// rollbackCreateAction rolls back a CREATE action by deleting the resource
func (rm *EdgeConnectResourceManager) rollbackCreateAction(ctx context.Context, action ActionResult, plan *DeploymentPlan) error {
if action.Type != ActionCreate {
return nil
}
// Determine if this is an app or instance rollback based on the target name
isInstance := false
for _, instanceAction := range plan.InstanceActions {
if instanceAction.InstanceName == action.Target {
isInstance = true
break
}
}
if isInstance {
return rm.rollbackInstance(ctx, action, plan)
} else {
return rm.rollbackApp(ctx, action, plan)
}
}
// rollbackApp deletes an application that was created
func (rm *EdgeConnectResourceManager) rollbackApp(ctx context.Context, action ActionResult, plan *DeploymentPlan) error {
appKey := v2.AppKey{
Organization: plan.AppAction.Desired.Organization,
Name: plan.AppAction.Desired.Name,
Version: plan.AppAction.Desired.Version,
}
return rm.client.DeleteApp(ctx, appKey, plan.AppAction.Desired.Region)
}
// rollbackInstance deletes an instance that was created
func (rm *EdgeConnectResourceManager) rollbackInstance(ctx context.Context, action ActionResult, plan *DeploymentPlan) error {
// Find the instance action to get the details
for _, instanceAction := range plan.InstanceActions {
if instanceAction.InstanceName == action.Target {
instanceKey := v2.AppInstanceKey{
Organization: plan.AppAction.Desired.Organization,
Name: instanceAction.InstanceName,
CloudletKey: v2.CloudletKey{
Organization: instanceAction.Target.CloudletOrg,
Name: instanceAction.Target.CloudletName,
},
}
return rm.client.DeleteAppInstance(ctx, instanceKey, instanceAction.Target.Region)
}
}
return fmt.Errorf("instance action not found for rollback: %s", action.Target)
}
// restoreApp recreates an app that was deleted during deployment
func (rm *EdgeConnectResourceManager) restoreApp(ctx context.Context, backup *AppBackup) error {
rm.logf("Restoring app: %s/%s version %s",
backup.App.Key.Organization, backup.App.Key.Name, backup.App.Key.Version)
// Build a clean app input with only creation-safe fields
// We must exclude read-only fields like CreatedAt, UpdatedAt, etc.
appInput := &v2.NewAppInput{
Region: backup.Region,
App: v2.App{
Key: backup.App.Key,
Deployment: backup.App.Deployment,
ImageType: backup.App.ImageType,
ImagePath: backup.App.ImagePath,
AllowServerless: backup.App.AllowServerless,
DefaultFlavor: backup.App.DefaultFlavor,
ServerlessConfig: backup.App.ServerlessConfig,
DeploymentManifest: backup.App.DeploymentManifest,
DeploymentGenerator: backup.App.DeploymentGenerator,
RequiredOutboundConnections: backup.App.RequiredOutboundConnections,
// Explicitly omit read-only fields like CreatedAt, UpdatedAt, Fields, etc.
},
}
if err := rm.client.CreateApp(ctx, appInput); err != nil {
return fmt.Errorf("failed to restore app: %w", err)
}
rm.logf("Successfully restored app: %s", backup.App.Key.Name)
return nil
}
// restoreInstance recreates an instance that was deleted during deployment
func (rm *EdgeConnectResourceManager) restoreInstance(ctx context.Context, backup *InstanceBackup) error {
rm.logf("Restoring instance: %s on %s:%s",
backup.Instance.Key.Name,
backup.Instance.Key.CloudletKey.Organization,
backup.Instance.Key.CloudletKey.Name)
// Build a clean instance input with only creation-safe fields
// We must exclude read-only fields like CloudletLoc, CreatedAt, etc.
instanceInput := &v2.NewAppInstanceInput{
Region: backup.Region,
AppInst: v2.AppInstance{
Key: backup.Instance.Key,
AppKey: backup.Instance.AppKey,
Flavor: backup.Instance.Flavor,
// Explicitly omit read-only fields like CloudletLoc, State, PowerState, CreatedAt, etc.
},
}
// Retry logic to handle namespace termination race conditions
maxRetries := 5
retryDelay := 10 * time.Second
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
if attempt > 0 {
rm.logf("Retrying instance restore %s (attempt %d/%d)", backup.Instance.Key.Name, attempt, maxRetries)
select {
case <-time.After(retryDelay):
case <-ctx.Done():
return ctx.Err()
}
}
err := rm.client.CreateAppInstance(ctx, instanceInput)
if err == nil {
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
return nil
}
lastErr = err
// Check if error is retryable
if !rm.isRetryableError(err) {
rm.logf("Failed to restore instance %s: %v (non-retryable error, giving up)", backup.Instance.Key.Name, err)
return fmt.Errorf("failed to restore instance: %w", err)
}
if attempt < maxRetries {
rm.logf("Failed to restore instance %s: %v (will retry)", backup.Instance.Key.Name, err)
}
}
return fmt.Errorf("failed to restore instance after %d attempts: %w", maxRetries+1, lastErr)
}
// isRetryableError determines if an error should be retried
func (rm *EdgeConnectResourceManager) isRetryableError(err error) bool {
if err == nil {
return false
}
errStr := strings.ToLower(err.Error())
// Special case: Kubernetes namespace termination race condition
// This is a transient 400 error that should be retried
if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") {
return true
}
// Check if it's an APIError with a status code
var apiErr *v2.APIError
if errors.As(err, &apiErr) {
// Don't retry client errors (4xx)
if apiErr.StatusCode >= 400 && apiErr.StatusCode < 500 {
return false
}
// Retry server errors (5xx)
if apiErr.StatusCode >= 500 {
return true
}
}
// Retry all other errors (network issues, timeouts, etc.)
return true
}
// logf logs a message if a logger is configured
func (rm *EdgeConnectResourceManager) logf(format string, v ...interface{}) {
if rm.logger != nil {
rm.logger.Printf("[ResourceManager] "+format, v...)
}
}