434 lines
14 KiB
Go
434 lines
14 KiB
Go
// ABOUTME: Resource management for EdgeConnect apply command with deployment execution and rollback
|
|
// ABOUTME: Handles actual deployment operations, manifest processing, and error recovery with parallel execution
|
|
package v2
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/internal/config"
|
|
v2 "edp.buildth.ing/DevFW-CICD/edge-connect-client/v2/sdk/edgeconnect/v2"
|
|
)
|
|
|
|
// ResourceManagerInterface defines the interface for resource management
|
|
type ResourceManagerInterface interface {
|
|
// ApplyDeployment executes a deployment plan
|
|
ApplyDeployment(ctx context.Context, plan *DeploymentPlan, config *config.EdgeConnectConfig, manifestContent string) (*ExecutionResult, error)
|
|
|
|
// RollbackDeployment attempts to rollback a failed deployment
|
|
RollbackDeployment(ctx context.Context, result *ExecutionResult) error
|
|
|
|
// ValidatePrerequisites checks if deployment prerequisites are met
|
|
ValidatePrerequisites(ctx context.Context, plan *DeploymentPlan) error
|
|
}
|
|
|
|
// EdgeConnectResourceManager implements resource management for EdgeConnect
|
|
type EdgeConnectResourceManager struct {
|
|
client EdgeConnectClientInterface
|
|
parallelLimit int
|
|
rollbackOnFail bool
|
|
logger Logger
|
|
strategyConfig StrategyConfig
|
|
}
|
|
|
|
// Logger interface for deployment logging
|
|
type Logger interface {
|
|
Printf(format string, v ...interface{})
|
|
}
|
|
|
|
// ResourceManagerOptions configures the resource manager behavior
|
|
type ResourceManagerOptions struct {
|
|
// ParallelLimit controls how many operations run concurrently
|
|
ParallelLimit int
|
|
|
|
// RollbackOnFail automatically rolls back on deployment failure
|
|
RollbackOnFail bool
|
|
|
|
// Logger for deployment operations
|
|
Logger Logger
|
|
|
|
// Timeout for individual operations
|
|
OperationTimeout time.Duration
|
|
|
|
// StrategyConfig for deployment strategies
|
|
StrategyConfig StrategyConfig
|
|
}
|
|
|
|
// DefaultResourceManagerOptions returns sensible defaults
|
|
func DefaultResourceManagerOptions() ResourceManagerOptions {
|
|
return ResourceManagerOptions{
|
|
ParallelLimit: 5, // Conservative parallel limit
|
|
RollbackOnFail: true,
|
|
OperationTimeout: 2 * time.Minute,
|
|
StrategyConfig: DefaultStrategyConfig(),
|
|
}
|
|
}
|
|
|
|
// NewResourceManager creates a new EdgeConnect resource manager
|
|
func NewResourceManager(client EdgeConnectClientInterface, opts ...func(*ResourceManagerOptions)) ResourceManagerInterface {
|
|
options := DefaultResourceManagerOptions()
|
|
for _, opt := range opts {
|
|
opt(&options)
|
|
}
|
|
|
|
return &EdgeConnectResourceManager{
|
|
client: client,
|
|
parallelLimit: options.ParallelLimit,
|
|
rollbackOnFail: options.RollbackOnFail,
|
|
logger: options.Logger,
|
|
strategyConfig: options.StrategyConfig,
|
|
}
|
|
}
|
|
|
|
// WithParallelLimit sets the parallel execution limit
|
|
func WithParallelLimit(limit int) func(*ResourceManagerOptions) {
|
|
return func(opts *ResourceManagerOptions) {
|
|
opts.ParallelLimit = limit
|
|
}
|
|
}
|
|
|
|
// WithRollbackOnFail enables/disables automatic rollback
|
|
func WithRollbackOnFail(rollback bool) func(*ResourceManagerOptions) {
|
|
return func(opts *ResourceManagerOptions) {
|
|
opts.RollbackOnFail = rollback
|
|
}
|
|
}
|
|
|
|
// WithLogger sets a logger for deployment operations
|
|
func WithLogger(logger Logger) func(*ResourceManagerOptions) {
|
|
return func(opts *ResourceManagerOptions) {
|
|
opts.Logger = logger
|
|
}
|
|
}
|
|
|
|
// WithStrategyConfig sets the strategy configuration
|
|
func WithStrategyConfig(config StrategyConfig) func(*ResourceManagerOptions) {
|
|
return func(opts *ResourceManagerOptions) {
|
|
opts.StrategyConfig = config
|
|
}
|
|
}
|
|
|
|
// ApplyDeployment executes a deployment plan using deployment strategies
|
|
func (rm *EdgeConnectResourceManager) ApplyDeployment(ctx context.Context, plan *DeploymentPlan, config *config.EdgeConnectConfig, manifestContent string) (*ExecutionResult, error) {
|
|
rm.logf("Starting deployment: %s", plan.ConfigName)
|
|
|
|
// Step 1: Validate prerequisites
|
|
if err := rm.ValidatePrerequisites(ctx, plan); err != nil {
|
|
result := &ExecutionResult{
|
|
Plan: plan,
|
|
CompletedActions: []ActionResult{},
|
|
FailedActions: []ActionResult{},
|
|
Error: fmt.Errorf("prerequisites validation failed: %w", err),
|
|
Duration: 0,
|
|
}
|
|
return result, err
|
|
}
|
|
|
|
// Step 2: Determine deployment strategy
|
|
strategyName := DeploymentStrategy(config.Spec.GetDeploymentStrategy())
|
|
rm.logf("Using deployment strategy: %s", strategyName)
|
|
|
|
// Step 3: Create strategy executor
|
|
strategyConfig := rm.strategyConfig
|
|
strategyConfig.ParallelOperations = rm.parallelLimit > 1
|
|
|
|
factory := NewStrategyFactory(rm.client, strategyConfig, rm.logger)
|
|
strategy, err := factory.CreateStrategy(strategyName)
|
|
if err != nil {
|
|
result := &ExecutionResult{
|
|
Plan: plan,
|
|
CompletedActions: []ActionResult{},
|
|
FailedActions: []ActionResult{},
|
|
Error: fmt.Errorf("failed to create deployment strategy: %w", err),
|
|
Duration: 0,
|
|
}
|
|
return result, err
|
|
}
|
|
|
|
// Step 4: Validate strategy can handle this deployment
|
|
if err := strategy.Validate(plan); err != nil {
|
|
result := &ExecutionResult{
|
|
Plan: plan,
|
|
CompletedActions: []ActionResult{},
|
|
FailedActions: []ActionResult{},
|
|
Error: fmt.Errorf("strategy validation failed: %w", err),
|
|
Duration: 0,
|
|
}
|
|
return result, err
|
|
}
|
|
|
|
// Step 5: Execute the deployment strategy
|
|
rm.logf("Estimated deployment duration: %v", strategy.EstimateDuration(plan))
|
|
result, err := strategy.Execute(ctx, plan, config, manifestContent)
|
|
|
|
// Step 6: Handle rollback if needed
|
|
if err != nil && rm.rollbackOnFail && result != nil {
|
|
rm.logf("Deployment failed, attempting rollback...")
|
|
if rollbackErr := rm.RollbackDeployment(ctx, result); rollbackErr != nil {
|
|
rm.logf("Rollback failed: %v", rollbackErr)
|
|
} else {
|
|
result.RollbackPerformed = true
|
|
result.RollbackSuccess = true
|
|
}
|
|
}
|
|
|
|
if result != nil && result.Success {
|
|
rm.logf("Deployment completed successfully in %v", result.Duration)
|
|
}
|
|
|
|
return result, err
|
|
}
|
|
|
|
// ValidatePrerequisites checks if deployment prerequisites are met
|
|
func (rm *EdgeConnectResourceManager) ValidatePrerequisites(ctx context.Context, plan *DeploymentPlan) error {
|
|
rm.logf("Validating deployment prerequisites for: %s", plan.ConfigName)
|
|
|
|
// Check if we have any actions to perform
|
|
if plan.IsEmpty() {
|
|
return fmt.Errorf("deployment plan is empty - no actions to perform")
|
|
}
|
|
|
|
// Validate that we have required client capabilities
|
|
if rm.client == nil {
|
|
return fmt.Errorf("EdgeConnect client is not configured")
|
|
}
|
|
|
|
rm.logf("Prerequisites validation passed")
|
|
return nil
|
|
}
|
|
|
|
// RollbackDeployment attempts to rollback a failed deployment
|
|
func (rm *EdgeConnectResourceManager) RollbackDeployment(ctx context.Context, result *ExecutionResult) error {
|
|
rm.logf("Starting rollback for deployment: %s", result.Plan.ConfigName)
|
|
|
|
rollbackErrors := []error{}
|
|
|
|
// Phase 1: Delete resources that were created in this deployment attempt (in reverse order)
|
|
rm.logf("Phase 1: Rolling back created resources")
|
|
for i := len(result.CompletedActions) - 1; i >= 0; i-- {
|
|
action := result.CompletedActions[i]
|
|
|
|
switch action.Type {
|
|
case ActionCreate:
|
|
if err := rm.rollbackCreateAction(ctx, action, result.Plan); err != nil {
|
|
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to rollback %s: %w", action.Target, err))
|
|
} else {
|
|
rm.logf("Successfully rolled back: %s", action.Target)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Phase 2: Restore resources that were deleted before the failed deployment
|
|
// This is critical for RecreateStrategy which deletes everything before recreating
|
|
if result.DeletedAppBackup != nil || len(result.DeletedInstancesBackup) > 0 {
|
|
rm.logf("Phase 2: Restoring deleted resources")
|
|
|
|
// Restore app first (must exist before instances can be created)
|
|
if result.DeletedAppBackup != nil {
|
|
if err := rm.restoreApp(ctx, result.DeletedAppBackup); err != nil {
|
|
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore app: %w", err))
|
|
rm.logf("Failed to restore app: %v", err)
|
|
} else {
|
|
rm.logf("Successfully restored app: %s", result.DeletedAppBackup.App.Key.Name)
|
|
}
|
|
}
|
|
|
|
// Restore instances
|
|
for _, backup := range result.DeletedInstancesBackup {
|
|
if err := rm.restoreInstance(ctx, &backup); err != nil {
|
|
rollbackErrors = append(rollbackErrors, fmt.Errorf("failed to restore instance %s: %w", backup.Instance.Key.Name, err))
|
|
rm.logf("Failed to restore instance %s: %v", backup.Instance.Key.Name, err)
|
|
} else {
|
|
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(rollbackErrors) > 0 {
|
|
return fmt.Errorf("rollback encountered %d errors: %v", len(rollbackErrors), rollbackErrors)
|
|
}
|
|
|
|
rm.logf("Rollback completed successfully")
|
|
return nil
|
|
}
|
|
|
|
// rollbackCreateAction rolls back a CREATE action by deleting the resource
|
|
func (rm *EdgeConnectResourceManager) rollbackCreateAction(ctx context.Context, action ActionResult, plan *DeploymentPlan) error {
|
|
if action.Type != ActionCreate {
|
|
return nil
|
|
}
|
|
|
|
// Determine if this is an app or instance rollback based on the target name
|
|
isInstance := false
|
|
for _, instanceAction := range plan.InstanceActions {
|
|
if instanceAction.InstanceName == action.Target {
|
|
isInstance = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if isInstance {
|
|
return rm.rollbackInstance(ctx, action, plan)
|
|
} else {
|
|
return rm.rollbackApp(ctx, action, plan)
|
|
}
|
|
}
|
|
|
|
// rollbackApp deletes an application that was created
|
|
func (rm *EdgeConnectResourceManager) rollbackApp(ctx context.Context, action ActionResult, plan *DeploymentPlan) error {
|
|
appKey := v2.AppKey{
|
|
Organization: plan.AppAction.Desired.Organization,
|
|
Name: plan.AppAction.Desired.Name,
|
|
Version: plan.AppAction.Desired.Version,
|
|
}
|
|
|
|
return rm.client.DeleteApp(ctx, appKey, plan.AppAction.Desired.Region)
|
|
}
|
|
|
|
// rollbackInstance deletes an instance that was created
|
|
func (rm *EdgeConnectResourceManager) rollbackInstance(ctx context.Context, action ActionResult, plan *DeploymentPlan) error {
|
|
// Find the instance action to get the details
|
|
for _, instanceAction := range plan.InstanceActions {
|
|
if instanceAction.InstanceName == action.Target {
|
|
instanceKey := v2.AppInstanceKey{
|
|
Organization: plan.AppAction.Desired.Organization,
|
|
Name: instanceAction.InstanceName,
|
|
CloudletKey: v2.CloudletKey{
|
|
Organization: instanceAction.Target.CloudletOrg,
|
|
Name: instanceAction.Target.CloudletName,
|
|
},
|
|
}
|
|
return rm.client.DeleteAppInstance(ctx, instanceKey, instanceAction.Target.Region)
|
|
}
|
|
}
|
|
return fmt.Errorf("instance action not found for rollback: %s", action.Target)
|
|
}
|
|
|
|
// restoreApp recreates an app that was deleted during deployment
|
|
func (rm *EdgeConnectResourceManager) restoreApp(ctx context.Context, backup *AppBackup) error {
|
|
rm.logf("Restoring app: %s/%s version %s",
|
|
backup.App.Key.Organization, backup.App.Key.Name, backup.App.Key.Version)
|
|
|
|
// Build a clean app input with only creation-safe fields
|
|
// We must exclude read-only fields like CreatedAt, UpdatedAt, etc.
|
|
appInput := &v2.NewAppInput{
|
|
Region: backup.Region,
|
|
App: v2.App{
|
|
Key: backup.App.Key,
|
|
Deployment: backup.App.Deployment,
|
|
ImageType: backup.App.ImageType,
|
|
ImagePath: backup.App.ImagePath,
|
|
AllowServerless: backup.App.AllowServerless,
|
|
DefaultFlavor: backup.App.DefaultFlavor,
|
|
ServerlessConfig: backup.App.ServerlessConfig,
|
|
DeploymentManifest: backup.App.DeploymentManifest,
|
|
DeploymentGenerator: backup.App.DeploymentGenerator,
|
|
RequiredOutboundConnections: backup.App.RequiredOutboundConnections,
|
|
// Explicitly omit read-only fields like CreatedAt, UpdatedAt, Fields, etc.
|
|
},
|
|
}
|
|
|
|
if err := rm.client.CreateApp(ctx, appInput); err != nil {
|
|
return fmt.Errorf("failed to restore app: %w", err)
|
|
}
|
|
|
|
rm.logf("Successfully restored app: %s", backup.App.Key.Name)
|
|
return nil
|
|
}
|
|
|
|
// restoreInstance recreates an instance that was deleted during deployment
|
|
func (rm *EdgeConnectResourceManager) restoreInstance(ctx context.Context, backup *InstanceBackup) error {
|
|
rm.logf("Restoring instance: %s on %s:%s",
|
|
backup.Instance.Key.Name,
|
|
backup.Instance.Key.CloudletKey.Organization,
|
|
backup.Instance.Key.CloudletKey.Name)
|
|
|
|
// Build a clean instance input with only creation-safe fields
|
|
// We must exclude read-only fields like CloudletLoc, CreatedAt, etc.
|
|
instanceInput := &v2.NewAppInstanceInput{
|
|
Region: backup.Region,
|
|
AppInst: v2.AppInstance{
|
|
Key: backup.Instance.Key,
|
|
AppKey: backup.Instance.AppKey,
|
|
Flavor: backup.Instance.Flavor,
|
|
// Explicitly omit read-only fields like CloudletLoc, State, PowerState, CreatedAt, etc.
|
|
},
|
|
}
|
|
|
|
// Retry logic to handle namespace termination race conditions
|
|
maxRetries := 5
|
|
retryDelay := 10 * time.Second
|
|
|
|
var lastErr error
|
|
for attempt := 0; attempt <= maxRetries; attempt++ {
|
|
if attempt > 0 {
|
|
rm.logf("Retrying instance restore %s (attempt %d/%d)", backup.Instance.Key.Name, attempt, maxRetries)
|
|
select {
|
|
case <-time.After(retryDelay):
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
|
|
err := rm.client.CreateAppInstance(ctx, instanceInput)
|
|
if err == nil {
|
|
rm.logf("Successfully restored instance: %s", backup.Instance.Key.Name)
|
|
return nil
|
|
}
|
|
|
|
lastErr = err
|
|
|
|
// Check if error is retryable
|
|
if !rm.isRetryableError(err) {
|
|
rm.logf("Failed to restore instance %s: %v (non-retryable error, giving up)", backup.Instance.Key.Name, err)
|
|
return fmt.Errorf("failed to restore instance: %w", err)
|
|
}
|
|
|
|
if attempt < maxRetries {
|
|
rm.logf("Failed to restore instance %s: %v (will retry)", backup.Instance.Key.Name, err)
|
|
}
|
|
}
|
|
|
|
return fmt.Errorf("failed to restore instance after %d attempts: %w", maxRetries+1, lastErr)
|
|
}
|
|
|
|
// isRetryableError determines if an error should be retried
|
|
func (rm *EdgeConnectResourceManager) isRetryableError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
|
|
errStr := strings.ToLower(err.Error())
|
|
|
|
// Special case: Kubernetes namespace termination race condition
|
|
// This is a transient 400 error that should be retried
|
|
if strings.Contains(errStr, "being terminated") || strings.Contains(errStr, "is being terminated") {
|
|
return true
|
|
}
|
|
|
|
// Check if it's an APIError with a status code
|
|
var apiErr *v2.APIError
|
|
if errors.As(err, &apiErr) {
|
|
// Don't retry client errors (4xx)
|
|
if apiErr.StatusCode >= 400 && apiErr.StatusCode < 500 {
|
|
return false
|
|
}
|
|
// Retry server errors (5xx)
|
|
if apiErr.StatusCode >= 500 {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Retry all other errors (network issues, timeouts, etc.)
|
|
return true
|
|
}
|
|
|
|
// logf logs a message if a logger is configured
|
|
func (rm *EdgeConnectResourceManager) logf(format string, v ...interface{}) {
|
|
if rm.logger != nil {
|
|
rm.logger.Printf("[ResourceManager] "+format, v...)
|
|
}
|
|
}
|