garm/runner/providers/lxd/lxd.go
2023-06-11 17:32:30 +02:00

521 lines
15 KiB
Go

// Copyright 2022 Cloudbase Solutions SRL
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
package lxd
import (
"context"
"fmt"
"log"
"sync"
"time"
"github.com/cloudbase/garm/config"
runnerErrors "github.com/cloudbase/garm/errors"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
"github.com/google/go-github/v48/github"
lxd "github.com/lxc/lxd/client"
"github.com/lxc/lxd/shared/api"
"github.com/pkg/errors"
)
var _ common.Provider = &LXD{}
const (
// We look for this key in the config of the instances to determine if they are
// created by us or not.
controllerIDKeyName = "user.runner-controller-id"
poolIDKey = "user.runner-pool-id"
// osTypeKeyName is the key we use in the instance config to indicate the OS
// platform a runner is supposed to have. This value is defined in the pool and
// passed into the provider as bootstrap params.
osTypeKeyName = "user.os-type"
// osArchKeyNAme is the key we use in the instance config to indicate the OS
// architecture a runner is supposed to have. This value is defined in the pool and
// passed into the provider as bootstrap params.
osArchKeyNAme = "user.os-arch"
)
var (
// lxdToGithubArchMap translates LXD architectures to Github tools architectures.
// TODO: move this in a separate package. This will most likely be used
// by any other provider.
lxdToGithubArchMap map[string]string = map[string]string{
"x86_64": "x64",
"amd64": "x64",
"armv7l": "arm",
"aarch64": "arm64",
"x64": "x64",
"arm": "arm",
"arm64": "arm64",
}
configToLXDArchMap map[params.OSArch]string = map[params.OSArch]string{
params.Amd64: "x86_64",
params.Arm64: "aarch64",
params.Arm: "armv7l",
}
lxdToConfigArch map[string]params.OSArch = map[string]params.OSArch{
"x86_64": params.Amd64,
"aarch64": params.Arm64,
"armv7l": params.Arm,
}
)
const (
DefaultProjectDescription = "This project was created automatically by garm to be used for github ephemeral action runners."
DefaultProjectName = "garm-project"
)
func NewProvider(ctx context.Context, cfg *config.Provider, controllerID string) (common.Provider, error) {
if err := cfg.Validate(); err != nil {
return nil, errors.Wrap(err, "validating provider config")
}
if cfg.ProviderType != params.LXDProvider {
return nil, fmt.Errorf("invalid provider type %s, expected %s", cfg.ProviderType, params.LXDProvider)
}
provider := &LXD{
ctx: ctx,
cfg: cfg,
controllerID: controllerID,
imageManager: &image{
remotes: cfg.LXD.ImageRemotes,
},
}
return provider, nil
}
type LXD struct {
// cfg is the provider config for this provider.
cfg *config.Provider
// ctx is the context.
ctx context.Context
// cli is the LXD client.
cli lxd.InstanceServer
// imageManager downloads images from remotes
imageManager *image
// controllerID is the ID of this controller
controllerID string
mux sync.Mutex
}
func (l *LXD) getCLI() (lxd.InstanceServer, error) {
l.mux.Lock()
defer l.mux.Unlock()
if l.cli != nil {
return l.cli, nil
}
cli, err := getClientFromConfig(l.ctx, &l.cfg.LXD)
if err != nil {
return nil, errors.Wrap(err, "creating LXD client")
}
_, _, err = cli.GetProject(projectName(l.cfg.LXD))
if err != nil {
return nil, errors.Wrapf(err, "fetching project name: %s", projectName(l.cfg.LXD))
}
cli = cli.UseProject(projectName(l.cfg.LXD))
l.cli = cli
return cli, nil
}
func (l *LXD) getProfiles(flavor string) ([]string, error) {
ret := []string{}
if l.cfg.LXD.IncludeDefaultProfile {
ret = append(ret, "default")
}
set := map[string]struct{}{}
cli, err := l.getCLI()
if err != nil {
return nil, errors.Wrap(err, "fetching client")
}
profiles, err := cli.GetProfileNames()
if err != nil {
return nil, errors.Wrap(err, "fetching profile names")
}
for _, profile := range profiles {
set[profile] = struct{}{}
}
if _, ok := set[flavor]; !ok {
return nil, errors.Wrapf(runnerErrors.ErrNotFound, "looking for profile %s", flavor)
}
ret = append(ret, flavor)
return ret, nil
}
func (l *LXD) getTools(tools []*github.RunnerApplicationDownload, osType params.OSType, architecture string) (github.RunnerApplicationDownload, error) {
// Validate image OS. Linux only for now.
switch osType {
case params.Linux:
default:
return github.RunnerApplicationDownload{}, fmt.Errorf("this provider does not support OS type: %s", osType)
}
// Find tools for OS/Arch.
for _, tool := range tools {
if tool == nil {
continue
}
if tool.OS == nil || tool.Architecture == nil {
continue
}
// fmt.Println(*tool.Architecture, *tool.OS)
// fmt.Printf("image arch: %s --> osType: %s\n", image.Architecture, string(osType))
if *tool.Architecture == architecture && *tool.OS == string(osType) {
return *tool, nil
}
arch, ok := lxdToGithubArchMap[architecture]
if ok && arch == *tool.Architecture && *tool.OS == string(osType) {
return *tool, nil
}
}
return github.RunnerApplicationDownload{}, fmt.Errorf("failed to find tools for OS %s and arch %s", osType, architecture)
}
// sadly, the security.secureboot flag is a string encoded boolean.
func (l *LXD) secureBootEnabled() string {
if l.cfg.LXD.SecureBoot {
return "true"
}
return "false"
}
func (l *LXD) getCreateInstanceArgs(bootstrapParams params.BootstrapInstance, specs extraSpecs) (api.InstancesPost, error) {
if bootstrapParams.Name == "" {
return api.InstancesPost{}, runnerErrors.NewBadRequestError("missing name")
}
profiles, err := l.getProfiles(bootstrapParams.Flavor)
if err != nil {
return api.InstancesPost{}, errors.Wrap(err, "fetching profiles")
}
arch, err := resolveArchitecture(bootstrapParams.OSArch)
if err != nil {
return api.InstancesPost{}, errors.Wrap(err, "fetching archictecture")
}
instanceType := l.cfg.LXD.GetInstanceType()
instanceSource, err := l.imageManager.getInstanceSource(bootstrapParams.Image, instanceType, arch, l.cli)
if err != nil {
return api.InstancesPost{}, errors.Wrap(err, "getting instance source")
}
tools, err := l.getTools(bootstrapParams.Tools, bootstrapParams.OSType, arch)
if err != nil {
return api.InstancesPost{}, errors.Wrap(err, "getting tools")
}
bootstrapParams.UserDataOptions.DisableUpdatesOnBoot = specs.DisableUpdates
bootstrapParams.UserDataOptions.ExtraPackages = specs.ExtraPackages
cloudCfg, err := util.GetCloudConfig(bootstrapParams, tools, bootstrapParams.Name)
if err != nil {
return api.InstancesPost{}, errors.Wrap(err, "generating cloud-config")
}
configMap := map[string]string{
"user.user-data": cloudCfg,
osTypeKeyName: string(bootstrapParams.OSType),
osArchKeyNAme: string(bootstrapParams.OSArch),
controllerIDKeyName: l.controllerID,
poolIDKey: bootstrapParams.PoolID,
}
if instanceType == config.LXDImageVirtualMachine {
configMap["security.secureboot"] = l.secureBootEnabled()
}
args := api.InstancesPost{
InstancePut: api.InstancePut{
Architecture: arch,
Profiles: profiles,
Description: "Github runner provisioned by garm",
Config: configMap,
},
Source: instanceSource,
Name: bootstrapParams.Name,
Type: api.InstanceType(instanceType),
}
return args, nil
}
func (l *LXD) AsParams() params.Provider {
return params.Provider{
Name: l.cfg.Name,
ProviderType: l.cfg.ProviderType,
Description: l.cfg.Description,
}
}
func (l *LXD) launchInstance(createArgs api.InstancesPost) error {
cli, err := l.getCLI()
if err != nil {
return errors.Wrap(err, "fetching client")
}
// Get LXD to create the instance (background operation)
op, err := cli.CreateInstance(createArgs)
if err != nil {
return errors.Wrap(err, "creating instance")
}
// Wait for the operation to complete
err = op.Wait()
if err != nil {
return errors.Wrap(err, "waiting for instance creation")
}
// Get LXD to start the instance (background operation)
reqState := api.InstanceStatePut{
Action: "start",
Timeout: -1,
}
op, err = cli.UpdateInstanceState(createArgs.Name, reqState, "")
if err != nil {
return errors.Wrap(err, "starting instance")
}
// Wait for the operation to complete
err = op.Wait()
if err != nil {
return errors.Wrap(err, "waiting for instance to start")
}
return nil
}
// CreateInstance creates a new compute instance in the provider.
func (l *LXD) CreateInstance(ctx context.Context, bootstrapParams params.BootstrapInstance) (params.Instance, error) {
extraSpecs, err := parseExtraSpecsFromBootstrapParams(bootstrapParams)
if err != nil {
return params.Instance{}, errors.Wrap(err, "parsing extra specs")
}
args, err := l.getCreateInstanceArgs(bootstrapParams, extraSpecs)
if err != nil {
return params.Instance{}, errors.Wrap(err, "fetching create args")
}
if err := l.launchInstance(args); err != nil {
return params.Instance{}, errors.Wrap(err, "creating instance")
}
ret, err := l.waitInstanceHasIP(ctx, args.Name)
if err != nil {
return params.Instance{}, errors.Wrap(err, "fetching instance")
}
return ret, nil
}
// GetInstance will return details about one instance.
func (l *LXD) GetInstance(ctx context.Context, instanceName string) (params.Instance, error) {
cli, err := l.getCLI()
if err != nil {
return params.Instance{}, errors.Wrap(err, "fetching client")
}
instance, _, err := cli.GetInstanceFull(instanceName)
if err != nil {
if isNotFoundError(err) {
return params.Instance{}, errors.Wrapf(runnerErrors.ErrNotFound, "fetching instance: %q", err)
}
return params.Instance{}, errors.Wrap(err, "fetching instance")
}
return lxdInstanceToAPIInstance(instance), nil
}
// Delete instance will delete the instance in a provider.
func (l *LXD) DeleteInstance(ctx context.Context, instance string) error {
cli, err := l.getCLI()
if err != nil {
return errors.Wrap(err, "fetching client")
}
if err := l.setState(instance, "stop", true); err != nil {
if isNotFoundError(err) {
log.Printf("received not found error when stopping instance %s", instance)
return nil
}
// I am not proud of this, but the drivers.ErrInstanceIsStopped from LXD pulls in
// a ton of CGO, linux specific dependencies, that don't make sense having
// in garm.
if !(errors.Cause(err).Error() == errInstanceIsStopped.Error()) {
return errors.Wrap(err, "stopping instance")
}
}
opResponse := make(chan struct {
op lxd.Operation
err error
})
var op lxd.Operation
go func() {
op, err := cli.DeleteInstance(instance)
opResponse <- struct {
op lxd.Operation
err error
}{op: op, err: err}
}()
select {
case resp := <-opResponse:
if resp.err != nil {
if isNotFoundError(resp.err) {
log.Printf("received not found error when deleting instance %s", instance)
return nil
}
return errors.Wrap(resp.err, "removing instance")
}
op = resp.op
case <-time.After(time.Second * 60):
return errors.Wrapf(runnerErrors.ErrTimeout, "removing instance %s", instance)
}
opTimeout, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()
err = op.WaitContext(opTimeout)
if err != nil {
if isNotFoundError(err) {
log.Printf("received not found error when waiting for instance deletion %s", instance)
return nil
}
return errors.Wrap(err, "waiting for instance deletion")
}
return nil
}
type listResponse struct {
instances []api.InstanceFull
err error
}
// ListInstances will list all instances for a provider.
func (l *LXD) ListInstances(ctx context.Context, poolID string) ([]params.Instance, error) {
cli, err := l.getCLI()
if err != nil {
return []params.Instance{}, errors.Wrap(err, "fetching client")
}
result := make(chan listResponse, 1)
go func() {
// TODO(gabriel-samfira): if this blocks indefinitely, we will leak a goroutine.
// Convert the internal provider to an external one. Running the provider as an
// external process will allow us to not care if a goroutine leaks. Once a timeout
// is reached, the provider can just exit with an error. Something we can't do with
// internal providers.
instances, err := cli.GetInstancesFull(api.InstanceTypeAny)
result <- listResponse{
instances: instances,
err: err,
}
}()
var instances []api.InstanceFull
select {
case res := <-result:
if res.err != nil {
return []params.Instance{}, errors.Wrap(res.err, "fetching instances")
}
instances = res.instances
case <-time.After(time.Second * 60):
return []params.Instance{}, errors.Wrap(runnerErrors.ErrTimeout, "fetching instances from provider")
}
ret := []params.Instance{}
for _, instance := range instances {
if id, ok := instance.ExpandedConfig[controllerIDKeyName]; ok && id == l.controllerID {
if poolID != "" {
id := instance.ExpandedConfig[poolIDKey]
if id != poolID {
// Pool ID was specified. Filter out instances belonging to other pools.
continue
}
}
ret = append(ret, lxdInstanceToAPIInstance(&instance))
}
}
return ret, nil
}
// RemoveAllInstances will remove all instances created by this provider.
func (l *LXD) RemoveAllInstances(ctx context.Context) error {
instances, err := l.ListInstances(ctx, "")
if err != nil {
return errors.Wrap(err, "fetching instance list")
}
for _, instance := range instances {
// TODO: remove in parallel
if err := l.DeleteInstance(ctx, instance.Name); err != nil {
return errors.Wrapf(err, "removing instance %s", instance.Name)
}
}
return nil
}
func (l *LXD) setState(instance, state string, force bool) error {
reqState := api.InstanceStatePut{
Action: state,
Timeout: -1,
Force: force,
}
cli, err := l.getCLI()
if err != nil {
return errors.Wrap(err, "fetching client")
}
op, err := cli.UpdateInstanceState(instance, reqState, "")
if err != nil {
return errors.Wrapf(err, "setting state to %s", state)
}
ctxTimeout, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()
err = op.WaitContext(ctxTimeout)
if err != nil {
return errors.Wrapf(err, "waiting for instance to transition to state %s", state)
}
return nil
}
// Stop shuts down the instance.
func (l *LXD) Stop(ctx context.Context, instance string, force bool) error {
return l.setState(instance, "stop", force)
}
// Start boots up an instance.
func (l *LXD) Start(ctx context.Context, instance string) error {
return l.setState(instance, "start", false)
}