diff --git a/cmd/mapt/cmd/aws/hosts/rhelai.go b/cmd/mapt/cmd/aws/hosts/rhelai.go index 43e848b4d..872bc64f1 100644 --- a/cmd/mapt/cmd/aws/hosts/rhelai.go +++ b/cmd/mapt/cmd/aws/hosts/rhelai.go @@ -1,6 +1,8 @@ package hosts import ( + "fmt" + awsParams "github.com/redhat-developer/mapt/cmd/mapt/cmd/aws/params" "github.com/redhat-developer/mapt/cmd/mapt/cmd/params" maptContext "github.com/redhat-developer/mapt/pkg/manager/context" @@ -14,6 +16,9 @@ import ( const ( cmdRHELAI = "rhel-ai" cmdRHELAIDesc = "manage rhel ai host" + + cmdRHELAIListVersions = "list-versions" + cmdRHELAIListVersionsDesc = "list available RHEL AI versions" ) func GetRHELAICmd() *cobra.Command { @@ -32,7 +37,7 @@ func GetRHELAICmd() *cobra.Command { params.AddCommonFlags(flagSet) c.PersistentFlags().AddFlagSet(flagSet) - c.AddCommand(getRHELAICreate(), getRHELAIDestroy()) + c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions()) return c } @@ -107,3 +112,27 @@ func getRHELAIDestroy() *cobra.Command { c.PersistentFlags().AddFlagSet(flagSet) return c } + +func getRHELAIListVersions() *cobra.Command { + c := &cobra.Command{ + Use: cmdRHELAIListVersions, + Short: cmdRHELAIListVersionsDesc, + RunE: func(cmd *cobra.Command, args []string) error { + if err := viper.BindPFlags(cmd.Flags()); err != nil { + return err + } + versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator)) + if err != nil { + return err + } + for _, v := range versions { + fmt.Println(v) + } + return nil + }, + } + flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError) + flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc) + c.PersistentFlags().AddFlagSet(flagSet) + return c +} diff --git a/cmd/mapt/cmd/azure/hosts/rhelai.go b/cmd/mapt/cmd/azure/hosts/rhelai.go index 7385073ea..29c78b53d 100644 --- a/cmd/mapt/cmd/azure/hosts/rhelai.go +++ b/cmd/mapt/cmd/azure/hosts/rhelai.go @@ -1,6 +1,8 @@ package hosts import ( + "fmt" + "github.com/redhat-developer/mapt/cmd/mapt/cmd/params" maptContext "github.com/redhat-developer/mapt/pkg/manager/context" rhelai "github.com/redhat-developer/mapt/pkg/provider/azure/action/rhel-ai" @@ -13,6 +15,9 @@ import ( const ( cmdRHELAI = "rhel-ai" cmdRHELAIDesc = "manage rhel ai host" + + cmdRHELAIListVersions = "list-versions" + cmdRHELAIListVersionsDesc = "list available RHEL AI versions in the Azure Compute Gallery" ) func GetRHELAICmd() *cobra.Command { @@ -31,7 +36,7 @@ func GetRHELAICmd() *cobra.Command { params.AddCommonFlags(flagSet) c.PersistentFlags().AddFlagSet(flagSet) - c.AddCommand(getRHELAICreate(), getRHELAIDestroy()) + c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions()) return c } @@ -104,3 +109,27 @@ func getRHELAIDestroy() *cobra.Command { c.PersistentFlags().AddFlagSet(flagSet) return c } + +func getRHELAIListVersions() *cobra.Command { + c := &cobra.Command{ + Use: cmdRHELAIListVersions, + Short: cmdRHELAIListVersionsDesc, + RunE: func(cmd *cobra.Command, args []string) error { + if err := viper.BindPFlags(cmd.Flags()); err != nil { + return err + } + versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator)) + if err != nil { + return err + } + for _, v := range versions { + fmt.Println(v) + } + return nil + }, + } + flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError) + flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc) + c.PersistentFlags().AddFlagSet(flagSet) + return c +} diff --git a/docs/aws/rhelai.md b/docs/aws/rhelai.md new file mode 100644 index 000000000..26cd60301 --- /dev/null +++ b/docs/aws/rhelai.md @@ -0,0 +1,110 @@ +# Overview + +mapt offers operations to manage RHEL AI environments on AWS. RHEL AI instances are GPU-enabled machines with pre-installed RHEL AI images, suitable for AI/ML workloads. + +## Operations + +### List Versions + +List available RHEL AI versions for a given accelerator type: + +```bash +mapt aws rhel-ai list-versions -h +list-versions + +Usage: + mapt aws rhel-ai list-versions [flags] + +Flags: + --accelerator string accelerator type. Valid types: cuda and rocm (default "cuda") + -h, --help help for list-versions + +Global Flags: + --backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket + --project-name string project name to identify the instance of the stack +``` + +#### Container + +```bash +podman run -it --rm \ + -e AWS_ACCESS_KEY_ID=XXX \ + -e AWS_SECRET_ACCESS_KEY=XXX \ + -e AWS_DEFAULT_REGION=us-east-1 \ + quay.io/redhat-developer/mapt:0.7.0-dev aws \ + rhel-ai list-versions \ + --accelerator cuda +``` + +### Create + +This will create a RHEL AI instance according to params specified: + +```bash +mapt aws rhel-ai create -h +create + +Usage: + mapt aws rhel-ai create [flags] + +Flags: + --accelerator string accelerator type. Valid types: cuda and rocm (default "cuda") + --conn-details-output string path to export host connection information (host, username and privateKey) + --cpus int32 Number of CPUs for the cloud instance (default 8) + --custom-image string custom AMI name (overrides version and accelerator) + --disk-size int Disk size in GB (default 2000) + --gpus int32 Number of GPUs + --memory int32 Amount of RAM for the cloud instance in GiB (default 64) + --spot if spot is set the spot prices across all regions will be checked and machine will be started on best spot option (price / eviction) + --spot-eviction-tolerance string if spot is enabled we can define the minimum tolerance level of eviction. Allowed values are: lowest, low, medium, high or highest (default "lowest") + --tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default []) + --timeout string set a timeout for the instance (e.g. 4h) + --version string version for the RHELAI OS (default "3.0.0") + -h, --help help for create + +Global Flags: + --backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket + --project-name string project name to identify the instance of the stack +``` + +#### Outputs + +It will create a RHEL AI instance and will give as result several files located at path defined by `--conn-details-output`: + +* **host**: host for the instance (load balancer DNS if spot) +* **username**: username to connect to the machine +* **id_rsa**: private key to connect to the machine + +Also, it will create a state folder holding the state for the created resources at AWS, the path for this folder is defined within `--backed-url`, the content from that folder is required with the same project name (`--project-name`) in order to destroy the resources. + +#### Container + +When running the container image it is required to pass the authentication information as variables, following a sample snippet on how to create an instance with default values: + +```bash +podman run -d --name mapt-rhelai \ + -v ${PWD}:/workspace:z \ + -e AWS_ACCESS_KEY_ID=XXX \ + -e AWS_SECRET_ACCESS_KEY=XXX \ + -e AWS_DEFAULT_REGION=us-east-1 \ + quay.io/redhat-developer/mapt:0.7.0-dev aws \ + rhel-ai create \ + --project-name mapt-rhelai \ + --backed-url file:///workspace \ + --conn-details-output /workspace \ + --spot +``` + +### Destroy + +```bash +podman run -d --rm \ + -v ${PWD}:/workspace:z \ + -e AWS_ACCESS_KEY_ID=XXX \ + -e AWS_SECRET_ACCESS_KEY=XXX \ + -e AWS_DEFAULT_REGION=us-east-1 \ + quay.io/redhat-developer/mapt:0.7.0-dev aws \ + rhel-ai destroy \ + --project-name mapt-rhelai \ + --backed-url file:///workspace +``` diff --git a/docs/azure/rhelai.md b/docs/azure/rhelai.md new file mode 100644 index 000000000..2dc6d5b2e --- /dev/null +++ b/docs/azure/rhelai.md @@ -0,0 +1,113 @@ +# Overview + +mapt offers operations to manage RHEL AI environments on Azure. RHEL AI instances are GPU-enabled machines with pre-installed RHEL AI images from the Azure Compute Gallery, suitable for AI/ML workloads. + +## Operations + +### List Versions + +List available RHEL AI versions for a given accelerator type: + +```bash +mapt azure rhel-ai list-versions -h +list-versions + +Usage: + mapt azure rhel-ai list-versions [flags] + +Flags: + --accelerator string accelerator type. Valid types: cuda and rocm (default "cuda") + -h, --help help for list-versions + +Global Flags: + --backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket + --project-name string project name to identify the instance of the stack +``` + +#### Container + +```bash +podman run -it --rm \ + -e ARM_TENANT_ID=${ati_value} \ + -e ARM_SUBSCRIPTION_ID=${asi_value} \ + -e ARM_CLIENT_ID=${aci_value} \ + -e ARM_CLIENT_SECRET=${acs_value} \ + quay.io/redhat-developer/mapt:0.7.0-dev azure \ + rhel-ai list-versions \ + --accelerator cuda +``` + +### Create + +This will create a RHEL AI instance according to params specified: + +```bash +mapt azure rhel-ai create -h +create + +Usage: + mapt azure rhel-ai create [flags] + +Flags: + --accelerator string accelerator type. Valid types: cuda and rocm (default "cuda") + --conn-details-output string path to export host connection information (host, username and privateKey) + --cpus int32 Number of CPUs for the cloud instance (default 8) + --custom-image string custom image name (overrides version and accelerator) + --disk-size int Disk size in GB + --gpus int32 Number of GPUs + --memory int32 Amount of RAM for the cloud instance in GiB (default 64) + --spot if spot is set the spot prices across all regions will be checked and machine will be started on best spot option (price / eviction) + --spot-eviction-tolerance string if spot is enabled we can define the minimum tolerance level of eviction. Allowed values are: lowest, low, medium, high or highest (default "lowest") + --tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default []) + --timeout string set a timeout for the instance (e.g. 4h) + --version string version for the RHELAI OS (default "3.0.0") + -h, --help help for create + +Global Flags: + --backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket + --project-name string project name to identify the instance of the stack +``` + +#### Outputs + +It will create a RHEL AI instance and will give as result several files located at path defined by `--conn-details-output`: + +* **host**: host for the instance (load balancer DNS if spot) +* **username**: username to connect to the machine +* **id_rsa**: private key to connect to the machine + +Also, it will create a state folder holding the state for the created resources at Azure, the path for this folder is defined within `--backed-url`, the content from that folder is required with the same project name (`--project-name`) in order to destroy the resources. + +#### Container + +When running the container image it is required to pass the authentication information as variables, following a sample snippet on how to create an instance with default values: + +```bash +podman run -d --name mapt-rhelai \ + -v ${PWD}:/workspace:z \ + -e ARM_TENANT_ID=${ati_value} \ + -e ARM_SUBSCRIPTION_ID=${asi_value} \ + -e ARM_CLIENT_ID=${aci_value} \ + -e ARM_CLIENT_SECRET=${acs_value} \ + quay.io/redhat-developer/mapt:0.7.0-dev azure \ + rhel-ai create \ + --project-name mapt-rhelai \ + --backed-url file:///workspace \ + --conn-details-output /workspace \ + --spot +``` + +### Destroy + +```bash +podman run -d --rm \ + -v ${PWD}:/workspace:z \ + -e ARM_TENANT_ID=${ati_value} \ + -e ARM_SUBSCRIPTION_ID=${asi_value} \ + -e ARM_CLIENT_ID=${aci_value} \ + -e ARM_CLIENT_SECRET=${acs_value} \ + quay.io/redhat-developer/mapt:0.7.0-dev azure \ + rhel-ai destroy \ + --project-name mapt-rhelai \ + --backed-url file:///workspace +``` diff --git a/pkg/provider/aws/action/rhel-ai/rhelai.go b/pkg/provider/aws/action/rhel-ai/rhelai.go index e534e5b6e..b7fae2fb8 100644 --- a/pkg/provider/aws/action/rhel-ai/rhelai.go +++ b/pkg/provider/aws/action/rhel-ai/rhelai.go @@ -3,6 +3,8 @@ package rhelai import ( "context" "fmt" + "sort" + "strings" "github.com/go-playground/validator/v10" "github.com/pulumi/pulumi-aws/sdk/v7/go/aws/ec2" @@ -119,6 +121,57 @@ func Destroy(mCtxArgs *mc.ContextArgs) error { return aws.CleanupState(mCtx) } +const listVersionsRegion = "us-east-1" + +// ListVersions returns available RHEL AI version strings for the given accelerator, +// sorted in ascending order. Versions are derived from AMI names in a reference +// region (us-east-1) matching the pattern "rhel-ai-{accelerator}-aws-{version}*". +func ListVersions(ctx context.Context, accelerator string) ([]string, error) { + acc := strings.ToLower(strings.TrimSpace(accelerator)) + switch acc { + case "cuda", "rocm": + default: + return nil, fmt.Errorf("unsupported accelerator %q (expected: cuda or rocm)", accelerator) + } + nameFilter := fmt.Sprintf("rhel-ai-%s-aws-*", acc) + region := listVersionsRegion + images, err := data.ListAMIs(ctx, data.ImageRequest{ + Name: &nameFilter, + Arch: &amiArch, + Owner: &amiOwner, + Region: ®ion, + }) + if err != nil { + return nil, fmt.Errorf("listing RHEL AI AMIs for accelerator %q: %w", acc, err) + } + prefix := fmt.Sprintf("rhel-ai-%s-aws-", acc) + seen := make(map[string]struct{}) + for _, img := range images { + if img.Name == nil { + continue + } + raw := strings.TrimPrefix(*img.Name, prefix) + // AMI names may have trailing qualifiers (e.g. "-x86_64-..."); take only the version part + if idx := strings.Index(raw, "-x86_64"); idx > 0 { + raw = raw[:idx] + } + if idx := strings.Index(raw, "-arm64"); idx > 0 { + raw = raw[:idx] + } + if len(raw) > 0 { + // Normalize underscores to dashes (e.g. "3.4.0_ea.2" → "3.4.0-ea.2") + version := strings.ReplaceAll(raw, "_", "-") + seen[version] = struct{}{} + } + } + versions := make([]string, 0, len(seen)) + for v := range seen { + versions = append(versions, v) + } + sort.Strings(versions) + return versions, nil +} + func (r *rhelAIRequest) createMachine() error { cs := manager.Stack{ StackName: r.mCtx.StackNameByProject(stackName), diff --git a/pkg/provider/aws/data/ami.go b/pkg/provider/aws/data/ami.go index e9f058c9b..2af576c06 100644 --- a/pkg/provider/aws/data/ami.go +++ b/pkg/provider/aws/data/ami.go @@ -103,6 +103,55 @@ func GetAMI(ctx context.Context, r ImageRequest) (*ImageInfo, error) { nil } +// ListAMIs returns all AMIs matching the request filters. +// Unlike GetAMI which returns only the newest, this returns the full set. +func ListAMIs(ctx context.Context, r ImageRequest) ([]ec2Types.Image, error) { + var cfgOpts config.LoadOptionsFunc + if r.Region != nil && len(*r.Region) > 0 { + cfgOpts = config.WithRegion(*r.Region) + } + cfg, err := config.LoadDefaultConfig(ctx, cfgOpts) + if err != nil { + return nil, err + } + client := ec2.NewFromConfig(cfg) + var filterName = "name" + filters := []ec2Types.Filter{ + { + Name: &filterName, + Values: []string{*r.Name}, + }, + } + if r.Arch != nil && len(*r.Arch) > 0 { + filter := "architecture" + filters = append(filters, ec2Types.Filter{ + Name: &filter, + Values: []string{*r.Arch}, + }) + } + input := &ec2.DescribeImagesInput{ + Filters: filters, + } + if r.Owner != nil && len(*r.Owner) > 0 { + input.Owners = []string{*r.Owner} + aId, err := accountId(ctx) + if err != nil { + return nil, err + } + if *aId != *r.Owner { + input.ExecutableUsers = []string{"self"} + } + } + result, err := client.DescribeImages(ctx, input) + if err != nil { + return nil, err + } + if result == nil { + return nil, nil + } + return result.Images, nil +} + // IsAMIOffered checks if an ami based on its Name is offered on a specific region func IsAMIOffered(ctx context.Context, r ImageRequest) (bool, *ImageInfo, error) { ami, err := GetAMI(ctx, r) diff --git a/pkg/provider/azure/action/rhel-ai/rhelai.go b/pkg/provider/azure/action/rhel-ai/rhelai.go index 846f649bd..7924df7a0 100644 --- a/pkg/provider/azure/action/rhel-ai/rhelai.go +++ b/pkg/provider/azure/action/rhel-ai/rhelai.go @@ -3,6 +3,7 @@ package rhelai import ( "context" "fmt" + "sort" "strings" maptContext "github.com/redhat-developer/mapt/pkg/manager/context" @@ -15,10 +16,11 @@ import ( const ( imageOwnerSubscriptionId = "02db6bd4-035c-4074-b699-468f3d914744" + imageOwnerResourceGroup = "aipcc-productization" // $1 accelerator $2 version imageNameRegex = "rhel-ai-%s-azure-%s" - // $1 subscriptionId $2 rgName - imageIdRegex = "/subscriptions/%s/resourceGroups/aipcc-productization/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0" + // $1 subscriptionId $2 rgName $3 galleryName $4 imageName + imageIdRegex = "/subscriptions/%s/resourceGroups/" + imageOwnerResourceGroup + "/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0" username = "azureuser" ) @@ -104,3 +106,28 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err func Destroy(mCtxArgs *maptContext.ContextArgs) error { return azureLinux.Destroy(mCtxArgs) } + +// ListVersions returns available RHEL AI version strings for the given accelerator, +// sorted in ascending order. Versions are derived from Azure Compute Gallery names +// in the image owner's subscription (e.g. gallery "rhel_ai_cuda_azure_3.4.0_ea.2" +// yields version "3.4.0-ea.2"). +func ListVersions(ctx context.Context, accelerator string) ([]string, error) { + acc := strings.ToLower(strings.TrimSpace(accelerator)) + switch acc { + case "cuda", "rocm": + default: + return nil, fmt.Errorf("unsupported accelerator %q (expected: cuda or rocm)", accelerator) + } + prefix := fmt.Sprintf("rhel_ai_%s_azure_", strings.ReplaceAll(acc, "-", "_")) + galleries, err := data.ListGalleriesByPrefix(ctx, imageOwnerSubscriptionId, imageOwnerResourceGroup, prefix) + if err != nil { + return nil, fmt.Errorf("listing RHEL AI versions for accelerator %q: %w", accelerator, err) + } + versions := make([]string, 0, len(galleries)) + for _, g := range galleries { + raw := strings.TrimPrefix(g, prefix) + versions = append(versions, strings.ReplaceAll(raw, "_", "-")) + } + sort.Strings(versions) + return versions, nil +} diff --git a/pkg/provider/azure/data/images.go b/pkg/provider/azure/data/images.go index 06cc8fb4c..adaf5ea1c 100644 --- a/pkg/provider/azure/data/images.go +++ b/pkg/provider/azure/data/images.go @@ -120,6 +120,34 @@ func GetSharedImageDiskControllerTypes(ctx context.Context, id *string) ([]strin return nil, nil } +// ListGalleriesByPrefix returns the names of galleries in resourceGroup (within +// subscriptionID) whose names start with namePrefix. +func ListGalleriesByPrefix(ctx context.Context, subscriptionID, resourceGroup, namePrefix string) ([]string, error) { + ensureAzureEnvs() + cred, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, err + } + c, err := armcompute.NewClientFactory(subscriptionID, cred, nil) + if err != nil { + return nil, err + } + pager := c.NewGalleriesClient().NewListByResourceGroupPager(resourceGroup, nil) + var names []string + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return nil, err + } + for _, g := range page.Value { + if g.Name != nil && strings.HasPrefix(*g.Name, namePrefix) { + names = append(names, *g.Name) + } + } + } + return names, nil +} + func SkuG2Support(ctx context.Context, location string, publisher string, offer string, sku string) (string, error) { ensureAzureEnvs() cred, err := azidentity.NewDefaultAzureCredential(nil)