Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion cmd/mapt/cmd/aws/hosts/rhelai.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package hosts

import (
"fmt"

awsParams "github.com/redhat-developer/mapt/cmd/mapt/cmd/aws/params"
"github.com/redhat-developer/mapt/cmd/mapt/cmd/params"
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
Expand All @@ -14,6 +16,9 @@ import (
const (
cmdRHELAI = "rhel-ai"
cmdRHELAIDesc = "manage rhel ai host"

cmdRHELAIListVersions = "list-versions"
cmdRHELAIListVersionsDesc = "list available RHEL AI versions"
)

func GetRHELAICmd() *cobra.Command {
Expand All @@ -32,7 +37,7 @@ func GetRHELAICmd() *cobra.Command {
params.AddCommonFlags(flagSet)
c.PersistentFlags().AddFlagSet(flagSet)

c.AddCommand(getRHELAICreate(), getRHELAIDestroy())
c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions())
return c
}

Expand Down Expand Up @@ -107,3 +112,27 @@ func getRHELAIDestroy() *cobra.Command {
c.PersistentFlags().AddFlagSet(flagSet)
return c
}

func getRHELAIListVersions() *cobra.Command {
c := &cobra.Command{
Use: cmdRHELAIListVersions,
Short: cmdRHELAIListVersionsDesc,
RunE: func(cmd *cobra.Command, args []string) error {
if err := viper.BindPFlags(cmd.Flags()); err != nil {
return err
}
versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator))
if err != nil {
return err
}
for _, v := range versions {
fmt.Println(v)
}
return nil
},
}
flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError)
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
c.PersistentFlags().AddFlagSet(flagSet)
return c
}
31 changes: 30 additions & 1 deletion cmd/mapt/cmd/azure/hosts/rhelai.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package hosts

import (
"fmt"

"github.com/redhat-developer/mapt/cmd/mapt/cmd/params"
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
rhelai "github.com/redhat-developer/mapt/pkg/provider/azure/action/rhel-ai"
Expand All @@ -13,6 +15,9 @@ import (
const (
cmdRHELAI = "rhel-ai"
cmdRHELAIDesc = "manage rhel ai host"

cmdRHELAIListVersions = "list-versions"
cmdRHELAIListVersionsDesc = "list available RHEL AI versions in the Azure Compute Gallery"
)

func GetRHELAICmd() *cobra.Command {
Expand All @@ -31,7 +36,7 @@ func GetRHELAICmd() *cobra.Command {
params.AddCommonFlags(flagSet)
c.PersistentFlags().AddFlagSet(flagSet)

c.AddCommand(getRHELAICreate(), getRHELAIDestroy())
c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions())
return c
}

Expand Down Expand Up @@ -104,3 +109,27 @@ func getRHELAIDestroy() *cobra.Command {
c.PersistentFlags().AddFlagSet(flagSet)
return c
}

func getRHELAIListVersions() *cobra.Command {
c := &cobra.Command{
Use: cmdRHELAIListVersions,
Short: cmdRHELAIListVersionsDesc,
RunE: func(cmd *cobra.Command, args []string) error {
if err := viper.BindPFlags(cmd.Flags()); err != nil {
return err
}
versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator))
if err != nil {
return err
}
for _, v := range versions {
fmt.Println(v)
}
return nil
},
}
flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError)
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
c.PersistentFlags().AddFlagSet(flagSet)
return c
}
110 changes: 110 additions & 0 deletions docs/aws/rhelai.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Overview

mapt offers operations to manage RHEL AI environments on AWS. RHEL AI instances are GPU-enabled machines with pre-installed RHEL AI images, suitable for AI/ML workloads.

## Operations

### List Versions

List available RHEL AI versions for a given accelerator type:

```bash
mapt aws rhel-ai list-versions -h
list-versions

Usage:
mapt aws rhel-ai list-versions [flags]

Flags:
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
-h, --help help for list-versions

Global Flags:
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
--project-name string project name to identify the instance of the stack
```

#### Container

```bash
podman run -it --rm \
-e AWS_ACCESS_KEY_ID=XXX \
-e AWS_SECRET_ACCESS_KEY=XXX \
-e AWS_DEFAULT_REGION=us-east-1 \
quay.io/redhat-developer/mapt:0.7.0-dev aws \
rhel-ai list-versions \
--accelerator cuda
```

### Create

This will create a RHEL AI instance according to params specified:

```bash
mapt aws rhel-ai create -h
create

Usage:
mapt aws rhel-ai create [flags]

Flags:
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
--conn-details-output string path to export host connection information (host, username and privateKey)
--cpus int32 Number of CPUs for the cloud instance (default 8)
--custom-image string custom AMI name (overrides version and accelerator)
--disk-size int Disk size in GB (default 2000)
--gpus int32 Number of GPUs
--memory int32 Amount of RAM for the cloud instance in GiB (default 64)
--spot if spot is set the spot prices across all regions will be checked and machine will be started on best spot option (price / eviction)
--spot-eviction-tolerance string if spot is enabled we can define the minimum tolerance level of eviction. Allowed values are: lowest, low, medium, high or highest (default "lowest")
--tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default [])
--timeout string set a timeout for the instance (e.g. 4h)
--version string version for the RHELAI OS (default "3.0.0")
-h, --help help for create

Global Flags:
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
--project-name string project name to identify the instance of the stack
```

#### Outputs

It will create a RHEL AI instance and will give as result several files located at path defined by `--conn-details-output`:

* **host**: host for the instance (load balancer DNS if spot)
* **username**: username to connect to the machine
* **id_rsa**: private key to connect to the machine

Also, it will create a state folder holding the state for the created resources at AWS, the path for this folder is defined within `--backed-url`, the content from that folder is required with the same project name (`--project-name`) in order to destroy the resources.

#### Container

When running the container image it is required to pass the authentication information as variables, following a sample snippet on how to create an instance with default values:

```bash
podman run -d --name mapt-rhelai \
-v ${PWD}:/workspace:z \
-e AWS_ACCESS_KEY_ID=XXX \
-e AWS_SECRET_ACCESS_KEY=XXX \
-e AWS_DEFAULT_REGION=us-east-1 \
quay.io/redhat-developer/mapt:0.7.0-dev aws \
rhel-ai create \
--project-name mapt-rhelai \
--backed-url file:///workspace \
--conn-details-output /workspace \
--spot
```

### Destroy

```bash
podman run -d --rm \
-v ${PWD}:/workspace:z \
-e AWS_ACCESS_KEY_ID=XXX \
-e AWS_SECRET_ACCESS_KEY=XXX \
-e AWS_DEFAULT_REGION=us-east-1 \
quay.io/redhat-developer/mapt:0.7.0-dev aws \
rhel-ai destroy \
--project-name mapt-rhelai \
--backed-url file:///workspace
```
113 changes: 113 additions & 0 deletions docs/azure/rhelai.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Overview

mapt offers operations to manage RHEL AI environments on Azure. RHEL AI instances are GPU-enabled machines with pre-installed RHEL AI images from the Azure Compute Gallery, suitable for AI/ML workloads.

## Operations

### List Versions

List available RHEL AI versions for a given accelerator type:

```bash
mapt azure rhel-ai list-versions -h
list-versions

Usage:
mapt azure rhel-ai list-versions [flags]

Flags:
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
-h, --help help for list-versions

Global Flags:
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
--project-name string project name to identify the instance of the stack
```

#### Container

```bash
podman run -it --rm \
-e ARM_TENANT_ID=${ati_value} \
-e ARM_SUBSCRIPTION_ID=${asi_value} \
-e ARM_CLIENT_ID=${aci_value} \
-e ARM_CLIENT_SECRET=${acs_value} \
quay.io/redhat-developer/mapt:0.7.0-dev azure \
rhel-ai list-versions \
--accelerator cuda
```

### Create

This will create a RHEL AI instance according to params specified:

```bash
mapt azure rhel-ai create -h
create

Usage:
mapt azure rhel-ai create [flags]

Flags:
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
--conn-details-output string path to export host connection information (host, username and privateKey)
--cpus int32 Number of CPUs for the cloud instance (default 8)
--custom-image string custom image name (overrides version and accelerator)
--disk-size int Disk size in GB
--gpus int32 Number of GPUs
--memory int32 Amount of RAM for the cloud instance in GiB (default 64)
--spot if spot is set the spot prices across all regions will be checked and machine will be started on best spot option (price / eviction)
--spot-eviction-tolerance string if spot is enabled we can define the minimum tolerance level of eviction. Allowed values are: lowest, low, medium, high or highest (default "lowest")
--tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default [])
--timeout string set a timeout for the instance (e.g. 4h)
--version string version for the RHELAI OS (default "3.0.0")
-h, --help help for create

Global Flags:
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
--project-name string project name to identify the instance of the stack
```

#### Outputs

It will create a RHEL AI instance and will give as result several files located at path defined by `--conn-details-output`:

* **host**: host for the instance (load balancer DNS if spot)
* **username**: username to connect to the machine
* **id_rsa**: private key to connect to the machine

Also, it will create a state folder holding the state for the created resources at Azure, the path for this folder is defined within `--backed-url`, the content from that folder is required with the same project name (`--project-name`) in order to destroy the resources.

#### Container

When running the container image it is required to pass the authentication information as variables, following a sample snippet on how to create an instance with default values:

```bash
podman run -d --name mapt-rhelai \
-v ${PWD}:/workspace:z \
-e ARM_TENANT_ID=${ati_value} \
-e ARM_SUBSCRIPTION_ID=${asi_value} \
-e ARM_CLIENT_ID=${aci_value} \
-e ARM_CLIENT_SECRET=${acs_value} \
quay.io/redhat-developer/mapt:0.7.0-dev azure \
rhel-ai create \
--project-name mapt-rhelai \
--backed-url file:///workspace \
--conn-details-output /workspace \
--spot
```

### Destroy

```bash
podman run -d --rm \
-v ${PWD}:/workspace:z \
-e ARM_TENANT_ID=${ati_value} \
-e ARM_SUBSCRIPTION_ID=${asi_value} \
-e ARM_CLIENT_ID=${aci_value} \
-e ARM_CLIENT_SECRET=${acs_value} \
quay.io/redhat-developer/mapt:0.7.0-dev azure \
rhel-ai destroy \
--project-name mapt-rhelai \
--backed-url file:///workspace
```
Loading