diff --git a/autoshift/values.hub.yaml b/autoshift/values.hub.yaml index 7a349073..35e1f688 100644 --- a/autoshift/values.hub.yaml +++ b/autoshift/values.hub.yaml @@ -267,6 +267,13 @@ hubClusterSets: # Suffix appended to catalog source names for mirrored catalogs # Example: redhat-operators + -mirror = redhat-operators-mirror mirror-catalog-suffix: 'mirror' + ### nvidia-gpu + nvidia-gpu: 'false' + nvidia-gpu-subscription-name: gpu-operator-certified + nvidia-gpu-channel: v25.10 + nvidia-gpu-source: redhat-operators + nvidia-gpu-source-namespace: openshift-marketplace + nvidia-gpu-version: 'gpu-operator-certified.v25.10.1' ### node-feature-discovery node-feature-discovery: 'false' node-feature-discovery-subscription-name: nfd diff --git a/policies/nvidia-gpu/Chart.yaml b/policies/nvidia-gpu/Chart.yaml new file mode 100644 index 00000000..3f1d7683 --- /dev/null +++ b/policies/nvidia-gpu/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: nvidia-gpu +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" \ No newline at end of file diff --git a/policies/nvidia-gpu/README.md b/policies/nvidia-gpu/README.md new file mode 100644 index 00000000..57bb2601 --- /dev/null +++ b/policies/nvidia-gpu/README.md @@ -0,0 +1,274 @@ +# nvidia-gpu AutoShift Policy + +## Overview +This policy installs the gpu-operator-certified operator using AutoShift patterns. + +## Status +✅ **Operator Installation**: Ready to deploy +🔧 **Configuration**: Requires operator-specific setup (see below) + +## Quick Deploy + +### Test Locally +```bash +# Validate policy renders correctly +helm template policies/nvidia-gpu/ +``` + +### Enable on Clusters +Edit AutoShift values files to add the operator labels: + +```yaml +# In autoshift/values.hub.yaml (or values.sbx.yaml, etc.) +hubClusterSets: + hub: + labels: + nvidia-gpu: 'true' + nvidia-gpu-subscription-name: 'gpu-operator-certified' + nvidia-gpu-channel: 'v25.10' + nvidia-gpu-source: 'certified-operators' + nvidia-gpu-source-namespace: 'openshift-marketplace' + # nvidia-gpu-version: 'gpu-operator-certified.v1.x.x' # Optional: pin to specific CSV version + +managedClusterSets: + managed: + labels: + nvidia-gpu: 'true' + nvidia-gpu-subscription-name: 'gpu-operator-certified' + nvidia-gpu-channel: 'v25.10' + nvidia-gpu-source: 'certified-operators' + nvidia-gpu-source-namespace: 'openshift-marketplace' + # nvidia-gpu-version: 'gpu-operator-certified.v1.x.x' # Optional: pin to specific CSV version + +# For specific clusters (optional override) +clusters: + my-cluster: + labels: + nvidia-gpu: 'true' + nvidia-gpu-channel: 'fast' # Override channel for this cluster +``` + +Labels are automatically propagated to clusters via the cluster-labels policy. + +### Add to AutoShift ApplicationSet +Edit `autoshift/templates/applicationset.yaml` and add: +```yaml +- name: nvidia-gpu + path: policies/nvidia-gpu + helm: + valueFiles: + - values.yaml +``` + +## Configuration + +### Namespace Scope +This operator is configured as: +- **Cluster-scoped**: Manages resources across all namespaces (default) +- **Namespace-scoped**: Limited to specific target namespaces (if `targetNamespaces` enabled in values.yaml) + +To change scope, edit `values.yaml` and uncomment/configure the `targetNamespaces` field. + +### Version Control +This policy supports AutoShift's operator version control system: + +- **Automatic Upgrades**: By default, the operator follows automatic upgrade paths within its channel +- **Version Pinning**: Add `nvidia-gpu-version` label to pin to a specific CSV version +- **Manual Control**: Pinned versions require manual updates to upgrade + +To pin to a specific version, add the version label to your cluster or clusterset: +```yaml +nvidia-gpu-version: 'gpu-operator-certified.v1.x.x' +``` + +Find available CSV versions: +```bash +# List available versions for this operator +oc get packagemanifests gpu-operator-certified -o jsonpath='{.status.channels[*].currentCSV}' +``` + +## Next Steps: Configuration + +### 1. Explore Installed CRDs +After operator installation, check what Custom Resources are available: +```bash +# Wait for operator to install +oc get pods -n nvidia-gpu-operator + +# Check available CRDs +oc get crds | grep nvidia-gpu + +# Explore CRD specifications +oc explain +``` + +### 2. Create Configuration Policies +Add operator-specific configuration policies to `templates/` directory. + +#### Common Patterns: +- `policy-nvidia-gpu-config.yaml` - Main configuration +- `policy-nvidia-gpu-.yaml` - Feature-specific configs + +#### Template Structure: +```yaml +{{- $policyName := "policy-nvidia-gpu-config" }} +{{- $placementName := "placement-policy-nvidia-gpu-config" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + dependencies: + - name: policy-nvidia-gpu-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: nvidia-gpu-config + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: # Your operator's API version + kind: # Your operator's Custom Resource + metadata: + name: nvidia-gpu-config + namespace: {{ .Values.nvidiaGpu.namespace }} + spec: + # Your operator-specific configuration + # Use dynamic labels when needed: + # setting: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-setting" | default "default-value" {{ "hub}}" }}' + pruneObjectBehavior: None +--- +# Use same placement as operator install or create specific targeting +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/nvidia-gpu' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy +``` + +### 3. Reference Examples +**Study similar complexity policies:** +- **Simple**: `policies/openshift-gitops/` - Basic operator + ArgoCD config +- **Medium**: `policies/advanced-cluster-security/` - Multiple related policies +- **Complex**: `policies/metallb/` - Multiple configuration types (L2, BGP, etc.) +- **Advanced**: `policies/openshift-data-foundation/` - Storage cluster configuration + +### 4. AutoShift Labels +Add configuration labels to `values.yaml` and use in templates: + +```yaml +# Add to values.yaml AutoShift Labels Documentation: +# nvidia-gpu-setting: Configuration option (default: 'value') +# nvidia-gpu-feature-enabled: Enable optional feature (default: 'false') +# nvidia-gpu-provider: Provider-specific config (default: 'generic') + +# Use in templates: +setting: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-setting" | default "default-value" {{ "hub}}" }}' +``` + +## Common Patterns + +### CSV Status Checking (Optional) +For operators that need installation verification: +```yaml +- objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: nvidia-gpu-csv-status + spec: + remediationAction: inform + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: operators.coreos.com/v1alpha1 + kind: ClusterServiceVersion + metadata: + namespace: {{ .Values.nvidiaGpu.namespace }} + status: + phase: Succeeded +``` + +### ArgoCD Sync Annotations (If Needed) +For policies requiring special sync behavior: +```yaml +annotations: + argocd.argoproj.io/sync-options: Prune=false,SkipDryRunOnMissingResource=true + argocd.argoproj.io/compare-options: IgnoreExtraneous + argocd.argoproj.io/sync-wave: "1" +``` + +## Troubleshooting + +### Policy Not Applied +1. Check cluster labels: `oc get managedcluster --show-labels` +2. Verify placement: `oc get placement -n open-cluster-policies` +3. Check policy status: `oc describe policy policy-nvidia-gpu-operator-install` + +### Operator Installation Issues +1. Check subscription: `oc get subscription -n nvidia-gpu-operator` +2. Check install plan: `oc get installplan -n nvidia-gpu-operator` +3. Verify operator source exists: `oc get catalogsource -n openshift-marketplace` + +### Template Rendering Issues +1. Test locally: `helm template policies/nvidia-gpu/` +2. Check hub escaping: Look for `{{ "{{hub" }} ... {{ "hub}}" }}` patterns +3. Validate YAML: `helm lint policies/nvidia-gpu/` + +## Resources +- [Operator Documentation](https://operatorhub.io/operator/gpu-operator-certified) - Find your operator details +- [AutoShift Policy Patterns](../../README-DEVELOPER.md) - Comprehensive policy development guide +- [ACM Policy Documentation](https://access.redhat.com/documentation/en-us/red_hat_advanced_cluster_management_for_kubernetes) - Policy syntax reference in Governence Section +- [Similar Policies](../) - Browse other policies for patterns and examples \ No newline at end of file diff --git a/policies/nvidia-gpu/templates/policy-nvidia-cluster-policy-deploy.yaml b/policies/nvidia-gpu/templates/policy-nvidia-cluster-policy-deploy.yaml new file mode 100644 index 00000000..f47c693e --- /dev/null +++ b/policies/nvidia-gpu/templates/policy-nvidia-cluster-policy-deploy.yaml @@ -0,0 +1,164 @@ +{{- $policyName := "policy-nvidia-cluster-policy-deploy" }} +{{- $placementName := "placement-policy-nvidia-cluster-policy-deploy" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: +{{ if (($.Values.autoshift).dryRun) }} + remediationAction: inform +{{ end }} + disabled: false + dependencies: + - name: policy-nvidia-gpu-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: nvidia-cluster-policy-deploy + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + kind: ClusterPolicy + apiVersion: nvidia.com/v1 + metadata: + name: gpu-cluster-policy + spec: + operator: + defaultRuntime: crio + use_ocp_driver_toolkit: true + initContainer: {} + cdi: + enabled: true + sandboxWorkloads: + enabled: true + defaultWorkload: container + driver: + enabled: true + useNvidiaDriverCRD: false + kernelModuleType: auto + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + repoConfig: + configMapName: '' + certConfig: + name: '' + licensingConfig: + nlsEnabled: true + secretName: '' + virtualTopology: + config: '' + kernelModuleConfig: + name: '' + dcgmExporter: + enabled: true + config: + name: '' + serviceMonitor: + enabled: true + dcgm: + enabled: true + daemonsets: + updateStrategy: RollingUpdate + rollingUpdate: + maxUnavailable: '1' + devicePlugin: + enabled: true + config: + name: '' + default: '' + mps: + root: /run/nvidia/mps + gfd: + enabled: true + migManager: + enabled: true + nodeStatusExporter: + enabled: true + mig: + strategy: single + toolkit: + enabled: true + validator: + plugin: + env: [] + vgpuManager: + enabled: false + vgpuDeviceManager: + enabled: true + sandboxDevicePlugin: + enabled: true + vfioManager: + enabled: true + gds: + enabled: false + gdrcopy: + enabled: false +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/nvidia-gpu' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy diff --git a/policies/nvidia-gpu/templates/policy-nvidia-gpu-operator-install.yaml b/policies/nvidia-gpu/templates/policy-nvidia-gpu-operator-install.yaml new file mode 100644 index 00000000..deb5ed5c --- /dev/null +++ b/policies/nvidia-gpu/templates/policy-nvidia-gpu-operator-install.yaml @@ -0,0 +1,104 @@ +{{- $policyName := "policy-nvidia-gpu-operator-install" }} +{{- $placementName := "placement-policy-nvidia-gpu-operator-install" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: +{{ if (($.Values.autoshift).dryRun) }} + remediationAction: inform +{{ end }} + disabled: false + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: nvidia-gpu-operator-ns + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: {{ .Values.nvidiaGpu.namespace }} + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1beta1 + kind: OperatorPolicy + metadata: + name: install-nvidia-gpu + spec: + remediationAction: enforce + severity: high + complianceType: musthave + operatorGroup: + name: {{ .Values.nvidiaGpu.operatorGroupName }} + namespace: {{ .Values.nvidiaGpu.namespace }} + {{- if .Values.nvidiaGpu.targetNamespaces }} + targetNamespaces: + {{- range .Values.nvidiaGpu.targetNamespaces }} + - {{ . }} + {{- end }} + {{- end }} + subscription: + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-version" | default "" {{ "hub}}" }}' + namespace: {{ .Values.nvidiaGpu.namespace }} + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-channel" | default "{{ .Values.nvidiaGpu.channel }}" {{ "hub}}" }}' + name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-subscription-name" | default "{{ .Values.nvidiaGpu.name }}" {{ "hub}}" }}' + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-source" | default "{{ .Values.nvidiaGpu.source }}" {{ "hub}}" }}' + sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-source-namespace" | default "{{ .Values.nvidiaGpu.sourceNamespace }}" {{ "hub}}" }}' + # Upgrade approval for subscription will be set based on if version is set not this flag + # If version is set install plan is set to manual and only the version specified will be installed + upgradeApproval: Automatic + versions: + - '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-version" | default "" {{ "hub}}" }}' +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/nvidia-gpu' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy \ No newline at end of file diff --git a/policies/nvidia-gpu/values.yaml b/policies/nvidia-gpu/values.yaml new file mode 100644 index 00000000..cda3cdf1 --- /dev/null +++ b/policies/nvidia-gpu/values.yaml @@ -0,0 +1,43 @@ +# Default values for nvidia-gpu +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +policy_namespace: open-cluster-policies +nvidiaGpu: + name: gpu-operator-certified + namespace: nvidia-gpu-operator + channel: v25.10 + source: certified-operators + sourceNamespace: openshift-marketplace + operatorGroupName: nvidia-gpu-operator-group + targetNamespaces: # Target namespaces for namespace-scoped operators + - nvidia-gpu-operator + +# hubClusterSets: +# hub: +# labels: +# test1: 'test' +# managedClusterSets: +# managed: +# labels: +# test4: 'test' + +### AutoShift Labels Documentation +# The following labels can be set at the cluster or clusterset level to configure this policy: +# +# Enable/Disable: +# nvidia-gpu: 'true' or 'false' - Controls whether nvidia-gpu is managed +# +# Configuration: +# nvidia-gpu-subscription-name: Operator subscription name (default: 'gpu-operator-certified') +# nvidia-gpu-channel: Operator channel (default: 'v25.10') +# nvidia-gpu-version: Specific operator version (CSV) to install (optional) +# nvidia-gpu-source: Operator catalog source (default: 'certified-operators') +# nvidia-gpu-source-namespace: Catalog namespace (default: 'openshift-marketplace') +# +# Examples: +# autoshift.io/nvidia-gpu: 'true' +# autoshift.io/nvidia-gpu-subscription-name: 'gpu-operator-certified' +# autoshift.io/nvidia-gpu-channel: 'v25.10' +# autoshift.io/nvidia-gpu-version: 'operator-name.v1.x.x' +# autoshift.io/nvidia-gpu-source: 'certified-operators' +# autoshift.io/nvidia-gpu-source-namespace: 'openshift-marketplace' \ No newline at end of file