2# Copyright 2024 The Kubeflow authors.
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
8# https://www.apache.org/licenses/LICENSE-2.0
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
17# Default values for spark-operator.
18# This is a YAML-formatted file.
19# Declare variables to be passed into your templates.
21# -- String to partially override release name.
23# -- String to fully override release name.
25# -- Common labels to add to the resources.
27# Image used by the Spark operator.
31 # -- Image repository.
32 repository: chainguard-private/spark-operator
34 # @default -- If not set, the chart appVersion will be used.
35 tag: latest@sha256:68988784d6770df9e8222c6e700fb94ba25519757a2bc75084272e7cd8fee3a6
36 # -- Image pull policy.
37 pullPolicy: IfNotPresent
38 # -- Image pull secrets for private image registry.
40 # - name: <secret-name>
41# Helm hook configuration.
43 # -- Whether to create a Helm pre-install/pre-upgrade hook Job to update CRDs.
45 # Image used by the Helm hook Job.
49 # -- Image repository.
50 repository: chainguard-private/kubectl
52 # @default -- If not set, the chart appVersion will be used.
53 tag: latest@sha256:7a93e691227757aa829b9dc7826e90fc4234d4616b951bf09dc0c358e822c848
54 # -- Node selector for the Helm hook Job.
56 # -- Affinity for the Helm hook Job.
58 # -- List of node taints to tolerate for the Helm hook Job.
61 # -- Number of replicas of controller.
63 # -- Feature gates to enable or disable specific features.
65 - name: PartialRestart
67 - name: LoadSparkDefaults
69 # -- The number of old history to retain to allow rollback.
70 revisionHistoryLimit: 10
72 # -- Specifies whether to enable leader election for controller.
74 # -- Leader election lease duration.
76 # -- Leader election renew deadline.
78 # -- Leader election retry period.
80 # -- Reconcile concurrency, higher values might increase memory usage.
82 # -- Configure the verbosity of logging, can be one of `debug`, `info`, `error`.
84 # -- Configure the encoder of logging, can be one of `console` or `json`.
86 # -- Grace period after a successful spark-submit when driver pod not found errors will be retried. Useful if the driver pod can take some time to be created.
87 driverPodCreationGracePeriod: 10s
88 # -- Specifies the maximum number of Executor pods that can be tracked by the controller per SparkApplication.
89 maxTrackedExecutorPerApp: 1000
90 # -- Timestamp precision for ScheduledSparkApplication run names.
91 # Valid values: nanos (default), micros, millis, seconds, minutes.
92 # Shorter precisions produce shorter names which helps with Kubernetes name length limits.
93 # NOTE: Using lower precisions such as "seconds" or "minutes" increases the risk of name
94 # collisions if multiple runs are created within the same time unit (for example during
95 # reconciliation loops or manual re-triggers). A collision will cause run creation to fail.
96 # Choose a precision compatible with your scheduling frequency: "minutes" is only suitable
97 # for jobs scheduled at most once per minute, "seconds" for jobs scheduled at most once per second.
98 scheduledSparkApplicationTimestampPrecision: nanos
100 # -- Specifies whether to create service for Spark web UI.
103 # -- Specifies whether to create ingress for Spark web UI.
104 # `controller.uiService.enable` must be `true` to enable ingress.
106 # -- Ingress URL format.
107 # Required if `controller.uiIngress.enable` is true.
109 # -- Optionally set the ingressClassName.
111 # -- Optionally set default TLS configuration for the Spark UI's ingress. `ingressTLS` in the SparkApplication spec overrides this.
115 # secretName: "example-secret"
116 # -- Optionally set default ingress annotations for the Spark UI's ingress. `ingressAnnotations` in the SparkApplication spec overrides this.
121 # -- Specifies whether to enable batch scheduler for spark jobs scheduling.
122 # If enabled, users can specify batch scheduler name in spark application.
124 # -- Specifies a list of kube-scheduler names for scheduling Spark pods.
125 kubeSchedulerNames: []
126 # - default-scheduler
127 # -- Default batch scheduler to be used if not specified by the user.
128 # If specified, this value must be either "volcano" or "yunikorn". Specifying any other
129 # value will cause the controller to error on startup.
132 # -- Specifies whether to create a service account for the controller.
134 # -- Optional name for the controller service account.
136 # -- Extra annotations for the controller service account.
138 # -- Auto-mount service account token to the controller pods.
139 automountServiceAccountToken: true
141 # -- Specifies whether to create RBAC resources for the controller.
143 # -- Extra annotations for the controller RBAC resources.
145 # -- Extra labels for controller pods.
150 # -- Extra annotations for controller pods.
155 # -- Volumes for controller pods.
157 # Create a tmp directory to write Spark artifacts to for deployed Spark apps.
161 # -- Node selector for controller pods.
163 # -- Affinity for controller pods.
165 # -- List of node taints to tolerate for controller pods.
167 # -- Priority class for controller pods.
168 priorityClassName: ""
169 # -- Security context for controller pods.
172 # -- Topology spread constraints rely on node labels to identify the topology domain(s) that each Node is in.
173 # Ref: [Pod Topology Spread Constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/).
174 # The labelSelector field in topology spread constraint will be set to the selector labels for controller pods if not specified.
175 topologySpreadConstraints: []
177 # topologyKey: topology.kubernetes.io/zone
178 # whenUnsatisfiable: ScheduleAnyway
180 # topologyKey: kubernetes.io/hostname
181 # whenUnsatisfiable: DoNotSchedule
183 # -- Whether to use user namespace or not
184 # Kubernetes version 1.30 for feature beta (1.33 for GA) or higher is required with support from OS and OCI runtime
185 # ref: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/
187 # -- Environment variables for controller containers.
189 # -- Environment variable sources for controller containers.
191 # -- Volume mounts for controller containers.
193 # Mount a tmp directory to write Spark artifacts to for deployed Spark apps.
197 # -- Pod resource requests and limits for controller containers.
198 # Note, that each job submission will spawn a JVM within the controller pods using "/usr/local/openjdk-11/bin/java -Xmx128m".
199 # Kubernetes may kill these Java processes at will to enforce resource limits. When that happens, you will see the following error:
200 # 'failed to run spark-submit for SparkApplication [...]: signal: killed' - when this happens, you may want to increase memory limits.
209 # -- Security context for controller containers.
211 readOnlyRootFilesystem: true
213 allowPrivilegeEscalation: false
220 # -- Sidecar containers for controller pods.
222 # Pod disruption budget for controller to avoid service degradation.
224 # -- Specifies whether to create pod disruption budget for controller.
225 # Ref: [Specifying a Disruption Budget for your Application](https://kubernetes.io/docs/tasks/run-application/configure-pdb/)
227 # -- The number of pods that must be available.
228 # Require `controller.replicas` to be greater than 1
231 # -- Specifies whether to enable pprof.
233 # -- Specifies pprof port.
235 # -- Specifies pprof service port name.
237 # Workqueue rate limiter configuration forwarded to the controller-runtime Reconciler.
238 workqueueRateLimiter:
239 # -- Specifies the average rate of items process by the workqueue rate limiter.
241 # -- Specifies the maximum number of items that can be in the workqueue at any given time.
244 # -- Specifies whether to enable max delay for the workqueue rate limiter.
245 # This is useful to avoid losing events when the workqueue is full.
247 # -- Specifies the maximum delay duration for the workqueue rate limiter.
250 # -- Specifies whether to enable webhook.
252 # -- Number of replicas of webhook server.
254 # -- The number of old history to retain to allow rollback.
255 revisionHistoryLimit: 10
257 # -- Specifies whether to enable leader election for webhook.
259 # -- Configure the verbosity of logging, can be one of `debug`, `info`, `error`.
261 # -- Configure the encoder of logging, can be one of `console` or `json`.
263 # -- Specifies webhook port.
265 # -- Specifies webhook service port name.
267 # -- Specifies how unrecognized errors are handled.
268 # Available options are `Ignore` or `Fail`.
270 # -- Specifies the timeout seconds of the webhook, the value must be between 1 and 30.
272 resourceQuotaEnforcement:
273 # -- Specifies whether to enable the ResourceQuota enforcement for SparkApplication resources.
276 # -- Specifies whether to create a service account for the webhook.
278 # -- Optional name for the webhook service account.
280 # -- Extra annotations for the webhook service account.
282 # -- Auto-mount service account token to the webhook pods.
283 automountServiceAccountToken: true
285 # -- Specifies whether to create RBAC resources for the webhook.
287 # -- Extra annotations for the webhook RBAC resources.
289 # -- Extra labels for webhook pods.
294 # -- Extra annotations for webhook pods.
299 # -- Sidecar containers for webhook pods.
301 # -- Volumes for webhook pods.
303 # Create a dir for the webhook to generate its certificates in.
304 - name: serving-certs
307 # -- Node selector for webhook pods.
309 # -- Affinity for webhook pods.
311 # -- List of node taints to tolerate for webhook pods.
313 # -- Priority class for webhook pods.
314 priorityClassName: ""
315 # -- Security context for webhook pods.
318 # -- Topology spread constraints rely on node labels to identify the topology domain(s) that each Node is in.
319 # Ref: [Pod Topology Spread Constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/).
320 # The labelSelector field in topology spread constraint will be set to the selector labels for webhook pods if not specified.
321 topologySpreadConstraints: []
323 # topologyKey: topology.kubernetes.io/zone
324 # whenUnsatisfiable: ScheduleAnyway
326 # topologyKey: kubernetes.io/hostname
327 # whenUnsatisfiable: DoNotSchedule
329 # -- Whether to use user namespace or not
330 # Kubernetes version 1.30 for feature beta (1.33 for GA) or higher is required with support from OS and OCI runtime
331 # ref: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/
333 # -- Environment variables for webhook containers.
335 # -- Environment variable sources for webhook containers.
337 # -- Volume mounts for webhook containers.
339 # Mount a dir for the webhook to generate its certificates in.
340 - name: serving-certs
341 mountPath: /etc/k8s-webhook-server/serving-certs
342 subPath: serving-certs
344 # -- Pod resource requests and limits for webhook pods.
353 # -- Security context for webhook containers.
355 readOnlyRootFilesystem: true
357 allowPrivilegeEscalation: false
364 # Pod disruption budget for webhook to avoid service degradation.
366 # -- Specifies whether to create pod disruption budget for webhook.
367 # Ref: [Specifying a Disruption Budget for your Application](https://kubernetes.io/docs/tasks/run-application/configure-pdb/)
369 # -- The number of pods that must be available.
370 # Require `webhook.replicas` to be greater than 1
373 # -- List of namespaces where to run spark jobs.
374 # If empty string is included, all namespaces will be allowed.
375 # Namespaces specified here will be watched in addition to those matching jobNamespaceSelector.
376 # Make sure the namespaces have already existed.
379 # -- Label selector to filter namespaces to watch.
380 # Supports standard Kubernetes label selector syntax (e.g., 'spark-operator=enabled,env in (prod,staging)').
381 # Namespaces matching this selector will be watched in addition to those in jobNamespaces.
382 # When specified, requires ClusterRole permission to list and watch namespaces.
383 # Leave empty to disable namespace selector functionality.
384 jobNamespaceSelector: ""
386 # -- Specifies whether to create a service account for spark applications.
388 # -- Optional name for the spark service account.
390 # -- Optional annotations for the spark service account.
392 # -- Auto-mount service account token to the spark applications pods.
393 automountServiceAccountToken: true
395 # -- Specifies whether to create RBAC resources for spark applications.
397 # -- Optional annotations for the spark application RBAC resources.
401 # -- Specifies whether to enable prometheus metrics scraping.
405 # -- Metrics port name.
407 # -- Metrics serving endpoint.
409 # -- Metrics prefix, will be added to all exported metrics.
411 # -- Job Start Latency histogram buckets. Specified in seconds.
412 jobStartLatencyBuckets: "30,60,90,120,150,180,210,240,270,300"
413 # -- Labels to be added to the Spark Operator standard metrics, e.g., "label1Key,label2Key".
414 # Defaults to 'app_type' if not set.
416 # Prometheus pod monitor for controller pods
418 # -- Specifies whether to create pod monitor.
419 # Note that prometheus metrics should be enabled as well.
421 # -- Pod monitor labels
423 # -- The label to use to retrieve the job name from
424 jobLabel: spark-operator-podmonitor
425 # -- Prometheus metrics endpoint properties. `metrics.portName` will be used as a port
430 # -- Specifies whether to use [cert-manager](https://cert-manager.io) to generate certificate for webhook.
431 # `webhook.enable` must be set to `true` to enable cert-manager.
433 # -- The reference to the issuer.
434 # @default -- A self-signed issuer will be created and used if not specified.
436 # group: cert-manager.io
437 # kind: ClusterIssuer
439 # -- The duration of the certificate validity (e.g. `2160h`).
440 # See [cert-manager.io/v1.Certificate](https://cert-manager.io/docs/reference/api-docs/#cert-manager.io/v1.Certificate).
441 # @default -- `2160h` (90 days) will be used if not specified.
443 # -- The duration before the certificate expiration to renew the certificate (e.g. `720h`).
444 # See [cert-manager.io/v1.Certificate](https://cert-manager.io/docs/reference/api-docs/#cert-manager.io/v1.Certificate).
445 # @default -- 1/3 of issued certificate’s lifetime.