diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 9d228de9b5908..655545c5d2c9e 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -1693,6 +1693,18 @@ ], "sqlState" : "42702" }, + "EXECUTOR_KUBERNETES_SERVICE_COOL_DOWN_PERIOD_INVALID" : { + "message" : [ + "The executor Kubernetes service cool down period of seconds configured via must not be negative." + ], + "sqlState" : "42000" + }, + "EXECUTOR_KUBERNETES_SERVICE_REQUIRES_BLOCK_MANAGER_PORT" : { + "message" : [ + "Enabling the executor Kubernetes service requires to be set to a positive number, for instance ." + ], + "sqlState" : "42000" + }, "EXEC_IMMEDIATE_DUPLICATE_ARGUMENT_ALIASES" : { "message" : [ "The USING clause of this EXECUTE IMMEDIATE command contained multiple arguments with same alias (), which is invalid; please update the command to specify unique aliases and then try it again." diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala index 5c26dea417acc..60f8b2ea5222b 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala @@ -466,6 +466,35 @@ private[spark] object Config extends Logging { .toSequence .createWithDefault(Nil) + val KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD_KEY = + "spark.kubernetes.executor.service.coolDownPeriod" + val KUBERNETES_EXECUTOR_SERVICE_ENABLED = + ConfigBuilder("spark.kubernetes.executor.service.enabled") + .doc("If true, a Kubernetes service is created for the executor. " + + "An executor is usually connected to via the pod IP. Connecting to a decommissioned" + + "executor fails after a 'connection timeout', which is set via NETWORK_TIMEOUT and " + + "defaults to 2 minutes. Connecting to the executor via a Kubernetes service instantly " + + "fails with 'connection refused' error. " + + "For this to work, the executor kubernetes service outlives its executor pod by at least " + + KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD_KEY + " seconds. " + + "This kubernetes service provides access to the executor's " + + "block manager, so BLOCK_MANAGER_PORT has to be given a value greater than zero.") + .version("4.2.0") + .booleanConf + .createWithDefault(false) + + val KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD = + ConfigBuilder(KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD_KEY) + .doc(s"The number of seconds the executor kubernetes service enabled via " + + KUBERNETES_EXECUTOR_SERVICE_ENABLED.key + " lives beyond the lifetime of the " + + "corresponding executor pod. The service has to live longer than the executor, " + + "because connecting to a non-existing kubernetes service fails after a 'connection " + + "timeout', which defeats its very purpose. " + + s"See ${KUBERNETES_EXECUTOR_SERVICE_ENABLED.key} for more information.") + .version("4.2.0") + .intConf + .createWithDefault(300) + val KUBERNETES_EXECUTOR_DECOMMISSION_LABEL = ConfigBuilder("spark.kubernetes.executor.decommissionLabel") .doc("Label to apply to a pod which is being decommissioned." + diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala index d9b3c3df945a3..63c75e35a4edc 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala @@ -37,6 +37,9 @@ object Constants { val SPARK_POD_DRIVER_ROLE = "driver" val SPARK_POD_EXECUTOR_ROLE = "executor" val SPARK_EXECUTOR_INACTIVE_LABEL = "spark-exec-inactive" + val SPARK_EXECUTOR_SERVICE_STATE_LABEL = "spark-exec-service-state" + val SPARK_EXECUTOR_SERVICE_ALIVE_STATE = "alive" + val SPARK_EXECUTOR_SERVICE_COOLDOWN_STATE = "cooldown" // Credentials secrets val DRIVER_CREDENTIALS_SECRETS_BASE_DIR = @@ -109,6 +112,11 @@ object Constants { val CONNECT_GRPC_BINDING_PORT = "spark.connect.grpc.binding.port" val EXIT_EXCEPTION_ANNOTATION = "spark.exit-exception" val POD_DELETION_COST = "controller.kubernetes.io/pod-deletion-cost" + val OWNER_REFERENCE_ANNOTATION = "spark.owner-reference" + val OWNER_REFERENCE_ANNOTATION_DRIVER_VALUE = "driver" + val OWNER_REFERENCE_ANNOTATION_EXECUTOR_VALUE = "executor" + val COOLDOWN_PERIOD_ANNOTATION = "spark.cooldown-period" + val COOLDOWN_DEADLINE_ANNOTATION = "spark.cooldown-deadline" // Hadoop Configuration val HADOOP_CONF_VOLUME = "hadoop-properties" diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/ExecutorServiceFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/ExecutorServiceFeatureStep.scala new file mode 100644 index 0000000000000..e779a70bc6966 --- /dev/null +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/ExecutorServiceFeatureStep.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.deploy.k8s.features + +import scala.jdk.CollectionConverters._ + +import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder, ServiceBuilder} + +import org.apache.spark.SparkException +import org.apache.spark.deploy.k8s.{KubernetesExecutorConf, SparkPod} +import org.apache.spark.deploy.k8s.Config.KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD +import org.apache.spark.deploy.k8s.Constants.{COOLDOWN_PERIOD_ANNOTATION, OWNER_REFERENCE_ANNOTATION, OWNER_REFERENCE_ANNOTATION_DRIVER_VALUE, OWNER_REFERENCE_ANNOTATION_EXECUTOR_VALUE, SPARK_APP_ID_LABEL, SPARK_EXECUTOR_ID_LABEL, SPARK_EXECUTOR_SERVICE_ALIVE_STATE, SPARK_EXECUTOR_SERVICE_STATE_LABEL} +import org.apache.spark.internal.config.{BLOCK_MANAGER_PORT, SHUFFLE_SERVICE_PORT} + +class ExecutorServiceFeatureStep(conf: KubernetesExecutorConf) extends KubernetesFeatureConfigStep { + private val service_selector_labels = Set(SPARK_APP_ID_LABEL, SPARK_EXECUTOR_ID_LABEL) + private lazy val selector = conf.labels + .filter { case (key, _) => service_selector_labels.contains(key) } + private lazy val labels = selector ++ + Map(SPARK_EXECUTOR_SERVICE_STATE_LABEL -> SPARK_EXECUTOR_SERVICE_ALIVE_STATE) + + private lazy val sparkAppSelector = getLabel(SPARK_APP_ID_LABEL) + private lazy val sparkExecId = getLabel(SPARK_EXECUTOR_ID_LABEL) + // name length is 8 + 38 + 6 + 10 = 62 + // which fits in KUBERNETES_DNS_LABEL_NAME_MAX_LENGTH = 63 + private lazy val serviceName = s"svc-$sparkAppSelector-exec-$sparkExecId" + + // The service lives for this number of seconds + private val coolDownPeriod = conf.sparkConf.get(KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD) + SparkException.require(coolDownPeriod >= 0, + "EXECUTOR_KUBERNETES_SERVICE_COOL_DOWN_PERIOD_INVALID", + Map( + "period" -> coolDownPeriod.toString, + "key" -> KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD.key)); + + // The executor kubernetes services requires BLOCK_MANAGER_PORT to be set + private val blockManagerPortName = "spark-block-manager" + private val blockManagerPort = conf.sparkConf.get(BLOCK_MANAGER_PORT) + SparkException.require(blockManagerPort > 0, + "EXECUTOR_KUBERNETES_SERVICE_REQUIRES_BLOCK_MANAGER_PORT", + Map( + "blockManagerPortConfigKey" -> BLOCK_MANAGER_PORT.key, + "defaultShuffleServicePort" -> SHUFFLE_SERVICE_PORT.defaultValue.get.toString)); + + private def getLabel(label: String): String = { + val value = conf.labels.get(label) + value.getOrElse( + throw new SparkException(s"This feature step requires label $label") + ) + } + + override def configurePod(pod: SparkPod): SparkPod = { + SparkPod( + new PodBuilder(pod.pod) + .editSpec() + // otherwise, executor pods get 8 environment variables for each other executor service + // with some thousands executor pods you would see ARG_MAX limit issues in endpoint.sh + .withEnableServiceLinks(false) + .endSpec() + .build(), + // tell the executor entry point its Kubernetes service name + new ContainerBuilder(pod.container) + .addNewEnv() + .withName("EXECUTOR_SERVICE_NAME") + .withValue(serviceName) + .endEnv() + .build()) + } + + override def getAdditionalKubernetesResources(): Seq[HasMetadata] = { + val owner = if (coolDownPeriod > 0) { + OWNER_REFERENCE_ANNOTATION_DRIVER_VALUE + } else { + OWNER_REFERENCE_ANNOTATION_EXECUTOR_VALUE + } + + val annotation = Map( + OWNER_REFERENCE_ANNOTATION -> owner, + COOLDOWN_PERIOD_ANNOTATION -> coolDownPeriod.toString + ) + val service = new ServiceBuilder() + .withNewMetadata() + .withName(serviceName) + .withLabels(labels.asJava) + .withAnnotations(annotation.asJava) + .endMetadata() + .withNewSpec() + .withSelector(selector.asJava) + .addNewPort() + .withName(blockManagerPortName) + .withPort(blockManagerPort) + .withNewTargetPort(blockManagerPort) + .endPort() + .endSpec() + .build() + + Seq(service) + } +} diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala index 126d07c0926f4..628ba46ee9097 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala @@ -16,16 +16,18 @@ */ package org.apache.spark.scheduler.cluster.k8s -import java.time.Instant +import java.time.{Instant, ZoneOffset} import java.util.concurrent.{ConcurrentHashMap, TimeUnit} import java.util.concurrent.atomic.AtomicInteger import scala.collection.mutable import scala.jdk.CollectionConverters._ +import scala.util.Try import scala.util.control.NonFatal -import io.fabric8.kubernetes.api.model.{HasMetadata, PersistentVolumeClaim, Pod, PodBuilder} +import io.fabric8.kubernetes.api.model.{HasMetadata, PersistentVolumeClaim, Pod, PodBuilder, ServiceBuilder} import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException} +import io.fabric8.kubernetes.client.dsl.base.{PatchContext, PatchType} import org.apache.spark.{SecurityManager, SparkConf, SparkException} import org.apache.spark.deploy.k8s.Config._ @@ -231,6 +233,74 @@ class ExecutorPodsAllocator( if (snapshots.nonEmpty) { val existingExecs = lastSnapshot.executorPods.keySet _deletedExecutorIds = _deletedExecutorIds.intersect(existingExecs) + + // schedule all services of not-alive executors that have a cooldown period for deletion + val aliveExecs = existingExecs ++ newlyCreatedExecutors.keySet.diff(k8sKnownExecIds.toSet) + Utils.tryLogNonFatalError { + val start = clock.getTimeMillis() + kubernetesClient + .services() + .inNamespace(namespace) + .withLabel(SPARK_APP_ID_LABEL, applicationId) + .withLabel(SPARK_EXECUTOR_SERVICE_STATE_LABEL, SPARK_EXECUTOR_SERVICE_ALIVE_STATE) + .withLabelNotIn(SPARK_EXECUTOR_ID_LABEL, aliveExecs.toSeq.sorted.map(_.toString): _*) + .resources().forEach { service => + val svc = service.get() + val cooldownString = + svc.getMetadata.getAnnotations.get(COOLDOWN_PERIOD_ANNOTATION) + if (cooldownString != null && cooldownString.toIntOption.isDefined) { + val cooldown = cooldownString.toInt + val deadline = + Instant.ofEpochMilli(currentTime + cooldown * 1000).atZone(ZoneOffset.UTC) + logInfo(s"Executor got deleted, removal of " + + s"service ${svc.getMetadata.getName} scheduled in ${cooldown}s") + Utils.tryLogNonFatalError { + service.patch( + PatchContext.of(PatchType.STRATEGIC_MERGE), + new ServiceBuilder() + .withNewMetadata() + .addToLabels( + SPARK_EXECUTOR_SERVICE_STATE_LABEL, + SPARK_EXECUTOR_SERVICE_COOLDOWN_STATE + ) + .addToAnnotations(COOLDOWN_DEADLINE_ANNOTATION, deadline.toString) + .endMetadata() + .build() + ) + } + } + } + val end = clock.getTimeMillis() + logInfo(s"Processed all services with alive state label in ${end - start}ms") + } + } + + // delete services that passed their cooldown deadline + Utils.tryLogNonFatalError { + val start = clock.getTimeMillis() + kubernetesClient + .services() + .inNamespace(namespace) + .withLabel(SPARK_APP_ID_LABEL, applicationId) + .withLabel(SPARK_EXECUTOR_SERVICE_STATE_LABEL, SPARK_EXECUTOR_SERVICE_COOLDOWN_STATE) + .resources().forEach { service => + val svc = service.get + Option(svc.getMetadata.getAnnotations.get(COOLDOWN_DEADLINE_ANNOTATION)) + .flatMap(s => Try(Instant.parse(s)).toOption) + .filter(_.toEpochMilli <= currentTime) + .foreach { deadline => + logInfo(s"Service deadline $deadline has passed current time $currentTime, " + + s"deleting service ${svc.getMetadata.getName}") + try { + service.delete() + } catch { + case NonFatal(e) => + logWarning(s"Failed to delete service $service", e) + } + } + } + val end = clock.getTimeMillis() + logInfo(s"Processed all services with cooldown state label in ${end - start}ms") } val notDeletedPods = lastSnapshot.executorPods @@ -459,10 +529,23 @@ class ExecutorPodsAllocator( .build() val resources = replacePVCsIfNeeded( podWithAttachedContainer, resolvedExecutorSpec.executorKubernetesResources, reusablePVCs) + val refAnnotation = OWNER_REFERENCE_ANNOTATION + val driverValue = OWNER_REFERENCE_ANNOTATION_DRIVER_VALUE + val executorValue = OWNER_REFERENCE_ANNOTATION_EXECUTOR_VALUE + val getOwnerReference = (r: HasMetadata) => + r.getMetadata.getAnnotations.getOrDefault(refAnnotation, executorValue) + val (driverResources, executorResources) = + resources + .filter(r => Set(driverValue, executorValue).contains(getOwnerReference(r))) + .partition(r => getOwnerReference(r) == driverValue) val createdExecutorPod = kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create() try { - addOwnerReference(createdExecutorPod, resources) + addOwnerReference(createdExecutorPod, executorResources) + if (driverResources.nonEmpty && driverPod.nonEmpty) { + addOwnerReference(driverPod.get, driverResources) + } + kubernetesClient.resourceList(resources: _*).forceConflicts().serverSideApply() resources .filter(_.getKind == "PersistentVolumeClaim") .foreach { resource => @@ -484,6 +567,7 @@ class ExecutorPodsAllocator( .inNamespace(namespace) .resource(createdExecutorPod) .delete() + kubernetesClient.resourceList(resources: _*).delete() throw e } } @@ -542,6 +626,16 @@ class ExecutorPodsAllocator( .withLabel(SPARK_ROLE_LABEL, SPARK_POD_EXECUTOR_ROLE) .delete() } + + // delete all services with cooldown periods + Utils.tryLogNonFatalError { + kubernetesClient + .services() + .inNamespace(namespace) + .withLabel(SPARK_APP_ID_LABEL, applicationId) + .withLabel(SPARK_EXECUTOR_SERVICE_STATE_LABEL) + .delete() + } } } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilder.scala index 2253c07df116e..9f8c781b1f815 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilder.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilder.scala @@ -65,6 +65,12 @@ private[spark] class KubernetesExecutorBuilder { } } + val optionalFeatures = Seq( + Some(conf.get(Config.KUBERNETES_EXECUTOR_SERVICE_ENABLED)) + .filter(enabled => enabled) + .map(_ => new ExecutorServiceFeatureStep(conf)) + ).flatten + val allFeatures = Seq( new BasicExecutorFeatureStep(conf, secMgr, resourceProfile), new ExecutorKubernetesCredentialsFeatureStep(conf), @@ -72,7 +78,7 @@ private[spark] class KubernetesExecutorBuilder { new EnvSecretsFeatureStep(conf), new MountVolumesFeatureStep(conf), new HadoopConfExecutorFeatureStep(conf), - new LocalDirsFeatureStep(conf)) ++ userFeatures + new LocalDirsFeatureStep(conf)) ++ optionalFeatures ++ userFeatures val features = allFeatures.filterNot(f => conf.get(Config.KUBERNETES_EXECUTOR_POD_EXCLUDED_FEATURE_STEPS).contains(f.getClass.getName)) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/ExecutorServiceFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/ExecutorServiceFeatureStepSuite.scala new file mode 100644 index 0000000000000..60181df9106fa --- /dev/null +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/ExecutorServiceFeatureStepSuite.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.deploy.k8s.features + +import scala.jdk.CollectionConverters.{CollectionHasAsScala, MapHasAsScala} + +import io.fabric8.kubernetes.api.model.{HasMetadata, Service} +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{SparkConf, SparkFunSuite, SparkIllegalArgumentException} +import org.apache.spark.deploy.k8s.{KubernetesTestConf, SparkPod} +import org.apache.spark.deploy.k8s.Config.KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD +import org.apache.spark.internal.config.BLOCK_MANAGER_PORT + + +class ExecutorServiceFeatureStepSuite extends SparkFunSuite with BeforeAndAfter { + + private var baseConf: SparkConf = _ + + before { + baseConf = new SparkConf(false) + baseConf.set(BLOCK_MANAGER_PORT, 1234) + } + + test("no block manager port") { + baseConf.remove(BLOCK_MANAGER_PORT) + intercept[SparkIllegalArgumentException] { + evaluateStep() + } + } + + test("default configuration") { + val (sparkPod, resources) = evaluateStep() + assertSparkPod(sparkPod) + assertResources(resources, expectedOwner = "driver", expectedCooldownPeriod = 300) + } + + test("custom configuration") { + baseConf.set(KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD, 123) + val (sparkPod, resources) = evaluateStep() + assertSparkPod(sparkPod) + assertResources(resources, expectedOwner = "driver", expectedCooldownPeriod = 123) + } + + test("no cooldown period") { + baseConf.set(KUBERNETES_EXECUTOR_SERVICE_COOL_DOWN_PERIOD, 0) + val (sparkPod, resources) = evaluateStep() + assertSparkPod(sparkPod) + assertResources(resources, expectedOwner = "executor", expectedCooldownPeriod = 0) + } + + private def assertSparkPod(sparkPod: SparkPod): Unit = { + assert(sparkPod.pod.getSpec.getEnableServiceLinks === false) + val env = sparkPod.container.getEnv.asScala.map(v => v.getName -> v.getValue).toMap + assert(env.get("EXECUTOR_SERVICE_NAME") === Some("svc-appId-exec-1")) + } + + private def assertResources( + resources: Seq[HasMetadata], + expectedOwner: String, + expectedCooldownPeriod: Int): Unit = { + assert(resources.size === 1) + assert(resources.head.getKind === "Service") + assertService(resources.head.asInstanceOf[Service], expectedOwner, expectedCooldownPeriod) + } + + private def assertService( + service: Service, + expectedOwner: String, + expectedCooldownPeriod: Int): Unit = { + assert(service.getKind === "Service") + assert(service.getMetadata.getName === "svc-appId-exec-1") + assert(service.getMetadata.getAnnotations.asScala === Map( + "spark.owner-reference" -> expectedOwner, + "spark.cooldown-period" -> expectedCooldownPeriod.toString + )) + assert(service.getSpec.getSelector.asScala === + Map("spark-exec-id" -> "1", "spark-app-selector" -> "appId")) + assert(service.getSpec.getPorts.size() === 1) + val port = service.getSpec.getPorts.get(0) + assert(port.getName === "spark-block-manager") + assert(port.getPort === 1234) + assert(port.getTargetPort.getIntVal === 1234) + } + + private def evaluateStep(): (SparkPod, Seq[HasMetadata]) = { + val executorConf = KubernetesTestConf.createExecutorConf( + sparkConf = baseConf) + val step = new ExecutorServiceFeatureStep(executorConf) + (step.configurePod(SparkPod.initialPod()), step.getAdditionalKubernetesResources()) + } +} diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala index e994ccbed9a65..c82b527b99a7a 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala @@ -16,23 +16,26 @@ */ package org.apache.spark.scheduler.cluster.k8s -import java.time.Instant +import java.time.{Instant, ZoneOffset} import java.time.temporal.ChronoUnit.MILLIS import java.util.concurrent.atomic.AtomicInteger +import java.util.stream.Stream import scala.collection.mutable import scala.jdk.CollectionConverters._ import io.fabric8.kubernetes.api.model._ import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException} -import io.fabric8.kubernetes.client.dsl.PodResource -import org.mockito.{Mock, MockitoAnnotations} +import io.fabric8.kubernetes.client.dsl.{Deletable, MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource, ServerSideApplicable, ServiceResource} +import io.fabric8.kubernetes.client.dsl.base.{PatchContext, PatchType} +import org.mockito.{ArgumentCaptor, Mock, Mockito, MockitoAnnotations} import org.mockito.ArgumentMatchers.{any, anyString, eq => meq} -import org.mockito.Mockito.{never, times, verify, when} +import org.mockito.Mockito.{never, spy, times, verify, when} import org.mockito.invocation.InvocationOnMock import org.mockito.stubbing.Answer import org.scalatest.BeforeAndAfter import org.scalatest.PrivateMethodTester._ +import org.scalatestplus.mockito.MockitoSugar.mock import org.apache.spark.{SecurityManager, SparkConf, SparkException, SparkFunSuite} import org.apache.spark.deploy.k8s.{KubernetesExecutorConf, KubernetesExecutorSpec} @@ -46,6 +49,8 @@ import org.apache.spark.util.ManualClock class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter { + private def doReturn(value: Any) = org.mockito.Mockito.doReturn(value, Seq.empty: _*) + private val driverPodName = "driver" private val driverPod = new PodBuilder() @@ -112,6 +117,11 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter { @Mock private var schedulerBackend: KubernetesClusterSchedulerBackend = _ + @Mock + private var resourceList: RESOURCE_LIST = _ + + private var createdResourcesArgumentCaptor: ArgumentCaptor[Array[HasMetadata]] = _ + private var snapshotsStore: DeterministicExecutorPodsSnapshotsStore = _ private var podsAllocatorUnderTest: ExecutorPodsAllocator = _ @@ -121,6 +131,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter { val appId = "testapp" before { + createdResourcesArgumentCaptor = ArgumentCaptor.forClass(classOf[Array[HasMetadata]]) MockitoAnnotations.openMocks(this).close() when(kubernetesClient.pods()).thenReturn(podOperations) when(podOperations.inNamespace("default")).thenReturn(podsWithNamespace) @@ -144,12 +155,21 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter { conf, secMgr, executorBuilder, kubernetesClient, snapshotsStore, waitForExecutorPodsClock) when(schedulerBackend.getExecutorIds()).thenReturn(Seq.empty) podsAllocatorUnderTest.start(TEST_SPARK_APP_ID, schedulerBackend) + val apl = mock[NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[HasMetadata]] + val ssa = mock[ServerSideApplicable[java.util.List[HasMetadata]]] + when(apl.forceConflicts()).thenReturn(ssa) + when(kubernetesClient.resourceList()).thenReturn(apl) + when(kubernetesClient.resourceList(any[HasMetadata]())).thenReturn(apl) when(kubernetesClient.persistentVolumeClaims()).thenReturn(persistentVolumeClaims) when(persistentVolumeClaims.inNamespace("default")).thenReturn(pvcWithNamespace) when(pvcWithNamespace.withLabel(any(), any())).thenReturn(labeledPersistentVolumeClaims) when(pvcWithNamespace.resource(any())).thenReturn(pvcResource) when(labeledPersistentVolumeClaims.list()).thenReturn(persistentVolumeClaimList) when(persistentVolumeClaimList.getItems).thenReturn(Seq.empty[PersistentVolumeClaim].asJava) + when(resourceList.forceConflicts()).thenReturn(resourceList) + doReturn(resourceList) + .when(kubernetesClient) + .resourceList(createdResourcesArgumentCaptor.capture(): _*) } test("SPARK-49447: Prevent small values less than 100 for batch delay") { @@ -775,6 +795,346 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter { assert(!podsAllocatorUnderTest.isDeleted("7")) } + test("SPARK-55585: executor feature steps can create resources") { + val service = new ServiceBuilder() + .withNewMetadata() + .withName("servicename") + .endMetadata() + .build() + + when(executorBuilder.buildFromFeatures(any(classOf[KubernetesExecutorConf]), meq(secMgr), + // have the feature step define a kubernetes service (resource) + meq(kubernetesClient), any(classOf[ResourceProfile]))) + .thenAnswer((invocation: InvocationOnMock) => { + val k8sConf: KubernetesExecutorConf = invocation.getArgument(0) + KubernetesExecutorSpec( + executorPodWithId(k8sConf.executorId.toInt, k8sConf.resourceProfileId), + Seq(service)) + }) + + val startTime = Instant.now.toEpochMilli + waitForExecutorPodsClock.setTime(startTime) + + // Scale up to one executor + podsAllocatorUnderTest.setTotalExpectedExecutors( + Map(defaultProfile -> 1)) + assert(podsAllocatorUnderTest.invokePrivate(numOutstandingPods).get() == 1) + verify(podsWithNamespace).resource(podWithAttachedContainerForId(1)) + + // service is considered for creation + // resources should have been created + verify(kubernetesClient, times(1)).resourceList(meq(service)) + verify(resourceList, times(1)).serverSideApply() + } + + test("SPARK-55587: executor feature steps resources ownership") { + val executorMetadata = mock[ObjectMeta] + when(executorMetadata.getName).thenReturn("executor-name") + when(executorMetadata.getUid).thenReturn("executor-uid") + + val executorPod = mock[Pod] + when(podResource.create()).thenReturn(executorPod) + when(executorPod.getMetadata).thenReturn(executorMetadata) + when(executorPod.getApiVersion).thenReturn("executor-version") + when(executorPod.getKind).thenReturn("executor-kind") + + val service1 = new ServiceBuilder() + .withNewMetadata() + .withName("service1") + .endMetadata() + .build() + val service2 = new ServiceBuilder() + .withNewMetadata() + .withName("service2") + .withAnnotations( + Map(OWNER_REFERENCE_ANNOTATION -> OWNER_REFERENCE_ANNOTATION_EXECUTOR_VALUE).asJava + ) + .endMetadata() + .build() + val service3 = new ServiceBuilder() + .withNewMetadata() + .withName("service3") + .withAnnotations( + Map(OWNER_REFERENCE_ANNOTATION -> OWNER_REFERENCE_ANNOTATION_DRIVER_VALUE).asJava + ) + .endMetadata() + .build() + val service4 = new ServiceBuilder() + .withNewMetadata() + .withName("service4") + .withAnnotations( + Map(OWNER_REFERENCE_ANNOTATION -> "none").asJava + ) + .endMetadata() + .build() + + when(executorBuilder.buildFromFeatures(any(classOf[KubernetesExecutorConf]), meq(secMgr), + // have the feature step define a kubernetes service (resource) + meq(kubernetesClient), any(classOf[ResourceProfile]))) + .thenAnswer((invocation: InvocationOnMock) => { + val k8sConf: KubernetesExecutorConf = invocation.getArgument(0) + KubernetesExecutorSpec( + executorPodWithId(k8sConf.executorId.toInt, k8sConf.resourceProfileId), + Seq(service1, service2, service3, service4)) + }) + + assert(service1.getMetadata.getOwnerReferences.isEmpty) + assert(service2.getMetadata.getOwnerReferences.isEmpty) + assert(service3.getMetadata.getOwnerReferences.isEmpty) + assert(service4.getMetadata.getOwnerReferences.isEmpty) + + val startTime = Instant.now.toEpochMilli + waitForExecutorPodsClock.setTime(startTime) + + // Scale up to one executor + podsAllocatorUnderTest.setTotalExpectedExecutors( + Map(defaultProfile -> 1)) + assert(podsAllocatorUnderTest.invokePrivate(numOutstandingPods).get() == 1) + verify(podsWithNamespace).resource(podWithAttachedContainerForId(1)) + verify(podResource).create() + + // ownership references of services updated + // executor owns service1 (default) + assert(service1.getMetadata.getOwnerReferences.size() === 1) + assert(service1.getMetadata.getOwnerReferences.get(0).getName === "executor-name") + // executor owns service2 (through annotation) + assert(service2.getMetadata.getOwnerReferences.size() === 1) + assert(service2.getMetadata.getOwnerReferences.get(0).getName === "executor-name") + // driver owns service3 (through annotation) + assert(service3.getMetadata.getOwnerReferences.size() === 1) + assert(service3.getMetadata.getOwnerReferences.get(0).getName === "driver") + // nothing owns service 4 + assert(service4.getMetadata.getOwnerReferences.isEmpty) + } + + test("SPARK-55587: executor feature steps resources deleted on failure") { + val service = new ServiceBuilder() + .withNewMetadata() + .withName("service") + .withAnnotations( + Map(OWNER_REFERENCE_ANNOTATION -> "none").asJava + ) + .endMetadata() + .build() + + when(executorBuilder.buildFromFeatures(any(classOf[KubernetesExecutorConf]), meq(secMgr), + // have the feature step define a kubernetes service (resource) + meq(kubernetesClient), any(classOf[ResourceProfile]))) + .thenAnswer((invocation: InvocationOnMock) => { + val k8sConf: KubernetesExecutorConf = invocation.getArgument(0) + KubernetesExecutorSpec( + executorPodWithId(k8sConf.executorId.toInt, k8sConf.resourceProfileId), + Seq(service)) + }) + + // force an exception on resourceList.serverSideApply + when(resourceList.serverSideApply()).thenAnswer( + _ => throw new RuntimeException("test exception") + ) + + val startTime = Instant.now.toEpochMilli + waitForExecutorPodsClock.setTime(startTime) + + // Scale up to one executor, this should fail + intercept[RuntimeException] { + podsAllocatorUnderTest.setTotalExpectedExecutors( + Map(defaultProfile -> 1)) + } + verify(podsWithNamespace).resource(podWithAttachedContainerForId(1)) + + // resources should have been deleted on failure + verify(resourceList, times(1)).delete() + } + + test("SPARK-52505: executor feature steps service cooldown period") { + val executorMetadata = mock[ObjectMeta] + when(executorMetadata.getName).thenReturn("executor-name") + when(executorMetadata.getUid).thenReturn("executor-uid") + + val executorPod = mock[Pod] + when(podResource.create()).thenReturn(executorPod) + when(executorPod.getMetadata).thenReturn(executorMetadata) + when(executorPod.getApiVersion).thenReturn("executor-version") + when(executorPod.getKind).thenReturn("executor-kind") + + val appId = TEST_SPARK_APP_ID + val appIdLabel = SPARK_APP_ID_LABEL + val execIdLabel = SPARK_EXECUTOR_ID_LABEL + val stateLabel = SPARK_EXECUTOR_SERVICE_STATE_LABEL + val aliveState = SPARK_EXECUTOR_SERVICE_ALIVE_STATE + val cooldownState = SPARK_EXECUTOR_SERVICE_COOLDOWN_STATE + + val service = new ServiceBuilder() + .withNewMetadata() + .withName("service") + .withLabels( + Map( + appIdLabel -> appId, + execIdLabel -> "1", + stateLabel -> aliveState + ).asJava + ) + .withAnnotations( + Map(COOLDOWN_PERIOD_ANNOTATION -> "2").asJava + ) + .endMetadata() + .build() + val serviceMock = spy(service) + + val serviceResource = mock[ServiceResource[Service]] + when(serviceResource.get()).thenReturn(serviceMock) + val serviceList = mock[MixedOperation[Service, ServiceList, ServiceResource[Service]]] + when(serviceList.resources()).thenAnswer(_ => Stream.of(serviceResource)) + val emptyServiceList = mock[MixedOperation[Service, ServiceList, ServiceResource[Service]]] + when(emptyServiceList.resources()).thenAnswer(_ => Stream.empty[ServiceResource[Service]]) + + when(serviceList.inNamespace("default")).thenReturn(serviceList) + when(serviceList.withLabel(appIdLabel, appId)).thenReturn(serviceList) + when(serviceList.withLabel(stateLabel)).thenReturn(serviceList) + when(serviceList.withLabel(stateLabel, aliveState)).thenReturn(emptyServiceList) + when(serviceList.withLabel(stateLabel, cooldownState)).thenReturn(emptyServiceList) + val argumentCaptor = ArgumentCaptor.forClass(classOf[Array[String]]) + when(serviceList.withLabelNotIn(meq(execIdLabel), argumentCaptor.capture(): _*)) + .thenReturn(emptyServiceList) + + when(emptyServiceList.withLabel(stateLabel)).thenReturn(emptyServiceList) + when(emptyServiceList.withLabel(meq(stateLabel), anyString())).thenReturn(emptyServiceList) + when(emptyServiceList.withLabelNotIn(meq(execIdLabel), argumentCaptor.capture(): _*)) + .thenReturn(emptyServiceList) + + when(kubernetesClient.services()).thenReturn(serviceList) + + when(executorBuilder.buildFromFeatures(any(classOf[KubernetesExecutorConf]), meq(secMgr), + // have the feature step define a kubernetes service (resource) + meq(kubernetesClient), any(classOf[ResourceProfile]))) + .thenAnswer((invocation: InvocationOnMock) => { + val k8sConf: KubernetesExecutorConf = invocation.getArgument(0) + KubernetesExecutorSpec( + executorPodWithId(k8sConf.executorId.toInt, k8sConf.resourceProfileId), + Seq(service)) + }) + + val startTime = Instant.now.toEpochMilli + waitForExecutorPodsClock.setTime(startTime) + + // Scale up to one executor + podsAllocatorUnderTest.setTotalExpectedExecutors( + Map(defaultProfile -> 1)) + assert(podsAllocatorUnderTest.invokePrivate(numOutstandingPods).get() == 1) + verify(podsWithNamespace).resource(podWithAttachedContainerForId(1)) + verify(podResource).create() + verify(resourceList, times(1)).serverSideApply() + + // service created and alive + when(serviceList.withLabel(stateLabel, aliveState)).thenReturn(serviceList) + when(serviceList.withLabelNotIn(meq(execIdLabel), argumentCaptor.capture(): _*)) + .thenAnswer(a => { + if (a.getArguments.slice(1, Int.MaxValue).contains("1")) { + emptyServiceList + } else { + serviceList + } + }) + + // make pods allocator see an empty snapshot + waitForExecutorPodsClock.setTime(startTime + 10*1000) + snapshotsStore.removeDeletedExecutors() + snapshotsStore.notifySubscribers() + verify(serviceResource, never).get() + verify(serviceResource, never).patch() + verify(serviceResource, never).delete() + + // the executor is coming up + waitForExecutorPodsClock.setTime(startTime + 20*1000) + snapshotsStore.updatePod(pendingExecutor(1)) + snapshotsStore.notifySubscribers() + verify(serviceResource, never).get() + verify(serviceResource, never).patch() + verify(serviceResource, never).delete() + + // ... and running + waitForExecutorPodsClock.setTime(startTime + 30*1000) + snapshotsStore.updatePod(runningExecutor(1)) + snapshotsStore.notifySubscribers() + verify(serviceResource, never).get() + verify(serviceResource, never).patch() + verify(serviceResource, never).delete() + + // the executor gets unscheduled + waitForExecutorPodsClock.setTime(startTime + 40*1000) + podsAllocatorUnderTest.setTotalExpectedExecutors( + Map(defaultProfile -> 0)) + snapshotsStore.notifySubscribers() + verify(serviceResource, never).get() + verify(serviceResource, never).patch() + verify(serviceResource, never).delete() + + // the executor disappears, does not trigger anything + waitForExecutorPodsClock.setTime(startTime + 50*1000) + snapshotsStore.updatePod(deletedExecutor(1)) + snapshotsStore.notifySubscribers() + verify(serviceResource, never).get() + verify(serviceResource, never).patch() + verify(serviceResource, never).delete() + + // the executor disappears, this triggers scheduling service for deletion + waitForExecutorPodsClock.setTime(startTime + 60*1000) + snapshotsStore.removeDeletedExecutors() + snapshotsStore.notifySubscribers() + verify(serviceResource, times(1)).get() + + val contextCapture = ArgumentCaptor.forClass(classOf[PatchContext]) + val serviceCapture = ArgumentCaptor.forClass(classOf[Service]) + verify(serviceResource, times(1)).patch(contextCapture.capture(), serviceCapture.capture()) + assert(contextCapture.getValue.getPatchType === PatchType.STRATEGIC_MERGE) + val metadata = serviceCapture.getValue.getMetadata + val expectedCooldownLabels = Map(stateLabel -> cooldownState) + assert(metadata.getLabels.asScala === expectedCooldownLabels) + val expectedDeadline = Instant.ofEpochMilli(waitForExecutorPodsClock.getTimeMillis() + 2000) + .atZone(ZoneOffset.UTC) + val expectedCooldownAnnotation = Map(COOLDOWN_DEADLINE_ANNOTATION -> expectedDeadline.toString) + assert(metadata.getAnnotations.asScala === expectedCooldownAnnotation) + + verify(serviceResource, never).delete() + Mockito.clearInvocations(serviceResource) + + // apply patch to service + val labels = metadata.getLabels.asScala ++ expectedCooldownLabels + val annotations = metadata.getAnnotations.asScala ++ expectedCooldownAnnotation + service.getMetadata.setLabels(labels.asJava) + service.getMetadata.setAnnotations(annotations.asJava) + // service in cooldown state + when(serviceList.withLabel(stateLabel, aliveState)).thenReturn(emptyServiceList) + when(serviceList.withLabel(stateLabel, cooldownState)).thenReturn(serviceList) + + // one second passes by, cooldown period is two seconds + waitForExecutorPodsClock.setTime(startTime + 61*1000) + snapshotsStore.notifySubscribers() + verify(serviceResource, times(1)).get() + verify(serviceResource, never).patch() + verify(serviceResource, never).delete() + Mockito.clearInvocations(serviceResource) + + // two seconds passed by, service is being deleted + waitForExecutorPodsClock.setTime(startTime + 62*1000) + snapshotsStore.notifySubscribers() + verify(serviceResource, times(1)).get() + verify(serviceResource, never).patch() + verify(serviceResource, times(1)).delete() + } + + test("SPARK-52505: stopping deletes services with state label") { + val serviceResource = mock[ServiceResource[Service]] + val serviceList = mock[MixedOperation[Service, ServiceList, ServiceResource[Service]]] + when(serviceList.resources()).thenAnswer(_ => Stream.of(serviceResource)) + when(serviceList.inNamespace("default")).thenReturn(serviceList) + when(serviceList.withLabel(SPARK_APP_ID_LABEL, TEST_SPARK_APP_ID)).thenReturn(serviceList) + when(serviceList.withLabel(SPARK_EXECUTOR_SERVICE_STATE_LABEL)).thenReturn(serviceList) + when(kubernetesClient.services()).thenReturn(serviceList) + podsAllocatorUnderTest.stop(TEST_SPARK_APP_ID) + verify[Deletable](serviceList, times(1)).delete() + } + test("SPARK-33262: pod allocator does not stall with pending pods") { when(podsWithNamespace .withLabel(SPARK_APP_ID_LABEL, TEST_SPARK_APP_ID)) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala index 5f0f04da9196b..496790cf39674 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala @@ -16,12 +16,16 @@ */ package org.apache.spark.scheduler.cluster.k8s +import scala.jdk.CollectionConverters.IterableHasAsScala + +import io.fabric8.kubernetes.api.model.Service import io.fabric8.kubernetes.client.KubernetesClient +import org.mockito.Mockito.mock -import org.apache.spark.{SecurityManager, SparkConf} +import org.apache.spark.{SecurityManager, SparkConf, SparkIllegalArgumentException} import org.apache.spark.deploy.k8s._ import org.apache.spark.deploy.k8s.features.KubernetesExecutorCustomFeatureConfigStep -import org.apache.spark.internal.config.ConfigEntry +import org.apache.spark.internal.config.{BLOCK_MANAGER_PORT, ConfigEntry} import org.apache.spark.resource.ResourceProfile class KubernetesExecutorBuilderSuite extends PodBuilderSuite { @@ -65,6 +69,57 @@ class KubernetesExecutorBuilderSuite extends PodBuilderSuite { val defaultProfile = ResourceProfile.getOrCreateDefaultProfile(sparkConf) new KubernetesExecutorBuilder().buildFromFeatures(conf, secMgr, client, defaultProfile).pod } + + test("SPARK-52505: check executor kubernetes spec with service disabled by default") { + val sparkConf = baseConf + val conf = KubernetesTestConf.createExecutorConf(sparkConf = sparkConf) + val secMgr = new SecurityManager(sparkConf) + val client = mock(classOf[KubernetesClient]) + val profile = ResourceProfile.getOrCreateDefaultProfile(sparkConf) + val spec = new KubernetesExecutorBuilder().buildFromFeatures(conf, secMgr, client, profile) + + val containerEnvs = spec.pod.container.getEnv.asScala + assert(!containerEnvs.exists(_.getName === "EXECUTOR_SERVICE_NAME")) + + assert(spec.executorKubernetesResources.size === 0) + } + + test("SPARK-52505: check executor kubernetes spec with service enabled") { + val sparkConf = baseConf.clone + .set(Config.KUBERNETES_EXECUTOR_SERVICE_ENABLED, true) + .set(BLOCK_MANAGER_PORT, 1234) + val conf = KubernetesTestConf.createExecutorConf(sparkConf = sparkConf) + val secMgr = new SecurityManager(sparkConf) + val client = mock(classOf[KubernetesClient]) + val profile = ResourceProfile.getOrCreateDefaultProfile(sparkConf) + val spec = new KubernetesExecutorBuilder().buildFromFeatures(conf, secMgr, client, profile) + + val containerEnvs = spec.pod.container.getEnv.asScala + assert(containerEnvs.exists(_.getName === "EXECUTOR_SERVICE_NAME")) + val containerEnv = containerEnvs.filter(_.getName === "EXECUTOR_SERVICE_NAME").head + assert(containerEnv.getValue === "svc-appId-exec-1") + + assert(spec.executorKubernetesResources.size === 1) + val resource = spec.executorKubernetesResources.head + assert(resource.getKind === "Service") + val service = resource.asInstanceOf[Service] + assert(service.getMetadata.getName === "svc-appId-exec-1") + assert(service.getSpec.getPorts.size() === 1) + val port = service.getSpec.getPorts.get(0) + assert(port.getName === "spark-block-manager") + assert(port.getPort === 1234) + } + + test("SPARK-52505: check executor kubernetes service requires block manager port") { + val sparkConf = baseConf.clone.set(Config.KUBERNETES_EXECUTOR_SERVICE_ENABLED, true) + val conf = KubernetesTestConf.createExecutorConf(sparkConf = sparkConf) + val secMgr = new SecurityManager(sparkConf) + val client = mock(classOf[KubernetesClient]) + val profile = ResourceProfile.getOrCreateDefaultProfile(sparkConf) + assertThrows[SparkIllegalArgumentException] { + new KubernetesExecutorBuilder().buildFromFeatures(conf, secMgr, client, profile) + } + } } /** diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh index f9561b9aa4ed5..a782f3bcb7d14 100755 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh @@ -102,7 +102,8 @@ case "$1" in --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP + ${EXECUTOR_SERVICE_NAME:+--bind-address $SPARK_EXECUTOR_POD_IP} + --hostname ${EXECUTOR_SERVICE_NAME:-$SPARK_EXECUTOR_POD_IP} --resourceProfileId $SPARK_RESOURCE_PROFILE_ID --podName $SPARK_EXECUTOR_POD_NAME )