codalab · ObadaS · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 11, 2026
diff --git a/.env_sample b/.env_sample
@@ -67,6 +67,8 @@ AWS_STORAGE_PRIVATE_BUCKET_NAME=private
 # NOTE! port 9000 here should match $MINIO_PORT
 AWS_S3_ENDPOINT_URL=http://minio:9000/
 AWS_QUERYSTRING_AUTH=False
+# Optional URL rewriting in compute worker, format: FROM | TO
+#WORKER_BUNDLE_URL_REWRITE=http://localhost:9000|http://minio:9000
 
 
 # -----------------------------------------------------------------------------

diff --git a/compute_worker/compute_worker.py b/compute_worker/compute_worker.py
@@ -208,6 +208,28 @@ class ExecutionTimeLimitExceeded(Exception):
     pass
 
 
+# -----------------------------------------------
+# Local setups where public S3 URLs are not directly reachable from workers
+# -----------------------------------------------
+def rewrite_bundle_url_if_needed(url):
+    """
+    Optionally rewrite presigned bundle URLs for worker networking.
+
+    Controlled by env: WORKER_BUNDLE_URL_REWRITE=FROM|TO
+
+    Example: http://localhost:9000|http://minio:9000
+    """
+    rule = os.getenv("WORKER_BUNDLE_URL_REWRITE", "").strip()
+    if not rule or "|" not in rule:
+        return url
+    src, dst = rule.split("|", 1)
+    if url.startswith(src):
+        new_url = dst + url[len(src):]
+        logger.info(f"Rewriting bundle URL for worker: {url} -> {new_url}")
+        return new_url
+    return url
+
+
 # -----------------------------------------------------------------------------
 # The main compute worker entrypoint, this is how a job is ran at the highest
 # level.
@@ -323,10 +345,13 @@ class Run:
     """
 
     def __init__(self, run_args):
+        self.run_related_name = (
+            f"uPK-{run_args['user_pk']}_sID-{run_args['id']}"
+        )
         # Directories for the run
         self.watch = True
         self.completed_program_counter = 0
-        self.root_dir = tempfile.mkdtemp(dir=BASE_DIR)
+        self.root_dir = tempfile.mkdtemp(prefix=f'{self.run_related_name}__', dir=BASE_DIR)
         self.bundle_dir = os.path.join(self.root_dir, "bundles")
         self.input_dir = os.path.join(self.root_dir, "input")
         self.output_dir = os.path.join(self.root_dir, "output")
@@ -349,8 +374,8 @@ def __init__(self, run_args):
         self.stdout, self.stderr, self.ingestion_stdout, self.ingestion_stderr = (
             self._get_stdout_stderr_file_names(run_args)
         )
-        self.ingestion_container_name = uuid.uuid4()
-        self.program_container_name = uuid.uuid4()
+        self.ingestion_container_name = f"ingestion_{self.run_related_name}"
+        self.program_container_name = f"scoring_{self.run_related_name}"
         self.program_data = run_args.get("program_data")
         self.ingestion_program_data = run_args.get("ingestion_program")
         self.input_data = run_args.get("input_data")
@@ -449,9 +474,14 @@ async def send_detailed_results(self, file_path):
                 )
             )
         except Exception as e:
-            logger.error("This error might result in a Execution Time Exceeded error" + e)
+            logger.error(
+                f"This error might result in a Execution Time Exceeded error: {e}"
+            )
             if os.environ.get("LOG_LEVEL", "info").lower() == "debug":
                 logger.exception(e)
+            raise SubmissionException(
+                "Could not connect to instance to update detailed result"
+            )
 
     def _get_stdout_stderr_file_names(self, run_args):
         # run_args should be the run_args argument passed to __init__ from the run_wrapper.
@@ -615,6 +645,7 @@ def _get_bundle(self, url, destination, cache=True):
             if download_needed:
                 try:
                     # Download the bundle
+                    url = rewrite_bundle_url_if_needed(url)
                     urlretrieve(url, bundle_file)
                 except HTTPError:
                     raise SubmissionException(
@@ -628,10 +659,10 @@ def _get_bundle(self, url, destination, cache=True):
             except BadZipFile:
                 retries += 1
                 if retries >= max_retries:
-                    raise  # Re-raise the last caught BadZipFile exception
+                    raise SubmissionException("Bad or empty zip file")
                 else:
-                    logger.warning("Failed. Retrying in 60 seconds...")
-                    time.sleep(60)  # Wait 60 seconds before retrying
+                    logger.warning("Failed. Retrying in 20 seconds...")
+                    time.sleep(20)  # Wait 20 seconds before retrying
         # Return the zip file path for other uses, e.g. for creating a MD5 hash to identify it
         return bundle_file
 
@@ -668,8 +699,7 @@ async def _run_container_engine_cmd(self, container, kind):
             )
         except Exception as e:
             logger.error(
-                "There was an error trying to connect to the websocket on the codabench instance"
-                + e
+                f"There was an error trying to connect to the websocket on the codabench instance: {e}"
             )
             if os.environ.get("LOG_LEVEL", "info").lower() == "debug":
                 logger.exception(e)
@@ -718,8 +748,7 @@ async def _run_container_engine_cmd(self, container, kind):
             logger.error(e)
         except Exception as e:
             logger.error(
-                "There was an error while starting the container and getting the logs"
-                + e
+                f"There was an error while starting the container and getting the logs: {e}"
             )
             if os.environ.get("LOG_LEVEL", "info").lower() == "debug":
                 logger.exception(e)
@@ -748,7 +777,7 @@ async def _run_container_engine_cmd(self, container, kind):
             Exception,
         ) as e:
             logger.error(e)
-            return_Code = {"StatusCode": e}
+            return_Code = {"StatusCode": 1}
 
         self.logs[kind] = {
             "returncode": return_Code["StatusCode"],
@@ -1024,6 +1053,7 @@ def _put_dir(self, url, directory):
 
     def _put_file(self, url, file=None, raw_data=None, content_type="application/zip"):
         """Send the file in the storage."""
+        url = rewrite_bundle_url_if_needed(url)
         if file and raw_data:
             raise Exception("Cannot put both a file and raw_data")
 
@@ -1066,6 +1096,15 @@ def _prep_cache_dir(self, max_size=MAX_CACHE_DIR_SIZE_GB):
             logger.info("Cache directory does not need to be pruned!")
 
     def prepare(self):
+        hostname = utils.nodenames.gethostname()
+        if self.is_scoring:
+            self._update_status(
+                STATUS_RUNNING, extra_information=f"scoring_hostname-{hostname}"
+            )
+        else:
+            self._update_status(
+                STATUS_RUNNING, extra_information=f"ingestion_hostname-{hostname}"
+            )
         if not self.is_scoring:
             # Only during prediction step do we want to announce "preparing"
             self._update_status(STATUS_PREPARING)
@@ -1110,15 +1149,6 @@ def prepare(self):
         self._get_container_image(self.container_image)
 
     def start(self):
-        hostname = utils.nodenames.gethostname()
-        if self.is_scoring:
-            self._update_status(
-                STATUS_RUNNING, extra_information=f"scoring_hostname-{hostname}"
-            )
-        else:
-            self._update_status(
-                STATUS_RUNNING, extra_information=f"ingestion_hostname-{hostname}"
-            )
         program_dir = os.path.join(self.root_dir, "program")
         ingestion_program_dir = os.path.join(self.root_dir, "ingestion_program")
 
@@ -1129,12 +1159,16 @@ def start(self):
             self._run_program_directory(ingestion_program_dir, kind="ingestion"),
             self.watch_detailed_results(),
             loop=loop,
+            return_exceptions=True,
         )
 
+        task_results = []  # will store results/exceptions from gather
         signal.signal(signal.SIGALRM, alarm_handler)
         signal.alarm(self.execution_time_limit)
         try:
-            loop.run_until_complete(gathered_tasks)
+            # run tasks
+            # keep what gather returned so we can detect async errors later
+            task_results = loop.run_until_complete(gathered_tasks) or []
         except ExecutionTimeLimitExceeded:
             error_message = f"Execution Time Limit exceeded. Limit was {self.execution_time_limit} seconds"
             logger.error(error_message)
@@ -1159,7 +1193,7 @@ def start(self):
                         logger.error(e)
                     except Exception as e:
                         logger.error(
-                            "There was a problem killing " + str(containers_to_kill) + e
+                            f"There was a problem killing {containers_to_kill}: {e}"
                         )
                         if os.environ.get("LOG_LEVEL", "info").lower() == "debug":
                             logger.exception(e)
@@ -1175,7 +1209,12 @@ def start(self):
                     elapsed_time = logs["end"] - logs["start"]
                 else:
                     elapsed_time = self.execution_time_limit
-                return_code = logs["returncode"]
+                # Normalize the return_code
+                return_code = (
+                    logs["returncode"]
+                    if logs["returncode"] is None or isinstance(logs["returncode"], int)
+                    else 1
+                )
                 if return_code is None:
                     logger.warning("No return code from Process. Killing it")
                     if kind == "ingestion":
@@ -1189,7 +1228,7 @@ def start(self):
                         logger.error(e)
                     except Exception as e:
                         logger.error(
-                            "There was a problem killing " + str(containers_to_kill) + e
+                            f"There was a problem killing {containers_to_kill}: {e}"
                         )
                         if os.environ.get("LOG_LEVEL", "info").lower() == "debug":
                             logger.exception(e)
@@ -1212,6 +1251,21 @@ def start(self):
         signal.alarm(0)
 
         if self.is_scoring:
+            # Check if scoring program failed
+            program_results, _, _ = task_results
+            # Gather returns either normal values or exception instances when return_exceptions=True
+            had_async_exc = isinstance(
+                program_results, BaseException
+            ) and not isinstance(program_results, asyncio.CancelledError)
+            program_rc = getattr(self, "program_exit_code", None)
+            failed_rc = program_rc not in (0, None)
+            if had_async_exc or failed_rc:
+                self._update_status(
+                    STATUS_FAILED,
+                    extra_information=f"program_rc={program_rc}, async={task_results}",
+                )
+                # Raise so upstream marks failed immediately
+                raise SubmissionException("Child task failed or non-zero return code")
             self._update_status(STATUS_FINISHED)
         else:
             self._update_status(STATUS_SCORING)
@@ -1273,9 +1327,12 @@ def push_output(self):
                 "Error, the output directory already contains a metadata file. This file is used "
                 "to store exitCode and other data, do not write to this file manually."
             )
-
-        with open(metadata_path, "w") as f:
-            f.write(yaml.dump(prog_status, default_flow_style=False))
+        try:
+            with open(metadata_path, "w") as f:
+                f.write(yaml.dump(prog_status, default_flow_style=False))
+        except Exception as e:
+            logger.error(e)
+            raise SubmissionException("Metadata file not found")
 
         if not self.is_scoring:
             self._put_dir(self.prediction_result, self.output_dir)

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -226,6 +226,7 @@ services:
       - ${HOST_DIRECTORY:-/tmp/codabench}:/codabench
       # Actual connection back to docker parent to run things
       - /var/run/docker.sock:/var/run/docker.sock
+    restart: unless-stopped
     env_file: .env
     environment:
       - BROKER_URL=pyamqp://${RABBITMQ_DEFAULT_USER}:${RABBITMQ_DEFAULT_PASS}@${RABBITMQ_HOST}:${RABBITMQ_PORT}//

diff --git a/documentation/docs/Developers_and_Administrators/Codabench-Installation.md b/documentation/docs/Developers_and_Administrators/Codabench-Installation.md
@@ -41,7 +41,9 @@ by
 
 ```ini
 AWS_S3_ENDPOINT_URL=http://docker.for.mac.localhost:9000/
+WORKER_BUNDLE_URL_REWRITE=http://docker.for.mac.localhost:9000|http://minio:9000
 ```
+
 !!! note "If needed, some troubleshooting of this step is provided at [the end of this page](#troubleshooting-storage-endpoint-url) or [in this page](How-to-deploy-Codabench-on-your-server.md#frequently-asked-questions-faqs)"
 
 ## Start the service

diff --git a/documentation/docs/Newsletters_Archive/CodaLab-in-2024.md b/documentation/docs/Newsletters_Archive/CodaLab-in-2024.md
@@ -1,9 +1,9 @@
-## CodaLab in 2024 
+## CodaLab in 2024
 ### A Year of Breakthroughs and New Horizons with Codabench
 
 Welcome to the first edition of CodaLab’s newsletter! This year has been full of novelty, success, and scientific progress. The platform is breaking records of participation and number of organized competitions, and [Codabench](https://codabench.org/), the new version of [CodaLab](https://codalab.lisn.fr/), had a very promising launch. Let’s dive into more details.
 
-![image_2025_01_30T15_23_35_261Z](../_attachments/991c18dc-3baa-4d69-b157-6a5561e6798d_17528513106457524.jpg)
+![image_2025_01_30T15_23_35_261Z](_attachments/991c18dc-3baa-4d69-b157-6a5561e6798d_17528513106457524.jpg)
 
 
 ##  Unprecedented engagement
@@ -13,7 +13,7 @@ In October, Codabench has registered its **10,000th user**! From about 100 daily
 
 Contributors community is very active with **143 pull requests** this year. Since the platform is still relatively new, the primary focus has been on bug fixes, security and performance enhancements, and administrative features, accounting for approximately two-thirds of the pull requests. Nevertheless, we are keen on improving the experience for both participants and organizers. We have set a versioning and a release-notes follow-up to give more visibility to the platform evolution and maturity.
 
-![pie-chart](../_attachments/2b41574f-9597-47c0-b492-cf427eba7eff_17528513107025487.jpg)
+![pie-chart](_attachments/2b41574f-9597-47c0-b492-cf427eba7eff_17528513107025487.jpg)
 
 
 ## Introducing Codabench