DataDog · VianneyRuhlmann · Jun 10, 2025 · May 26, 2025 · May 27, 2025 · May 27, 2025
diff --git a/data-pipeline-ffi/src/error.rs b/data-pipeline-ffi/src/error.rs
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use data_pipeline::trace_exporter::error::{
-    AgentErrorKind, BuilderErrorKind, NetworkErrorKind, TraceExporterError,
+    AgentErrorKind, BuilderErrorKind, InternalErrorKind, NetworkErrorKind, TraceExporterError,
 };
 use std::ffi::{c_char, CString};
 use std::fmt::Display;
@@ -32,6 +32,7 @@ pub enum ExporterErrorCode {
     NetworkUnknown,
     Serde,
     TimedOut,
+    Internal,
 }
 
 impl Display for ExporterErrorCode {
@@ -57,6 +58,7 @@ impl Display for ExporterErrorCode {
             Self::NetworkUnknown => write!(f, "Unknown network error"),
             Self::Serde => write!(f, "Serialization/Deserialization error"),
             Self::TimedOut => write!(f, "Operation timed out"),
+            Self::Internal => write!(f, "Internal error"),
         }
     }
 }
@@ -89,6 +91,9 @@ impl From<TraceExporterError> for ExporterError {
                 BuilderErrorKind::InvalidTelemetryConfig => ExporterErrorCode::InvalidArgument,
                 BuilderErrorKind::InvalidConfiguration(_) => ExporterErrorCode::InvalidArgument,
             },
+            TraceExporterError::Internal(e) => match e {
+                InternalErrorKind::InvalidWorkerState(_) => ExporterErrorCode::Internal,
+            },
             TraceExporterError::Deserialization(_) => ExporterErrorCode::Serde,
             TraceExporterError::Io(e) => match e.kind() {
                 IoErrorKind::InvalidData => ExporterErrorCode::InvalidData,

diff --git a/data-pipeline/src/agent_info/fetcher.rs b/data-pipeline/src/agent_info/fetcher.rs
@@ -6,8 +6,7 @@
 use super::{schema::AgentInfo, AgentInfoArc};
 use anyhow::{anyhow, Result};
 use arc_swap::ArcSwapOption;
-use ddcommon::hyper_migration;
-use ddcommon::Endpoint;
+use ddcommon::{hyper_migration, worker::Worker, Endpoint};
 use http_body_util::BodyExt;
 use hyper::{self, body::Buf, header::HeaderName};
 use std::sync::Arc;
@@ -96,12 +95,13 @@ pub async fn fetch_info(info_endpoint: &Endpoint) -> Result<Box<AgentInfo>> {
 /// # Example
 /// ```no_run
 /// # use anyhow::Result;
+/// # use ddcommon::worker::Worker;
 /// # #[tokio::main]
 /// # async fn main() -> Result<()> {
 /// // Define the endpoint
 /// let endpoint = ddcommon::Endpoint::from_url("http://localhost:8126/info".parse().unwrap());
 /// // Create the fetcher
-/// let fetcher = data_pipeline::agent_info::AgentInfoFetcher::new(
+/// let mut fetcher = data_pipeline::agent_info::AgentInfoFetcher::new(
 ///     endpoint,
 ///     std::time::Duration::from_secs(5 * 60),
 /// );
@@ -122,6 +122,7 @@ pub async fn fetch_info(info_endpoint: &Endpoint) -> Result<Box<AgentInfo>> {
 /// # Ok(())
 /// # }
 /// ```
+#[derive(Debug)]
 pub struct AgentInfoFetcher {
     info_endpoint: Endpoint,
     info: AgentInfoArc,
@@ -139,11 +140,20 @@ impl AgentInfoFetcher {
         }
     }
 
+    /// Return an AgentInfoArc storing the info received by the agent.
+    ///
+    /// When the fetcher is running it updates the AgentInfoArc when the agent's info changes.
+    pub fn get_info(&self) -> AgentInfoArc {
+        self.info.clone()
+    }
+}
+
+impl Worker for AgentInfoFetcher {
     /// Start fetching the info endpoint with the given interval.
     ///
     /// # Warning
     /// This method does not return and should be called within a dedicated task.
-    pub async fn run(&self) {
+    async fn run(&mut self) {
         loop {
             let current_info = self.info.load();
             let current_hash = current_info.as_ref().map(|info| info.state_hash.as_str());
@@ -163,13 +173,6 @@ impl AgentInfoFetcher {
             sleep(self.refresh_interval).await;
         }
     }
-
-    /// Return an AgentInfoArc storing the info received by the agent.
-    ///
-    /// When the fetcher is running it updates the AgentInfoArc when the agent's info changes.
-    pub fn get_info(&self) -> AgentInfoArc {
-        self.info.clone()
-    }
 }
 
 #[cfg(test)]
@@ -328,7 +331,7 @@ mod tests {
             })
             .await;
         let endpoint = Endpoint::from_url(server.url("/info").parse().unwrap());
-        let fetcher = AgentInfoFetcher::new(endpoint.clone(), Duration::from_millis(100));
+        let mut fetcher = AgentInfoFetcher::new(endpoint.clone(), Duration::from_millis(100));
         let info = fetcher.get_info();
         assert!(info.load().is_none());
         tokio::spawn(async move {

diff --git a/data-pipeline/src/lib.rs b/data-pipeline/src/lib.rs
@@ -12,6 +12,7 @@
 
 pub mod agent_info;
 mod health_metrics;
+mod pausable_worker;
 #[allow(missing_docs)]
 pub mod span_concentrator;
 #[allow(missing_docs)]

diff --git a/data-pipeline/src/pausable_worker.rs b/data-pipeline/src/pausable_worker.rs
@@ -0,0 +1,172 @@
+// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/
+// SPDX-License-Identifier: Apache-2.0
+
+//! Defines a pausable worker to be able to stop background processes before forks
+
+use ddcommon::worker::Worker;
+use std::fmt::Display;
+use tokio::{
+    runtime::Runtime,
+    select,
+    task::{JoinError, JoinHandle},
+};
+use tokio_util::sync::CancellationToken;
+
+/// A pausable worker which can be paused and restarted on forks.
+///
+/// Used to allow a [`ddcommon::worker::Worker`] to be paused while saving its state when dropping
+/// a tokio runtime to be able to restart with the same state on a new runtime. This is used to
+/// stop all threads before a fork to avoid deadlocks in child.
+///
+/// # Time-to-pause
+/// This loop should yield regularly to reduce time-to-pause. See [`tokio::task::yield_now`].
+///
+/// # Cancellation safety
+/// The main loop can be interrupted at any yield point (`.await`ed call). The state of the worker
+/// at this point will be saved and used to restart the worker. To be able to safely restart, the
+/// worker must be in a valid state on every call to `.await`.
+/// See [`tokio::select#cancellation-safety`] for more details.
+#[derive(Debug)]
+pub enum PausableWorker<T: Worker + Send + Sync + 'static> {
+    Running {
+        handle: JoinHandle<T>,
+        stop_token: CancellationToken,
+    },
+    Paused {
+        worker: T,
+    },
+    InvalidState,
+}
+
+#[derive(Debug)]
+pub enum PausableWorkerError {
+    InvalidState,
+    TaskAborted,
+}
+
+impl Display for PausableWorkerError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            PausableWorkerError::InvalidState => {
+                write!(f, "Worker is in an invalid state and must be recreated.")
+            }
+            PausableWorkerError::TaskAborted => {
+                write!(f, "Worker task has been aborted and state has been lost.")
+            }
+        }
+    }
+}
+
+impl core::error::Error for PausableWorkerError {}
+
+impl<T: Worker + Send + Sync + 'static> PausableWorker<T> {
+    /// Create a new pausable worker from the given worker.
+    pub fn new(worker: T) -> Self {
+        Self::Paused { worker }
+    }
+
+    /// Start the worker on the given runtime.
+    ///
+    /// The worker's main loop will be run on the runtime.
+    ///
+    /// # Errors
+    /// Fails if the worker is in an invalid state.
+    pub fn start(&mut self, rt: &Runtime) -> Result<(), PausableWorkerError> {
+        if let Self::Running { .. } = self {
+            Ok(())
+        } else if let Self::Paused { mut worker } = std::mem::replace(self, Self::InvalidState) {
+            // Worker is temporarily in an invalid state, but since this block is failsafe it will
+            // be replaced by a valid state.
+            let stop_token = CancellationToken::new();
+            let cloned_token = stop_token.clone();
+            let handle = rt.spawn(async move {
+                select! {
+                    _ = worker.run() => {worker}
+                    _ = cloned_token.cancelled() => {worker}
+                }
+            });
+
+            *self = PausableWorker::Running { handle, stop_token };
+            Ok(())
+        } else {
+            Err(PausableWorkerError::InvalidState)
+        }
+    }
+
+    /// Pause the worker saving it's state to be restarted.
+    ///
+    /// # Errors
+    /// Fails if the worker handle has been aborted preventing the worker from being retrieved.
+    pub async fn pause(&mut self) -> Result<(), PausableWorkerError> {
+        match self {
+            PausableWorker::Running { handle, stop_token } => {
+                stop_token.cancel();
+                if let Ok(worker) = handle.await {
+                    *self = PausableWorker::Paused { worker };
+                    Ok(())
+                } else {
+                    // The task has been aborted and the worker can't be retrieved.
+                    *self = PausableWorker::InvalidState;
+                    Err(PausableWorkerError::TaskAborted)
+                }
+            }
+            PausableWorker::Paused { .. } => Ok(()),
+            PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState),
+        }
+    }
+
+    /// Wait for the run method of the worker to exit.
+    pub async fn join(self) -> Result<(), JoinError> {
+        if let PausableWorker::Running { handle, .. } = self {
+            handle.await?;
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tokio::{runtime::Builder, time::sleep};
+
+    use super::*;
+    use std::{
+        sync::mpsc::{channel, Sender},
+        time::Duration,
+    };
+
+    /// Test worker incrementing the state and sending it with the sender.
+    struct TestWorker {
+        state: u32,
+        sender: Sender<u32>,
+    }
+
+    impl Worker for TestWorker {
+        async fn run(&mut self) {
+            loop {
+                let _ = self.sender.send(self.state);
+                self.state += 1;
+                sleep(Duration::from_millis(100)).await;
+            }
+        }
+    }
+
+    #[test]
+    fn test_restart() {
+        let (sender, receiver) = channel::<u32>();
+        let worker = TestWorker { state: 0, sender };
+        let runtime = Builder::new_multi_thread().enable_time().build().unwrap();
+        let mut pausable_worker = PausableWorker::new(worker);
+
+        pausable_worker.start(&runtime).unwrap();
+
+        assert_eq!(receiver.recv().unwrap(), 0);
+        runtime.block_on(async { pausable_worker.pause().await.unwrap() });
+        // Empty the message queue and get the last message
+        let mut next_message = 1;
+        for message in receiver.try_iter() {
+            next_message = message + 1;
+        }
+        pausable_worker.start(&runtime).unwrap();
+        assert_eq!(receiver.recv().unwrap(), next_message);
+    }
+}
diff --git a/data-pipeline/src/stats_exporter.rs b/data-pipeline/src/stats_exporter.rs
@@ -14,7 +14,7 @@ use std::{
 use crate::{span_concentrator::SpanConcentrator, trace_exporter::TracerMetadata};
 use datadog_trace_protobuf::pb;
 use datadog_trace_utils::send_with_retry::{send_with_retry, RetryStrategy};
-use ddcommon::Endpoint;
+use ddcommon::{worker::Worker, Endpoint};
 use hyper;
 use tokio::select;
 use tokio_util::sync::CancellationToken;
@@ -127,13 +127,15 @@ impl StatsExporter {
                 .flush(time::SystemTime::now(), force_flush),
         )
     }
+}
 
+impl Worker for StatsExporter {
     /// Run loop of the stats exporter
     ///
     /// Once started, the stats exporter will flush and send stats on every `self.flush_interval`.
     /// If the `self.cancellation_token` is cancelled, the exporter will force flush all stats and
     /// return.
-    pub async fn run(&mut self) {
+    async fn run(&mut self) {
         loop {
             select! {
                 _ = self.cancellation_token.cancelled() => {