DataDog
diff --git a/‎.github/workflows/system-tests.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/system-tests.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.gitlab-ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ddtrace/internal/_encoding.pyx‎
Lines changed: 6 additions & 12 deletions b/‎ddtrace/internal/_encoding.pyx‎
Lines changed: 6 additions & 12 deletions
diff --git a/‎ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h‎
Lines changed: 31 additions & 9 deletions b/‎ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h‎
Lines changed: 31 additions & 9 deletions
diff --git a/‎ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h‎
Lines changed: 47 additions & 20 deletions b/‎ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h‎
Lines changed: 47 additions & 20 deletions
diff --git a/‎ddtrace/testing/internal/telemetry.py‎
Lines changed: 4 additions & 2 deletions b/‎ddtrace/testing/internal/telemetry.py‎
Lines changed: 4 additions & 2 deletions
@@ -45,7 +45,7 @@ jobs:
           persist-credentials: false
           repository: 'DataDog/system-tests'
           # Automatically managed, use scripts/update-system-tests-version to update
-          ref: '94529f681dcaf74382ed47c3b0c85acdb775b6c9'
+          ref: '30e17ba7009b84998c0ada3b3a17f39c0037faba'
 
       - name: Download wheels to binaries directory
         uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
@@ -90,7 +90,7 @@ jobs:
           persist-credentials: false
           repository: 'DataDog/system-tests'
           # Automatically managed, use scripts/update-system-tests-version to update
-          ref: '94529f681dcaf74382ed47c3b0c85acdb775b6c9'
+          ref: '30e17ba7009b84998c0ada3b3a17f39c0037faba'
 
       - name: Build runner
         uses: ./.github/actions/install_runner
@@ -287,7 +287,7 @@ jobs:
           persist-credentials: false
           repository: 'DataDog/system-tests'
           # Automatically managed, use scripts/update-system-tests-version to update
-          ref: '94529f681dcaf74382ed47c3b0c85acdb775b6c9'
+          ref: '30e17ba7009b84998c0ada3b3a17f39c0037faba'
       - name: Download wheels to binaries directory
         uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
         with:
 
@@ -14,7 +14,7 @@ variables:
   DD_VPA_TEMPLATE: "vpa-template-cpu-p70-10percent-2x-oom-min-cap"
   # CI_DEBUG_SERVICES: "true"
   # Automatically managed, use scripts/update-system-tests-version to update
-  SYSTEM_TESTS_REF: "94529f681dcaf74382ed47c3b0c85acdb775b6c9"
+  SYSTEM_TESTS_REF: "30e17ba7009b84998c0ada3b3a17f39c0037faba"
 
 default:
   interruptible: true
 
@@ -1257,8 +1257,7 @@ cdef class Packer(object):
                         default_used = True
                         continue
                     else:
-                        o = "Integer value out of range"
-                        continue
+                        raise OverflowError("Integer value out of range")
             elif PyFloat_CheckExact(o):
                 dval = o
                 ret = msgpack_pack_double(&self.pk, dval)
@@ -1274,14 +1273,12 @@ cdef class Packer(object):
                 if self.encoding == NULL:
                     ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT)
                     if ret == -2:
-                        o = f"Unicode string is too large {L}"
-                        continue
+                        raise ValueError("unicode string is too large")
                 else:
                     o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
                     L = len(o)
                     if L > ITEM_LIMIT:
-                        o = f"Unicode string is too large {L}"
-                        continue
+                        raise ValueError("unicode string is too large")
                     ret = msgpack_pack_raw(&self.pk, L)
                     if ret == 0:
                         rawval = o
@@ -1290,8 +1287,7 @@ cdef class Packer(object):
                 d = <dict>o
                 L = len(d)
                 if L > ITEM_LIMIT:
-                    o = f"Dictionnary is too large {L}"
-                    continue
+                    raise ValueError("dict is too large")
                 ret = msgpack_pack_map(&self.pk, L)
                 if ret == 0:
                     for k, v in d.items():
@@ -1304,8 +1300,7 @@ cdef class Packer(object):
             elif PyList_CheckExact(o):
                 L = Py_SIZE(o)
                 if L > ITEM_LIMIT:
-                    o = f"List is too large {L}"
-                    continue
+                    raise ValueError("list is too large")
                 ret = msgpack_pack_array(&self.pk, L)
                 if ret == 0:
                     for v in o:
@@ -1318,8 +1313,7 @@ cdef class Packer(object):
                 else:
                     ret = msgpack_pack_false(&self.pk)
             else:
-                o = f"Can not serialize [{type(o).__name__}] object"
-                continue
+                PyErr_Format(TypeError, b"can not serialize '%.200s' object", Py_TYPE(o).tp_name)
             return ret
 
     cpdef pack(self, object obj):
 
@@ -182,27 +182,26 @@ class TaskInfo
     PyObject* origin = NULL;
     PyObject* loop = NULL;
 
-    GenInfo::Ptr coro = nullptr;
-
     StringTable::Key name;
+    bool is_on_cpu = false;
+    GenInfo::Ptr coro = nullptr;
 
     // Information to reconstruct the async stack as best as we can
     TaskInfo::Ptr waiter = nullptr;
-    bool is_on_cpu = false;
 
     [[nodiscard]] static Result<TaskInfo::Ptr> create(TaskObj*);
     TaskInfo(PyObject* origin, PyObject* loop, GenInfo::Ptr coro, StringTable::Key name, TaskInfo::Ptr waiter)
       : origin(origin)
       , loop(loop)
-      , coro(std::move(coro))
       , name(name)
+      , is_on_cpu(coro && coro->is_running)
+      , coro(std::move(coro))
       , waiter(std::move(waiter))
-      , is_on_cpu(this->coro && this->coro->is_running)
     {
     }
 
     [[nodiscard]] static Result<TaskInfo::Ptr> current(PyObject*);
-    inline size_t unwind(FrameStack&);
+    inline size_t unwind(FrameStack&, size_t& upper_python_stack_size);
 };
 
 inline std::unordered_map<PyObject*, PyObject*> task_link_map;
@@ -344,7 +343,7 @@ inline std::vector<std::unique_ptr<StackInfo>> current_tasks;
 // ----------------------------------------------------------------------------
 
 inline size_t
-TaskInfo::unwind(FrameStack& stack)
+TaskInfo::unwind(FrameStack& stack, size_t& upper_python_stack_size)
 {
     // TODO: Check for running task.
     std::stack<PyObject*> coro_frames;
@@ -355,14 +354,37 @@ TaskInfo::unwind(FrameStack& stack)
             coro_frames.push(py_coro->frame);
     }
 
-    int count = 0;
+    // Total number of frames added to the Stack
+    size_t count = 0;
 
     // Unwind the coro frames
     while (!coro_frames.empty()) {
         PyObject* frame = coro_frames.top();
         coro_frames.pop();
 
-        count += unwind_frame(frame, stack);
+        auto new_frames = unwind_frame(frame, stack);
+
+        // If we failed to unwind the Frame, stop unwinding the coroutine chain; otherwise we could
+        // end up with Stacks with missing Frames between two coroutines Frames.
+        if (new_frames == 0) {
+            break;
+        }
+
+        // If this is the first Frame being unwound (we have not added any Frames to the Stack yet),
+        // use the number of Frames added to the Stack to determine the size of the upper Python stack.
+        if (count == 0) {
+            // The first Frame is the coroutine Frame, so the Python stack size is the number of Frames - 1
+            upper_python_stack_size = new_frames - 1;
+
+            // Remove the Python Frames from the Stack (they will be added back later)
+            // We cannot push those Frames now because otherwise they would be added once per Task,
+            // we only want to add them once per Leaf Task, and on top of all non-leaf Tasks.
+            for (size_t i = 0; i < upper_python_stack_size; i++) {
+                stack.pop_back();
+            }
+        }
+
+        count += new_frames;
     }
 
     return count;
 
@@ -264,31 +264,45 @@ ThreadInfo::unwind_tasks()
         }
     }
 
+    // Make sure the on CPU task is first
+    for (size_t i = 0; i < leaf_tasks.size(); i++) {
+        if (leaf_tasks[i].get().is_on_cpu) {
+            if (i > 0) {
+                std::swap(leaf_tasks[i], leaf_tasks[0]);
+            }
+            break;
+        }
+    }
+
+    // The size of the "pure Python" stack (before asyncio Frames), computed later by TaskInfo::unwind
+    size_t upper_python_stack_size = 0;
+    // Unused variable, will be used later by TaskInfo::unwind
+    size_t unused;
+
+    bool on_cpu_task_seen = false;
     for (auto& leaf_task : leaf_tasks) {
+        on_cpu_task_seen = on_cpu_task_seen || leaf_task.get().is_on_cpu;
+
         auto stack_info = std::make_unique<StackInfo>(leaf_task.get().name, leaf_task.get().is_on_cpu);
         auto& stack = stack_info->stack;
+
         for (auto current_task = leaf_task;;) {
             auto& task = current_task.get();
 
-            size_t stack_size = task.unwind(stack);
-
+            // The task_stack_size includes both the coroutines frames and the "upper" Python synchronous frames
+            size_t task_stack_size = task.unwind(stack, task.is_on_cpu ? upper_python_stack_size : unused);
             if (task.is_on_cpu) {
-                // Undo the stack unwinding
-                // TODO[perf]: not super-efficient :(
-                for (size_t i = 0; i < stack_size; i++)
-                    stack.pop_back();
-
-                // Instead we get part of the thread stack
-                FrameStack temp_stack;
-                size_t nframes = (python_stack.size() > stack_size) ? python_stack.size() - stack_size : 0;
-                for (size_t i = 0; i < nframes; i++) {
-                    auto python_frame = python_stack.front();
-                    temp_stack.push_front(python_frame);
-                    python_stack.pop_front();
-                }
-                while (!temp_stack.empty()) {
-                    stack.push_front(temp_stack.front());
-                    temp_stack.pop_front();
+                // Get the "bottom" part of the Python synchronous Stack, that is to say the
+                // synchronous functions and coroutines called by the Task's outermost coroutine
+                // The number of Frames to push is the total number of Frames in the Python stack, from which we
+                // subtract the number of Frames in the "upper Python stack" (asyncio machinery + sync entrypoint)
+                // This gives us [outermost coroutine, ... , innermost coroutine, outermost sync function, ... ,
+                // innermost sync function]
+                size_t frames_to_push =
+                  (python_stack.size() > task_stack_size) ? python_stack.size() - task_stack_size : 0;
+                for (size_t i = 0; i < frames_to_push; i++) {
+                    const auto& python_frame = python_stack[frames_to_push - i - 1];
+                    stack.push_front(python_frame);
                 }
             }
 
@@ -317,8 +331,21 @@ ThreadInfo::unwind_tasks()
         }
 
         // Finish off with the remaining thread stack
-        for (auto p = python_stack.begin(); p != python_stack.end(); p++)
-            stack.push_back(*p);
+        // If we have seen an on-CPU Task, then upper_python_stack_size will be set and will include the sync entry
+        // point and the asyncio machinery Frames. Otherwise, we are in `select` (idle) and we should push all the
+        // Frames.
+
+        // There could be a race condition where relevant partial Python Thread Stack ends up being different from the
+        // one we saw in TaskInfo::unwind. This is extremely unlikely, I believe, but failing to account for it would
+        // cause an underflow, so let's be conservative.
+        size_t start_index = 0;
+        if (on_cpu_task_seen && python_stack.size() >= upper_python_stack_size) {
+            start_index = python_stack.size() - upper_python_stack_size;
+        }
+        for (size_t i = start_index; i < python_stack.size(); i++) {
+            const auto& python_frame = python_stack[i];
+            stack.push_back(python_frame);
+        }
 
         current_tasks.push_back(std::move(stack_info));
     }
 
@@ -177,7 +177,6 @@ def record_event_payload(
         payload_size: int,
         request_seconds: float,
         events_count: int,
-        serialization_seconds: float,
         error: t.Optional[ErrorType],
     ) -> None:
         tags = {"endpoint": endpoint}
@@ -186,11 +185,14 @@ def record_event_payload(
         self.add_count_metric("endpoint_payload.requests", 1, tags)
         self.add_distribution_metric("endpoint_payload.requests_ms", request_seconds * 1000, tags)
         self.add_distribution_metric("endpoint_payload.events_count", events_count, tags)
-        self.add_distribution_metric("endpoint_payload.events_serialization_ms", serialization_seconds * 1000, tags)
 
         if error:
             self.record_event_payload_error(endpoint, error)
 
+    def record_event_payload_serialization_seconds(self, endpoint: str, serialization_seconds: float) -> None:
+        tags = {"endpoint": endpoint}
+        self.add_distribution_metric("endpoint_payload.events_serialization_ms", serialization_seconds * 1000, tags)
+
     def record_event_payload_error(self, endpoint: str, error: ErrorType) -> None:
         # `endpoint_payload.requests_errors` accepts a different set of error types, so we need to convert them here.
         if error == ErrorType.TIMEOUT: