Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions livekit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,4 @@ anyhow = "1.0.99"
test-log = "0.2.18"
test-case = "3.3"
serial_test = "3.0"
http = "1.1"
93 changes: 93 additions & 0 deletions livekit/src/rtc_engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,14 @@ impl EngineInner {
match try_connect().await {
Ok(res) => return Ok(res),
Err(e) => {
// A validated auth failure (401/403) will not succeed on
// retry with the same token — surface it immediately instead
// of burning the remaining join attempts. Same classification
// as the reconnect loop (see `auth_failure_reason`).
if auth_failure_reason(&e).is_some() {
log::warn!("authentication rejected during connect ({e}); not retrying");
return Err(e);
}
let attempt_i = i + 1;
if i < max_retries {
log::warn!(
Expand Down Expand Up @@ -943,6 +951,16 @@ impl EngineInner {
"server requested disconnect during restart".into(),
));
}
if let Some(reason) = auth_failure_reason(&err) {
log::warn!(
"authentication rejected during restart ({err}); not retrying"
);
self.running_handle.write().can_reconnect = false;
self.close(reason).await;
return Err(EngineError::Connection(
"authentication failed during reconnect".into(),
));
}
log::error!("restarting connection failed: {}", err);
}
}
Expand Down Expand Up @@ -971,6 +989,16 @@ impl EngineInner {
"server requested disconnect during resume".into(),
));
}
if let Some(reason) = auth_failure_reason(&err) {
log::warn!(
"authentication rejected during resume ({err}); not retrying"
);
self.running_handle.write().can_reconnect = false;
self.close(reason).await;
return Err(EngineError::Connection(
"authentication failed during reconnect".into(),
));
}
log::error!("resuming connection failed: {}", err);
let mut running_handle = self.running_handle.write();
running_handle.full_reconnect = true;
Expand Down Expand Up @@ -1153,6 +1181,28 @@ fn leave_disconnect_reason(err: &EngineError) -> Option<DisconnectReason> {
None
}

/// Inspect a reconnect-attempt error for a genuine authentication/authorization
/// failure (HTTP 401/403). Such a failure will not succeed on retry with the
/// same token, so the reconnect loop should bail out immediately rather than
/// burning every attempt (and hammering the server) with credentials it already
/// knows are rejected.
///
/// We key on `SignalError::Client(401|403)`, which is produced by the server's
/// `rtc/validate` probe (see [`super`]'s `SignalInner::validate`) — an
/// authoritative classification. We deliberately do NOT key on the raw
/// `WsError::Http` upgrade status, because that can be a fabricated 401 masking a
/// transient server error (e.g. a 503 from a saturated node), which IS
/// retryable. A resume that hits a raw 401 simply escalates to a full reconnect,
/// whose connect path runs `validate()` and surfaces the authoritative status.
fn auth_failure_reason(err: &EngineError) -> Option<DisconnectReason> {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion(non-blocking): Alternatively could be defined as a method on the enum:

impl EngineError {
    fn auth_failure_reason(&self) -> Option<DisconnectReason> {
        // ...
    }
}

if let EngineError::Signal(SignalError::Client(status, _)) = err {
if matches!(status.as_u16(), 401 | 403) {
return Some(DisconnectReason::JoinFailure);
}
}
None
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -1200,4 +1250,47 @@ mod tests {
);
}
}

#[test]
fn auth_failure_reason_flags_validated_401_and_403() {
// The server's rtc/validate probe surfaces auth failures as Client(4xx).
for status in [401u16, 403] {
let err = EngineError::Signal(SignalError::Client(
http::StatusCode::from_u16(status).unwrap(),
"invalid token".into(),
));
assert_eq!(
auth_failure_reason(&err),
Some(DisconnectReason::JoinFailure),
"Client({status}) must be treated as a non-retryable auth failure"
);
}
}

fn auth_failure_reason_ignores_other_client_and_server_errors() {
let not_auth = [
// Other client errors are not auth failures.
EngineError::Signal(SignalError::Client(http::StatusCode::NOT_FOUND, "".into())),
EngineError::Signal(SignalError::Client(
http::StatusCode::TOO_MANY_REQUESTS,
"".into(),
)),
// Server errors (e.g. a saturated node) are retryable.
EngineError::Signal(SignalError::Server(
http::StatusCode::SERVICE_UNAVAILABLE,
"".into(),
)),
// Generic connectivity/internal errors are retryable.
EngineError::Connection("network".into()),
EngineError::Internal("bug".into()),
EngineError::Signal(SignalError::SendError),
EngineError::Signal(SignalError::Timeout("waiting".into())),
];
for err in &not_auth {
assert!(
auth_failure_reason(err).is_none(),
"{err:?} must NOT be treated as an auth failure"
);
}
}
}
16 changes: 8 additions & 8 deletions livekit/src/rtc_engine/reconnect_strategy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ mod tests {
// Monotonic non-decreasing and never above the cap.
let mut prev = Duration::ZERO;
for attempt in 1..=RECONNECT_ATTEMPTS {
let nominal = nominal(attempt);
assert!(nominal >= prev, "backoff must not decrease (attempt {attempt})");
assert!(nominal <= RECONNECT_MAX_DELAY, "backoff must not exceed the cap");
prev = nominal;
let nominal_duration = nominal(attempt);
assert!(nominal_duration >= prev, "backoff must not decrease (attempt {attempt})");
assert!(nominal_duration <= RECONNECT_MAX_DELAY, "backoff must not exceed the cap");
prev = nominal_duration;
}

// Late attempts are pinned to the cap, and large attempt indices don't
Expand All @@ -86,12 +86,12 @@ mod tests {
fn backoff_delay_stays_within_nominal_jitter_window() {
// Full jitter: every sample must land within [0, nominal(attempt)].
for attempt in 1..=RECONNECT_ATTEMPTS {
let nominal = nominal(attempt);
let nominal_duration = nominal(attempt);
for _ in 0..1000 {
let delay = delay(attempt);
let delay_duration = delay(attempt);
assert!(
delay <= nominal,
"jittered delay {delay:?} exceeded nominal {nominal:?} (attempt {attempt})"
delay_duration <= nominal_duration,
"jittered delay {delay_duration:?} exceeded nominal {nominal_duration:?} (attempt {attempt})"
);
}
}
Expand Down