From 355f63b2c498057f3a9cb61a14f480c2ef7f822e Mon Sep 17 00:00:00 2001 From: Ritav Das Date: Sat, 13 Jun 2026 18:54:57 +0530 Subject: [PATCH] Fix Voice-Activated mode not scrolling with low-gain mics The Voice-Activated (silence-paused) listening mode only advances the teleprompter while SpeechRecognizer.isSpeaking is true. That flag used a single hard-coded absolute threshold (average level > 0.08) over recent audio samples. On common microphones (built-in MacBook mic, AirPods at a normal distance) speech rarely averages that high, so isSpeaking stayed false and the text never scrolled even while the user was clearly speaking. Classic and Word Tracking modes don't depend on isSpeaking, which is why only this mode was affected. Replace the fixed threshold with adaptive voice-activity detection: - Continuously track a noise floor that rises slowly and falls fast, so steady ambient noise is absorbed (no false 'always speaking') while the floor stays low for quiet mics. - Trigger speech on a gain-relative threshold (max(0.025, floor * 1.8)), so detection works across very different mic gains. - Add a 0.25s hangover so natural pauses between words don't stall scrolling. - Reset VAD state on each start(with:). Fixes #49 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Textream/Textream/SpeechRecognizer.swift | 55 +++++++++++++++++++----- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index d1151c1..eab3d14 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -80,13 +80,10 @@ class SpeechRecognizer { var shouldDismiss: Bool = false var shouldAdvancePage: Bool = false - /// True when recent audio levels indicate the user is actively speaking - var isSpeaking: Bool { - let recent = audioLevels.suffix(10) - guard !recent.isEmpty else { return false } - let avg = recent.reduce(0, +) / CGFloat(recent.count) - return avg > 0.08 - } + /// True when recent audio energy indicates the user is actively speaking. + /// Updated from the audio tap using an adaptive noise floor so detection + /// works across microphones with different gain levels. + var isSpeaking: Bool = false private var speechRecognizer: SFSpeechRecognizer? private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? @@ -107,6 +104,11 @@ class SpeechRecognizer { /// We require 2-of-3 recent results to agree before committing a forward jump. private var recentMatchPositions: [Int] = [] + // Adaptive voice-activity detection (drives Voice-Activated / silence-paused scrolling) + private var noiseFloor: CGFloat = 0.02 + private var lastSpeechAt: Date = .distantPast + private let speechHangover: TimeInterval = 0.25 + /// Update the source text while preserving the current recognized char count. /// Used by Director Mode to live-edit unread text without resetting read progress. func updateText(_ text: String, preservingCharCount: Int) { @@ -143,6 +145,9 @@ class SpeechRecognizer { matchStartOffset = 0 retryCount = 0 recentMatchPositions = [] + noiseFloor = 0.02 + lastSpeechAt = .distantPast + isSpeaking = false error = nil sessionGeneration += 1 @@ -371,10 +376,12 @@ class SpeechRecognizer { let level = CGFloat(min(rms * 5, 1.0)) DispatchQueue.main.async { - self?.audioLevels.append(level) - if (self?.audioLevels.count ?? 0) > 30 { - self?.audioLevels.removeFirst() + guard let self else { return } + self.audioLevels.append(level) + if self.audioLevels.count > 30 { + self.audioLevels.removeFirst() } + self.updateVoiceActivity(level: level) } } @@ -456,6 +463,34 @@ class SpeechRecognizer { } } + // MARK: - Voice-activity detection + + /// Adaptive voice-activity detection driving Voice-Activated scrolling. + /// Tracks a slowly-adapting noise floor during silence and flags speech when + /// the level rises above it, with a short hangover so natural pauses between + /// words don't stall scrolling. Adapting relative to the noise floor keeps + /// detection reliable across microphones with very different gain levels. + private func updateVoiceActivity(level: CGFloat) { + // Continuously adapt the noise floor: rise slowly so brief speech peaks + // don't pull it up, but fall quickly so it settles into quiet passages. + // This lets steady ambient noise get absorbed into the floor (preventing + // false "always speaking") while keeping the floor low for quiet mics. + if level > noiseFloor { + noiseFloor += (level - noiseFloor) * 0.01 + } else { + noiseFloor += (level - noiseFloor) * 0.30 + } + + // Gain-relative threshold: speech sits well above the noise floor on any + // mic. The absolute minimum stops near-silent rooms (floor ~0) from + // triggering on tiny fluctuations. + let threshold = max(0.025, noiseFloor * 1.8) + if level > threshold { + lastSpeechAt = Date() + } + isSpeaking = Date().timeIntervalSince(lastSpeechAt) < speechHangover + } + // MARK: - Thread-safe buffer appending private func appendBufferToRequest(_ buffer: AVAudioPCMBuffer) {