From bfca951c82112c5c155a8c02340686ff74e29b86 Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 8 Jan 2026 21:24:50 -0800 Subject: [PATCH 01/64] custom cirbuf impl for streammanager --- pkg/sessionmanager/cirbuf.go | 166 +++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 pkg/sessionmanager/cirbuf.go diff --git a/pkg/sessionmanager/cirbuf.go b/pkg/sessionmanager/cirbuf.go new file mode 100644 index 0000000000..f1d07aaa18 --- /dev/null +++ b/pkg/sessionmanager/cirbuf.go @@ -0,0 +1,166 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package sessionmanager + +import ( + "context" + "fmt" + "sync" +) + +type CirBuf struct { + lock sync.Mutex + waiterChan chan chan struct{} + buf []byte + readPos int + writePos int + count int + totalSize int64 + syncMode bool +} + +func MakeCirBuf(maxSize int, initSyncMode bool) *CirBuf { + cb := &CirBuf{ + buf: make([]byte, maxSize), + syncMode: initSyncMode, + waiterChan: make(chan chan struct{}, 1), + } + return cb +} + +func (cb *CirBuf) Write(data []byte) (int, error) { + return cb.WriteCtx(context.Background(), data) +} + +// WriteCtx writes data to the circular buffer with context support for cancellation. +// In sync mode, blocks when buffer is full until space is available or context is cancelled. +// Returns partial byte count and context error if cancelled mid-write. +// NOTE: Only one concurrent blocked write is allowed. Multiple blocked writes will panic. +func (cb *CirBuf) WriteCtx(ctx context.Context, data []byte) (int, error) { + if len(data) == 0 { + return 0, nil + } + + bytesWritten := 0 + for bytesWritten < len(data) { + if err := ctx.Err(); err != nil { + return bytesWritten, err + } + + n, spaceAvailable := cb.writeAvailable(data[bytesWritten:]) + bytesWritten += n + + if spaceAvailable != nil { + select { + case <-spaceAvailable: + continue + case <-ctx.Done(): + tryReadCh(cb.waiterChan) + return bytesWritten, ctx.Err() + } + } + } + + return bytesWritten, nil +} + +func (cb *CirBuf) writeAvailable(data []byte) (int, chan struct{}) { + cb.lock.Lock() + defer cb.lock.Unlock() + + size := len(cb.buf) + written := 0 + + for i := 0; i < len(data); i++ { + if cb.syncMode && cb.count >= size { + spaceAvailable := make(chan struct{}) + if !tryWriteCh(cb.waiterChan, spaceAvailable) { + panic("CirBuf: multiple concurrent blocked writes not allowed") + } + return written, spaceAvailable + } + + cb.buf[cb.writePos] = data[i] + cb.writePos = (cb.writePos + 1) % size + if cb.count < size { + cb.count++ + } else { + cb.readPos = (cb.readPos + 1) % size + } + cb.totalSize++ + written++ + } + + return written, nil +} + +func (cb *CirBuf) PeekData(data []byte) int { + cb.lock.Lock() + defer cb.lock.Unlock() + + if cb.count == 0 { + return 0 + } + + size := len(cb.buf) + read := 0 + pos := cb.readPos + + for i := 0; i < len(data) && i < cb.count; i++ { + data[i] = cb.buf[pos] + pos = (pos + 1) % size + read++ + } + + return read +} + +func (cb *CirBuf) Consume(numBytes int) error { + cb.lock.Lock() + defer cb.lock.Unlock() + + if numBytes > cb.count { + return fmt.Errorf("cannot consume %d bytes, only %d available", numBytes, cb.count) + } + + size := len(cb.buf) + cb.readPos = (cb.readPos + numBytes) % size + cb.count -= numBytes + + if waiterCh, ok := tryReadCh(cb.waiterChan); ok { + close(*waiterCh) + } + + return nil +} + +func (cb *CirBuf) HeadPos() int64 { + cb.lock.Lock() + defer cb.lock.Unlock() + return cb.totalSize - int64(cb.count) +} + +func (cb *CirBuf) TotalSize() int64 { + cb.lock.Lock() + defer cb.lock.Unlock() + return cb.totalSize +} + +func tryWriteCh[T any](ch chan<- T, val T) bool { + select { + case ch <- val: + return true + default: + return false + } +} + +func tryReadCh[T any](ch <-chan T) (*T, bool) { + select { + case rtn := <-ch: + return &rtn, true + default: + return nil, false + } +} From 741a931e88399c1c8d740a2d51690930292a200c Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 8 Jan 2026 22:42:06 -0800 Subject: [PATCH 02/64] add a SetEffectiveWindow to cirbuf --- pkg/sessionmanager/cirbuf.go | 41 +++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/pkg/sessionmanager/cirbuf.go b/pkg/sessionmanager/cirbuf.go index f1d07aaa18..855113d5c7 100644 --- a/pkg/sessionmanager/cirbuf.go +++ b/pkg/sessionmanager/cirbuf.go @@ -18,6 +18,7 @@ type CirBuf struct { count int totalSize int64 syncMode bool + windowSize int } func MakeCirBuf(maxSize int, initSyncMode bool) *CirBuf { @@ -25,10 +26,36 @@ func MakeCirBuf(maxSize int, initSyncMode bool) *CirBuf { buf: make([]byte, maxSize), syncMode: initSyncMode, waiterChan: make(chan chan struct{}, 1), + windowSize: maxSize, } return cb } +// SetEffectiveWindow changes the sync mode and effective window size for flow control. +// The windowSize is capped at the buffer size. +// When window shrinks: data is preserved, sync mode blocks writes, async mode maintains data size. +// When window increases: blocked writers are woken up if space becomes available. +func (cb *CirBuf) SetEffectiveWindow(syncMode bool, windowSize int) { + cb.lock.Lock() + defer cb.lock.Unlock() + + maxSize := len(cb.buf) + if windowSize > maxSize { + windowSize = maxSize + } + + oldWindowSize := cb.windowSize + cb.windowSize = windowSize + cb.syncMode = syncMode + + if windowSize > oldWindowSize { + cb.tryWakeWriter() + } +} + +// Write will never block if syncMode is false +// If syncMode is true, write will block until enough data is consumed to allow the write to finish +// to cancel a write in progress use WriteCtx func (cb *CirBuf) Write(data []byte) (int, error) { return cb.WriteCtx(context.Background(), data) } @@ -73,7 +100,7 @@ func (cb *CirBuf) writeAvailable(data []byte) (int, chan struct{}) { written := 0 for i := 0; i < len(data); i++ { - if cb.syncMode && cb.count >= size { + if cb.syncMode && cb.count >= cb.windowSize { spaceAvailable := make(chan struct{}) if !tryWriteCh(cb.waiterChan, spaceAvailable) { panic("CirBuf: multiple concurrent blocked writes not allowed") @@ -83,7 +110,7 @@ func (cb *CirBuf) writeAvailable(data []byte) (int, chan struct{}) { cb.buf[cb.writePos] = data[i] cb.writePos = (cb.writePos + 1) % size - if cb.count < size { + if cb.count < cb.windowSize { cb.count++ } else { cb.readPos = (cb.readPos + 1) % size @@ -128,9 +155,7 @@ func (cb *CirBuf) Consume(numBytes int) error { cb.readPos = (cb.readPos + numBytes) % size cb.count -= numBytes - if waiterCh, ok := tryReadCh(cb.waiterChan); ok { - close(*waiterCh) - } + cb.tryWakeWriter() return nil } @@ -164,3 +189,9 @@ func tryReadCh[T any](ch <-chan T) (*T, bool) { return nil, false } } + +func (cb *CirBuf) tryWakeWriter() { + if waiterCh, ok := tryReadCh(cb.waiterChan); ok { + close(*waiterCh) + } +} From 61dcc8da6d0690a22b2daa707865ea2816d453de Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 8 Jan 2026 23:07:16 -0800 Subject: [PATCH 03/64] fixes for cirbuf --- pkg/sessionmanager/cirbuf.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/sessionmanager/cirbuf.go b/pkg/sessionmanager/cirbuf.go index 855113d5c7..1a3287f22e 100644 --- a/pkg/sessionmanager/cirbuf.go +++ b/pkg/sessionmanager/cirbuf.go @@ -44,11 +44,14 @@ func (cb *CirBuf) SetEffectiveWindow(syncMode bool, windowSize int) { windowSize = maxSize } + oldSyncMode := cb.syncMode oldWindowSize := cb.windowSize cb.windowSize = windowSize cb.syncMode = syncMode - if windowSize > oldWindowSize { + // Only sync mode blocks writers, so only wake if we were in sync mode. + // Wake when window grows (more space available) or switching to async (no longer blocking). + if oldSyncMode && (windowSize > oldWindowSize || !syncMode) { cb.tryWakeWriter() } } @@ -166,6 +169,12 @@ func (cb *CirBuf) HeadPos() int64 { return cb.totalSize - int64(cb.count) } +func (cb *CirBuf) Size() int { + cb.lock.Lock() + defer cb.lock.Unlock() + return cb.count +} + func (cb *CirBuf) TotalSize() int64 { cb.lock.Lock() defer cb.lock.Unlock() From 3d7566cf21e4e6fb6f362721d3996d339fc0af2c Mon Sep 17 00:00:00 2001 From: sawka Date: Fri, 9 Jan 2026 15:40:02 -0800 Subject: [PATCH 04/64] checkpoint, streammanager --- frontend/types/gotypes.d.ts | 4 +- pkg/sessionmanager/cirbuf.go | 11 +- pkg/sessionmanager/sessionmanager.go | 202 +++++++++++++ pkg/sessionmanager/streammanager.go | 357 +++++++++++++++++++++++ pkg/sessionmanager/streammanager_test.go | 357 +++++++++++++++++++++++ pkg/streamclient/stream_test.go | 24 +- pkg/streamclient/streambroker.go | 32 +- pkg/streamclient/streamreader.go | 4 +- pkg/streamclient/streamwriter.go | 4 +- pkg/wshrpc/wshrpctypes.go | 6 +- 10 files changed, 960 insertions(+), 41 deletions(-) create mode 100644 pkg/sessionmanager/sessionmanager.go create mode 100644 pkg/sessionmanager/streammanager.go create mode 100644 pkg/sessionmanager/streammanager_test.go diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 4658bc1af2..5ecb186d62 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -463,7 +463,7 @@ declare global { // wshrpc.CommandStreamAckData type CommandStreamAckData = { - id: number; + id: string; seq: number; rwnd: number; fin?: boolean; @@ -474,7 +474,7 @@ declare global { // wshrpc.CommandStreamData type CommandStreamData = { - id: number; + id: string; seq: number; data64?: string; eof?: boolean; diff --git a/pkg/sessionmanager/cirbuf.go b/pkg/sessionmanager/cirbuf.go index 1a3287f22e..cee66b2597 100644 --- a/pkg/sessionmanager/cirbuf.go +++ b/pkg/sessionmanager/cirbuf.go @@ -126,18 +126,23 @@ func (cb *CirBuf) writeAvailable(data []byte) (int, chan struct{}) { } func (cb *CirBuf) PeekData(data []byte) int { + return cb.PeekDataAt(0, data) +} + +func (cb *CirBuf) PeekDataAt(offset int, data []byte) int { cb.lock.Lock() defer cb.lock.Unlock() - if cb.count == 0 { + if cb.count == 0 || offset >= cb.count { return 0 } size := len(cb.buf) + pos := (cb.readPos + offset) % size + maxRead := cb.count - offset read := 0 - pos := cb.readPos - for i := 0; i < len(data) && i < cb.count; i++ { + for i := 0; i < len(data) && i < maxRead; i++ { data[i] = cb.buf[pos] pos = (pos + 1) % size read++ diff --git a/pkg/sessionmanager/sessionmanager.go b/pkg/sessionmanager/sessionmanager.go new file mode 100644 index 0000000000..f643d2a116 --- /dev/null +++ b/pkg/sessionmanager/sessionmanager.go @@ -0,0 +1,202 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package sessionmanager + +import ( + "encoding/base64" + "fmt" + "log" + "os" + "os/exec" + "os/signal" + "strings" + "sync" + "syscall" + "time" + + "github.com/creack/pty" + "github.com/wavetermdev/waveterm/pkg/waveobj" + "github.com/wavetermdev/waveterm/pkg/wshrpc" +) + +const ShutdownDelayTime = 100 * time.Millisecond + +type CmdDef struct { + Cmd string + Args []string + Env map[string]string + TermSize waveobj.TermSize +} + +type SessionManager struct { + sessionId string + lock sync.Mutex + cmd *exec.Cmd + cmdPty pty.Pty + cleanedUp bool + exitCode int + exitSignal string + exitErr error +} + +func MakeSessionManager(sessionId string, cmdDef CmdDef) (*SessionManager, error) { + sm := &SessionManager{ + sessionId: sessionId, + } + if cmdDef.TermSize.Rows == 0 || cmdDef.TermSize.Cols == 0 { + cmdDef.TermSize.Rows = 25 + cmdDef.TermSize.Cols = 80 + } + if cmdDef.TermSize.Rows <= 0 || cmdDef.TermSize.Cols <= 0 { + return nil, fmt.Errorf("invalid term size: %v", cmdDef.TermSize) + } + ecmd := exec.Command(cmdDef.Cmd, cmdDef.Args...) + if len(cmdDef.Env) > 0 { + ecmd.Env = os.Environ() + for key, val := range cmdDef.Env { + ecmd.Env = append(ecmd.Env, fmt.Sprintf("%s=%s", key, val)) + } + } + cmdPty, err := pty.StartWithSize(ecmd, &pty.Winsize{Rows: uint16(cmdDef.TermSize.Rows), Cols: uint16(cmdDef.TermSize.Cols)}) + if err != nil { + return nil, fmt.Errorf("failed to start command: %w", err) + } + sm.cmd = ecmd + sm.cmdPty = cmdPty + go sm.readPtyOutput(cmdPty) + go sm.waitForProcess() + sm.setupSignalHandlers() + return sm, nil +} + +func (sm *SessionManager) waitForProcess() { + if sm.cmd == nil || sm.cmd.Process == nil { + return + } + err := sm.cmd.Wait() + sm.lock.Lock() + defer sm.lock.Unlock() + + sm.exitErr = err + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { + if status.Signaled() { + sm.exitSignal = status.Signal().String() + sm.exitCode = -1 + } else { + sm.exitCode = status.ExitStatus() + } + } + } + } else { + sm.exitCode = 0 + } + log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", sm.exitCode, sm.exitSignal, sm.exitErr) +} + +func (sm *SessionManager) GetCmd() (*exec.Cmd, pty.Pty) { + sm.lock.Lock() + defer sm.lock.Unlock() + return sm.cmd, sm.cmdPty +} + +func (sm *SessionManager) HandleInput(data wshrpc.CommandBlockInputData) error { + sm.lock.Lock() + defer sm.lock.Unlock() + + if sm.cmd == nil || sm.cmdPty == nil { + return fmt.Errorf("no active process") + } + + if len(data.InputData64) > 0 { + inputBuf := make([]byte, base64.StdEncoding.DecodedLen(len(data.InputData64))) + nw, err := base64.StdEncoding.Decode(inputBuf, []byte(data.InputData64)) + if err != nil { + return fmt.Errorf("error decoding input data: %w", err) + } + _, err = sm.cmdPty.Write(inputBuf[:nw]) + if err != nil { + return fmt.Errorf("error writing to pty: %w", err) + } + } + + if data.SigName != "" { + sig := normalizeSignal(data.SigName) + if sig != nil && sm.cmd.Process != nil { + err := sm.cmd.Process.Signal(sig) + if err != nil { + return fmt.Errorf("error sending signal: %w", err) + } + } + } + + if data.TermSize != nil { + err := pty.Setsize(sm.cmdPty, &pty.Winsize{ + Rows: uint16(data.TermSize.Rows), + Cols: uint16(data.TermSize.Cols), + }) + if err != nil { + return fmt.Errorf("error setting terminal size: %w", err) + } + } + + return nil +} + +func normalizeSignal(sigName string) os.Signal { + sigName = strings.ToUpper(sigName) + sigName = strings.TrimPrefix(sigName, "SIG") + + switch sigName { + case "HUP": + return syscall.SIGHUP + case "INT": + return syscall.SIGINT + case "QUIT": + return syscall.SIGQUIT + case "KILL": + return syscall.SIGKILL + case "TERM": + return syscall.SIGTERM + case "USR1": + return syscall.SIGUSR1 + case "USR2": + return syscall.SIGUSR2 + case "STOP": + return syscall.SIGSTOP + case "CONT": + return syscall.SIGCONT + default: + return nil + } +} + +func (sm *SessionManager) setupSignalHandlers() { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) + + go func() { + sig := <-sigChan + log.Printf("received signal: %v\n", sig) + + cmd, _ := sm.GetCmd() + if cmd != nil && cmd.Process != nil { + log.Printf("forwarding signal %v to child process\n", sig) + cmd.Process.Signal(sig) + time.Sleep(ShutdownDelayTime) + } + + sm.Cleanup() + os.Exit(0) + }() +} + +func (sm *SessionManager) readPtyOutput(cmdPty pty.Pty) { + // TODO: implement readPtyOutput +} + +func (sm *SessionManager) Cleanup() { + // TODO: implement Cleanup +} diff --git a/pkg/sessionmanager/streammanager.go b/pkg/sessionmanager/streammanager.go new file mode 100644 index 0000000000..f93122cfc3 --- /dev/null +++ b/pkg/sessionmanager/streammanager.go @@ -0,0 +1,357 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package sessionmanager + +import ( + "encoding/base64" + "fmt" + "io" + "sync" + + "github.com/wavetermdev/waveterm/pkg/wshrpc" +) + +const ( + CwndSize = 64 * 1024 // 64 KB window for connected mode + CirBufSize = 2 * 1024 * 1024 // 2 MB max buffer size + DisconnReadSz = 4 * 1024 // 4 KB read chunks when disconnected + MaxPacketSize = 4 * 1024 // 4 KB max data per packet +) + +type DataSender interface { + SendData(dataPk wshrpc.CommandStreamData) +} + +type streamTerminalEvent struct { + isEof bool + err string +} + +// StreamManager handles PTY output buffering with ACK-based flow control +type StreamManager struct { + lock sync.Mutex + + streamId string + buf *CirBuf + + terminalEvent *streamTerminalEvent + terminalEventSent bool + + reader io.Reader + readerWg sync.WaitGroup + + dataSender DataSender + + cwndSize int + rwndSize int + connected bool + drained bool + + sentNotAcked int64 + drainCond *sync.Cond + closed bool +} + +func MakeStreamManager(streamId string, dataSender DataSender) *StreamManager { + return MakeStreamManagerWithSizes(streamId, dataSender, CwndSize, CirBufSize) +} + +func MakeStreamManagerWithSizes(streamId string, dataSender DataSender, cwndSize, cirbufSize int) *StreamManager { + if dataSender == nil { + panic("dataSender cannot be nil") + } + sm := &StreamManager{ + streamId: streamId, + buf: MakeCirBuf(cirbufSize, true), + dataSender: dataSender, + cwndSize: cwndSize, + rwndSize: cwndSize, + sentNotAcked: 0, + } + sm.drainCond = sync.NewCond(&sm.lock) + go sm.senderLoop() + return sm +} + +// AttachReader starts reading from the given reader +func (sm *StreamManager) AttachReader(r io.Reader) error { + sm.lock.Lock() + defer sm.lock.Unlock() + + if sm.reader != nil { + return fmt.Errorf("reader already attached") + } + + sm.reader = r + + sm.readerWg.Add(1) + go sm.readLoop() + + return nil +} + +// ClientConnected transitions to CONNECTED mode +func (sm *StreamManager) ClientConnected(rwndSize int) error { + sm.lock.Lock() + defer sm.lock.Unlock() + + if sm.connected { + return nil + } + + sm.connected = true + sm.drained = false + sm.rwndSize = rwndSize + effectiveWindow := sm.cwndSize + if sm.rwndSize < effectiveWindow { + effectiveWindow = sm.rwndSize + } + sm.buf.SetEffectiveWindow(true, effectiveWindow) + sm.drainCond.Signal() + + return nil +} + +// ClientDisconnected transitions to DISCONNECTED mode +func (sm *StreamManager) ClientDisconnected() { + sm.lock.Lock() + defer sm.lock.Unlock() + + if !sm.connected { + return + } + + sm.connected = false + sm.drainCond.Signal() + sm.sentNotAcked = 0 + sm.buf.SetEffectiveWindow(false, CirBufSize) +} + +// RecvAck processes an ACK from the client +func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) error { + sm.lock.Lock() + defer sm.lock.Unlock() + + if !sm.connected { + return nil + } + + seq := ackPk.Seq + headPos := sm.buf.HeadPos() + if seq < headPos { + return fmt.Errorf("ACK seq %d is before buffer start %d", seq, headPos) + } + + ackedBytes := seq - headPos + available := sm.buf.Size() + + maxAckable := int64(available) + sm.sentNotAcked + if ackedBytes > maxAckable { + return fmt.Errorf("ACK seq %d exceeds total sent (headPos=%d, available=%d, sentNotAcked=%d)", + seq, headPos, available, sm.sentNotAcked) + } + + if ackedBytes > 0 { + consumeFromBuf := int(ackedBytes) + if consumeFromBuf > available { + consumeFromBuf = available + } + if err := sm.buf.Consume(consumeFromBuf); err != nil { + return err + } + sm.sentNotAcked -= ackedBytes + if sm.sentNotAcked < 0 { + sm.sentNotAcked = 0 + } + } + + prevRwnd := sm.rwndSize + sm.rwndSize = int(ackPk.RWnd) + effectiveWindow := sm.cwndSize + if sm.rwndSize < effectiveWindow { + effectiveWindow = sm.rwndSize + } + sm.buf.SetEffectiveWindow(true, effectiveWindow) + + if sm.rwndSize > prevRwnd || ackedBytes > 0 { + sm.drainCond.Signal() + } + + if sm.terminalEvent != nil && !sm.terminalEventSent && sm.buf.Size() == 0 && sm.sentNotAcked == 0 { + sm.sendTerminalEvent() + } + + return nil +} + +// Close shuts down the sender loop and waits for the reader to finish +func (sm *StreamManager) Close() { + sm.lock.Lock() + sm.closed = true + sm.drainCond.Signal() + sm.lock.Unlock() + + sm.readerWg.Wait() +} + +// readLoop is the main read goroutine +func (sm *StreamManager) readLoop() { + defer sm.readerWg.Done() + + for { + sm.lock.Lock() + if sm.terminalEvent != nil { + sm.lock.Unlock() + return + } + + isConnected := sm.connected && sm.drained + sm.lock.Unlock() + + var readBuf []byte + if isConnected { + readBuf = make([]byte, 32*1024) + } else { + readBuf = make([]byte, DisconnReadSz) + } + + n, err := sm.reader.Read(readBuf) + + if n > 0 { + sm.handleReadData(readBuf[:n], isConnected) + } + + if err != nil { + if err == io.EOF { + sm.handleEOF() + } else { + sm.handleError(err) + } + return + } + } +} + +func (sm *StreamManager) handleReadData(data []byte, isConnected bool) { + sm.buf.Write(data) + if isConnected { + sm.sendBufferData() + } +} + +func (sm *StreamManager) handleEOF() { + sm.lock.Lock() + defer sm.lock.Unlock() + + sm.terminalEvent = &streamTerminalEvent{isEof: true} + + if sm.buf.Size() == 0 && sm.sentNotAcked == 0 && sm.connected && sm.drained { + sm.sendTerminalEvent() + } +} + +func (sm *StreamManager) handleError(err error) { + sm.lock.Lock() + defer sm.lock.Unlock() + + sm.terminalEvent = &streamTerminalEvent{err: err.Error()} + + if sm.buf.Size() == 0 && sm.sentNotAcked == 0 && sm.connected && sm.drained { + sm.sendTerminalEvent() + } +} + +func (sm *StreamManager) senderLoop() { + for { + sm.lock.Lock() + + if sm.closed { + sm.lock.Unlock() + return + } + + if !sm.connected { + sm.drainCond.Wait() + sm.lock.Unlock() + continue + } + + available := sm.buf.Size() + if available == 0 { + sm.drained = true + if sm.terminalEvent != nil && !sm.terminalEventSent && sm.sentNotAcked == 0 { + sm.sendTerminalEvent() + } + sm.drainCond.Wait() + sm.lock.Unlock() + continue + } + + effectiveRwnd := sm.rwndSize + if sm.cwndSize < effectiveRwnd { + effectiveRwnd = sm.cwndSize + } + availableToSend := int64(effectiveRwnd) - sm.sentNotAcked + + if availableToSend <= 0 { + sm.drainCond.Wait() + sm.lock.Unlock() + continue + } + + peekSize := int(availableToSend) + if peekSize > MaxPacketSize { + peekSize = MaxPacketSize + } + if peekSize > available { + peekSize = available + } + + data := make([]byte, peekSize) + n := sm.buf.PeekDataAt(int(sm.sentNotAcked), data) + if n == 0 { + sm.lock.Unlock() + continue + } + data = data[:n] + + seq := sm.buf.HeadPos() + sm.sentNotAcked + sm.sentNotAcked += int64(n) + sm.lock.Unlock() + + pkt := wshrpc.CommandStreamData{ + Id: sm.streamId, + Seq: seq, + Data64: base64.StdEncoding.EncodeToString(data), + } + sm.dataSender.SendData(pkt) + } +} + +func (sm *StreamManager) sendBufferData() { + sm.lock.Lock() + sm.drainCond.Signal() + sm.lock.Unlock() +} + +func (sm *StreamManager) sendTerminalEvent() { + if sm.terminalEventSent { + return + } + + seq := sm.buf.HeadPos() + pkt := wshrpc.CommandStreamData{ + Id: sm.streamId, + Seq: seq, + } + + if sm.terminalEvent.isEof { + pkt.Eof = true + } else { + pkt.Error = sm.terminalEvent.err + } + + sm.terminalEventSent = true + sm.dataSender.SendData(pkt) +} diff --git a/pkg/sessionmanager/streammanager_test.go b/pkg/sessionmanager/streammanager_test.go new file mode 100644 index 0000000000..acfd74d7db --- /dev/null +++ b/pkg/sessionmanager/streammanager_test.go @@ -0,0 +1,357 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package sessionmanager + +import ( + "encoding/base64" + "io" + "strings" + "sync" + "testing" + "time" + + "github.com/wavetermdev/waveterm/pkg/wshrpc" +) + +type testWriter struct { + mu sync.Mutex + packets []wshrpc.CommandStreamData +} + +func (tw *testWriter) SendData(pkt wshrpc.CommandStreamData) { + tw.mu.Lock() + defer tw.mu.Unlock() + tw.packets = append(tw.packets, pkt) +} + +func (tw *testWriter) GetPackets() []wshrpc.CommandStreamData { + tw.mu.Lock() + defer tw.mu.Unlock() + result := make([]wshrpc.CommandStreamData, len(tw.packets)) + copy(result, tw.packets) + return result +} + +func (tw *testWriter) Clear() { + tw.mu.Lock() + defer tw.mu.Unlock() + tw.packets = nil +} + +func decodeData(data64 string) string { + decoded, _ := base64.StdEncoding.DecodeString(data64) + return string(decoded) +} + +func TestBasicDisconnectedMode(t *testing.T) { + tw := &testWriter{} + sm := MakeStreamManager("1", tw) + + reader := strings.NewReader("hello world") + err := sm.AttachReader(reader) + if err != nil { + t.Fatalf("AttachReader failed: %v", err) + } + + time.Sleep(50 * time.Millisecond) + + packets := tw.GetPackets() + if len(packets) > 0 { + t.Errorf("Expected no packets in DISCONNECTED mode without client, got %d", len(packets)) + } + + sm.Close() +} + +func TestConnectedModeBasicFlow(t *testing.T) { + tw := &testWriter{} + sm := MakeStreamManager("1", tw) + + reader := strings.NewReader("hello") + err := sm.AttachReader(reader) + if err != nil { + t.Fatalf("AttachReader failed: %v", err) + } + + err = sm.ClientConnected(CwndSize) + if err != nil { + t.Fatalf("ClientConnected failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + packets := tw.GetPackets() + if len(packets) == 0 { + t.Fatal("Expected packets after ClientConnected") + } + + // Verify we got the data + allData := "" + for _, pkt := range packets { + if pkt.Data64 != "" { + allData += decodeData(pkt.Data64) + } + } + + if allData != "hello" { + t.Errorf("Expected 'hello', got '%s'", allData) + } + + // Send ACK + err = sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: 5, RWnd: CwndSize}) + if err != nil { + t.Errorf("RecvAck failed: %v", err) + } + + time.Sleep(50 * time.Millisecond) + + // Check for EOF packet + packets = tw.GetPackets() + hasEof := false + for _, pkt := range packets { + if pkt.Eof { + hasEof = true + } + } + + if !hasEof { + t.Error("Expected EOF packet after ACKing all data") + } + + sm.Close() +} + +func TestDisconnectedToConnectedTransition(t *testing.T) { + tw := &testWriter{} + sm := MakeStreamManager("1", tw) + + reader := strings.NewReader("test data") + err := sm.AttachReader(reader) + if err != nil { + t.Fatalf("AttachReader failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + err = sm.ClientConnected(CwndSize) + if err != nil { + t.Fatalf("ClientConnected failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + packets := tw.GetPackets() + if len(packets) == 0 { + t.Fatal("Expected cirbuf drain after connect") + } + + allData := "" + for _, pkt := range packets { + if pkt.Data64 != "" { + allData += decodeData(pkt.Data64) + } + } + + if allData != "test data" { + t.Errorf("Expected 'test data', got '%s'", allData) + } + + sm.Close() +} + +func TestConnectedToDisconnectedTransition(t *testing.T) { + tw := &testWriter{} + sm := MakeStreamManager("1", tw) + + reader := &slowReader{data: []byte("slow data"), delay: 50 * time.Millisecond} + err := sm.AttachReader(reader) + if err != nil { + t.Fatalf("AttachReader failed: %v", err) + } + + err = sm.ClientConnected(CwndSize) + if err != nil { + t.Fatalf("ClientConnected failed: %v", err) + } + + time.Sleep(150 * time.Millisecond) + + sm.ClientDisconnected() + + time.Sleep(100 * time.Millisecond) + + sm.Close() +} + +func TestFlowControl(t *testing.T) { + cwndSize := 1024 + tw := &testWriter{} + sm := MakeStreamManagerWithSizes("1", tw, cwndSize, 8*1024) + + largeData := strings.Repeat("x", cwndSize+500) + reader := strings.NewReader(largeData) + + err := sm.AttachReader(reader) + if err != nil { + t.Fatalf("AttachReader failed: %v", err) + } + + err = sm.ClientConnected(cwndSize) + if err != nil { + t.Fatalf("ClientConnected failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + packets := tw.GetPackets() + totalData := 0 + for _, pkt := range packets { + if pkt.Data64 != "" { + decoded, _ := base64.StdEncoding.DecodeString(pkt.Data64) + totalData += len(decoded) + } + } + + if totalData > cwndSize { + t.Errorf("Sent %d bytes without ACK, exceeds cwnd size %d", totalData, cwndSize) + } + + err = sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: int64(totalData), RWnd: int64(cwndSize)}) + if err != nil { + t.Errorf("RecvAck failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + sm.Close() +} + +func TestSequenceNumbering(t *testing.T) { + tw := &testWriter{} + sm := MakeStreamManager("1", tw) + + reader := strings.NewReader("abcdefghij") + err := sm.AttachReader(reader) + if err != nil { + t.Fatalf("AttachReader failed: %v", err) + } + + err = sm.ClientConnected(CwndSize) + if err != nil { + t.Fatalf("ClientConnected failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + packets := tw.GetPackets() + if len(packets) == 0 { + t.Fatal("Expected packets") + } + + expectedSeq := int64(0) + for _, pkt := range packets { + if pkt.Data64 == "" { + continue + } + + if pkt.Seq != expectedSeq { + t.Errorf("Expected seq %d, got %d", expectedSeq, pkt.Seq) + } + + decoded, _ := base64.StdEncoding.DecodeString(pkt.Data64) + expectedSeq += int64(len(decoded)) + } + + sm.Close() +} + +func TestTerminalEventOrdering(t *testing.T) { + tw := &testWriter{} + sm := MakeStreamManager("1", tw) + + reader := strings.NewReader("data") + err := sm.AttachReader(reader) + if err != nil { + t.Fatalf("AttachReader failed: %v", err) + } + + err = sm.ClientConnected(CwndSize) + if err != nil { + t.Fatalf("ClientConnected failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + packets := tw.GetPackets() + if len(packets) == 0 { + t.Fatal("Expected data packets") + } + + hasData := false + hasEof := false + eofSeq := int64(-1) + + for _, pkt := range packets { + if pkt.Data64 != "" { + hasData = true + } + if pkt.Eof { + hasEof = true + eofSeq = pkt.Seq + } + } + + if !hasData { + t.Error("Expected data packet") + } + + if hasEof { + t.Error("Should not have EOF before ACK") + } + + err = sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: 4, RWnd: CwndSize}) + if err != nil { + t.Errorf("RecvAck failed: %v", err) + } + + time.Sleep(50 * time.Millisecond) + + packets = tw.GetPackets() + hasEof = false + for _, pkt := range packets { + if pkt.Eof { + hasEof = true + eofSeq = pkt.Seq + } + } + + if !hasEof { + t.Error("Expected EOF after ACKing all data") + } + + if eofSeq != 4 { + t.Errorf("Expected EOF at seq 4, got %d", eofSeq) + } + + sm.Close() +} + +type slowReader struct { + data []byte + pos int + delay time.Duration +} + +func (sr *slowReader) Read(p []byte) (n int, err error) { + if sr.pos >= len(sr.data) { + return 0, io.EOF + } + + time.Sleep(sr.delay) + + n = copy(p, sr.data[sr.pos:]) + sr.pos += n + + return n, nil +} diff --git a/pkg/streamclient/stream_test.go b/pkg/streamclient/stream_test.go index be1d2a1149..f5c43f937c 100644 --- a/pkg/streamclient/stream_test.go +++ b/pkg/streamclient/stream_test.go @@ -32,8 +32,8 @@ func (ft *fakeTransport) SendAck(ackPk wshrpc.CommandStreamAckData) { func TestBasicReadWrite(t *testing.T) { transport := newFakeTransport() - reader := NewReader(1, 1024, transport) - writer := NewWriter(1, 1024, transport) + reader := NewReader("1", 1024, transport) + writer := NewWriter("1", 1024, transport) go func() { for dataPk := range transport.dataChan { @@ -72,8 +72,8 @@ func TestBasicReadWrite(t *testing.T) { func TestEOF(t *testing.T) { transport := newFakeTransport() - reader := NewReader(1, 1024, transport) - writer := NewWriter(1, 1024, transport) + reader := NewReader("1", 1024, transport) + writer := NewWriter("1", 1024, transport) go func() { for dataPk := range transport.dataChan { @@ -110,8 +110,8 @@ func TestFlowControl(t *testing.T) { smallWindow := int64(10) transport := newFakeTransport() - reader := NewReader(1, smallWindow, transport) - writer := NewWriter(1, smallWindow, transport) + reader := NewReader("1", smallWindow, transport) + writer := NewWriter("1", smallWindow, transport) go func() { for dataPk := range transport.dataChan { @@ -163,8 +163,8 @@ func TestFlowControl(t *testing.T) { func TestError(t *testing.T) { transport := newFakeTransport() - reader := NewReader(1, 1024, transport) - writer := NewWriter(1, 1024, transport) + reader := NewReader("1", 1024, transport) + writer := NewWriter("1", 1024, transport) go func() { for dataPk := range transport.dataChan { @@ -194,8 +194,8 @@ func TestError(t *testing.T) { func TestCancel(t *testing.T) { transport := newFakeTransport() - reader := NewReader(1, 1024, transport) - writer := NewWriter(1, 1024, transport) + reader := NewReader("1", 1024, transport) + writer := NewWriter("1", 1024, transport) go func() { for dataPk := range transport.dataChan { @@ -227,8 +227,8 @@ func TestCancel(t *testing.T) { func TestMultipleWrites(t *testing.T) { transport := newFakeTransport() - reader := NewReader(1, 1024, transport) - writer := NewWriter(1, 1024, transport) + reader := NewReader("1", 1024, transport) + writer := NewWriter("1", 1024, transport) go func() { for dataPk := range transport.dataChan { diff --git a/pkg/streamclient/streambroker.go b/pkg/streamclient/streambroker.go index 4d35c9d367..abe7ba2ee7 100644 --- a/pkg/streamclient/streambroker.go +++ b/pkg/streamclient/streambroker.go @@ -5,6 +5,7 @@ import ( "sync" "time" + "github.com/google/uuid" "github.com/wavetermdev/waveterm/pkg/utilds" "github.com/wavetermdev/waveterm/pkg/wshrpc" "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" @@ -41,12 +42,11 @@ func AdaptWshRpc(rpc *wshutil.WshRpc) StreamRpcInterface { type Broker struct { lock sync.Mutex rpcClient StreamRpcInterface - streamIdCounter int64 - readers map[int64]*Reader - writers map[int64]*Writer - readerRoutes map[int64]string - writerRoutes map[int64]string - readerErrorSentTime map[int64]time.Time + readers map[string]*Reader + writers map[string]*Writer + readerRoutes map[string]string + writerRoutes map[string]string + readerErrorSentTime map[string]time.Time sendQueue *utilds.WorkQueue[workItem] recvQueue *utilds.WorkQueue[workItem] } @@ -54,12 +54,11 @@ type Broker struct { func NewBroker(rpcClient StreamRpcInterface) *Broker { b := &Broker{ rpcClient: rpcClient, - streamIdCounter: 0, - readers: make(map[int64]*Reader), - writers: make(map[int64]*Writer), - readerRoutes: make(map[int64]string), - writerRoutes: make(map[int64]string), - readerErrorSentTime: make(map[int64]time.Time), + readers: make(map[string]*Reader), + writers: make(map[string]*Writer), + readerRoutes: make(map[string]string), + writerRoutes: make(map[string]string), + readerErrorSentTime: make(map[string]time.Time), } b.sendQueue = utilds.NewWorkQueue(b.processSendWork) b.recvQueue = utilds.NewWorkQueue(b.processRecvWork) @@ -70,8 +69,7 @@ func (b *Broker) CreateStreamReader(readerRoute string, writerRoute string, rwnd b.lock.Lock() defer b.lock.Unlock() - b.streamIdCounter++ - streamId := b.streamIdCounter + streamId := uuid.New().String() reader := NewReader(streamId, rwnd, b) b.readers[streamId] = reader @@ -93,7 +91,7 @@ func (b *Broker) AttachStreamWriter(meta *wshrpc.StreamMeta) (*Writer, error) { defer b.lock.Unlock() if _, exists := b.writers[meta.Id]; exists { - return nil, fmt.Errorf("writer already registered for stream id %d", meta.Id) + return nil, fmt.Errorf("writer already registered for stream id %s", meta.Id) } writer := NewWriter(meta.Id, meta.RWnd, b) @@ -220,7 +218,7 @@ func (b *Broker) Close() { b.recvQueue.Wait() } -func (b *Broker) cleanupReader(streamId int64) { +func (b *Broker) cleanupReader(streamId string) { b.lock.Lock() defer b.lock.Unlock() @@ -229,7 +227,7 @@ func (b *Broker) cleanupReader(streamId int64) { delete(b.readerErrorSentTime, streamId) } -func (b *Broker) cleanupWriter(streamId int64) { +func (b *Broker) cleanupWriter(streamId string) { b.lock.Lock() defer b.lock.Unlock() diff --git a/pkg/streamclient/streamreader.go b/pkg/streamclient/streamreader.go index e1fb7bc10a..28e5f3fcf3 100644 --- a/pkg/streamclient/streamreader.go +++ b/pkg/streamclient/streamreader.go @@ -16,7 +16,7 @@ type AckSender interface { type Reader struct { lock sync.Mutex cond *sync.Cond - id int64 + id string ackSender AckSender readWindow int64 nextSeq int64 @@ -27,7 +27,7 @@ type Reader struct { lastRwndSent int64 } -func NewReader(id int64, readWindow int64, ackSender AckSender) *Reader { +func NewReader(id string, readWindow int64, ackSender AckSender) *Reader { r := &Reader{ id: id, readWindow: readWindow, diff --git a/pkg/streamclient/streamwriter.go b/pkg/streamclient/streamwriter.go index aef0a3df4e..862f0c9cfb 100644 --- a/pkg/streamclient/streamwriter.go +++ b/pkg/streamclient/streamwriter.go @@ -16,7 +16,7 @@ type DataSender interface { type Writer struct { lock sync.Mutex cond *sync.Cond - id int64 + id string dataSender DataSender readWindow int64 nextSeq int64 @@ -31,7 +31,7 @@ type Writer struct { closed bool } -func NewWriter(id int64, readWindow int64, dataSender DataSender) *Writer { +func NewWriter(id string, readWindow int64, dataSender DataSender) *Writer { w := &Writer{ id: id, readWindow: readWindow, diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index f36bd8fa8d..9fa9bafbe0 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -633,7 +633,7 @@ type CommandElectronDecryptRtnData struct { } type CommandStreamData struct { - Id int64 `json:"id"` // streamid + Id string `json:"id"` // streamid Seq int64 `json:"seq"` // start offset (bytes) Data64 string `json:"data64,omitempty"` Eof bool `json:"eof,omitempty"` // can be set with data or without @@ -641,7 +641,7 @@ type CommandStreamData struct { } type CommandStreamAckData struct { - Id int64 `json:"id"` // streamid + Id string `json:"id"` // streamid Seq int64 `json:"seq"` // next expected byte RWnd int64 `json:"rwnd"` // receive window size Fin bool `json:"fin,omitempty"` // observed end-of-stream (eof or error) @@ -651,7 +651,7 @@ type CommandStreamAckData struct { } type StreamMeta struct { - Id int64 `json:"id"` // streamid + Id string `json:"id"` // streamid RWnd int64 `json:"rwnd"` // initial receive window size ReaderRouteId string `json:"readerrouteid"` WriterRouteId string `json:"writerrouteid"` From de588ebc4ce68eaaad9f8add432a0213acd46ff3 Mon Sep 17 00:00:00 2001 From: sawka Date: Fri, 9 Jan 2026 16:00:55 -0800 Subject: [PATCH 05/64] integrate streammanager with broker --- pkg/sessionmanager/streammanager.go | 13 +++++------ pkg/sessionmanager/streammanager_test.go | 15 +++---------- pkg/streamclient/streambroker.go | 22 ++++++++++++++----- pkg/streamclient/streambroker_test.go | 28 ++++++++++++------------ 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/pkg/sessionmanager/streammanager.go b/pkg/sessionmanager/streammanager.go index f93122cfc3..6baf2e8f32 100644 --- a/pkg/sessionmanager/streammanager.go +++ b/pkg/sessionmanager/streammanager.go @@ -129,18 +129,18 @@ func (sm *StreamManager) ClientDisconnected() { } // RecvAck processes an ACK from the client -func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) error { +func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { sm.lock.Lock() defer sm.lock.Unlock() if !sm.connected { - return nil + return } seq := ackPk.Seq headPos := sm.buf.HeadPos() if seq < headPos { - return fmt.Errorf("ACK seq %d is before buffer start %d", seq, headPos) + return } ackedBytes := seq - headPos @@ -148,8 +148,7 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) error { maxAckable := int64(available) + sm.sentNotAcked if ackedBytes > maxAckable { - return fmt.Errorf("ACK seq %d exceeds total sent (headPos=%d, available=%d, sentNotAcked=%d)", - seq, headPos, available, sm.sentNotAcked) + return } if ackedBytes > 0 { @@ -158,7 +157,7 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) error { consumeFromBuf = available } if err := sm.buf.Consume(consumeFromBuf); err != nil { - return err + return } sm.sentNotAcked -= ackedBytes if sm.sentNotAcked < 0 { @@ -181,8 +180,6 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) error { if sm.terminalEvent != nil && !sm.terminalEventSent && sm.buf.Size() == 0 && sm.sentNotAcked == 0 { sm.sendTerminalEvent() } - - return nil } // Close shuts down the sender loop and waits for the reader to finish diff --git a/pkg/sessionmanager/streammanager_test.go b/pkg/sessionmanager/streammanager_test.go index acfd74d7db..98e901f00e 100644 --- a/pkg/sessionmanager/streammanager_test.go +++ b/pkg/sessionmanager/streammanager_test.go @@ -99,10 +99,7 @@ func TestConnectedModeBasicFlow(t *testing.T) { } // Send ACK - err = sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: 5, RWnd: CwndSize}) - if err != nil { - t.Errorf("RecvAck failed: %v", err) - } + sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: 5, RWnd: CwndSize}) time.Sleep(50 * time.Millisecond) @@ -217,10 +214,7 @@ func TestFlowControl(t *testing.T) { t.Errorf("Sent %d bytes without ACK, exceeds cwnd size %d", totalData, cwndSize) } - err = sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: int64(totalData), RWnd: int64(cwndSize)}) - if err != nil { - t.Errorf("RecvAck failed: %v", err) - } + sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: int64(totalData), RWnd: int64(cwndSize)}) time.Sleep(100 * time.Millisecond) @@ -310,10 +304,7 @@ func TestTerminalEventOrdering(t *testing.T) { t.Error("Should not have EOF before ACK") } - err = sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: 4, RWnd: CwndSize}) - if err != nil { - t.Errorf("RecvAck failed: %v", err) - } + sm.RecvAck(wshrpc.CommandStreamAckData{Id: "1", Seq: 4, RWnd: CwndSize}) time.Sleep(50 * time.Millisecond) diff --git a/pkg/streamclient/streambroker.go b/pkg/streamclient/streambroker.go index abe7ba2ee7..a6a6a8fec7 100644 --- a/pkg/streamclient/streambroker.go +++ b/pkg/streamclient/streambroker.go @@ -18,6 +18,10 @@ type workItem struct { dataPk wshrpc.CommandStreamData } +type StreamWriter interface { + RecvAck(ackPk wshrpc.CommandStreamAckData) +} + type StreamRpcInterface interface { StreamDataAckCommand(data wshrpc.CommandStreamAckData, opts *wshrpc.RpcOpts) error StreamDataCommand(data wshrpc.CommandStreamData, opts *wshrpc.RpcOpts) error @@ -43,7 +47,7 @@ type Broker struct { lock sync.Mutex rpcClient StreamRpcInterface readers map[string]*Reader - writers map[string]*Writer + writers map[string]StreamWriter readerRoutes map[string]string writerRoutes map[string]string readerErrorSentTime map[string]time.Time @@ -55,7 +59,7 @@ func NewBroker(rpcClient StreamRpcInterface) *Broker { b := &Broker{ rpcClient: rpcClient, readers: make(map[string]*Reader), - writers: make(map[string]*Writer), + writers: make(map[string]StreamWriter), readerRoutes: make(map[string]string), writerRoutes: make(map[string]string), readerErrorSentTime: make(map[string]time.Time), @@ -86,19 +90,27 @@ func (b *Broker) CreateStreamReader(readerRoute string, writerRoute string, rwnd return reader, meta } -func (b *Broker) AttachStreamWriter(meta *wshrpc.StreamMeta) (*Writer, error) { +func (b *Broker) AttachStreamWriter(meta *wshrpc.StreamMeta, writer StreamWriter) error { b.lock.Lock() defer b.lock.Unlock() if _, exists := b.writers[meta.Id]; exists { - return nil, fmt.Errorf("writer already registered for stream id %s", meta.Id) + return fmt.Errorf("writer already registered for stream id %s", meta.Id) } - writer := NewWriter(meta.Id, meta.RWnd, b) b.writers[meta.Id] = writer b.readerRoutes[meta.Id] = meta.ReaderRouteId b.writerRoutes[meta.Id] = meta.WriterRouteId + return nil +} + +func (b *Broker) CreateStreamWriter(meta *wshrpc.StreamMeta) (*Writer, error) { + writer := NewWriter(meta.Id, meta.RWnd, b) + err := b.AttachStreamWriter(meta, writer) + if err != nil { + return nil, err + } return writer, nil } diff --git a/pkg/streamclient/streambroker_test.go b/pkg/streamclient/streambroker_test.go index 42871caf80..146816ce79 100644 --- a/pkg/streamclient/streambroker_test.go +++ b/pkg/streamclient/streambroker_test.go @@ -68,9 +68,9 @@ func TestBrokerBasicReadWrite(t *testing.T) { broker1, broker2 := setupBrokerPair() reader, meta := broker1.CreateStreamReader("reader1", "writer1", 1024) - writer, err := broker2.AttachStreamWriter(meta) + writer, err := broker2.CreateStreamWriter(meta) if err != nil { - t.Fatalf("AttachStreamWriter failed: %v", err) + t.Fatalf("CreateStreamWriter failed: %v", err) } testData := []byte("Hello, World!") @@ -105,9 +105,9 @@ func TestBrokerEOF(t *testing.T) { broker1, broker2 := setupBrokerPair() reader, meta := broker1.CreateStreamReader("reader1", "writer1", 1024) - writer, err := broker2.AttachStreamWriter(meta) + writer, err := broker2.CreateStreamWriter(meta) if err != nil { - t.Fatalf("AttachStreamWriter failed: %v", err) + t.Fatalf("CreateStreamWriter failed: %v", err) } testData := []byte("Test data") @@ -134,9 +134,9 @@ func TestBrokerFlowControl(t *testing.T) { smallWindow := int64(10) reader, meta := broker1.CreateStreamReader("reader1", "writer1", smallWindow) - writer, err := broker2.AttachStreamWriter(meta) + writer, err := broker2.CreateStreamWriter(meta) if err != nil { - t.Fatalf("AttachStreamWriter failed: %v", err) + t.Fatalf("CreateStreamWriter failed: %v", err) } largeData := make([]byte, 100) @@ -180,9 +180,9 @@ func TestBrokerError(t *testing.T) { broker1, broker2 := setupBrokerPair() reader, meta := broker1.CreateStreamReader("reader1", "writer1", 1024) - writer, err := broker2.AttachStreamWriter(meta) + writer, err := broker2.CreateStreamWriter(meta) if err != nil { - t.Fatalf("AttachStreamWriter failed: %v", err) + t.Fatalf("CreateStreamWriter failed: %v", err) } testErr := io.ErrUnexpectedEOF @@ -202,9 +202,9 @@ func TestBrokerCancel(t *testing.T) { broker1, broker2 := setupBrokerPair() reader, meta := broker1.CreateStreamReader("reader1", "writer1", 1024) - writer, err := broker2.AttachStreamWriter(meta) + writer, err := broker2.CreateStreamWriter(meta) if err != nil { - t.Fatalf("AttachStreamWriter failed: %v", err) + t.Fatalf("CreateStreamWriter failed: %v", err) } reader.Close() @@ -226,9 +226,9 @@ func TestBrokerMultipleWrites(t *testing.T) { broker1, broker2 := setupBrokerPair() reader, meta := broker1.CreateStreamReader("reader1", "writer1", 1024) - writer, err := broker2.AttachStreamWriter(meta) + writer, err := broker2.CreateStreamWriter(meta) if err != nil { - t.Fatalf("AttachStreamWriter failed: %v", err) + t.Fatalf("CreateStreamWriter failed: %v", err) } messages := []string{"First", "Second", "Third"} @@ -261,9 +261,9 @@ func TestBrokerCleanup(t *testing.T) { broker1, broker2 := setupBrokerPair() reader, meta := broker1.CreateStreamReader("reader1", "writer1", 1024) - writer, err := broker2.AttachStreamWriter(meta) + writer, err := broker2.CreateStreamWriter(meta) if err != nil { - t.Fatalf("AttachStreamWriter failed: %v", err) + t.Fatalf("CreateStreamWriter failed: %v", err) } testData := []byte("cleanup test") From 5386109a3009d52b98d3f90c137afb08e49a1ce5 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 12 Jan 2026 10:10:59 -0800 Subject: [PATCH 06/64] rename session => job --- db/migrations-wstore/000011_job.down.sql | 1 + db/migrations-wstore/000011_job.up.sql | 5 ++ pkg/{sessionmanager => jobmanager}/cirbuf.go | 2 +- .../jobmanager.go} | 84 +++++++++---------- .../streammanager.go | 2 +- .../streammanager_test.go | 2 +- pkg/waveobj/wtype.go | 30 +++++++ pkg/wshutil/wshrouter.go | 4 + 8 files changed, 85 insertions(+), 45 deletions(-) create mode 100644 db/migrations-wstore/000011_job.down.sql create mode 100644 db/migrations-wstore/000011_job.up.sql rename pkg/{sessionmanager => jobmanager}/cirbuf.go (99%) rename pkg/{sessionmanager/sessionmanager.go => jobmanager/jobmanager.go} (70%) rename pkg/{sessionmanager => jobmanager}/streammanager.go (99%) rename pkg/{sessionmanager => jobmanager}/streammanager_test.go (99%) diff --git a/db/migrations-wstore/000011_job.down.sql b/db/migrations-wstore/000011_job.down.sql new file mode 100644 index 0000000000..34620c17aa --- /dev/null +++ b/db/migrations-wstore/000011_job.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS db_job; diff --git a/db/migrations-wstore/000011_job.up.sql b/db/migrations-wstore/000011_job.up.sql new file mode 100644 index 0000000000..3b032507bb --- /dev/null +++ b/db/migrations-wstore/000011_job.up.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS db_job ( + oid varchar(36) PRIMARY KEY, + version int NOT NULL, + data json NOT NULL +); diff --git a/pkg/sessionmanager/cirbuf.go b/pkg/jobmanager/cirbuf.go similarity index 99% rename from pkg/sessionmanager/cirbuf.go rename to pkg/jobmanager/cirbuf.go index cee66b2597..8d14bfef78 100644 --- a/pkg/sessionmanager/cirbuf.go +++ b/pkg/jobmanager/cirbuf.go @@ -1,7 +1,7 @@ // Copyright 2025, Command Line Inc. // SPDX-License-Identifier: Apache-2.0 -package sessionmanager +package jobmanager import ( "context" diff --git a/pkg/sessionmanager/sessionmanager.go b/pkg/jobmanager/jobmanager.go similarity index 70% rename from pkg/sessionmanager/sessionmanager.go rename to pkg/jobmanager/jobmanager.go index f643d2a116..8108029824 100644 --- a/pkg/sessionmanager/sessionmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -1,7 +1,7 @@ // Copyright 2025, Command Line Inc. // SPDX-License-Identifier: Apache-2.0 -package sessionmanager +package jobmanager import ( "encoding/base64" @@ -29,8 +29,8 @@ type CmdDef struct { TermSize waveobj.TermSize } -type SessionManager struct { - sessionId string +type JobManager struct { + jobId string lock sync.Mutex cmd *exec.Cmd cmdPty pty.Pty @@ -40,9 +40,9 @@ type SessionManager struct { exitErr error } -func MakeSessionManager(sessionId string, cmdDef CmdDef) (*SessionManager, error) { - sm := &SessionManager{ - sessionId: sessionId, +func MakeJobManager(jobId string, cmdDef CmdDef) (*JobManager, error) { + jm := &JobManager{ + jobId: jobId, } if cmdDef.TermSize.Rows == 0 || cmdDef.TermSize.Cols == 0 { cmdDef.TermSize.Rows = 25 @@ -62,51 +62,51 @@ func MakeSessionManager(sessionId string, cmdDef CmdDef) (*SessionManager, error if err != nil { return nil, fmt.Errorf("failed to start command: %w", err) } - sm.cmd = ecmd - sm.cmdPty = cmdPty - go sm.readPtyOutput(cmdPty) - go sm.waitForProcess() - sm.setupSignalHandlers() - return sm, nil + jm.cmd = ecmd + jm.cmdPty = cmdPty + go jm.readPtyOutput(cmdPty) + go jm.waitForProcess() + jm.setupSignalHandlers() + return jm, nil } -func (sm *SessionManager) waitForProcess() { - if sm.cmd == nil || sm.cmd.Process == nil { +func (jm *JobManager) waitForProcess() { + if jm.cmd == nil || jm.cmd.Process == nil { return } - err := sm.cmd.Wait() - sm.lock.Lock() - defer sm.lock.Unlock() - - sm.exitErr = err + err := jm.cmd.Wait() + jm.lock.Lock() + defer jm.lock.Unlock() + + jm.exitErr = err if err != nil { if exitErr, ok := err.(*exec.ExitError); ok { if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { if status.Signaled() { - sm.exitSignal = status.Signal().String() - sm.exitCode = -1 + jm.exitSignal = status.Signal().String() + jm.exitCode = -1 } else { - sm.exitCode = status.ExitStatus() + jm.exitCode = status.ExitStatus() } } } } else { - sm.exitCode = 0 + jm.exitCode = 0 } - log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", sm.exitCode, sm.exitSignal, sm.exitErr) + log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", jm.exitCode, jm.exitSignal, jm.exitErr) } -func (sm *SessionManager) GetCmd() (*exec.Cmd, pty.Pty) { - sm.lock.Lock() - defer sm.lock.Unlock() - return sm.cmd, sm.cmdPty +func (jm *JobManager) GetCmd() (*exec.Cmd, pty.Pty) { + jm.lock.Lock() + defer jm.lock.Unlock() + return jm.cmd, jm.cmdPty } -func (sm *SessionManager) HandleInput(data wshrpc.CommandBlockInputData) error { - sm.lock.Lock() - defer sm.lock.Unlock() +func (jm *JobManager) HandleInput(data wshrpc.CommandBlockInputData) error { + jm.lock.Lock() + defer jm.lock.Unlock() - if sm.cmd == nil || sm.cmdPty == nil { + if jm.cmd == nil || jm.cmdPty == nil { return fmt.Errorf("no active process") } @@ -116,7 +116,7 @@ func (sm *SessionManager) HandleInput(data wshrpc.CommandBlockInputData) error { if err != nil { return fmt.Errorf("error decoding input data: %w", err) } - _, err = sm.cmdPty.Write(inputBuf[:nw]) + _, err = jm.cmdPty.Write(inputBuf[:nw]) if err != nil { return fmt.Errorf("error writing to pty: %w", err) } @@ -124,8 +124,8 @@ func (sm *SessionManager) HandleInput(data wshrpc.CommandBlockInputData) error { if data.SigName != "" { sig := normalizeSignal(data.SigName) - if sig != nil && sm.cmd.Process != nil { - err := sm.cmd.Process.Signal(sig) + if sig != nil && jm.cmd.Process != nil { + err := jm.cmd.Process.Signal(sig) if err != nil { return fmt.Errorf("error sending signal: %w", err) } @@ -133,7 +133,7 @@ func (sm *SessionManager) HandleInput(data wshrpc.CommandBlockInputData) error { } if data.TermSize != nil { - err := pty.Setsize(sm.cmdPty, &pty.Winsize{ + err := pty.Setsize(jm.cmdPty, &pty.Winsize{ Rows: uint16(data.TermSize.Rows), Cols: uint16(data.TermSize.Cols), }) @@ -155,7 +155,7 @@ func normalizeSignal(sigName string) os.Signal { case "INT": return syscall.SIGINT case "QUIT": - return syscall.SIGQUIT + return syscall.SIGQUIT case "KILL": return syscall.SIGKILL case "TERM": @@ -173,7 +173,7 @@ func normalizeSignal(sigName string) os.Signal { } } -func (sm *SessionManager) setupSignalHandlers() { +func (jm *JobManager) setupSignalHandlers() { sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) @@ -181,22 +181,22 @@ func (sm *SessionManager) setupSignalHandlers() { sig := <-sigChan log.Printf("received signal: %v\n", sig) - cmd, _ := sm.GetCmd() + cmd, _ := jm.GetCmd() if cmd != nil && cmd.Process != nil { log.Printf("forwarding signal %v to child process\n", sig) cmd.Process.Signal(sig) time.Sleep(ShutdownDelayTime) } - sm.Cleanup() + jm.Cleanup() os.Exit(0) }() } -func (sm *SessionManager) readPtyOutput(cmdPty pty.Pty) { +func (jm *JobManager) readPtyOutput(cmdPty pty.Pty) { // TODO: implement readPtyOutput } -func (sm *SessionManager) Cleanup() { +func (jm *JobManager) Cleanup() { // TODO: implement Cleanup } diff --git a/pkg/sessionmanager/streammanager.go b/pkg/jobmanager/streammanager.go similarity index 99% rename from pkg/sessionmanager/streammanager.go rename to pkg/jobmanager/streammanager.go index 6baf2e8f32..a48269bc43 100644 --- a/pkg/sessionmanager/streammanager.go +++ b/pkg/jobmanager/streammanager.go @@ -1,7 +1,7 @@ // Copyright 2025, Command Line Inc. // SPDX-License-Identifier: Apache-2.0 -package sessionmanager +package jobmanager import ( "encoding/base64" diff --git a/pkg/sessionmanager/streammanager_test.go b/pkg/jobmanager/streammanager_test.go similarity index 99% rename from pkg/sessionmanager/streammanager_test.go rename to pkg/jobmanager/streammanager_test.go index 98e901f00e..5f551d77d0 100644 --- a/pkg/sessionmanager/streammanager_test.go +++ b/pkg/jobmanager/streammanager_test.go @@ -1,7 +1,7 @@ // Copyright 2025, Command Line Inc. // SPDX-License-Identifier: Apache-2.0 -package sessionmanager +package jobmanager import ( "encoding/base64" diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 2f7e7e0a1f..a51297a531 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -29,6 +29,7 @@ const ( OType_LayoutState = "layout" OType_Block = "block" OType_MainServer = "mainserver" + OType_Job = "job" OType_Temp = "temp" OType_Builder = "builder" // not persisted to DB ) @@ -41,6 +42,7 @@ var ValidOTypes = map[string]bool{ OType_LayoutState: true, OType_Block: true, OType_MainServer: true, + OType_Job: true, OType_Temp: true, OType_Builder: true, } @@ -306,6 +308,33 @@ func (*MainServer) GetOType() string { return OType_MainServer } +type Job struct { + OID string `json:"oid"` + Version int `json:"version"` + Connection string `json:"connection"` + JobKind string `json:"jobkind"` // shell, task + Pgid int `json:"pgid"` // process group id + AttachedBlockId string `json:"ownerblockid"` + HupOnConnect bool `json:"huponconnect"` + JobAccessToken string `json:"jobaccesstoken"` // wave -> job manager + JobAuthToken string `json:"jobauthtoken"` // job manger -> wave + Cmd string `json:"cmd"` + CmdArgs []string `json:"cmdargs,omitempty"` + CmdEnv map[string]string `json:"cmdenv,omitempty"` + TermSize TermSize `json:"termsize,omitempty"` + StartTs int64 `json:"startts,omitempty"` // timestamp (milliseconds) + Status string `json:"status"` // running, done + ExitTs int64 `json:"exitts,omitempty"` // timestamp (milliseconds) + ExitCode int `json:"exitcode,omitempty"` + ExitSignal string `json:"exitsignal,omitempty"` + Error string `json:"error,omitempty"` + Meta MetaMapType `json:"meta"` +} + +func (*Job) GetOType() string { + return OType_Job +} + func AllWaveObjTypes() []reflect.Type { return []reflect.Type{ reflect.TypeOf(&Client{}), @@ -315,6 +344,7 @@ func AllWaveObjTypes() []reflect.Type { reflect.TypeOf(&Block{}), reflect.TypeOf(&LayoutState{}), reflect.TypeOf(&MainServer{}), + reflect.TypeOf(&Job{}), } } diff --git a/pkg/wshutil/wshrouter.go b/pkg/wshutil/wshrouter.go index cbc2f47ab3..94e59e270b 100644 --- a/pkg/wshutil/wshrouter.go +++ b/pkg/wshutil/wshrouter.go @@ -118,6 +118,10 @@ func MakeBuilderRouteId(builderId string) string { return "builder:" + builderId } +func MakeJobRouteId(jobId string) string { + return "job:" + jobId +} + var DefaultRouter *WshRouter func NewWshRouter() *WshRouter { From fe3644f6fae3095933e32469e02a5d20d849d586 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 12 Jan 2026 11:13:31 -0800 Subject: [PATCH 07/64] working on job manager cmd/interface --- cmd/wsh/cmd/wshcmd-jobmanager.go | 62 ++++++++ frontend/app/store/wshclientapi.ts | 5 + frontend/types/gotypes.d.ts | 26 +++ pkg/jobmanager/jobcmd.go | 202 +++++++++++++++++++++++ pkg/jobmanager/jobmanager.go | 246 +++++++++++------------------ pkg/wavejwt/wavejwt.go | 12 +- pkg/wshrpc/wshclient/wshclient.go | 6 + pkg/wshrpc/wshrpctypes.go | 7 + 8 files changed, 405 insertions(+), 161 deletions(-) create mode 100644 cmd/wsh/cmd/wshcmd-jobmanager.go create mode 100644 pkg/jobmanager/jobcmd.go diff --git a/cmd/wsh/cmd/wshcmd-jobmanager.go b/cmd/wsh/cmd/wshcmd-jobmanager.go new file mode 100644 index 0000000000..2f7936fc99 --- /dev/null +++ b/cmd/wsh/cmd/wshcmd-jobmanager.go @@ -0,0 +1,62 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package cmd + +import ( + "encoding/base64" + "fmt" + "os" + + "github.com/google/uuid" + "github.com/spf13/cobra" + "github.com/wavetermdev/waveterm/pkg/jobmanager" +) + +var jobManagerCmd = &cobra.Command{ + Use: "jobmanager", + Hidden: true, + Short: "job manager for wave terminal", + Args: cobra.NoArgs, + RunE: jobManagerRun, +} + +var jobManagerJobId string +var jobManagerClientId string + +func init() { + jobManagerCmd.Flags().StringVar(&jobManagerJobId, "jobid", "", "job ID (UUID, required)") + jobManagerCmd.Flags().StringVar(&jobManagerClientId, "clientid", "", "client ID (UUID, required)") + jobManagerCmd.MarkFlagRequired("jobid") + jobManagerCmd.MarkFlagRequired("clientid") + rootCmd.AddCommand(jobManagerCmd) +} + +func jobManagerRun(cmd *cobra.Command, args []string) error { + _, err := uuid.Parse(jobManagerJobId) + if err != nil { + return fmt.Errorf("invalid jobid: must be a valid UUID") + } + + _, err = uuid.Parse(jobManagerClientId) + if err != nil { + return fmt.Errorf("invalid clientid: must be a valid UUID") + } + + publicKeyB64 := os.Getenv("WAVETERM_PUBLICKEY") + if publicKeyB64 == "" { + return fmt.Errorf("WAVETERM_PUBLICKEY environment variable is not set") + } + + publicKeyBytes, err := base64.StdEncoding.DecodeString(publicKeyB64) + if err != nil { + return fmt.Errorf("failed to decode WAVETERM_PUBLICKEY: %v", err) + } + + err = jobmanager.SetupJobManager(jobManagerClientId, jobManagerJobId, publicKeyBytes) + if err != nil { + return fmt.Errorf("error setting up job manager: %v", err) + } + + select {} +} diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index fffcf3e899..3cdcc2749c 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -22,6 +22,11 @@ class RpcApiType { return client.wshRpcCall("authenticate", data, opts); } + // command "authenticatetojobmanager" [call] + AuthenticateToJobManagerCommand(client: WshClient, data: CommandAuthenticateToJobData, opts?: RpcOpts): Promise { + return client.wshRpcCall("authenticatetojobmanager", data, opts); + } + // command "authenticatetoken" [call] AuthenticateTokenCommand(client: WshClient, data: CommandAuthenticateTokenData, opts?: RpcOpts): Promise { return client.wshRpcCall("authenticatetoken", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 5ecb186d62..0e1d03722d 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -201,6 +201,11 @@ declare global { rpccontext?: RpcContext; }; + // wshrpc.CommandAuthenticateToJobData + type CommandAuthenticateToJobData = { + jobaccesstoken: string; + }; + // wshrpc.CommandAuthenticateTokenData type CommandAuthenticateTokenData = { token: string; @@ -793,6 +798,27 @@ declare global { configerrors: ConfigError[]; }; + // waveobj.Job + type Job = WaveObj & { + connection: string; + jobkind: string; + pgid: number; + ownerblockid: string; + huponconnect: boolean; + jobaccesstoken: string; + jobauthtoken: string; + cmd: string; + cmdargs?: string[]; + cmdenv?: {[key: string]: string}; + termsize?: TermSize; + startts?: number; + status: string; + exitts?: number; + exitcode?: number; + exitsignal?: string; + error?: string; + }; + // waveobj.LayoutActionData type LayoutActionData = { actiontype: string; diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go new file mode 100644 index 0000000000..bc8fdb4258 --- /dev/null +++ b/pkg/jobmanager/jobcmd.go @@ -0,0 +1,202 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package jobmanager + +import ( + "encoding/base64" + "fmt" + "log" + "os" + "os/exec" + "os/signal" + "strings" + "sync" + "syscall" + "time" + + "github.com/creack/pty" + "github.com/wavetermdev/waveterm/pkg/waveobj" + "github.com/wavetermdev/waveterm/pkg/wshrpc" +) + +const ShutdownDelayTime = 100 * time.Millisecond + +type CmdDef struct { + Cmd string + Args []string + Env map[string]string + TermSize waveobj.TermSize +} + +type JobCmd struct { + jobId string + lock sync.Mutex + cmd *exec.Cmd + cmdPty pty.Pty + cleanedUp bool + exitCode int + exitSignal string + exitErr error +} + +func MakeJobCmd(jobId string, cmdDef CmdDef) (*JobCmd, error) { + jm := &JobCmd{ + jobId: jobId, + } + if cmdDef.TermSize.Rows == 0 || cmdDef.TermSize.Cols == 0 { + cmdDef.TermSize.Rows = 25 + cmdDef.TermSize.Cols = 80 + } + if cmdDef.TermSize.Rows <= 0 || cmdDef.TermSize.Cols <= 0 { + return nil, fmt.Errorf("invalid term size: %v", cmdDef.TermSize) + } + ecmd := exec.Command(cmdDef.Cmd, cmdDef.Args...) + if len(cmdDef.Env) > 0 { + ecmd.Env = os.Environ() + for key, val := range cmdDef.Env { + ecmd.Env = append(ecmd.Env, fmt.Sprintf("%s=%s", key, val)) + } + } + cmdPty, err := pty.StartWithSize(ecmd, &pty.Winsize{Rows: uint16(cmdDef.TermSize.Rows), Cols: uint16(cmdDef.TermSize.Cols)}) + if err != nil { + return nil, fmt.Errorf("failed to start command: %w", err) + } + jm.cmd = ecmd + jm.cmdPty = cmdPty + go jm.readPtyOutput(cmdPty) + go jm.waitForProcess() + jm.setupSignalHandlers() + return jm, nil +} + +func (jm *JobCmd) waitForProcess() { + if jm.cmd == nil || jm.cmd.Process == nil { + return + } + err := jm.cmd.Wait() + jm.lock.Lock() + defer jm.lock.Unlock() + + jm.exitErr = err + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { + if status.Signaled() { + jm.exitSignal = status.Signal().String() + jm.exitCode = -1 + } else { + jm.exitCode = status.ExitStatus() + } + } + } + } else { + jm.exitCode = 0 + } + log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", jm.exitCode, jm.exitSignal, jm.exitErr) +} + +func (jm *JobCmd) GetCmd() (*exec.Cmd, pty.Pty) { + jm.lock.Lock() + defer jm.lock.Unlock() + return jm.cmd, jm.cmdPty +} + +func (jm *JobCmd) HandleInput(data wshrpc.CommandBlockInputData) error { + jm.lock.Lock() + defer jm.lock.Unlock() + + if jm.cmd == nil || jm.cmdPty == nil { + return fmt.Errorf("no active process") + } + + if len(data.InputData64) > 0 { + inputBuf := make([]byte, base64.StdEncoding.DecodedLen(len(data.InputData64))) + nw, err := base64.StdEncoding.Decode(inputBuf, []byte(data.InputData64)) + if err != nil { + return fmt.Errorf("error decoding input data: %w", err) + } + _, err = jm.cmdPty.Write(inputBuf[:nw]) + if err != nil { + return fmt.Errorf("error writing to pty: %w", err) + } + } + + if data.SigName != "" { + sig := normalizeSignal(data.SigName) + if sig != nil && jm.cmd.Process != nil { + err := jm.cmd.Process.Signal(sig) + if err != nil { + return fmt.Errorf("error sending signal: %w", err) + } + } + } + + if data.TermSize != nil { + err := pty.Setsize(jm.cmdPty, &pty.Winsize{ + Rows: uint16(data.TermSize.Rows), + Cols: uint16(data.TermSize.Cols), + }) + if err != nil { + return fmt.Errorf("error setting terminal size: %w", err) + } + } + + return nil +} + +func normalizeSignal(sigName string) os.Signal { + sigName = strings.ToUpper(sigName) + sigName = strings.TrimPrefix(sigName, "SIG") + + switch sigName { + case "HUP": + return syscall.SIGHUP + case "INT": + return syscall.SIGINT + case "QUIT": + return syscall.SIGQUIT + case "KILL": + return syscall.SIGKILL + case "TERM": + return syscall.SIGTERM + case "USR1": + return syscall.SIGUSR1 + case "USR2": + return syscall.SIGUSR2 + case "STOP": + return syscall.SIGSTOP + case "CONT": + return syscall.SIGCONT + default: + return nil + } +} + +func (jm *JobCmd) setupSignalHandlers() { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) + + go func() { + sig := <-sigChan + log.Printf("received signal: %v\n", sig) + + cmd, _ := jm.GetCmd() + if cmd != nil && cmd.Process != nil { + log.Printf("forwarding signal %v to child process\n", sig) + cmd.Process.Signal(sig) + time.Sleep(ShutdownDelayTime) + } + + jm.Cleanup() + os.Exit(0) + }() +} + +func (jm *JobCmd) readPtyOutput(cmdPty pty.Pty) { + // TODO: implement readPtyOutput +} + +func (jm *JobCmd) Cleanup() { + // TODO: implement Cleanup +} diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 8108029824..e577aa257d 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -1,202 +1,136 @@ -// Copyright 2025, Command Line Inc. +// Copyright 2026, Command Line Inc. // SPDX-License-Identifier: Apache-2.0 package jobmanager import ( - "encoding/base64" + "context" "fmt" "log" + "net" "os" - "os/exec" - "os/signal" - "strings" - "sync" - "syscall" - "time" - - "github.com/creack/pty" - "github.com/wavetermdev/waveterm/pkg/waveobj" + "path/filepath" + + "github.com/wavetermdev/waveterm/pkg/baseds" + "github.com/wavetermdev/waveterm/pkg/panichandler" + "github.com/wavetermdev/waveterm/pkg/wavebase" + "github.com/wavetermdev/waveterm/pkg/wavejwt" "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshutil" ) -const ShutdownDelayTime = 100 * time.Millisecond +var WshCmdJobManager JobManager -type CmdDef struct { - Cmd string - Args []string - Env map[string]string - TermSize waveobj.TermSize +type JobManager struct { + ClientId string + JobId string + Cmd *JobCmd + JwtPublicKey []byte } -type JobManager struct { - jobId string - lock sync.Mutex - cmd *exec.Cmd - cmdPty pty.Pty - cleanedUp bool - exitCode int - exitSignal string - exitErr error +type JobServerImpl struct { + Authenticated bool } -func MakeJobManager(jobId string, cmdDef CmdDef) (*JobManager, error) { - jm := &JobManager{ - jobId: jobId, - } - if cmdDef.TermSize.Rows == 0 || cmdDef.TermSize.Cols == 0 { - cmdDef.TermSize.Rows = 25 - cmdDef.TermSize.Cols = 80 - } - if cmdDef.TermSize.Rows <= 0 || cmdDef.TermSize.Cols <= 0 { - return nil, fmt.Errorf("invalid term size: %v", cmdDef.TermSize) +func (JobServerImpl) WshServerImpl() {} + +func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, data wshrpc.CommandAuthenticateToJobData) { + claims, err := wavejwt.ValidateAndExtract(data.JobAccessToken) + if err != nil { + log.Printf("AuthenticateToJobManager: failed to validate token: %v\n", err) + return } - ecmd := exec.Command(cmdDef.Cmd, cmdDef.Args...) - if len(cmdDef.Env) > 0 { - ecmd.Env = os.Environ() - for key, val := range cmdDef.Env { - ecmd.Env = append(ecmd.Env, fmt.Sprintf("%s=%s", key, val)) - } + if !claims.MainServer { + log.Printf("AuthenticateToJobManager: MainServer claim not set\n") + return } - cmdPty, err := pty.StartWithSize(ecmd, &pty.Winsize{Rows: uint16(cmdDef.TermSize.Rows), Cols: uint16(cmdDef.TermSize.Cols)}) - if err != nil { - return nil, fmt.Errorf("failed to start command: %w", err) + if claims.JobId != WshCmdJobManager.JobId { + log.Printf("AuthenticateToJobManager: JobId mismatch: expected %s, got %s\n", WshCmdJobManager.JobId, claims.JobId) + return } - jm.cmd = ecmd - jm.cmdPty = cmdPty - go jm.readPtyOutput(cmdPty) - go jm.waitForProcess() - jm.setupSignalHandlers() - return jm, nil + impl.Authenticated = true + log.Printf("AuthenticateToJobManager: authentication successful for JobId=%s\n", claims.JobId) } -func (jm *JobManager) waitForProcess() { - if jm.cmd == nil || jm.cmd.Process == nil { - return +func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte) error { + WshCmdJobManager.ClientId = clientId + WshCmdJobManager.JobId = jobId + WshCmdJobManager.JwtPublicKey = publicKeyBytes + err := wavejwt.SetPublicKey(publicKeyBytes) + if err != nil { + return fmt.Errorf("failed to set public key: %w", err) } - err := jm.cmd.Wait() - jm.lock.Lock() - defer jm.lock.Unlock() - - jm.exitErr = err + err = MakeJobDomainSocket(clientId, jobId) if err != nil { - if exitErr, ok := err.(*exec.ExitError); ok { - if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { - if status.Signaled() { - jm.exitSignal = status.Signal().String() - jm.exitCode = -1 - } else { - jm.exitCode = status.ExitStatus() - } - } - } - } else { - jm.exitCode = 0 + return err } - log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", jm.exitCode, jm.exitSignal, jm.exitErr) + return nil } -func (jm *JobManager) GetCmd() (*exec.Cmd, pty.Pty) { - jm.lock.Lock() - defer jm.lock.Unlock() - return jm.cmd, jm.cmdPty -} +func MakeJobDomainSocket(clientId string, jobId string) error { + homeDir := wavebase.GetHomeDir() + socketDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) + err := os.MkdirAll(socketDir, 0700) + if err != nil { + return fmt.Errorf("failed to create socket directory: %w", err) + } -func (jm *JobManager) HandleInput(data wshrpc.CommandBlockInputData) error { - jm.lock.Lock() - defer jm.lock.Unlock() + socketPath := filepath.Join(socketDir, fmt.Sprintf("%s.sock", jobId)) - if jm.cmd == nil || jm.cmdPty == nil { - return fmt.Errorf("no active process") - } + os.Remove(socketPath) - if len(data.InputData64) > 0 { - inputBuf := make([]byte, base64.StdEncoding.DecodedLen(len(data.InputData64))) - nw, err := base64.StdEncoding.Decode(inputBuf, []byte(data.InputData64)) - if err != nil { - return fmt.Errorf("error decoding input data: %w", err) - } - _, err = jm.cmdPty.Write(inputBuf[:nw]) - if err != nil { - return fmt.Errorf("error writing to pty: %w", err) - } + listener, err := net.Listen("unix", socketPath) + if err != nil { + return fmt.Errorf("failed to listen on domain socket: %w", err) } - if data.SigName != "" { - sig := normalizeSignal(data.SigName) - if sig != nil && jm.cmd.Process != nil { - err := jm.cmd.Process.Signal(sig) + go func() { + defer func() { + panichandler.PanicHandler("MakeJobDomainSocket:accept", recover()) + listener.Close() + os.Remove(socketPath) + }() + for { + conn, err := listener.Accept() if err != nil { - return fmt.Errorf("error sending signal: %w", err) + log.Printf("error accepting connection: %v\n", err) + return } + go handleJobDomainSocketClient(conn) } - } - - if data.TermSize != nil { - err := pty.Setsize(jm.cmdPty, &pty.Winsize{ - Rows: uint16(data.TermSize.Rows), - Cols: uint16(data.TermSize.Cols), - }) - if err != nil { - return fmt.Errorf("error setting terminal size: %w", err) - } - } + }() return nil } -func normalizeSignal(sigName string) os.Signal { - sigName = strings.ToUpper(sigName) - sigName = strings.TrimPrefix(sigName, "SIG") - - switch sigName { - case "HUP": - return syscall.SIGHUP - case "INT": - return syscall.SIGINT - case "QUIT": - return syscall.SIGQUIT - case "KILL": - return syscall.SIGKILL - case "TERM": - return syscall.SIGTERM - case "USR1": - return syscall.SIGUSR1 - case "USR2": - return syscall.SIGUSR2 - case "STOP": - return syscall.SIGSTOP - case "CONT": - return syscall.SIGCONT - default: - return nil - } -} +func handleJobDomainSocketClient(conn net.Conn) { + inputCh := make(chan baseds.RpcInputChType, wshutil.DefaultInputChSize) + outputCh := make(chan []byte, wshutil.DefaultOutputChSize) -func (jm *JobManager) setupSignalHandlers() { - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) + serverImpl := &JobServerImpl{} + rpcCtx := wshrpc.RpcContext{} + wshRpc := wshutil.MakeWshRpcWithChannels(inputCh, outputCh, rpcCtx, serverImpl, "job-domain") go func() { - sig := <-sigChan - log.Printf("received signal: %v\n", sig) - - cmd, _ := jm.GetCmd() - if cmd != nil && cmd.Process != nil { - log.Printf("forwarding signal %v to child process\n", sig) - cmd.Process.Signal(sig) - time.Sleep(ShutdownDelayTime) + defer func() { + panichandler.PanicHandler("handleJobDomainSocketClient:AdaptOutputChToStream", recover()) + }() + writeErr := wshutil.AdaptOutputChToStream(outputCh, conn) + if writeErr != nil { + log.Printf("error writing to domain socket: %v\n", writeErr) } - - jm.Cleanup() - os.Exit(0) }() -} -func (jm *JobManager) readPtyOutput(cmdPty pty.Pty) { - // TODO: implement readPtyOutput -} + go func() { + defer func() { + panichandler.PanicHandler("handleJobDomainSocketClient:AdaptStreamToMsgCh", recover()) + }() + defer func() { + conn.Close() + close(inputCh) + }() + wshutil.AdaptStreamToMsgCh(conn, inputCh) + }() -func (jm *JobManager) Cleanup() { - // TODO: implement Cleanup + _ = wshRpc } diff --git a/pkg/wavejwt/wavejwt.go b/pkg/wavejwt/wavejwt.go index 45a621a9a3..9e91003c58 100644 --- a/pkg/wavejwt/wavejwt.go +++ b/pkg/wavejwt/wavejwt.go @@ -26,11 +26,13 @@ var ( type WaveJwtClaims struct { jwt.RegisteredClaims - Sock string `json:"sock,omitempty"` - RouteId string `json:"routeid,omitempty"` - BlockId string `json:"blockid,omitempty"` - Conn string `json:"conn,omitempty"` - Router bool `json:"router,omitempty"` + MainServer bool `json:"mainserver,omitempty"` + Sock string `json:"sock,omitempty"` + RouteId string `json:"routeid,omitempty"` + BlockId string `json:"blockid,omitempty"` + JobId string `json:"jobid,omitempty"` + Conn string `json:"conn,omitempty"` + Router bool `json:"router,omitempty"` } type KeyPair struct { diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index a8f6c46e0d..ec938b5feb 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -35,6 +35,12 @@ func AuthenticateCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) ( return resp, err } +// command "authenticatetojobmanager", wshserver.AuthenticateToJobManagerCommand +func AuthenticateToJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandAuthenticateToJobData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "authenticatetojobmanager", data, opts) + return err +} + // command "authenticatetoken", wshserver.AuthenticateTokenCommand func AuthenticateTokenCommand(w *wshutil.WshRpc, data wshrpc.CommandAuthenticateTokenData, opts *wshrpc.RpcOpts) (wshrpc.CommandAuthenticateRtnData, error) { resp, err := sendRpcRequestCallHelper[wshrpc.CommandAuthenticateRtnData](w, "authenticatetoken", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 9fa9bafbe0..c373c3dfef 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -154,6 +154,9 @@ type WshRpcInterface interface { // streams StreamDataCommand(ctx context.Context, data CommandStreamData) error StreamDataAckCommand(ctx context.Context, data CommandStreamAckData) error + + // jobs + AuthenticateToJobManagerCommand(ctx context.Context, data CommandAuthenticateToJobData) } // for frontend @@ -656,3 +659,7 @@ type StreamMeta struct { ReaderRouteId string `json:"readerrouteid"` WriterRouteId string `json:"writerrouteid"` } + +type CommandAuthenticateToJobData struct { + JobAccessToken string `json:"jobaccesstoken"` +} From 7e6b9b8201647565f9021dc009902341cd682e9e Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 12 Jan 2026 13:37:42 -0800 Subject: [PATCH 08/64] jobmanager checkpoint --- frontend/app/store/wshclientapi.ts | 25 +++++ frontend/types/gotypes.d.ts | 30 ++++++ pkg/jobmanager/jobcmd.go | 29 ----- pkg/jobmanager/jobmanager.go | 53 ++++++++++ pkg/jobmanager/jobmanager_unix.go | 48 +++++++++ pkg/jobmanager/jobmanager_windows.go | 19 ++++ pkg/wshrpc/wshclient/wshclient.go | 30 ++++++ pkg/wshrpc/wshrpctypes.go | 30 ++++++ pkg/wshrpc/wshrpctypes_const.go | 19 ++-- pkg/wshutil/wshrouter_controlimpl.go | 153 +++++++++++++++++++-------- 10 files changed, 355 insertions(+), 81 deletions(-) create mode 100644 pkg/jobmanager/jobmanager_unix.go create mode 100644 pkg/jobmanager/jobmanager_windows.go diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 3cdcc2749c..b871bb7065 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -22,6 +22,16 @@ class RpcApiType { return client.wshRpcCall("authenticate", data, opts); } + // command "authenticatejobmanager" [call] + AuthenticateJobManagerCommand(client: WshClient, data: CommandAuthenticateJobManagerData, opts?: RpcOpts): Promise { + return client.wshRpcCall("authenticatejobmanager", data, opts); + } + + // command "authenticatejobmanagerverify" [call] + AuthenticateJobManagerVerifyCommand(client: WshClient, data: CommandAuthenticateJobManagerData, opts?: RpcOpts): Promise { + return client.wshRpcCall("authenticatejobmanagerverify", data, opts); + } + // command "authenticatetojobmanager" [call] AuthenticateToJobManagerCommand(client: WshClient, data: CommandAuthenticateToJobData, opts?: RpcOpts): Promise { return client.wshRpcCall("authenticatetojobmanager", data, opts); @@ -382,6 +392,16 @@ class RpcApiType { return client.wshRpcCall("getwaveairatelimit", null, opts); } + // command "jobconnect" [call] + JobConnectCommand(client: WshClient, data: CommandJobConnectData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobconnect", data, opts); + } + + // command "jobterminate" [call] + JobTerminateCommand(client: WshClient, data: CommandJobTerminateData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobterminate", data, opts); + } + // command "listallappfiles" [call] ListAllAppFilesCommand(client: WshClient, data: CommandListAllAppFilesData, opts?: RpcOpts): Promise { return client.wshRpcCall("listallappfiles", data, opts); @@ -577,6 +597,11 @@ class RpcApiType { return client.wshRpcCall("startbuilder", data, opts); } + // command "startjob" [call] + StartJobCommand(client: WshClient, data: CommandStartJobData, opts?: RpcOpts): Promise { + return client.wshRpcCall("startjob", data, opts); + } + // command "stopbuilder" [call] StopBuilderCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("stopbuilder", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 0e1d03722d..9258d1324a 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -194,6 +194,12 @@ declare global { data: {[key: string]: any}; }; + // wshrpc.CommandAuthenticateJobManagerData + type CommandAuthenticateJobManagerData = { + jobid: string; + jobauthtoken: string; + }; + // wshrpc.CommandAuthenticateRtnData type CommandAuthenticateRtnData = { env?: {[key: string]: string}; @@ -348,6 +354,16 @@ declare global { chatid: string; }; + // wshrpc.CommandJobConnectData + type CommandJobConnectData = { + streamid: string; + seq: number; + }; + + // wshrpc.CommandJobTerminateData + type CommandJobTerminateData = { + }; + // wshrpc.CommandListAllAppFilesData type CommandListAllAppFilesData = { appid: string; @@ -466,6 +482,20 @@ declare global { builderid: string; }; + // wshrpc.CommandStartJobData + type CommandStartJobData = { + cmd: string; + args: string[]; + env: {[key: string]: string}; + termsize: TermSize; + jobauthtoken: string; + }; + + // wshrpc.CommandStartJobRtnData + type CommandStartJobRtnData = { + pgid: number; + }; + // wshrpc.CommandStreamAckData type CommandStreamAckData = { id: string; diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index bc8fdb4258..ebba94634c 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -10,7 +10,6 @@ import ( "os" "os/exec" "os/signal" - "strings" "sync" "syscall" "time" @@ -145,34 +144,6 @@ func (jm *JobCmd) HandleInput(data wshrpc.CommandBlockInputData) error { return nil } -func normalizeSignal(sigName string) os.Signal { - sigName = strings.ToUpper(sigName) - sigName = strings.TrimPrefix(sigName, "SIG") - - switch sigName { - case "HUP": - return syscall.SIGHUP - case "INT": - return syscall.SIGINT - case "QUIT": - return syscall.SIGQUIT - case "KILL": - return syscall.SIGKILL - case "TERM": - return syscall.SIGTERM - case "USR1": - return syscall.SIGUSR1 - case "USR2": - return syscall.SIGUSR2 - case "STOP": - return syscall.SIGSTOP - case "CONT": - return syscall.SIGCONT - default: - return nil - } -} - func (jm *JobCmd) setupSignalHandlers() { sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index e577aa257d..35ed9a15e4 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -26,6 +26,7 @@ type JobManager struct { JobId string Cmd *JobCmd JwtPublicKey []byte + JobAuthToken string } type JobServerImpl struct { @@ -52,6 +53,58 @@ func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, log.Printf("AuthenticateToJobManager: authentication successful for JobId=%s\n", claims.JobId) } +func (impl *JobServerImpl) StartJobCommand(ctx context.Context, data wshrpc.CommandStartJobData) (*wshrpc.CommandStartJobRtnData, error) { + if !impl.Authenticated { + return nil, fmt.Errorf("not authenticated") + } + if WshCmdJobManager.Cmd != nil { + return nil, fmt.Errorf("job already started") + } + WshCmdJobManager.JobAuthToken = data.JobAuthToken + cmdDef := CmdDef{ + Cmd: data.Cmd, + Args: data.Args, + Env: data.Env, + TermSize: data.TermSize, + } + jobCmd, err := MakeJobCmd(WshCmdJobManager.JobId, cmdDef) + if err != nil { + return nil, fmt.Errorf("failed to start job: %w", err) + } + WshCmdJobManager.Cmd = jobCmd + cmd, _ := jobCmd.GetCmd() + if cmd == nil || cmd.Process == nil { + return nil, fmt.Errorf("cmd or process is nil") + } + pgid, err := getProcessGroupId(cmd.Process.Pid) + if err != nil { + return nil, fmt.Errorf("failed to get process group id: %w", err) + } + return &wshrpc.CommandStartJobRtnData{Pgid: pgid}, nil +} + +func (impl *JobServerImpl) JobConnectCommand(ctx context.Context, data wshrpc.CommandJobConnectData) error { + if !impl.Authenticated { + return fmt.Errorf("not authenticated") + } + if WshCmdJobManager.Cmd == nil { + return fmt.Errorf("job not started") + } + log.Printf("JobConnect: streamid=%s seq=%d\n", data.StreamId, data.Seq) + return nil +} + +func (impl *JobServerImpl) JobTerminateCommand(ctx context.Context, data wshrpc.CommandJobTerminateData) error { + if !impl.Authenticated { + return fmt.Errorf("not authenticated") + } + if WshCmdJobManager.Cmd == nil { + return fmt.Errorf("job not started") + } + log.Printf("JobTerminate called\n") + return nil +} + func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte) error { WshCmdJobManager.ClientId = clientId WshCmdJobManager.JobId = jobId diff --git a/pkg/jobmanager/jobmanager_unix.go b/pkg/jobmanager/jobmanager_unix.go new file mode 100644 index 0000000000..a70dec72e1 --- /dev/null +++ b/pkg/jobmanager/jobmanager_unix.go @@ -0,0 +1,48 @@ +// Copyright 2026, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +//go:build unix + +package jobmanager + +import ( + "os" + "strings" + "syscall" +) + +func getProcessGroupId(pid int) (int, error) { + pgid, err := syscall.Getpgid(pid) + if err != nil { + return 0, err + } + return pgid, nil +} + +func normalizeSignal(sigName string) os.Signal { + sigName = strings.ToUpper(sigName) + sigName = strings.TrimPrefix(sigName, "SIG") + + switch sigName { + case "HUP": + return syscall.SIGHUP + case "INT": + return syscall.SIGINT + case "QUIT": + return syscall.SIGQUIT + case "KILL": + return syscall.SIGKILL + case "TERM": + return syscall.SIGTERM + case "USR1": + return syscall.SIGUSR1 + case "USR2": + return syscall.SIGUSR2 + case "STOP": + return syscall.SIGSTOP + case "CONT": + return syscall.SIGCONT + default: + return nil + } +} diff --git a/pkg/jobmanager/jobmanager_windows.go b/pkg/jobmanager/jobmanager_windows.go new file mode 100644 index 0000000000..7ce8d358fa --- /dev/null +++ b/pkg/jobmanager/jobmanager_windows.go @@ -0,0 +1,19 @@ +// Copyright 2026, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +//go:build windows + +package jobmanager + +import ( + "fmt" + "os" +) + +func getProcessGroupId(pid int) (int, error) { + return 0, fmt.Errorf("process group id not supported on windows") +} + +func normalizeSignal(sigName string) os.Signal { + return nil +} diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index ec938b5feb..c1a91aca07 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -35,6 +35,18 @@ func AuthenticateCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) ( return resp, err } +// command "authenticatejobmanager", wshserver.AuthenticateJobManagerCommand +func AuthenticateJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandAuthenticateJobManagerData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "authenticatejobmanager", data, opts) + return err +} + +// command "authenticatejobmanagerverify", wshserver.AuthenticateJobManagerVerifyCommand +func AuthenticateJobManagerVerifyCommand(w *wshutil.WshRpc, data wshrpc.CommandAuthenticateJobManagerData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "authenticatejobmanagerverify", data, opts) + return err +} + // command "authenticatetojobmanager", wshserver.AuthenticateToJobManagerCommand func AuthenticateToJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandAuthenticateToJobData, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "authenticatetojobmanager", data, opts) @@ -464,6 +476,18 @@ func GetWaveAIRateLimitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) (*uctype return resp, err } +// command "jobconnect", wshserver.JobConnectCommand +func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobconnect", data, opts) + return err +} + +// command "jobterminate", wshserver.JobTerminateCommand +func JobTerminateCommand(w *wshutil.WshRpc, data wshrpc.CommandJobTerminateData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobterminate", data, opts) + return err +} + // command "listallappfiles", wshserver.ListAllAppFilesCommand func ListAllAppFilesCommand(w *wshutil.WshRpc, data wshrpc.CommandListAllAppFilesData, opts *wshrpc.RpcOpts) (*wshrpc.CommandListAllAppFilesRtnData, error) { resp, err := sendRpcRequestCallHelper[*wshrpc.CommandListAllAppFilesRtnData](w, "listallappfiles", data, opts) @@ -694,6 +718,12 @@ func StartBuilderCommand(w *wshutil.WshRpc, data wshrpc.CommandStartBuilderData, return err } +// command "startjob", wshserver.StartJobCommand +func StartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandStartJobData, opts *wshrpc.RpcOpts) (*wshrpc.CommandStartJobRtnData, error) { + resp, err := sendRpcRequestCallHelper[*wshrpc.CommandStartJobRtnData](w, "startjob", data, opts) + return resp, err +} + // command "stopbuilder", wshserver.StopBuilderCommand func StopBuilderCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "stopbuilder", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index c373c3dfef..b14a3208c2 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -26,6 +26,8 @@ type WshRpcInterface interface { AuthenticateCommand(ctx context.Context, data string) (CommandAuthenticateRtnData, error) AuthenticateTokenCommand(ctx context.Context, data CommandAuthenticateTokenData) (CommandAuthenticateRtnData, error) AuthenticateTokenVerifyCommand(ctx context.Context, data CommandAuthenticateTokenData) (CommandAuthenticateRtnData, error) // (special) validates token without binding, root router only + AuthenticateJobManagerCommand(ctx context.Context, data CommandAuthenticateJobManagerData) error + AuthenticateJobManagerVerifyCommand(ctx context.Context, data CommandAuthenticateJobManagerData) error // (special) validates job auth token without binding, root router only DisposeCommand(ctx context.Context, data CommandDisposeData) error RouteAnnounceCommand(ctx context.Context) error // (special) announces a new route to the main router RouteUnannounceCommand(ctx context.Context) error // (special) unannounces a route to the main router @@ -157,6 +159,9 @@ type WshRpcInterface interface { // jobs AuthenticateToJobManagerCommand(ctx context.Context, data CommandAuthenticateToJobData) + StartJobCommand(ctx context.Context, data CommandStartJobData) (*CommandStartJobRtnData, error) + JobConnectCommand(ctx context.Context, data CommandJobConnectData) error + JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error } // for frontend @@ -663,3 +668,28 @@ type StreamMeta struct { type CommandAuthenticateToJobData struct { JobAccessToken string `json:"jobaccesstoken"` } + +type CommandAuthenticateJobManagerData struct { + JobId string `json:"jobid"` + JobAuthToken string `json:"jobauthtoken"` +} + +type CommandStartJobData struct { + Cmd string `json:"cmd"` + Args []string `json:"args"` + Env map[string]string `json:"env"` + TermSize waveobj.TermSize `json:"termsize"` + JobAuthToken string `json:"jobauthtoken"` +} + +type CommandStartJobRtnData struct { + Pgid int `json:"pgid"` +} + +type CommandJobConnectData struct { + StreamId string `json:"streamid"` + Seq int64 `json:"seq"` +} + +type CommandJobTerminateData struct { +} diff --git a/pkg/wshrpc/wshrpctypes_const.go b/pkg/wshrpc/wshrpctypes_const.go index 5133b40346..a01d103e8f 100644 --- a/pkg/wshrpc/wshrpctypes_const.go +++ b/pkg/wshrpc/wshrpctypes_const.go @@ -35,13 +35,14 @@ const ( // we only need consts for special commands handled in the router or // in the RPC code / WPS code directly. other commands go through the clients const ( - Command_Authenticate = "authenticate" // $control - Command_AuthenticateToken = "authenticatetoken" // $control - Command_AuthenticateTokenVerify = "authenticatetokenverify" // $control:root (internal, for token validation only) - Command_RouteAnnounce = "routeannounce" // $control (for routing) - Command_RouteUnannounce = "routeunannounce" // $control (for routing) - Command_Ping = "ping" // $control - Command_ControllerInput = "controllerinput" - Command_EventRecv = "eventrecv" - Command_Message = "message" + Command_Authenticate = "authenticate" // $control + Command_AuthenticateToken = "authenticatetoken" // $control + Command_AuthenticateTokenVerify = "authenticatetokenverify" // $control:root (internal, for token validation only) + Command_AuthenticateJobManagerVerify = "authenticatejobmanagerverify" // $control:root (internal, for job auth token validation only) + Command_RouteAnnounce = "routeannounce" // $control (for routing) + Command_RouteUnannounce = "routeunannounce" // $control (for routing) + Command_Ping = "ping" // $control + Command_ControllerInput = "controllerinput" + Command_EventRecv = "eventrecv" + Command_Message = "message" ) diff --git a/pkg/wshutil/wshrouter_controlimpl.go b/pkg/wshutil/wshrouter_controlimpl.go index f6f557eabc..0cc29ca2f9 100644 --- a/pkg/wshutil/wshrouter_controlimpl.go +++ b/pkg/wshutil/wshrouter_controlimpl.go @@ -11,7 +11,9 @@ import ( "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/util/shellutil" "github.com/wavetermdev/waveterm/pkg/util/utilfn" + "github.com/wavetermdev/waveterm/pkg/waveobj" "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wstore" ) type WshRouterControlImpl struct { @@ -102,6 +104,46 @@ func (impl *WshRouterControlImpl) AuthenticateCommand(ctx context.Context, data return rtnData, nil } +func extractTokenData(token string) (wshrpc.CommandAuthenticateRtnData, error) { + entry := shellutil.GetAndRemoveTokenSwapEntry(token) + if entry == nil { + return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no token entry found") + } + _, err := validateRpcContextFromAuth(entry.RpcContext) + if err != nil { + return wshrpc.CommandAuthenticateRtnData{}, err + } + if entry.RpcContext.IsRouter { + return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("cannot auth router via token") + } + if entry.RpcContext.RouteId == "" { + return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no routeid") + } + return wshrpc.CommandAuthenticateRtnData{ + Env: entry.Env, + InitScriptText: entry.ScriptText, + RpcContext: entry.RpcContext, + }, nil +} + +func (impl *WshRouterControlImpl) AuthenticateTokenVerifyCommand(ctx context.Context, data wshrpc.CommandAuthenticateTokenData) (wshrpc.CommandAuthenticateRtnData, error) { + if !impl.Router.IsRootRouter() { + return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("authenticatetokenverify can only be called on root router") + } + if data.Token == "" { + return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no token in authenticatetoken message") + } + + rtnData, err := extractTokenData(data.Token) + if err != nil { + log.Printf("wshrouter authenticate-token-verify error: %v", err) + return wshrpc.CommandAuthenticateRtnData{}, err + } + + log.Printf("wshrouter authenticate-token-verify success routeid=%q", rtnData.RpcContext.RouteId) + return rtnData, nil +} + func (impl *WshRouterControlImpl) AuthenticateTokenCommand(ctx context.Context, data wshrpc.CommandAuthenticateTokenData) (wshrpc.CommandAuthenticateRtnData, error) { handler := GetRpcResponseHandlerFromContext(ctx) if handler == nil { @@ -117,29 +159,14 @@ func (impl *WshRouterControlImpl) AuthenticateTokenCommand(ctx context.Context, } var rtnData wshrpc.CommandAuthenticateRtnData - var rpcContext *wshrpc.RpcContext + var err error + if impl.Router.IsRootRouter() { - entry := shellutil.GetAndRemoveTokenSwapEntry(data.Token) - if entry == nil { - log.Printf("wshrouter authenticate-token error linkid=%d: no token entry found", linkId) - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no token entry found") - } - _, err := validateRpcContextFromAuth(entry.RpcContext) + rtnData, err = extractTokenData(data.Token) if err != nil { + log.Printf("wshrouter authenticate-token error linkid=%d: %v", linkId, err) return wshrpc.CommandAuthenticateRtnData{}, err } - if entry.RpcContext.IsRouter { - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("cannot auth router via token") - } - if entry.RpcContext.RouteId == "" { - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no routeid") - } - rpcContext = entry.RpcContext - rtnData = wshrpc.CommandAuthenticateRtnData{ - Env: entry.Env, - InitScriptText: entry.ScriptText, - RpcContext: rpcContext, - } } else { wshRpc := GetWshRpcFromContext(ctx) if wshRpc == nil { @@ -154,51 +181,91 @@ func (impl *WshRouterControlImpl) AuthenticateTokenCommand(ctx context.Context, if err != nil { return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("failed to unmarshal response: %w", err) } - rpcContext = rtnData.RpcContext } - if rpcContext == nil { + if rtnData.RpcContext == nil { return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no rpccontext in token response") } - log.Printf("wshrouter authenticate-token success linkid=%d routeid=%q", linkId, rpcContext.RouteId) + log.Printf("wshrouter authenticate-token success linkid=%d routeid=%q", linkId, rtnData.RpcContext.RouteId) impl.Router.trustLink(linkId, LinkKind_Leaf) - impl.Router.bindRoute(linkId, rpcContext.RouteId, true) + impl.Router.bindRoute(linkId, rtnData.RpcContext.RouteId, true) return rtnData, nil } -func (impl *WshRouterControlImpl) AuthenticateTokenVerifyCommand(ctx context.Context, data wshrpc.CommandAuthenticateTokenData) (wshrpc.CommandAuthenticateRtnData, error) { +func (impl *WshRouterControlImpl) AuthenticateJobManagerVerifyCommand(ctx context.Context, data wshrpc.CommandAuthenticateJobManagerData) error { if !impl.Router.IsRootRouter() { - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("authenticatetokenverify can only be called on root router") + return fmt.Errorf("authenticatejobmanagerverify can only be called on root router") } - if data.Token == "" { - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no token in authenticatetoken message") + if data.JobId == "" { + return fmt.Errorf("no jobid in authenticatejobmanager message") } - entry := shellutil.GetAndRemoveTokenSwapEntry(data.Token) - if entry == nil { - log.Printf("wshrouter authenticate-token-verify error: no token entry found") - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no token entry found") + if data.JobAuthToken == "" { + return fmt.Errorf("no jobauthtoken in authenticatejobmanager message") } - _, err := validateRpcContextFromAuth(entry.RpcContext) + + job, err := wstore.DBMustGet[*waveobj.Job](ctx, data.JobId) if err != nil { - return wshrpc.CommandAuthenticateRtnData{}, err + log.Printf("wshrouter authenticate-jobmanager-verify error jobid=%q: failed to get job: %v", data.JobId, err) + return fmt.Errorf("failed to get job: %w", err) } - if entry.RpcContext.IsRouter { - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("cannot auth router via token") + + if job.JobAuthToken != data.JobAuthToken { + log.Printf("wshrouter authenticate-jobmanager-verify error jobid=%q: invalid jobauthtoken", data.JobId) + return fmt.Errorf("invalid jobauthtoken") } - if entry.RpcContext.RouteId == "" { - return wshrpc.CommandAuthenticateRtnData{}, fmt.Errorf("no routeid") + + log.Printf("wshrouter authenticate-jobmanager-verify success jobid=%q", data.JobId) + return nil +} + +func (impl *WshRouterControlImpl) AuthenticateJobManagerCommand(ctx context.Context, data wshrpc.CommandAuthenticateJobManagerData) error { + handler := GetRpcResponseHandlerFromContext(ctx) + if handler == nil { + return fmt.Errorf("no response handler in context") + } + linkId := handler.GetIngressLinkId() + if linkId == baseds.NoLinkId { + return fmt.Errorf("no ingress link found") } - rtnData := wshrpc.CommandAuthenticateRtnData{ - Env: entry.Env, - InitScriptText: entry.ScriptText, - RpcContext: entry.RpcContext, + if data.JobId == "" { + return fmt.Errorf("no jobid in authenticatejobmanager message") + } + if data.JobAuthToken == "" { + return fmt.Errorf("no jobauthtoken in authenticatejobmanager message") } - log.Printf("wshrouter authenticate-token-verify success routeid=%q", entry.RpcContext.RouteId) - return rtnData, nil + if impl.Router.IsRootRouter() { + job, err := wstore.DBMustGet[*waveobj.Job](ctx, data.JobId) + if err != nil { + log.Printf("wshrouter authenticate-jobmanager error linkid=%d jobid=%q: failed to get job: %v", linkId, data.JobId, err) + return fmt.Errorf("failed to get job: %w", err) + } + + if job.JobAuthToken != data.JobAuthToken { + log.Printf("wshrouter authenticate-jobmanager error linkid=%d jobid=%q: invalid jobauthtoken", linkId, data.JobId) + return fmt.Errorf("invalid jobauthtoken") + } + } else { + wshRpc := GetWshRpcFromContext(ctx) + if wshRpc == nil { + return fmt.Errorf("no wshrpc in context") + } + _, err := wshRpc.SendRpcRequest(wshrpc.Command_AuthenticateJobManagerVerify, data, &wshrpc.RpcOpts{Route: ControlRootRoute}) + if err != nil { + log.Printf("wshrouter authenticate-jobmanager error linkid=%d jobid=%q: failed to verify job auth token: %v", linkId, data.JobId, err) + return fmt.Errorf("failed to verify job auth token: %w", err) + } + } + + routeId := MakeJobRouteId(data.JobId) + log.Printf("wshrouter authenticate-jobmanager success linkid=%d jobid=%q routeid=%q", linkId, data.JobId, routeId) + impl.Router.trustLink(linkId, LinkKind_Leaf) + impl.Router.bindRoute(linkId, routeId, true) + + return nil } func validateRpcContextFromAuth(newCtx *wshrpc.RpcContext) (string, error) { From 30c87566ac7444ac944ac1cfb615da2334e9466d Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 12 Jan 2026 13:58:33 -0800 Subject: [PATCH 09/64] checkpoint --- pkg/jobmanager/jobmanager.go | 39 ++++++++++++++++++++++++++++++++---- pkg/waveobj/wtype.go | 2 +- pkg/wshrpc/wshrpctypes.go | 2 +- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 35ed9a15e4..045cb1c57d 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -16,6 +16,7 @@ import ( "github.com/wavetermdev/waveterm/pkg/wavebase" "github.com/wavetermdev/waveterm/pkg/wavejwt" "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" "github.com/wavetermdev/waveterm/pkg/wshutil" ) @@ -31,26 +32,42 @@ type JobManager struct { type JobServerImpl struct { Authenticated bool + WshRpc *wshutil.WshRpc } func (JobServerImpl) WshServerImpl() {} -func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, data wshrpc.CommandAuthenticateToJobData) { +func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, data wshrpc.CommandAuthenticateToJobData) error { claims, err := wavejwt.ValidateAndExtract(data.JobAccessToken) if err != nil { log.Printf("AuthenticateToJobManager: failed to validate token: %v\n", err) - return + return fmt.Errorf("failed to validate token: %w", err) } if !claims.MainServer { log.Printf("AuthenticateToJobManager: MainServer claim not set\n") - return + return fmt.Errorf("MainServer claim not set") } if claims.JobId != WshCmdJobManager.JobId { log.Printf("AuthenticateToJobManager: JobId mismatch: expected %s, got %s\n", WshCmdJobManager.JobId, claims.JobId) - return + return fmt.Errorf("JobId mismatch") } impl.Authenticated = true log.Printf("AuthenticateToJobManager: authentication successful for JobId=%s\n", claims.JobId) + + if WshCmdJobManager.JobAuthToken != "" { + authData := wshrpc.CommandAuthenticateJobManagerData{ + JobId: WshCmdJobManager.JobId, + JobAuthToken: WshCmdJobManager.JobAuthToken, + } + err = wshclient.AuthenticateJobManagerCommand(impl.WshRpc, authData, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) + if err != nil { + log.Printf("AuthenticateToJobManager: failed to authenticate back to server: %v\n", err) + impl.Authenticated = false + return fmt.Errorf("failed to authenticate back to server: %w", err) + } + log.Printf("AuthenticateToJobManager: successfully authenticated back to server\n") + } + return nil } func (impl *JobServerImpl) StartJobCommand(ctx context.Context, data wshrpc.CommandStartJobData) (*wshrpc.CommandStartJobRtnData, error) { @@ -61,6 +78,19 @@ func (impl *JobServerImpl) StartJobCommand(ctx context.Context, data wshrpc.Comm return nil, fmt.Errorf("job already started") } WshCmdJobManager.JobAuthToken = data.JobAuthToken + + authData := wshrpc.CommandAuthenticateJobManagerData{ + JobId: WshCmdJobManager.JobId, + JobAuthToken: WshCmdJobManager.JobAuthToken, + } + err := wshclient.AuthenticateJobManagerCommand(impl.WshRpc, authData, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) + if err != nil { + log.Printf("StartJob: failed to authenticate to server: %v\n", err) + WshCmdJobManager.JobAuthToken = "" + return nil, fmt.Errorf("failed to authenticate to server: %w", err) + } + log.Printf("StartJob: successfully authenticated to server\n") + cmdDef := CmdDef{ Cmd: data.Cmd, Args: data.Args, @@ -163,6 +193,7 @@ func handleJobDomainSocketClient(conn net.Conn) { serverImpl := &JobServerImpl{} rpcCtx := wshrpc.RpcContext{} wshRpc := wshutil.MakeWshRpcWithChannels(inputCh, outputCh, rpcCtx, serverImpl, "job-domain") + serverImpl.WshRpc = wshRpc go func() { defer func() { diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index a51297a531..7ffbccb1d4 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -323,7 +323,7 @@ type Job struct { CmdEnv map[string]string `json:"cmdenv,omitempty"` TermSize TermSize `json:"termsize,omitempty"` StartTs int64 `json:"startts,omitempty"` // timestamp (milliseconds) - Status string `json:"status"` // running, done + Status string `json:"status"` // init, running, done ExitTs int64 `json:"exitts,omitempty"` // timestamp (milliseconds) ExitCode int `json:"exitcode,omitempty"` ExitSignal string `json:"exitsignal,omitempty"` diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index b14a3208c2..ec244ef136 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -158,7 +158,7 @@ type WshRpcInterface interface { StreamDataAckCommand(ctx context.Context, data CommandStreamAckData) error // jobs - AuthenticateToJobManagerCommand(ctx context.Context, data CommandAuthenticateToJobData) + AuthenticateToJobManagerCommand(ctx context.Context, data CommandAuthenticateToJobData) error StartJobCommand(ctx context.Context, data CommandStartJobData) (*CommandStartJobRtnData, error) JobConnectCommand(ctx context.Context, data CommandJobConnectData) error JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error From c5743eed47e9016adccdf5c5162190540ea5ec79 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 12 Jan 2026 15:27:38 -0800 Subject: [PATCH 10/64] checkpoint --- pkg/jobmanager/jobcmd.go | 14 +++++ pkg/jobmanager/jobmanager.go | 25 ++++++--- pkg/jobmanager/streammanager.go | 80 +++++++++++++++++++++------- pkg/jobmanager/streammanager_test.go | 26 ++++----- 4 files changed, 106 insertions(+), 39 deletions(-) diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index ebba94634c..82691d95ef 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -34,6 +34,7 @@ type JobCmd struct { cmd *exec.Cmd cmdPty pty.Pty cleanedUp bool + ptyClosed bool exitCode int exitSignal string exitErr error @@ -168,6 +169,19 @@ func (jm *JobCmd) readPtyOutput(cmdPty pty.Pty) { // TODO: implement readPtyOutput } +func (jm *JobCmd) Terminate() { + jm.lock.Lock() + defer jm.lock.Unlock() + if jm.ptyClosed { + return + } + if jm.cmdPty != nil { + jm.cmdPty.Close() + jm.ptyClosed = true + log.Printf("pty closed for job %s\n", jm.jobId) + } +} + func (jm *JobCmd) Cleanup() { // TODO: implement Cleanup } diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 045cb1c57d..d455767d9c 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -10,6 +10,7 @@ import ( "net" "os" "path/filepath" + "sync" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" @@ -23,16 +24,19 @@ import ( var WshCmdJobManager JobManager type JobManager struct { - ClientId string - JobId string - Cmd *JobCmd - JwtPublicKey []byte - JobAuthToken string + ClientId string + JobId string + Cmd *JobCmd + JwtPublicKey []byte + JobAuthToken string + lock sync.Mutex + attachedClient *JobServerImpl } type JobServerImpl struct { Authenticated bool WshRpc *wshutil.WshRpc + Conn net.Conn } func (JobServerImpl) WshServerImpl() {} @@ -67,6 +71,14 @@ func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, } log.Printf("AuthenticateToJobManager: successfully authenticated back to server\n") } + + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + if WshCmdJobManager.attachedClient != nil { + log.Printf("AuthenticateToJobManager: kicking out existing client\n") + WshCmdJobManager.attachedClient.Conn.Close() + } + WshCmdJobManager.attachedClient = impl return nil } @@ -132,6 +144,7 @@ func (impl *JobServerImpl) JobTerminateCommand(ctx context.Context, data wshrpc. return fmt.Errorf("job not started") } log.Printf("JobTerminate called\n") + WshCmdJobManager.Cmd.Terminate() return nil } @@ -190,7 +203,7 @@ func handleJobDomainSocketClient(conn net.Conn) { inputCh := make(chan baseds.RpcInputChType, wshutil.DefaultInputChSize) outputCh := make(chan []byte, wshutil.DefaultOutputChSize) - serverImpl := &JobServerImpl{} + serverImpl := &JobServerImpl{Conn: conn} rpcCtx := wshrpc.RpcContext{} wshRpc := wshutil.MakeWshRpcWithChannels(inputCh, outputCh, rpcCtx, serverImpl, "job-domain") serverImpl.WshRpc = wshRpc diff --git a/pkg/jobmanager/streammanager.go b/pkg/jobmanager/streammanager.go index a48269bc43..7f020942f6 100644 --- a/pkg/jobmanager/streammanager.go +++ b/pkg/jobmanager/streammanager.go @@ -35,8 +35,9 @@ type StreamManager struct { streamId string buf *CirBuf - terminalEvent *streamTerminalEvent - terminalEventSent bool + terminalEvent *streamTerminalEvent + terminalEventSent bool + terminalEventAcked bool reader io.Reader readerWg sync.WaitGroup @@ -53,18 +54,13 @@ type StreamManager struct { closed bool } -func MakeStreamManager(streamId string, dataSender DataSender) *StreamManager { - return MakeStreamManagerWithSizes(streamId, dataSender, CwndSize, CirBufSize) +func MakeStreamManager() *StreamManager { + return MakeStreamManagerWithSizes(CwndSize, CirBufSize) } -func MakeStreamManagerWithSizes(streamId string, dataSender DataSender, cwndSize, cirbufSize int) *StreamManager { - if dataSender == nil { - panic("dataSender cannot be nil") - } +func MakeStreamManagerWithSizes(cwndSize, cirbufSize int) *StreamManager { sm := &StreamManager{ - streamId: streamId, buf: MakeCirBuf(cirbufSize, true), - dataSender: dataSender, cwndSize: cwndSize, rwndSize: cwndSize, sentNotAcked: 0, @@ -92,17 +88,42 @@ func (sm *StreamManager) AttachReader(r io.Reader) error { } // ClientConnected transitions to CONNECTED mode -func (sm *StreamManager) ClientConnected(rwndSize int) error { +func (sm *StreamManager) ClientConnected(streamId string, dataSender DataSender, rwndSize int, clientSeq int64) (int64, error) { sm.lock.Lock() defer sm.lock.Unlock() if sm.connected { - return nil + return 0, fmt.Errorf("client already connected") + } + + if dataSender == nil { + return 0, fmt.Errorf("dataSender cannot be nil") + } + + headPos := sm.buf.HeadPos() + if clientSeq > headPos { + bytesToConsume := int(clientSeq - headPos) + available := sm.buf.Size() + if bytesToConsume > available { + return 0, fmt.Errorf("client seq %d is beyond our stream end (head=%d, size=%d)", clientSeq, headPos, available) + } + if bytesToConsume > 0 { + if err := sm.buf.Consume(bytesToConsume); err != nil { + return 0, fmt.Errorf("failed to consume buffer: %w", err) + } + headPos = sm.buf.HeadPos() + } } + sm.streamId = streamId + sm.dataSender = dataSender sm.connected = true sm.drained = false sm.rwndSize = rwndSize + sm.sentNotAcked = 0 + if !sm.terminalEventAcked { + sm.terminalEventSent = false + } effectiveWindow := sm.cwndSize if sm.rwndSize < effectiveWindow { effectiveWindow = sm.rwndSize @@ -110,7 +131,12 @@ func (sm *StreamManager) ClientConnected(rwndSize int) error { sm.buf.SetEffectiveWindow(true, effectiveWindow) sm.drainCond.Signal() - return nil + startSeq := headPos + if clientSeq > startSeq { + startSeq = clientSeq + } + + return startSeq, nil } // ClientDisconnected transitions to DISCONNECTED mode @@ -123,6 +149,7 @@ func (sm *StreamManager) ClientDisconnected() { } sm.connected = false + sm.dataSender = nil sm.drainCond.Signal() sm.sentNotAcked = 0 sm.buf.SetEffectiveWindow(false, CirBufSize) @@ -137,6 +164,10 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { return } + if ackPk.Fin { + sm.terminalEventAcked = true + } + seq := ackPk.Seq headPos := sm.buf.HeadPos() if seq < headPos { @@ -178,7 +209,7 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { } if sm.terminalEvent != nil && !sm.terminalEventSent && sm.buf.Size() == 0 && sm.sentNotAcked == 0 { - sm.sendTerminalEvent() + sm.sendTerminalEvent_withlock() } } @@ -244,7 +275,7 @@ func (sm *StreamManager) handleEOF() { sm.terminalEvent = &streamTerminalEvent{isEof: true} if sm.buf.Size() == 0 && sm.sentNotAcked == 0 && sm.connected && sm.drained { - sm.sendTerminalEvent() + sm.sendTerminalEvent_withlock() } } @@ -255,7 +286,7 @@ func (sm *StreamManager) handleError(err error) { sm.terminalEvent = &streamTerminalEvent{err: err.Error()} if sm.buf.Size() == 0 && sm.sentNotAcked == 0 && sm.connected && sm.drained { - sm.sendTerminalEvent() + sm.sendTerminalEvent_withlock() } } @@ -278,7 +309,7 @@ func (sm *StreamManager) senderLoop() { if available == 0 { sm.drained = true if sm.terminalEvent != nil && !sm.terminalEventSent && sm.sentNotAcked == 0 { - sm.sendTerminalEvent() + sm.sendTerminalEvent_withlock() } sm.drainCond.Wait() sm.lock.Unlock() @@ -315,28 +346,37 @@ func (sm *StreamManager) senderLoop() { seq := sm.buf.HeadPos() + sm.sentNotAcked sm.sentNotAcked += int64(n) + dataSender := sm.dataSender sm.lock.Unlock() + if dataSender == nil { + continue + } + pkt := wshrpc.CommandStreamData{ Id: sm.streamId, Seq: seq, Data64: base64.StdEncoding.EncodeToString(data), } - sm.dataSender.SendData(pkt) + dataSender.SendData(pkt) } } func (sm *StreamManager) sendBufferData() { sm.lock.Lock() + defer sm.lock.Unlock() sm.drainCond.Signal() - sm.lock.Unlock() } -func (sm *StreamManager) sendTerminalEvent() { +func (sm *StreamManager) sendTerminalEvent_withlock() { if sm.terminalEventSent { return } + if sm.dataSender == nil { + return + } + seq := sm.buf.HeadPos() pkt := wshrpc.CommandStreamData{ Id: sm.streamId, diff --git a/pkg/jobmanager/streammanager_test.go b/pkg/jobmanager/streammanager_test.go index 5f551d77d0..9a0e3c895e 100644 --- a/pkg/jobmanager/streammanager_test.go +++ b/pkg/jobmanager/streammanager_test.go @@ -46,7 +46,7 @@ func decodeData(data64 string) string { func TestBasicDisconnectedMode(t *testing.T) { tw := &testWriter{} - sm := MakeStreamManager("1", tw) + sm := MakeStreamManager() reader := strings.NewReader("hello world") err := sm.AttachReader(reader) @@ -66,7 +66,7 @@ func TestBasicDisconnectedMode(t *testing.T) { func TestConnectedModeBasicFlow(t *testing.T) { tw := &testWriter{} - sm := MakeStreamManager("1", tw) + sm := MakeStreamManager() reader := strings.NewReader("hello") err := sm.AttachReader(reader) @@ -74,7 +74,7 @@ func TestConnectedModeBasicFlow(t *testing.T) { t.Fatalf("AttachReader failed: %v", err) } - err = sm.ClientConnected(CwndSize) + _, err = sm.ClientConnected("1", tw, CwndSize, 0) if err != nil { t.Fatalf("ClientConnected failed: %v", err) } @@ -121,7 +121,7 @@ func TestConnectedModeBasicFlow(t *testing.T) { func TestDisconnectedToConnectedTransition(t *testing.T) { tw := &testWriter{} - sm := MakeStreamManager("1", tw) + sm := MakeStreamManager() reader := strings.NewReader("test data") err := sm.AttachReader(reader) @@ -131,7 +131,7 @@ func TestDisconnectedToConnectedTransition(t *testing.T) { time.Sleep(100 * time.Millisecond) - err = sm.ClientConnected(CwndSize) + _, err = sm.ClientConnected("1", tw, CwndSize, 0) if err != nil { t.Fatalf("ClientConnected failed: %v", err) } @@ -159,7 +159,7 @@ func TestDisconnectedToConnectedTransition(t *testing.T) { func TestConnectedToDisconnectedTransition(t *testing.T) { tw := &testWriter{} - sm := MakeStreamManager("1", tw) + sm := MakeStreamManager() reader := &slowReader{data: []byte("slow data"), delay: 50 * time.Millisecond} err := sm.AttachReader(reader) @@ -167,7 +167,7 @@ func TestConnectedToDisconnectedTransition(t *testing.T) { t.Fatalf("AttachReader failed: %v", err) } - err = sm.ClientConnected(CwndSize) + _, err = sm.ClientConnected("1", tw, CwndSize, 0) if err != nil { t.Fatalf("ClientConnected failed: %v", err) } @@ -184,7 +184,7 @@ func TestConnectedToDisconnectedTransition(t *testing.T) { func TestFlowControl(t *testing.T) { cwndSize := 1024 tw := &testWriter{} - sm := MakeStreamManagerWithSizes("1", tw, cwndSize, 8*1024) + sm := MakeStreamManagerWithSizes(cwndSize, 8*1024) largeData := strings.Repeat("x", cwndSize+500) reader := strings.NewReader(largeData) @@ -194,7 +194,7 @@ func TestFlowControl(t *testing.T) { t.Fatalf("AttachReader failed: %v", err) } - err = sm.ClientConnected(cwndSize) + _, err = sm.ClientConnected("1", tw, cwndSize, 0) if err != nil { t.Fatalf("ClientConnected failed: %v", err) } @@ -223,7 +223,7 @@ func TestFlowControl(t *testing.T) { func TestSequenceNumbering(t *testing.T) { tw := &testWriter{} - sm := MakeStreamManager("1", tw) + sm := MakeStreamManager() reader := strings.NewReader("abcdefghij") err := sm.AttachReader(reader) @@ -231,7 +231,7 @@ func TestSequenceNumbering(t *testing.T) { t.Fatalf("AttachReader failed: %v", err) } - err = sm.ClientConnected(CwndSize) + _, err = sm.ClientConnected("1", tw, CwndSize, 0) if err != nil { t.Fatalf("ClientConnected failed: %v", err) } @@ -262,7 +262,7 @@ func TestSequenceNumbering(t *testing.T) { func TestTerminalEventOrdering(t *testing.T) { tw := &testWriter{} - sm := MakeStreamManager("1", tw) + sm := MakeStreamManager() reader := strings.NewReader("data") err := sm.AttachReader(reader) @@ -270,7 +270,7 @@ func TestTerminalEventOrdering(t *testing.T) { t.Fatalf("AttachReader failed: %v", err) } - err = sm.ClientConnected(CwndSize) + _, err = sm.ClientConnected("1", tw, CwndSize, 0) if err != nil { t.Fatalf("ClientConnected failed: %v", err) } From e09d90075b04939ee01dc40422e3cbb88b3a297d Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 10:44:53 -0800 Subject: [PATCH 11/64] job manager start/connect sync fixed, checkpoint --- frontend/app/store/wshclientapi.ts | 2 +- frontend/types/gotypes.d.ts | 16 +- pkg/jobmanager/jobmanager.go | 236 +++++++++++++++++++++++------ pkg/jobmanager/streammanager.go | 3 +- pkg/wshrpc/wshclient/wshclient.go | 6 +- pkg/wshrpc/wshrpctypes.go | 11 +- 6 files changed, 217 insertions(+), 57 deletions(-) diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index b871bb7065..7bc7e48f1d 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -393,7 +393,7 @@ class RpcApiType { } // command "jobconnect" [call] - JobConnectCommand(client: WshClient, data: CommandJobConnectData, opts?: RpcOpts): Promise { + JobConnectCommand(client: WshClient, data: CommandJobConnectData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobconnect", data, opts); } diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 9258d1324a..a0eea7bba5 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -356,7 +356,12 @@ declare global { // wshrpc.CommandJobConnectData type CommandJobConnectData = { - streamid: string; + streammeta: StreamMeta; + seq: number; + }; + + // wshrpc.CommandJobConnectRtnData + type CommandJobConnectRtnData = { seq: number; }; @@ -489,6 +494,7 @@ declare global { env: {[key: string]: string}; termsize: TermSize; jobauthtoken: string; + streammeta?: StreamMeta; }; // wshrpc.CommandStartJobRtnData @@ -1242,6 +1248,14 @@ declare global { display: StickerDisplayOptsType; }; + // wshrpc.StreamMeta + type StreamMeta = { + id: string; + rwnd: number; + readerrouteid: string; + writerrouteid: string; + }; + // wps.SubscriptionRequest type SubscriptionRequest = { event: string; diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index d455767d9c..ec309bd2d5 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -11,6 +11,7 @@ import ( "os" "path/filepath" "sync" + "sync/atomic" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" @@ -24,24 +25,78 @@ import ( var WshCmdJobManager JobManager type JobManager struct { - ClientId string - JobId string - Cmd *JobCmd - JwtPublicKey []byte - JobAuthToken string - lock sync.Mutex - attachedClient *JobServerImpl + ClientId string + JobId string + Cmd *JobCmd + JwtPublicKey []byte + JobAuthToken string + StreamManager *StreamManager + lock sync.Mutex + attachedClient *JobServerImpl + connectedStreamClient *JobServerImpl } type JobServerImpl struct { - Authenticated bool - WshRpc *wshutil.WshRpc - Conn net.Conn + PeerAuthenticated atomic.Bool + SelfAuthenticated atomic.Bool + WshRpc *wshutil.WshRpc + Conn net.Conn + inputCh chan baseds.RpcInputChType + closeOnce sync.Once } -func (JobServerImpl) WshServerImpl() {} +func (*JobServerImpl) WshServerImpl() {} + +func (impl *JobServerImpl) Close() { + impl.closeOnce.Do(func() { + impl.Conn.Close() + close(impl.inputCh) + }) +} + +type routedDataSender struct { + wshRpc *wshutil.WshRpc + route string +} + +func (rds *routedDataSender) SendData(dataPk wshrpc.CommandStreamData) { + err := wshclient.StreamDataCommand(rds.wshRpc, dataPk, &wshrpc.RpcOpts{NoResponse: true, Route: rds.route}) + if err != nil { + log.Printf("SendData: error sending stream data: %v\n", err) + } +} + +func (jm *JobManager) GetJobAuthInfo() (string, string) { + jm.lock.Lock() + defer jm.lock.Unlock() + return jm.JobId, jm.JobAuthToken +} + +func (jm *JobManager) IsJobStarted() bool { + jm.lock.Lock() + defer jm.lock.Unlock() + return jm.Cmd != nil +} + +func (impl *JobServerImpl) authenticateSelfToServer(jobAuthToken string) error { + jobId, _ := WshCmdJobManager.GetJobAuthInfo() + authData := wshrpc.CommandAuthenticateJobManagerData{ + JobId: jobId, + JobAuthToken: jobAuthToken, + } + err := wshclient.AuthenticateJobManagerCommand(impl.WshRpc, authData, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) + if err != nil { + log.Printf("authenticateSelfToServer: failed to authenticate to server: %v\n", err) + return fmt.Errorf("failed to authenticate to server: %w", err) + } + impl.SelfAuthenticated.Store(true) + log.Printf("authenticateSelfToServer: successfully authenticated to server\n") + return nil +} func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, data wshrpc.CommandAuthenticateToJobData) error { + jobId, jobAuthToken := WshCmdJobManager.GetJobAuthInfo() + claims, err := wavejwt.ValidateAndExtract(data.JobAccessToken) if err != nil { log.Printf("AuthenticateToJobManager: failed to validate token: %v\n", err) @@ -51,57 +106,92 @@ func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, log.Printf("AuthenticateToJobManager: MainServer claim not set\n") return fmt.Errorf("MainServer claim not set") } - if claims.JobId != WshCmdJobManager.JobId { - log.Printf("AuthenticateToJobManager: JobId mismatch: expected %s, got %s\n", WshCmdJobManager.JobId, claims.JobId) + if claims.JobId != jobId { + log.Printf("AuthenticateToJobManager: JobId mismatch: expected %s, got %s\n", jobId, claims.JobId) return fmt.Errorf("JobId mismatch") } - impl.Authenticated = true + impl.PeerAuthenticated.Store(true) log.Printf("AuthenticateToJobManager: authentication successful for JobId=%s\n", claims.JobId) - if WshCmdJobManager.JobAuthToken != "" { - authData := wshrpc.CommandAuthenticateJobManagerData{ - JobId: WshCmdJobManager.JobId, - JobAuthToken: WshCmdJobManager.JobAuthToken, - } - err = wshclient.AuthenticateJobManagerCommand(impl.WshRpc, authData, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) + if jobAuthToken != "" { + err = impl.authenticateSelfToServer(jobAuthToken) if err != nil { - log.Printf("AuthenticateToJobManager: failed to authenticate back to server: %v\n", err) - impl.Authenticated = false - return fmt.Errorf("failed to authenticate back to server: %w", err) + impl.PeerAuthenticated.Store(false) + return err } - log.Printf("AuthenticateToJobManager: successfully authenticated back to server\n") } WshCmdJobManager.lock.Lock() defer WshCmdJobManager.lock.Unlock() + if WshCmdJobManager.attachedClient != nil { log.Printf("AuthenticateToJobManager: kicking out existing client\n") - WshCmdJobManager.attachedClient.Conn.Close() + WshCmdJobManager.attachedClient.Close() } WshCmdJobManager.attachedClient = impl return nil } +func (jm *JobManager) connectToStreamHelper_withlock(jobServerImpl *JobServerImpl, streamMeta wshrpc.StreamMeta, seq int64) (int64, error) { + rwndSize := int(streamMeta.RWnd) + if rwndSize < 0 { + return 0, fmt.Errorf("invalid rwnd size: %d", rwndSize) + } + + if jm.connectedStreamClient != nil { + log.Printf("connectToStreamHelper: disconnecting existing client\n") + jm.StreamManager.ClientDisconnected() + jm.connectedStreamClient = nil + } + dataSender := &routedDataSender{ + wshRpc: jobServerImpl.WshRpc, + route: streamMeta.ReaderRouteId, + } + serverSeq, err := jm.StreamManager.ClientConnected( + streamMeta.Id, + dataSender, + rwndSize, + seq, + ) + if err != nil { + return 0, fmt.Errorf("failed to connect client: %w", err) + } + jm.connectedStreamClient = jobServerImpl + return serverSeq, nil +} + +func (jm *JobManager) disconnectFromStreamHelper(jobServerImpl *JobServerImpl) { + jm.lock.Lock() + defer jm.lock.Unlock() + if jm.connectedStreamClient == nil || jm.connectedStreamClient != jobServerImpl { + return + } + jm.StreamManager.ClientDisconnected() + jm.connectedStreamClient = nil +} + func (impl *JobServerImpl) StartJobCommand(ctx context.Context, data wshrpc.CommandStartJobData) (*wshrpc.CommandStartJobRtnData, error) { - if !impl.Authenticated { + if !impl.PeerAuthenticated.Load() { return nil, fmt.Errorf("not authenticated") } - if WshCmdJobManager.Cmd != nil { + if WshCmdJobManager.IsJobStarted() { return nil, fmt.Errorf("job already started") } - WshCmdJobManager.JobAuthToken = data.JobAuthToken - authData := wshrpc.CommandAuthenticateJobManagerData{ - JobId: WshCmdJobManager.JobId, - JobAuthToken: WshCmdJobManager.JobAuthToken, - } - err := wshclient.AuthenticateJobManagerCommand(impl.WshRpc, authData, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) + err := impl.authenticateSelfToServer(data.JobAuthToken) if err != nil { - log.Printf("StartJob: failed to authenticate to server: %v\n", err) - WshCmdJobManager.JobAuthToken = "" - return nil, fmt.Errorf("failed to authenticate to server: %w", err) + return nil, err } - log.Printf("StartJob: successfully authenticated to server\n") + + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if WshCmdJobManager.Cmd != nil { + // we must re-check this with the lock for proper sync + return nil, fmt.Errorf("job already started") + } + + WshCmdJobManager.JobAuthToken = data.JobAuthToken cmdDef := CmdDef{ Cmd: data.Cmd, @@ -114,6 +204,23 @@ func (impl *JobServerImpl) StartJobCommand(ctx context.Context, data wshrpc.Comm return nil, fmt.Errorf("failed to start job: %w", err) } WshCmdJobManager.Cmd = jobCmd + + if data.StreamMeta != nil { + serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(impl, *data.StreamMeta, 0) + if err != nil { + return nil, fmt.Errorf("failed to connect stream: %w", err) + } + log.Printf("StartJob: connected stream streamid=%s serverSeq=%d\n", data.StreamMeta.Id, serverSeq) + } + + _, cmdPty := jobCmd.GetCmd() + if cmdPty != nil { + err = WshCmdJobManager.StreamManager.AttachReader(cmdPty) + if err != nil { + return nil, fmt.Errorf("failed to attach reader to stream manager: %w", err) + } + } + cmd, _ := jobCmd.GetCmd() if cmd == nil || cmd.Process == nil { return nil, fmt.Errorf("cmd or process is nil") @@ -125,19 +232,49 @@ func (impl *JobServerImpl) StartJobCommand(ctx context.Context, data wshrpc.Comm return &wshrpc.CommandStartJobRtnData{Pgid: pgid}, nil } -func (impl *JobServerImpl) JobConnectCommand(ctx context.Context, data wshrpc.CommandJobConnectData) error { - if !impl.Authenticated { - return fmt.Errorf("not authenticated") +func (impl *JobServerImpl) JobConnectCommand(ctx context.Context, data wshrpc.CommandJobConnectData) (*wshrpc.CommandJobConnectRtnData, error) { + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if !impl.PeerAuthenticated.Load() { + return nil, fmt.Errorf("peer not authenticated") + } + if !impl.SelfAuthenticated.Load() { + return nil, fmt.Errorf("not authenticated to server") } if WshCmdJobManager.Cmd == nil { - return fmt.Errorf("job not started") + return nil, fmt.Errorf("job not started") } - log.Printf("JobConnect: streamid=%s seq=%d\n", data.StreamId, data.Seq) + + serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(impl, data.StreamMeta, data.Seq) + if err != nil { + return nil, err + } + + log.Printf("JobConnect: streamid=%s clientSeq=%d serverSeq=%d\n", data.StreamMeta.Id, data.Seq, serverSeq) + return &wshrpc.CommandJobConnectRtnData{Seq: serverSeq}, nil +} + +func (impl *JobServerImpl) StreamDataAckCommand(ctx context.Context, data wshrpc.CommandStreamAckData) error { + // bad acks do NOT get error packets created (to avoid infinite loops). + // they should be silently ignored + if !impl.PeerAuthenticated.Load() { + return nil + } + if !impl.SelfAuthenticated.Load() { + return nil + } + // this is safe without locking because streamids are unique, and StreamManager will ignore an ack + // when not connected or when the streamid does not match + WshCmdJobManager.StreamManager.RecvAck(data) return nil } func (impl *JobServerImpl) JobTerminateCommand(ctx context.Context, data wshrpc.CommandJobTerminateData) error { - if !impl.Authenticated { + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if !impl.PeerAuthenticated.Load() { return fmt.Errorf("not authenticated") } if WshCmdJobManager.Cmd == nil { @@ -152,6 +289,7 @@ func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte) error WshCmdJobManager.ClientId = clientId WshCmdJobManager.JobId = jobId WshCmdJobManager.JwtPublicKey = publicKeyBytes + WshCmdJobManager.StreamManager = MakeStreamManager() err := wavejwt.SetPublicKey(publicKeyBytes) if err != nil { return fmt.Errorf("failed to set public key: %w", err) @@ -203,15 +341,20 @@ func handleJobDomainSocketClient(conn net.Conn) { inputCh := make(chan baseds.RpcInputChType, wshutil.DefaultInputChSize) outputCh := make(chan []byte, wshutil.DefaultOutputChSize) - serverImpl := &JobServerImpl{Conn: conn} + serverImpl := &JobServerImpl{ + Conn: conn, + inputCh: inputCh, + } rpcCtx := wshrpc.RpcContext{} wshRpc := wshutil.MakeWshRpcWithChannels(inputCh, outputCh, rpcCtx, serverImpl, "job-domain") serverImpl.WshRpc = wshRpc + defer WshCmdJobManager.disconnectFromStreamHelper(serverImpl) go func() { defer func() { panichandler.PanicHandler("handleJobDomainSocketClient:AdaptOutputChToStream", recover()) }() + defer serverImpl.Close() writeErr := wshutil.AdaptOutputChToStream(outputCh, conn) if writeErr != nil { log.Printf("error writing to domain socket: %v\n", writeErr) @@ -222,10 +365,7 @@ func handleJobDomainSocketClient(conn net.Conn) { defer func() { panichandler.PanicHandler("handleJobDomainSocketClient:AdaptStreamToMsgCh", recover()) }() - defer func() { - conn.Close() - close(inputCh) - }() + defer serverImpl.Close() wshutil.AdaptStreamToMsgCh(conn, inputCh) }() diff --git a/pkg/jobmanager/streammanager.go b/pkg/jobmanager/streammanager.go index 7f020942f6..0edfc802bb 100644 --- a/pkg/jobmanager/streammanager.go +++ b/pkg/jobmanager/streammanager.go @@ -156,11 +156,12 @@ func (sm *StreamManager) ClientDisconnected() { } // RecvAck processes an ACK from the client +// must be connected, and streamid must match func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { sm.lock.Lock() defer sm.lock.Unlock() - if !sm.connected { + if !sm.connected || ackPk.Id != sm.streamId { return } diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index c1a91aca07..4918a99211 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -477,9 +477,9 @@ func GetWaveAIRateLimitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) (*uctype } // command "jobconnect", wshserver.JobConnectCommand -func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opts *wshrpc.RpcOpts) error { - _, err := sendRpcRequestCallHelper[any](w, "jobconnect", data, opts) - return err +func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opts *wshrpc.RpcOpts) (*wshrpc.CommandJobConnectRtnData, error) { + resp, err := sendRpcRequestCallHelper[*wshrpc.CommandJobConnectRtnData](w, "jobconnect", data, opts) + return resp, err } // command "jobterminate", wshserver.JobTerminateCommand diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index ec244ef136..d6c7919a2a 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -160,7 +160,7 @@ type WshRpcInterface interface { // jobs AuthenticateToJobManagerCommand(ctx context.Context, data CommandAuthenticateToJobData) error StartJobCommand(ctx context.Context, data CommandStartJobData) (*CommandStartJobRtnData, error) - JobConnectCommand(ctx context.Context, data CommandJobConnectData) error + JobConnectCommand(ctx context.Context, data CommandJobConnectData) (*CommandJobConnectRtnData, error) JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error } @@ -680,6 +680,7 @@ type CommandStartJobData struct { Env map[string]string `json:"env"` TermSize waveobj.TermSize `json:"termsize"` JobAuthToken string `json:"jobauthtoken"` + StreamMeta *StreamMeta `json:"streammeta,omitempty"` } type CommandStartJobRtnData struct { @@ -687,8 +688,12 @@ type CommandStartJobRtnData struct { } type CommandJobConnectData struct { - StreamId string `json:"streamid"` - Seq int64 `json:"seq"` + StreamMeta StreamMeta `json:"streammeta"` + Seq int64 `json:"seq"` +} + +type CommandJobConnectRtnData struct { + Seq int64 `json:"seq"` } type CommandJobTerminateData struct { From bada6d9cae62350be853776fe8c0b73b3b782731 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 13:42:26 -0800 Subject: [PATCH 12/64] split up jobmanager file --- pkg/jobmanager/jobmanager.go | 242 +++---------------------------- pkg/jobmanager/mainserverconn.go | 213 +++++++++++++++++++++++++++ pkg/waveobj/wtype.go | 3 +- 3 files changed, 234 insertions(+), 224 deletions(-) create mode 100644 pkg/jobmanager/mainserverconn.go diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index ec309bd2d5..51ea19ce9e 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -4,21 +4,18 @@ package jobmanager import ( - "context" "fmt" "log" "net" "os" "path/filepath" "sync" - "sync/atomic" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" "github.com/wavetermdev/waveterm/pkg/wavebase" "github.com/wavetermdev/waveterm/pkg/wavejwt" "github.com/wavetermdev/waveterm/pkg/wshrpc" - "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" "github.com/wavetermdev/waveterm/pkg/wshutil" ) @@ -32,38 +29,24 @@ type JobManager struct { JobAuthToken string StreamManager *StreamManager lock sync.Mutex - attachedClient *JobServerImpl - connectedStreamClient *JobServerImpl + attachedClient *MainServerConn + connectedStreamClient *MainServerConn } -type JobServerImpl struct { - PeerAuthenticated atomic.Bool - SelfAuthenticated atomic.Bool - WshRpc *wshutil.WshRpc - Conn net.Conn - inputCh chan baseds.RpcInputChType - closeOnce sync.Once -} - -func (*JobServerImpl) WshServerImpl() {} - -func (impl *JobServerImpl) Close() { - impl.closeOnce.Do(func() { - impl.Conn.Close() - close(impl.inputCh) - }) -} - -type routedDataSender struct { - wshRpc *wshutil.WshRpc - route string -} - -func (rds *routedDataSender) SendData(dataPk wshrpc.CommandStreamData) { - err := wshclient.StreamDataCommand(rds.wshRpc, dataPk, &wshrpc.RpcOpts{NoResponse: true, Route: rds.route}) +func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte) error { + WshCmdJobManager.ClientId = clientId + WshCmdJobManager.JobId = jobId + WshCmdJobManager.JwtPublicKey = publicKeyBytes + WshCmdJobManager.StreamManager = MakeStreamManager() + err := wavejwt.SetPublicKey(publicKeyBytes) if err != nil { - log.Printf("SendData: error sending stream data: %v\n", err) + return fmt.Errorf("failed to set public key: %w", err) } + err = MakeJobDomainSocket(clientId, jobId) + if err != nil { + return err + } + return nil } func (jm *JobManager) GetJobAuthInfo() (string, string) { @@ -78,61 +61,7 @@ func (jm *JobManager) IsJobStarted() bool { return jm.Cmd != nil } -func (impl *JobServerImpl) authenticateSelfToServer(jobAuthToken string) error { - jobId, _ := WshCmdJobManager.GetJobAuthInfo() - authData := wshrpc.CommandAuthenticateJobManagerData{ - JobId: jobId, - JobAuthToken: jobAuthToken, - } - err := wshclient.AuthenticateJobManagerCommand(impl.WshRpc, authData, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) - if err != nil { - log.Printf("authenticateSelfToServer: failed to authenticate to server: %v\n", err) - return fmt.Errorf("failed to authenticate to server: %w", err) - } - impl.SelfAuthenticated.Store(true) - log.Printf("authenticateSelfToServer: successfully authenticated to server\n") - return nil -} - -func (impl *JobServerImpl) AuthenticateToJobManagerCommand(ctx context.Context, data wshrpc.CommandAuthenticateToJobData) error { - jobId, jobAuthToken := WshCmdJobManager.GetJobAuthInfo() - - claims, err := wavejwt.ValidateAndExtract(data.JobAccessToken) - if err != nil { - log.Printf("AuthenticateToJobManager: failed to validate token: %v\n", err) - return fmt.Errorf("failed to validate token: %w", err) - } - if !claims.MainServer { - log.Printf("AuthenticateToJobManager: MainServer claim not set\n") - return fmt.Errorf("MainServer claim not set") - } - if claims.JobId != jobId { - log.Printf("AuthenticateToJobManager: JobId mismatch: expected %s, got %s\n", jobId, claims.JobId) - return fmt.Errorf("JobId mismatch") - } - impl.PeerAuthenticated.Store(true) - log.Printf("AuthenticateToJobManager: authentication successful for JobId=%s\n", claims.JobId) - - if jobAuthToken != "" { - err = impl.authenticateSelfToServer(jobAuthToken) - if err != nil { - impl.PeerAuthenticated.Store(false) - return err - } - } - - WshCmdJobManager.lock.Lock() - defer WshCmdJobManager.lock.Unlock() - - if WshCmdJobManager.attachedClient != nil { - log.Printf("AuthenticateToJobManager: kicking out existing client\n") - WshCmdJobManager.attachedClient.Close() - } - WshCmdJobManager.attachedClient = impl - return nil -} - -func (jm *JobManager) connectToStreamHelper_withlock(jobServerImpl *JobServerImpl, streamMeta wshrpc.StreamMeta, seq int64) (int64, error) { +func (jm *JobManager) connectToStreamHelper_withlock(mainServerConn *MainServerConn, streamMeta wshrpc.StreamMeta, seq int64) (int64, error) { rwndSize := int(streamMeta.RWnd) if rwndSize < 0 { return 0, fmt.Errorf("invalid rwnd size: %d", rwndSize) @@ -144,7 +73,7 @@ func (jm *JobManager) connectToStreamHelper_withlock(jobServerImpl *JobServerImp jm.connectedStreamClient = nil } dataSender := &routedDataSender{ - wshRpc: jobServerImpl.WshRpc, + wshRpc: mainServerConn.WshRpc, route: streamMeta.ReaderRouteId, } serverSeq, err := jm.StreamManager.ClientConnected( @@ -156,151 +85,20 @@ func (jm *JobManager) connectToStreamHelper_withlock(jobServerImpl *JobServerImp if err != nil { return 0, fmt.Errorf("failed to connect client: %w", err) } - jm.connectedStreamClient = jobServerImpl + jm.connectedStreamClient = mainServerConn return serverSeq, nil } -func (jm *JobManager) disconnectFromStreamHelper(jobServerImpl *JobServerImpl) { +func (jm *JobManager) disconnectFromStreamHelper(mainServerConn *MainServerConn) { jm.lock.Lock() defer jm.lock.Unlock() - if jm.connectedStreamClient == nil || jm.connectedStreamClient != jobServerImpl { + if jm.connectedStreamClient == nil || jm.connectedStreamClient != mainServerConn { return } jm.StreamManager.ClientDisconnected() jm.connectedStreamClient = nil } -func (impl *JobServerImpl) StartJobCommand(ctx context.Context, data wshrpc.CommandStartJobData) (*wshrpc.CommandStartJobRtnData, error) { - if !impl.PeerAuthenticated.Load() { - return nil, fmt.Errorf("not authenticated") - } - if WshCmdJobManager.IsJobStarted() { - return nil, fmt.Errorf("job already started") - } - - err := impl.authenticateSelfToServer(data.JobAuthToken) - if err != nil { - return nil, err - } - - WshCmdJobManager.lock.Lock() - defer WshCmdJobManager.lock.Unlock() - - if WshCmdJobManager.Cmd != nil { - // we must re-check this with the lock for proper sync - return nil, fmt.Errorf("job already started") - } - - WshCmdJobManager.JobAuthToken = data.JobAuthToken - - cmdDef := CmdDef{ - Cmd: data.Cmd, - Args: data.Args, - Env: data.Env, - TermSize: data.TermSize, - } - jobCmd, err := MakeJobCmd(WshCmdJobManager.JobId, cmdDef) - if err != nil { - return nil, fmt.Errorf("failed to start job: %w", err) - } - WshCmdJobManager.Cmd = jobCmd - - if data.StreamMeta != nil { - serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(impl, *data.StreamMeta, 0) - if err != nil { - return nil, fmt.Errorf("failed to connect stream: %w", err) - } - log.Printf("StartJob: connected stream streamid=%s serverSeq=%d\n", data.StreamMeta.Id, serverSeq) - } - - _, cmdPty := jobCmd.GetCmd() - if cmdPty != nil { - err = WshCmdJobManager.StreamManager.AttachReader(cmdPty) - if err != nil { - return nil, fmt.Errorf("failed to attach reader to stream manager: %w", err) - } - } - - cmd, _ := jobCmd.GetCmd() - if cmd == nil || cmd.Process == nil { - return nil, fmt.Errorf("cmd or process is nil") - } - pgid, err := getProcessGroupId(cmd.Process.Pid) - if err != nil { - return nil, fmt.Errorf("failed to get process group id: %w", err) - } - return &wshrpc.CommandStartJobRtnData{Pgid: pgid}, nil -} - -func (impl *JobServerImpl) JobConnectCommand(ctx context.Context, data wshrpc.CommandJobConnectData) (*wshrpc.CommandJobConnectRtnData, error) { - WshCmdJobManager.lock.Lock() - defer WshCmdJobManager.lock.Unlock() - - if !impl.PeerAuthenticated.Load() { - return nil, fmt.Errorf("peer not authenticated") - } - if !impl.SelfAuthenticated.Load() { - return nil, fmt.Errorf("not authenticated to server") - } - if WshCmdJobManager.Cmd == nil { - return nil, fmt.Errorf("job not started") - } - - serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(impl, data.StreamMeta, data.Seq) - if err != nil { - return nil, err - } - - log.Printf("JobConnect: streamid=%s clientSeq=%d serverSeq=%d\n", data.StreamMeta.Id, data.Seq, serverSeq) - return &wshrpc.CommandJobConnectRtnData{Seq: serverSeq}, nil -} - -func (impl *JobServerImpl) StreamDataAckCommand(ctx context.Context, data wshrpc.CommandStreamAckData) error { - // bad acks do NOT get error packets created (to avoid infinite loops). - // they should be silently ignored - if !impl.PeerAuthenticated.Load() { - return nil - } - if !impl.SelfAuthenticated.Load() { - return nil - } - // this is safe without locking because streamids are unique, and StreamManager will ignore an ack - // when not connected or when the streamid does not match - WshCmdJobManager.StreamManager.RecvAck(data) - return nil -} - -func (impl *JobServerImpl) JobTerminateCommand(ctx context.Context, data wshrpc.CommandJobTerminateData) error { - WshCmdJobManager.lock.Lock() - defer WshCmdJobManager.lock.Unlock() - - if !impl.PeerAuthenticated.Load() { - return fmt.Errorf("not authenticated") - } - if WshCmdJobManager.Cmd == nil { - return fmt.Errorf("job not started") - } - log.Printf("JobTerminate called\n") - WshCmdJobManager.Cmd.Terminate() - return nil -} - -func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte) error { - WshCmdJobManager.ClientId = clientId - WshCmdJobManager.JobId = jobId - WshCmdJobManager.JwtPublicKey = publicKeyBytes - WshCmdJobManager.StreamManager = MakeStreamManager() - err := wavejwt.SetPublicKey(publicKeyBytes) - if err != nil { - return fmt.Errorf("failed to set public key: %w", err) - } - err = MakeJobDomainSocket(clientId, jobId) - if err != nil { - return err - } - return nil -} - func MakeJobDomainSocket(clientId string, jobId string) error { homeDir := wavebase.GetHomeDir() socketDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) @@ -341,7 +139,7 @@ func handleJobDomainSocketClient(conn net.Conn) { inputCh := make(chan baseds.RpcInputChType, wshutil.DefaultInputChSize) outputCh := make(chan []byte, wshutil.DefaultOutputChSize) - serverImpl := &JobServerImpl{ + serverImpl := &MainServerConn{ Conn: conn, inputCh: inputCh, } diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go new file mode 100644 index 0000000000..ee659aa564 --- /dev/null +++ b/pkg/jobmanager/mainserverconn.go @@ -0,0 +1,213 @@ +// Copyright 2026, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package jobmanager + +import ( + "context" + "fmt" + "log" + "net" + "sync" + "sync/atomic" + + "github.com/wavetermdev/waveterm/pkg/baseds" + "github.com/wavetermdev/waveterm/pkg/wavejwt" + "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" + "github.com/wavetermdev/waveterm/pkg/wshutil" +) + +type MainServerConn struct { + PeerAuthenticated atomic.Bool + SelfAuthenticated atomic.Bool + WshRpc *wshutil.WshRpc + Conn net.Conn + inputCh chan baseds.RpcInputChType + closeOnce sync.Once +} + +func (*MainServerConn) WshServerImpl() {} + +func (msc *MainServerConn) Close() { + msc.closeOnce.Do(func() { + msc.Conn.Close() + close(msc.inputCh) + }) +} + +type routedDataSender struct { + wshRpc *wshutil.WshRpc + route string +} + +func (rds *routedDataSender) SendData(dataPk wshrpc.CommandStreamData) { + err := wshclient.StreamDataCommand(rds.wshRpc, dataPk, &wshrpc.RpcOpts{NoResponse: true, Route: rds.route}) + if err != nil { + log.Printf("SendData: error sending stream data: %v\n", err) + } +} + +func (msc *MainServerConn) authenticateSelfToServer(jobAuthToken string) error { + jobId, _ := WshCmdJobManager.GetJobAuthInfo() + authData := wshrpc.CommandAuthenticateJobManagerData{ + JobId: jobId, + JobAuthToken: jobAuthToken, + } + err := wshclient.AuthenticateJobManagerCommand(msc.WshRpc, authData, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) + if err != nil { + log.Printf("authenticateSelfToServer: failed to authenticate to server: %v\n", err) + return fmt.Errorf("failed to authenticate to server: %w", err) + } + msc.SelfAuthenticated.Store(true) + log.Printf("authenticateSelfToServer: successfully authenticated to server\n") + return nil +} + +func (msc *MainServerConn) AuthenticateToJobManagerCommand(ctx context.Context, data wshrpc.CommandAuthenticateToJobData) error { + jobId, jobAuthToken := WshCmdJobManager.GetJobAuthInfo() + + claims, err := wavejwt.ValidateAndExtract(data.JobAccessToken) + if err != nil { + log.Printf("AuthenticateToJobManager: failed to validate token: %v\n", err) + return fmt.Errorf("failed to validate token: %w", err) + } + if !claims.MainServer { + log.Printf("AuthenticateToJobManager: MainServer claim not set\n") + return fmt.Errorf("MainServer claim not set") + } + if claims.JobId != jobId { + log.Printf("AuthenticateToJobManager: JobId mismatch: expected %s, got %s\n", jobId, claims.JobId) + return fmt.Errorf("JobId mismatch") + } + msc.PeerAuthenticated.Store(true) + log.Printf("AuthenticateToJobManager: authentication successful for JobId=%s\n", claims.JobId) + + if jobAuthToken != "" { + err = msc.authenticateSelfToServer(jobAuthToken) + if err != nil { + msc.PeerAuthenticated.Store(false) + return err + } + } + + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if WshCmdJobManager.attachedClient != nil { + log.Printf("AuthenticateToJobManager: kicking out existing client\n") + WshCmdJobManager.attachedClient.Close() + } + WshCmdJobManager.attachedClient = msc + return nil +} + +func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.CommandStartJobData) (*wshrpc.CommandStartJobRtnData, error) { + if !msc.PeerAuthenticated.Load() { + return nil, fmt.Errorf("not authenticated") + } + if WshCmdJobManager.IsJobStarted() { + return nil, fmt.Errorf("job already started") + } + + err := msc.authenticateSelfToServer(data.JobAuthToken) + if err != nil { + return nil, err + } + + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if WshCmdJobManager.Cmd != nil { + return nil, fmt.Errorf("job already started") + } + + WshCmdJobManager.JobAuthToken = data.JobAuthToken + + cmdDef := CmdDef{ + Cmd: data.Cmd, + Args: data.Args, + Env: data.Env, + TermSize: data.TermSize, + } + jobCmd, err := MakeJobCmd(WshCmdJobManager.JobId, cmdDef) + if err != nil { + return nil, fmt.Errorf("failed to start job: %w", err) + } + WshCmdJobManager.Cmd = jobCmd + + if data.StreamMeta != nil { + serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(msc, *data.StreamMeta, 0) + if err != nil { + return nil, fmt.Errorf("failed to connect stream: %w", err) + } + log.Printf("StartJob: connected stream streamid=%s serverSeq=%d\n", data.StreamMeta.Id, serverSeq) + } + + _, cmdPty := jobCmd.GetCmd() + if cmdPty != nil { + err = WshCmdJobManager.StreamManager.AttachReader(cmdPty) + if err != nil { + return nil, fmt.Errorf("failed to attach reader to stream manager: %w", err) + } + } + + cmd, _ := jobCmd.GetCmd() + if cmd == nil || cmd.Process == nil { + return nil, fmt.Errorf("cmd or process is nil") + } + pgid, err := getProcessGroupId(cmd.Process.Pid) + if err != nil { + return nil, fmt.Errorf("failed to get process group id: %w", err) + } + return &wshrpc.CommandStartJobRtnData{Pgid: pgid}, nil +} + +func (msc *MainServerConn) JobConnectCommand(ctx context.Context, data wshrpc.CommandJobConnectData) (*wshrpc.CommandJobConnectRtnData, error) { + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if !msc.PeerAuthenticated.Load() { + return nil, fmt.Errorf("peer not authenticated") + } + if !msc.SelfAuthenticated.Load() { + return nil, fmt.Errorf("not authenticated to server") + } + if WshCmdJobManager.Cmd == nil { + return nil, fmt.Errorf("job not started") + } + + serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(msc, data.StreamMeta, data.Seq) + if err != nil { + return nil, err + } + + log.Printf("JobConnect: streamid=%s clientSeq=%d serverSeq=%d\n", data.StreamMeta.Id, data.Seq, serverSeq) + return &wshrpc.CommandJobConnectRtnData{Seq: serverSeq}, nil +} + +func (msc *MainServerConn) StreamDataAckCommand(ctx context.Context, data wshrpc.CommandStreamAckData) error { + if !msc.PeerAuthenticated.Load() { + return nil + } + if !msc.SelfAuthenticated.Load() { + return nil + } + WshCmdJobManager.StreamManager.RecvAck(data) + return nil +} + +func (msc *MainServerConn) JobTerminateCommand(ctx context.Context, data wshrpc.CommandJobTerminateData) error { + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if !msc.PeerAuthenticated.Load() { + return fmt.Errorf("not authenticated") + } + if WshCmdJobManager.Cmd == nil { + return fmt.Errorf("job not started") + } + log.Printf("JobTerminate called\n") + WshCmdJobManager.Cmd.Terminate() + return nil +} diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 7ffbccb1d4..037c295e8e 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -316,8 +316,7 @@ type Job struct { Pgid int `json:"pgid"` // process group id AttachedBlockId string `json:"ownerblockid"` HupOnConnect bool `json:"huponconnect"` - JobAccessToken string `json:"jobaccesstoken"` // wave -> job manager - JobAuthToken string `json:"jobauthtoken"` // job manger -> wave + JobAuthToken string `json:"jobauthtoken"` // job manger -> wave Cmd string `json:"cmd"` CmdArgs []string `json:"cmdargs,omitempty"` CmdEnv map[string]string `json:"cmdenv,omitempty"` From 93bc1dffa9f80edb7461e4b164df0e5cc78766e5 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 16:08:51 -0800 Subject: [PATCH 13/64] set up auth token when starting jobmanager not on jobstart --- cmd/wsh/cmd/wshcmd-jobmanager.go | 53 +++++++++++++++++++++++++++++++- frontend/types/gotypes.d.ts | 2 -- pkg/jobmanager/jobmanager.go | 5 ++- pkg/jobmanager/mainserverconn.go | 17 +++------- pkg/wshrpc/wshrpctypes.go | 11 +++---- 5 files changed, 65 insertions(+), 23 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobmanager.go b/cmd/wsh/cmd/wshcmd-jobmanager.go index 2f7936fc99..ac3959ba83 100644 --- a/cmd/wsh/cmd/wshcmd-jobmanager.go +++ b/cmd/wsh/cmd/wshcmd-jobmanager.go @@ -4,9 +4,13 @@ package cmd import ( + "bufio" + "context" "encoding/base64" "fmt" "os" + "strings" + "time" "github.com/google/uuid" "github.com/spf13/cobra" @@ -53,10 +57,57 @@ func jobManagerRun(cmd *cobra.Command, args []string) error { return fmt.Errorf("failed to decode WAVETERM_PUBLICKEY: %v", err) } - err = jobmanager.SetupJobManager(jobManagerClientId, jobManagerJobId, publicKeyBytes) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + jobAuthToken, err := readJobAuthToken(ctx) + if err != nil { + return fmt.Errorf("failed to read job auth token: %v", err) + } + + err = jobmanager.SetupJobManager(jobManagerClientId, jobManagerJobId, publicKeyBytes, jobAuthToken) if err != nil { return fmt.Errorf("error setting up job manager: %v", err) } select {} } + +func readJobAuthToken(ctx context.Context) (string, error) { + resultCh := make(chan string, 1) + errorCh := make(chan error, 1) + + go func() { + reader := bufio.NewReader(os.Stdin) + line, err := reader.ReadString('\n') + if err != nil { + errorCh <- fmt.Errorf("error reading from stdin: %v", err) + return + } + + line = strings.TrimSpace(line) + prefix := jobmanager.JobAccessTokenLabel + ":" + if !strings.HasPrefix(line, prefix) { + errorCh <- fmt.Errorf("invalid token format: expected '%s'", prefix) + return + } + + token := strings.TrimPrefix(line, prefix) + token = strings.TrimSpace(token) + if token == "" { + errorCh <- fmt.Errorf("empty job auth token") + return + } + + resultCh <- token + }() + + select { + case token := <-resultCh: + return token, nil + case err := <-errorCh: + return "", err + case <-ctx.Done(): + return "", ctx.Err() + } +} diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index a0eea7bba5..f9e29aa56a 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -493,7 +493,6 @@ declare global { args: string[]; env: {[key: string]: string}; termsize: TermSize; - jobauthtoken: string; streammeta?: StreamMeta; }; @@ -841,7 +840,6 @@ declare global { pgid: number; ownerblockid: string; huponconnect: boolean; - jobaccesstoken: string; jobauthtoken: string; cmd: string; cmdargs?: string[]; diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 51ea19ce9e..55047a80e0 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -19,6 +19,8 @@ import ( "github.com/wavetermdev/waveterm/pkg/wshutil" ) +const JobAccessTokenLabel = "Wave-JobAccessToken" + var WshCmdJobManager JobManager type JobManager struct { @@ -33,10 +35,11 @@ type JobManager struct { connectedStreamClient *MainServerConn } -func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte) error { +func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAuthToken string) error { WshCmdJobManager.ClientId = clientId WshCmdJobManager.JobId = jobId WshCmdJobManager.JwtPublicKey = publicKeyBytes + WshCmdJobManager.JobAuthToken = jobAuthToken WshCmdJobManager.StreamManager = MakeStreamManager() err := wavejwt.SetPublicKey(publicKeyBytes) if err != nil { diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index ee659aa564..8b4ac25d92 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -83,12 +83,10 @@ func (msc *MainServerConn) AuthenticateToJobManagerCommand(ctx context.Context, msc.PeerAuthenticated.Store(true) log.Printf("AuthenticateToJobManager: authentication successful for JobId=%s\n", claims.JobId) - if jobAuthToken != "" { - err = msc.authenticateSelfToServer(jobAuthToken) - if err != nil { - msc.PeerAuthenticated.Store(false) - return err - } + err = msc.authenticateSelfToServer(jobAuthToken) + if err != nil { + msc.PeerAuthenticated.Store(false) + return err } WshCmdJobManager.lock.Lock() @@ -110,11 +108,6 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm return nil, fmt.Errorf("job already started") } - err := msc.authenticateSelfToServer(data.JobAuthToken) - if err != nil { - return nil, err - } - WshCmdJobManager.lock.Lock() defer WshCmdJobManager.lock.Unlock() @@ -122,8 +115,6 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm return nil, fmt.Errorf("job already started") } - WshCmdJobManager.JobAuthToken = data.JobAuthToken - cmdDef := CmdDef{ Cmd: data.Cmd, Args: data.Args, diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index d6c7919a2a..665caf1d3a 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -675,12 +675,11 @@ type CommandAuthenticateJobManagerData struct { } type CommandStartJobData struct { - Cmd string `json:"cmd"` - Args []string `json:"args"` - Env map[string]string `json:"env"` - TermSize waveobj.TermSize `json:"termsize"` - JobAuthToken string `json:"jobauthtoken"` - StreamMeta *StreamMeta `json:"streammeta,omitempty"` + Cmd string `json:"cmd"` + Args []string `json:"args"` + Env map[string]string `json:"env"` + TermSize waveobj.TermSize `json:"termsize"` + StreamMeta *StreamMeta `json:"streammeta,omitempty"` } type CommandStartJobRtnData struct { From 19e79e8548dd9e4d196fcd9e355f90aa04beb99d Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 16:50:37 -0800 Subject: [PATCH 14/64] daemonization, better signal handling --- pkg/jobmanager/jobcmd.go | 73 +++++++++++---------------- pkg/jobmanager/jobmanager.go | 97 +++++++++++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 44 deletions(-) diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index 82691d95ef..e25701be09 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -9,18 +9,14 @@ import ( "log" "os" "os/exec" - "os/signal" "sync" "syscall" - "time" "github.com/creack/pty" "github.com/wavetermdev/waveterm/pkg/waveobj" "github.com/wavetermdev/waveterm/pkg/wshrpc" ) -const ShutdownDelayTime = 100 * time.Millisecond - type CmdDef struct { Cmd string Args []string @@ -29,15 +25,16 @@ type CmdDef struct { } type JobCmd struct { - jobId string - lock sync.Mutex - cmd *exec.Cmd - cmdPty pty.Pty - cleanedUp bool - ptyClosed bool - exitCode int - exitSignal string - exitErr error + jobId string + lock sync.Mutex + cmd *exec.Cmd + cmdPty pty.Pty + cleanedUp bool + ptyClosed bool + processExited bool + exitCode int + exitSignal string + exitErr error } func MakeJobCmd(jobId string, cmdDef CmdDef) (*JobCmd, error) { @@ -64,9 +61,7 @@ func MakeJobCmd(jobId string, cmdDef CmdDef) (*JobCmd, error) { } jm.cmd = ecmd jm.cmdPty = cmdPty - go jm.readPtyOutput(cmdPty) go jm.waitForProcess() - jm.setupSignalHandlers() return jm, nil } @@ -78,6 +73,7 @@ func (jm *JobCmd) waitForProcess() { jm.lock.Lock() defer jm.lock.Unlock() + jm.processExited = true jm.exitErr = err if err != nil { if exitErr, ok := err.(*exec.ExitError); ok { @@ -102,6 +98,25 @@ func (jm *JobCmd) GetCmd() (*exec.Cmd, pty.Pty) { return jm.cmd, jm.cmdPty } +func (jm *JobCmd) GetPGID() (int, error) { + jm.lock.Lock() + defer jm.lock.Unlock() + if jm.cmd == nil || jm.cmd.Process == nil { + return 0, fmt.Errorf("no active process") + } + if jm.processExited { + return 0, fmt.Errorf("process already exited") + } + pgid, err := syscall.Getpgid(jm.cmd.Process.Pid) + if err != nil { + return 0, fmt.Errorf("failed to get pgid: %w", err) + } + if pgid <= 0 { + return 0, fmt.Errorf("invalid pgid returned: %d", pgid) + } + return pgid, nil +} + func (jm *JobCmd) HandleInput(data wshrpc.CommandBlockInputData) error { jm.lock.Lock() defer jm.lock.Unlock() @@ -145,30 +160,6 @@ func (jm *JobCmd) HandleInput(data wshrpc.CommandBlockInputData) error { return nil } -func (jm *JobCmd) setupSignalHandlers() { - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) - - go func() { - sig := <-sigChan - log.Printf("received signal: %v\n", sig) - - cmd, _ := jm.GetCmd() - if cmd != nil && cmd.Process != nil { - log.Printf("forwarding signal %v to child process\n", sig) - cmd.Process.Signal(sig) - time.Sleep(ShutdownDelayTime) - } - - jm.Cleanup() - os.Exit(0) - }() -} - -func (jm *JobCmd) readPtyOutput(cmdPty pty.Pty) { - // TODO: implement readPtyOutput -} - func (jm *JobCmd) Terminate() { jm.lock.Lock() defer jm.lock.Unlock() @@ -181,7 +172,3 @@ func (jm *JobCmd) Terminate() { log.Printf("pty closed for job %s\n", jm.jobId) } } - -func (jm *JobCmd) Cleanup() { - // TODO: implement Cleanup -} diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 55047a80e0..1dbad5e91a 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -8,8 +8,12 @@ import ( "log" "net" "os" + "os/signal" "path/filepath" + "runtime" "sync" + "syscall" + "time" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" @@ -20,6 +24,7 @@ import ( ) const JobAccessTokenLabel = "Wave-JobAccessToken" +const JobManagerStartLabel = "Wave-JobManagerStart" var WshCmdJobManager JobManager @@ -36,6 +41,9 @@ type JobManager struct { } func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAuthToken string) error { + if runtime.GOOS != "linux" && runtime.GOOS != "darwin" { + return fmt.Errorf("job manager only supported on unix systems, not %s", runtime.GOOS) + } WshCmdJobManager.ClientId = clientId WshCmdJobManager.JobId = jobId WshCmdJobManager.JwtPublicKey = publicKeyBytes @@ -49,9 +57,90 @@ func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAu if err != nil { return err } + fmt.Fprintf(os.Stdout, JobManagerStartLabel+"\n") + + err = daemonize(clientId, jobId) + if err != nil { + return fmt.Errorf("failed to daemonize: %w", err) + } + return nil } +func (jm *JobManager) GetCmd() *JobCmd { + jm.lock.Lock() + defer jm.lock.Unlock() + return jm.Cmd +} + +func daemonize(clientId string, jobId string) error { + devNull, err := os.OpenFile("/dev/null", os.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("failed to open /dev/null: %w", err) + } + err = syscall.Dup2(int(devNull.Fd()), int(os.Stdin.Fd())) + if err != nil { + return fmt.Errorf("failed to dup2 stdin: %w", err) + } + devNull.Close() // dupped so we can close this one + + logPath := getJobFilePath(clientId, jobId, "log") + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600) + if err != nil { + return fmt.Errorf("failed to open log file: %w", err) + } + err = syscall.Dup2(int(logFile.Fd()), int(os.Stdout.Fd())) + if err != nil { + return fmt.Errorf("failed to dup2 stdout: %w", err) + } + err = syscall.Dup2(int(logFile.Fd()), int(os.Stderr.Fd())) + if err != nil { + return fmt.Errorf("failed to dup2 stderr: %w", err) + } + logFile.Close() // dupped, so we can close this one + + log.SetOutput(os.Stdout) + log.Printf("job manager daemonized, logging to %s\n", logPath) + + setupJobManagerSignalHandlers() + return nil +} + +func setupJobManagerSignalHandlers() { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) + + go func() { + for sig := range sigChan { + log.Printf("job manager received signal: %v\n", sig) + + cmd := WshCmdJobManager.GetCmd() + if cmd != nil { + pgid, err := cmd.GetPGID() + if err == nil { + if s, ok := sig.(syscall.Signal); ok { + log.Printf("forwarding signal %v to process group %d\n", sig, pgid) + _ = syscall.Kill(-pgid, s) + } else { + log.Printf("signal is not a syscall.Signal: %T\n", sig) + } + } else { + log.Printf("failed to get pgid: %v\n", err) + } + } + + if sig == syscall.SIGTERM { + if cmd != nil { + log.Printf("received SIGTERM, will exit\n") + time.Sleep(500 * time.Millisecond) + } + log.Printf("terminating job manager\n") + os.Exit(0) + } + } + }() +} + func (jm *JobManager) GetJobAuthInfo() (string, string) { jm.lock.Lock() defer jm.lock.Unlock() @@ -102,6 +191,12 @@ func (jm *JobManager) disconnectFromStreamHelper(mainServerConn *MainServerConn) jm.connectedStreamClient = nil } +func getJobFilePath(clientId string, jobId string, extension string) string { + homeDir := wavebase.GetHomeDir() + socketDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) + return filepath.Join(socketDir, fmt.Sprintf("%s.%s", jobId, extension)) +} + func MakeJobDomainSocket(clientId string, jobId string) error { homeDir := wavebase.GetHomeDir() socketDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) @@ -110,7 +205,7 @@ func MakeJobDomainSocket(clientId string, jobId string) error { return fmt.Errorf("failed to create socket directory: %w", err) } - socketPath := filepath.Join(socketDir, fmt.Sprintf("%s.sock", jobId)) + socketPath := getJobFilePath(clientId, jobId, "sock") os.Remove(socketPath) From cbf39ffb9c6a2cc48fe3342e1446ac8aa7def781 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 20:32:23 -0800 Subject: [PATCH 15/64] update router to allow routing messages by linkid. also allow responses from untrusted links --- pkg/wshutil/wshrouter.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pkg/wshutil/wshrouter.go b/pkg/wshutil/wshrouter.go index 94e59e270b..19fa40d037 100644 --- a/pkg/wshutil/wshrouter.go +++ b/pkg/wshutil/wshrouter.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" "log" + "strconv" "strings" "sync" "time" @@ -34,6 +35,7 @@ const ( RoutePrefix_Tab = "tab:" RoutePrefix_FeBlock = "feblock:" RoutePrefix_Builder = "builder:" + RoutePrefix_Link = "link:" ) // this works like a network switch @@ -122,6 +124,10 @@ func MakeJobRouteId(jobId string) string { return "job:" + jobId } +func MakeLinkRouteId(linkId baseds.LinkId) string { + return fmt.Sprintf("%s%d", RoutePrefix_Link, linkId) +} + var DefaultRouter *WshRouter func NewWshRouter() *WshRouter { @@ -249,6 +255,13 @@ func (router *WshRouter) getRouteInfo(rpcId string) *rpcRoutingInfo { // returns true if message was sent, false if failed func (router *WshRouter) sendRoutedMessage(msgBytes []byte, routeId string, commandName string, ingressLinkId baseds.LinkId) bool { + if strings.HasPrefix(routeId, RoutePrefix_Link) { + linkIdStr := strings.TrimPrefix(routeId, RoutePrefix_Link) + linkIdInt, err := strconv.ParseInt(linkIdStr, 10, 32) + if err == nil { + return router.sendMessageToLink(msgBytes, baseds.LinkId(linkIdInt), ingressLinkId) + } + } lm := router.getLinkForRoute(routeId) if lm != nil { lm.client.SendRpcMessage(msgBytes, ingressLinkId, "route") @@ -452,8 +465,10 @@ func (router *WshRouter) runLinkClientRecvLoop(linkId baseds.LinkId, client Abst } else { // non-request messages (responses) if !lm.trusted { - // drop responses from untrusted links - continue + // allow responses to RPCs we initiated + if rpcMsg.ResId == "" || router.getRouteInfo(rpcMsg.ResId) == nil { + continue + } } } router.inputCh <- baseds.RpcInputChType{MsgBytes: msgBytes, IngressLinkId: linkId} From 82ca0245123f1af5a28fc8f3a8cea33a8b40a2ce Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 21:10:13 -0800 Subject: [PATCH 16/64] new mode to run router over domain socket --- cmd/wsh/cmd/wshcmd-connserver.go | 121 ++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/cmd/wsh/cmd/wshcmd-connserver.go b/cmd/wsh/cmd/wshcmd-connserver.go index 3fb6a10fcc..1bfd825e54 100644 --- a/cmd/wsh/cmd/wshcmd-connserver.go +++ b/cmd/wsh/cmd/wshcmd-connserver.go @@ -38,11 +38,13 @@ var serverCmd = &cobra.Command{ } var connServerRouter bool +var connServerRouterDomainSocket bool var connServerConnName string var connServerDev bool func init() { - serverCmd.Flags().BoolVar(&connServerRouter, "router", false, "run in local router mode") + serverCmd.Flags().BoolVar(&connServerRouter, "router", false, "run in local router mode (stdio upstream)") + serverCmd.Flags().BoolVar(&connServerRouterDomainSocket, "router-domainsocket", false, "run in local router mode (domain socket upstream)") serverCmd.Flags().StringVar(&connServerConnName, "conn", "", "connection name") serverCmd.Flags().BoolVar(&connServerDev, "dev", false, "enable dev mode with file logging and PID in logs") rootCmd.AddCommand(serverCmd) @@ -209,6 +211,109 @@ func serverRunRouter() error { select {} } +func serverRunRouterDomainSocket(jwtToken string) error { + log.Printf("starting connserver router (domain socket upstream)") + + // extract socket name from JWT token (unverified - we're on the client side) + sockName, err := wshutil.ExtractUnverifiedSocketName(jwtToken) + if err != nil { + return fmt.Errorf("error extracting socket name from JWT: %v", err) + } + + // connect to the forwarded domain socket + sockName = wavebase.ExpandHomeDirSafe(sockName) + conn, err := net.Dial("unix", sockName) + if err != nil { + return fmt.Errorf("error connecting to domain socket %s: %v", sockName, err) + } + + // create router + router := wshutil.NewWshRouter() + + // create proxy for the domain socket connection + upstreamProxy := wshutil.MakeRpcProxy("connserver-upstream") + + // goroutine to write to the domain socket + go func() { + defer func() { + panichandler.PanicHandler("serverRunRouterDomainSocket:WriteLoop", recover()) + }() + writeErr := wshutil.AdaptOutputChToStream(upstreamProxy.ToRemoteCh, conn) + if writeErr != nil { + log.Printf("error writing to upstream domain socket: %v\n", writeErr) + } + }() + + // goroutine to read from the domain socket + go func() { + defer func() { + panichandler.PanicHandler("serverRunRouterDomainSocket:ReadLoop", recover()) + }() + defer func() { + log.Printf("upstream domain socket closed, shutting down") + wshutil.DoShutdown("", 0, true) + }() + wshutil.AdaptStreamToMsgCh(conn, upstreamProxy.FromRemoteCh) + }() + + // register the domain socket connection as upstream + router.RegisterUpstream(upstreamProxy) + + // setup the connserver rpc client (leaf) + client, err := setupConnServerRpcClientWithRouter(router) + if err != nil { + return fmt.Errorf("error setting up connserver rpc client: %v", err) + } + wshfs.RpcClient = client + + // authenticate with the upstream router using the JWT + _, err = wshclient.AuthenticateCommand(client, jwtToken, &wshrpc.RpcOpts{Route: wshutil.ControlRoute}) + if err != nil { + return fmt.Errorf("error authenticating with upstream: %v", err) + } + log.Printf("authenticated with upstream router") + + // fetch and set JWT public key + log.Printf("trying to get JWT public key") + jwtPublicKeyB64, err := wshclient.GetJwtPublicKeyCommand(client, nil) + if err != nil { + return fmt.Errorf("error getting jwt public key: %v", err) + } + jwtPublicKeyBytes, err := base64.StdEncoding.DecodeString(jwtPublicKeyB64) + if err != nil { + return fmt.Errorf("error decoding jwt public key: %v", err) + } + err = wavejwt.SetPublicKey(jwtPublicKeyBytes) + if err != nil { + return fmt.Errorf("error setting jwt public key: %v", err) + } + log.Printf("got JWT public key") + + // set up the local domain socket listener for local wsh commands + unixListener, err := MakeRemoteUnixListener() + if err != nil { + return fmt.Errorf("cannot create unix listener: %v", err) + } + log.Printf("unix listener started") + go func() { + defer func() { + panichandler.PanicHandler("serverRunRouterDomainSocket:runListener", recover()) + }() + runListener(unixListener, router) + }() + + // run the sysinfo loop + go func() { + defer func() { + panichandler.PanicHandler("serverRunRouterDomainSocket:RunSysInfoLoop", recover()) + }() + wshremote.RunSysInfoLoop(client, connServerConnName) + }() + + log.Printf("running server (router-domainsocket mode), successfully started") + select {} +} + func serverRunNormal(jwtToken string) error { err := setupRpcClient(&wshremote.ServerImpl{LogWriter: os.Stdout}, jwtToken) if err != nil { @@ -283,6 +388,20 @@ func serverRun(cmd *cobra.Command, args []string) error { } return err } + if connServerRouterDomainSocket { + jwtToken, err := askForJwtToken() + if err != nil { + if logFile != nil { + fmt.Fprintf(logFile, "askForJwtToken error: %v\n", err) + } + return err + } + err = serverRunRouterDomainSocket(jwtToken) + if err != nil && logFile != nil { + fmt.Fprintf(logFile, "serverRunRouterDomainSocket error: %v\n", err) + } + return err + } jwtToken, err := askForJwtToken() if err != nil { if logFile != nil { From b4dc3219803e4d8312f78151d9431e5937100907 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 21:19:31 -0800 Subject: [PATCH 17/64] updates to conncontroller to support new router over domainsocket mode --- pkg/remote/conncontroller/conncontroller.go | 34 +++++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 34d2033b43..7a3ff6e546 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -85,7 +85,7 @@ type SSHConn struct { var ConnServerCmdTemplate = strings.TrimSpace( strings.Join([]string{ "%s version 2> /dev/null || (echo -n \"not-installed \"; uname -sm; exit 0);", - "exec %s connserver --conn %s %s", + "exec %s connserver --conn %s %s %s", }, "\n")) func IsLocalConnName(connName string) bool { @@ -285,8 +285,9 @@ func (conn *SSHConn) GetConfigShellPath() string { // returns (needsInstall, clientVersion, osArchStr, error) // if wsh is not installed, the clientVersion will be "not-installed", and it will also return an osArchStr // if clientVersion is set, then no osArchStr will be returned -func (conn *SSHConn) StartConnServer(ctx context.Context, afterUpdate bool) (bool, string, string, error) { - conn.Infof(ctx, "running StartConnServer...\n") +// if useRouterMode is true, will start connserver with --router-domainsocket flag +func (conn *SSHConn) StartConnServer(ctx context.Context, afterUpdate bool, useRouterMode bool) (bool, string, string, error) { + conn.Infof(ctx, "running StartConnServer (routerMode=%v)...\n", useRouterMode) allowed := WithLockRtn(conn, func() bool { return conn.Status == Status_Connecting }) @@ -296,10 +297,19 @@ func (conn *SSHConn) StartConnServer(ctx context.Context, afterUpdate bool) (boo client := conn.GetClient() wshPath := conn.getWshPath() sockName := conn.GetDomainSocketName() - rpcCtx := wshrpc.RpcContext{ - RouteId: wshutil.MakeConnectionRouteId(conn.GetName()), - SockName: sockName, - Conn: conn.GetName(), + var rpcCtx wshrpc.RpcContext + if useRouterMode { + rpcCtx = wshrpc.RpcContext{ + IsRouter: true, + SockName: sockName, + Conn: conn.GetName(), + } + } else { + rpcCtx = wshrpc.RpcContext{ + RouteId: wshutil.MakeConnectionRouteId(conn.GetName()), + SockName: sockName, + Conn: conn.GetName(), + } } jwtToken, err := wshutil.MakeClientJWTToken(rpcCtx) if err != nil { @@ -321,7 +331,11 @@ func (conn *SSHConn) StartConnServer(ctx context.Context, afterUpdate bool) (boo if wavebase.IsDevMode() { devFlag = "--dev" } - cmdStr := fmt.Sprintf(ConnServerCmdTemplate, wshPath, wshPath, shellutil.HardQuote(conn.GetName()), devFlag) + routerFlag := "" + if useRouterMode { + routerFlag = "--router-domainsocket" + } + cmdStr := fmt.Sprintf(ConnServerCmdTemplate, wshPath, wshPath, shellutil.HardQuote(conn.GetName()), devFlag, routerFlag) log.Printf("starting conn controller: %q\n", cmdStr) shWrappedCmdStr := fmt.Sprintf("sh -c %s", shellutil.HardQuote(cmdStr)) blocklogger.Debugf(ctx, "[conndebug] wrapped command:\n%s\n", shWrappedCmdStr) @@ -702,7 +716,7 @@ func (conn *SSHConn) tryEnableWsh(ctx context.Context, clientDisplayName string) err = fmt.Errorf("error opening domain socket listener: %w", err) return WshCheckResult{NoWshReason: "error opening domain socket", NoWshCode: NoWshCode_DomainSocketError, WshError: err} } - needsInstall, clientVersion, osArchStr, err := conn.StartConnServer(ctx, false) + needsInstall, clientVersion, osArchStr, err := conn.StartConnServer(ctx, false, false) if err != nil { conn.Infof(ctx, "ERROR starting conn server: %v\n", err) err = fmt.Errorf("error starting conn server: %w", err) @@ -716,7 +730,7 @@ func (conn *SSHConn) tryEnableWsh(ctx context.Context, clientDisplayName string) err = fmt.Errorf("error installing wsh: %w", err) return WshCheckResult{NoWshReason: "error installing wsh/connserver", NoWshCode: NoWshCode_InstallError, WshError: err} } - needsInstall, clientVersion, _, err = conn.StartConnServer(ctx, true) + needsInstall, clientVersion, _, err = conn.StartConnServer(ctx, true, false) if err != nil { conn.Infof(ctx, "ERROR starting conn server (after install): %v\n", err) err = fmt.Errorf("error starting conn server (after install): %w", err) From cfebfa0753799b68f6afd3dbeb517da90224773e Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 13 Jan 2026 21:37:19 -0800 Subject: [PATCH 18/64] begin work on remotejobstart command --- frontend/app/store/wshclientapi.ts | 5 ++ frontend/types/gotypes.d.ts | 13 +++++ pkg/jobmanager/jobmanager.go | 6 +-- pkg/wshrpc/wshclient/wshclient.go | 6 +++ pkg/wshrpc/wshremote/wshremote.go | 77 ++++++++++++++++++++++++++++++ pkg/wshrpc/wshrpctypes.go | 13 +++++ 6 files changed, 117 insertions(+), 3 deletions(-) diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 7bc7e48f1d..e48c39d8ff 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -507,6 +507,11 @@ class RpcApiType { return client.wshRpcCall("remotemkdir", data, opts); } + // command "remotestartjob" [call] + RemoteStartJobCommand(client: WshClient, data: CommandRemoteStartJobData, opts?: RpcOpts): Promise { + return client.wshRpcCall("remotestartjob", data, opts); + } + // command "remotestreamcpudata" [responsestream] RemoteStreamCpuDataCommand(client: WshClient, opts?: RpcOpts): AsyncGenerator { return client.wshRpcStream("remotestreamcpudata", null, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index f9e29aa56a..881541804f 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -434,6 +434,19 @@ declare global { fileinfo?: FileInfo[]; }; + // wshrpc.CommandRemoteStartJobData + type CommandRemoteStartJobData = { + cmd: string; + args: string[]; + env: {[key: string]: string}; + termsize: TermSize; + streammeta?: StreamMeta; + jobauthtoken: string; + jobid: string; + mainserverjwttoken: string; + clientid: string; + }; + // wshrpc.CommandRemoteStreamFileData type CommandRemoteStreamFileData = { path: string; diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 1dbad5e91a..7805061e4b 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -84,7 +84,7 @@ func daemonize(clientId string, jobId string) error { } devNull.Close() // dupped so we can close this one - logPath := getJobFilePath(clientId, jobId, "log") + logPath := GetJobFilePath(clientId, jobId, "log") logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600) if err != nil { return fmt.Errorf("failed to open log file: %w", err) @@ -191,7 +191,7 @@ func (jm *JobManager) disconnectFromStreamHelper(mainServerConn *MainServerConn) jm.connectedStreamClient = nil } -func getJobFilePath(clientId string, jobId string, extension string) string { +func GetJobFilePath(clientId string, jobId string, extension string) string { homeDir := wavebase.GetHomeDir() socketDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) return filepath.Join(socketDir, fmt.Sprintf("%s.%s", jobId, extension)) @@ -205,7 +205,7 @@ func MakeJobDomainSocket(clientId string, jobId string) error { return fmt.Errorf("failed to create socket directory: %w", err) } - socketPath := getJobFilePath(clientId, jobId, "sock") + socketPath := GetJobFilePath(clientId, jobId, "sock") os.Remove(socketPath) diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 4918a99211..7160120d0f 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -613,6 +613,12 @@ func RemoteMkdirCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) er return err } +// command "remotestartjob", wshserver.RemoteStartJobCommand +func RemoteStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteStartJobData, opts *wshrpc.RpcOpts) (*wshrpc.CommandStartJobRtnData, error) { + resp, err := sendRpcRequestCallHelper[*wshrpc.CommandStartJobRtnData](w, "remotestartjob", data, opts) + return resp, err +} + // command "remotestreamcpudata", wshserver.RemoteStreamCpuDataCommand func RemoteStreamCpuDataCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) chan wshrpc.RespOrErrorUnion[wshrpc.TimeSeriesData] { return sendRpcRequestResponseStreamHelper[wshrpc.TimeSeriesData](w, "remotestreamcpudata", nil, opts) diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index ec90e367c7..a0ded80f82 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -5,6 +5,7 @@ package wshremote import ( "archive/tar" + "bufio" "context" "encoding/base64" "errors" @@ -12,7 +13,9 @@ import ( "io" "io/fs" "log" + "net" "os" + "os/exec" "path/filepath" "strings" "time" @@ -861,3 +864,77 @@ func (*ServerImpl) DisposeSuggestionsCommand(ctx context.Context, widgetId strin suggestion.DisposeSuggestions(ctx, widgetId) return nil } + +func (*ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.CommandRemoteStartJobData) (*wshrpc.CommandStartJobRtnData, error) { + wshPath, err := wavebase.ExpandHomeDir("~/.waveterm/bin/wsh") + if err != nil { + return nil, fmt.Errorf("cannot expand wsh path: %w", err) + } + + cmd := exec.Command(wshPath, "jobmanager", "--jobid", data.JobId, "--clientid", data.ClientId) + stdin, err := cmd.StdinPipe() + if err != nil { + return nil, fmt.Errorf("cannot create stdin pipe: %w", err) + } + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, fmt.Errorf("cannot create stdout pipe: %w", err) + } + + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("cannot start job manager: %w", err) + } + + jobAuthTokenLine := fmt.Sprintf("Wave-JobAccessToken:%s\n", data.JobAuthToken) + if _, err := stdin.Write([]byte(jobAuthTokenLine)); err != nil { + cmd.Process.Kill() + return nil, fmt.Errorf("cannot write job auth token: %w", err) + } + stdin.Close() + + startCh := make(chan error, 1) + go func() { + scanner := bufio.NewScanner(stdout) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "Wave-JobManagerStart") { + startCh <- nil + return + } + } + if err := scanner.Err(); err != nil { + startCh <- fmt.Errorf("error reading stdout: %w", err) + } else { + startCh <- fmt.Errorf("job manager exited without start signal") + } + }() + + timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + select { + case err := <-startCh: + if err != nil { + cmd.Process.Kill() + return nil, err + } + case <-timeoutCtx.Done(): + cmd.Process.Kill() + return nil, fmt.Errorf("timeout waiting for job manager to start") + } + + go func() { + cmd.Wait() + }() + + socketPath := filepath.Join(wavebase.GetHomeDir(), ".waveterm", "jobs", data.ClientId, fmt.Sprintf("%s.sock", data.JobId)) + conn, err := net.Dial("unix", socketPath) + if err != nil { + return nil, fmt.Errorf("cannot connect to job manager socket: %w", err) + } + + log.Printf("RemoteStartJobCommand: connected to job manager socket, need to implement auth\n") + conn.Close() + + return nil, fmt.Errorf("not implemented") +} diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 665caf1d3a..bfb1ade001 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -102,6 +102,7 @@ type WshRpcInterface interface { RemoteStreamCpuDataCommand(ctx context.Context) chan RespOrErrorUnion[TimeSeriesData] RemoteGetInfoCommand(ctx context.Context) (RemoteInfo, error) RemoteInstallRcFilesCommand(ctx context.Context) error + RemoteStartJobCommand(ctx context.Context, data CommandRemoteStartJobData) (*CommandStartJobRtnData, error) // emain WebSelectorCommand(ctx context.Context, data CommandWebSelectorData) ([]string, error) @@ -682,6 +683,18 @@ type CommandStartJobData struct { StreamMeta *StreamMeta `json:"streammeta,omitempty"` } +type CommandRemoteStartJobData struct { + Cmd string `json:"cmd"` + Args []string `json:"args"` + Env map[string]string `json:"env"` + TermSize waveobj.TermSize `json:"termsize"` + StreamMeta *StreamMeta `json:"streammeta,omitempty"` + JobAuthToken string `json:"jobauthtoken"` + JobId string `json:"jobid"` + MainServerJwtToken string `json:"mainserverjwttoken"` + ClientId string `json:"clientid"` +} + type CommandStartJobRtnData struct { Pgid int `json:"pgid"` } From 34f9afde98348a75977a2b1af354691c394bd7b1 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 14 Jan 2026 10:08:33 -0800 Subject: [PATCH 19/64] checkpoint -- finish RemoteStartJobCommand (untested) --- cmd/server/main-server.go | 2 +- cmd/wsh/cmd/wshcmd-connserver.go | 10 ++++- pkg/wshrpc/wshremote/wshremote.go | 62 +++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/cmd/server/main-server.go b/cmd/server/main-server.go index a59661f0ba..c09713b952 100644 --- a/cmd/server/main-server.go +++ b/cmd/server/main-server.go @@ -391,7 +391,7 @@ func createMainWshClient() { wshfs.RpcClient = rpc wshutil.DefaultRouter.RegisterTrustedLeaf(rpc, wshutil.DefaultRoute) wps.Broker.SetClient(wshutil.DefaultRouter) - localConnWsh := wshutil.MakeWshRpc(wshrpc.RpcContext{Conn: wshrpc.LocalConnName}, &wshremote.ServerImpl{}, "conn:local") + localConnWsh := wshutil.MakeWshRpc(wshrpc.RpcContext{Conn: wshrpc.LocalConnName}, &wshremote.ServerImpl{Router: wshutil.DefaultRouter, RpcClient: wshclient.GetBareRpcClient()}, "conn:local") go wshremote.RunSysInfoLoop(localConnWsh, wshrpc.LocalConnName) wshutil.DefaultRouter.RegisterTrustedLeaf(localConnWsh, wshutil.MakeConnectionRouteId(wshrpc.LocalConnName)) } diff --git a/cmd/wsh/cmd/wshcmd-connserver.go b/cmd/wsh/cmd/wshcmd-connserver.go index 1bfd825e54..dd672438c9 100644 --- a/cmd/wsh/cmd/wshcmd-connserver.go +++ b/cmd/wsh/cmd/wshcmd-connserver.go @@ -41,6 +41,7 @@ var connServerRouter bool var connServerRouterDomainSocket bool var connServerConnName string var connServerDev bool +var ConnServerWshRouter *wshutil.WshRouter func init() { serverCmd.Flags().BoolVar(&connServerRouter, "router", false, "run in local router mode (stdio upstream)") @@ -125,7 +126,12 @@ func setupConnServerRpcClientWithRouter(router *wshutil.WshRouter) (*wshutil.Wsh RouteId: routeId, Conn: connServerConnName, } - connServerClient := wshutil.MakeWshRpc(rpcCtx, &wshremote.ServerImpl{LogWriter: os.Stdout}, routeId) + + bareRouteId := wshutil.MakeRandomProcRouteId() + bareClient := wshutil.MakeWshRpc(wshrpc.RpcContext{}, &wshclient.WshServer{}, bareRouteId) + router.RegisterTrustedLeaf(bareClient, bareRouteId) + + connServerClient := wshutil.MakeWshRpc(rpcCtx, &wshremote.ServerImpl{LogWriter: os.Stdout, Router: router, RpcClient: bareClient}, routeId) router.RegisterTrustedLeaf(connServerClient, routeId) return connServerClient, nil } @@ -133,6 +139,7 @@ func setupConnServerRpcClientWithRouter(router *wshutil.WshRouter) (*wshutil.Wsh func serverRunRouter() error { log.Printf("starting connserver router") router := wshutil.NewWshRouter() + ConnServerWshRouter = router termProxy := wshutil.MakeRpcProxy("connserver-term") rawCh := make(chan []byte, wshutil.DefaultOutputChSize) go func() { @@ -229,6 +236,7 @@ func serverRunRouterDomainSocket(jwtToken string) error { // create router router := wshutil.NewWshRouter() + ConnServerWshRouter = router // create proxy for the domain socket connection upstreamProxy := wshutil.MakeRpcProxy("connserver-upstream") diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index a0ded80f82..cfd7ddb816 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -36,6 +36,8 @@ import ( type ServerImpl struct { LogWriter io.Writer + Router *wshutil.WshRouter + RpcClient *wshutil.WshRpc } func (*ServerImpl) WshServerImpl() {} @@ -865,7 +867,10 @@ func (*ServerImpl) DisposeSuggestionsCommand(ctx context.Context, widgetId strin return nil } -func (*ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.CommandRemoteStartJobData) (*wshrpc.CommandStartJobRtnData, error) { +func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.CommandRemoteStartJobData) (*wshrpc.CommandStartJobRtnData, error) { + if impl.Router == nil { + return nil, fmt.Errorf("cannot start remote job: no router available") + } wshPath, err := wavebase.ExpandHomeDir("~/.waveterm/bin/wsh") if err != nil { return nil, fmt.Errorf("cannot expand wsh path: %w", err) @@ -933,8 +938,57 @@ func (*ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.Comman return nil, fmt.Errorf("cannot connect to job manager socket: %w", err) } - log.Printf("RemoteStartJobCommand: connected to job manager socket, need to implement auth\n") - conn.Close() + proxy := wshutil.MakeRpcProxy("jobmanager") + go func() { + writeErr := wshutil.AdaptOutputChToStream(proxy.ToRemoteCh, conn) + if writeErr != nil { + log.Printf("RemoteStartJobCommand: error writing to job manager socket: %v\n", writeErr) + } + }() + go func() { + defer func() { + conn.Close() + close(proxy.FromRemoteCh) + }() + wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) + }() + + linkId := impl.Router.RegisterUntrustedLink(proxy) + + routeId := wshutil.MakeLinkRouteId(linkId) + authData := wshrpc.CommandAuthenticateToJobData{ + JobAccessToken: data.JobAuthToken, + } + err = wshclient.AuthenticateToJobManagerCommand(impl.RpcClient, authData, &wshrpc.RpcOpts{Route: routeId}) + if err != nil { + conn.Close() + impl.Router.UnregisterLink(linkId) + return nil, fmt.Errorf("authentication to job manager failed: %w", err) + } + + jobRouteId := wshutil.MakeJobRouteId(data.JobId) + waitCtx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + err = impl.Router.WaitForRegister(waitCtx, jobRouteId) + if err != nil { + conn.Close() + impl.Router.UnregisterLink(linkId) + return nil, fmt.Errorf("timeout waiting for job route to register: %w", err) + } + + startJobData := wshrpc.CommandStartJobData{ + Cmd: data.Cmd, + Args: data.Args, + Env: data.Env, + TermSize: data.TermSize, + StreamMeta: data.StreamMeta, + } + rtnData, err := wshclient.StartJobCommand(impl.RpcClient, startJobData, &wshrpc.RpcOpts{Route: jobRouteId}) + if err != nil { + conn.Close() + impl.Router.UnregisterLink(linkId) + return nil, fmt.Errorf("failed to start job: %w", err) + } - return nil, fmt.Errorf("not implemented") + return rtnData, nil } From 28c9ed4271066e0f20e6f3d540578d2c67a809ee Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 14 Jan 2026 14:48:10 -0800 Subject: [PATCH 20/64] hook up streambroker to wshrpc --- pkg/streamclient/streambroker.go | 18 -------------- pkg/wshrpc/wshrpctypes_const.go | 2 ++ pkg/wshutil/wshrpc.go | 41 ++++++++++++++++++++++++++++++++ pkg/wshutil/wshstreamadapter.go | 24 +++++++++++++++++++ 4 files changed, 67 insertions(+), 18 deletions(-) create mode 100644 pkg/wshutil/wshstreamadapter.go diff --git a/pkg/streamclient/streambroker.go b/pkg/streamclient/streambroker.go index a6a6a8fec7..a9b41ec393 100644 --- a/pkg/streamclient/streambroker.go +++ b/pkg/streamclient/streambroker.go @@ -8,8 +8,6 @@ import ( "github.com/google/uuid" "github.com/wavetermdev/waveterm/pkg/utilds" "github.com/wavetermdev/waveterm/pkg/wshrpc" - "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" - "github.com/wavetermdev/waveterm/pkg/wshutil" ) type workItem struct { @@ -27,22 +25,6 @@ type StreamRpcInterface interface { StreamDataCommand(data wshrpc.CommandStreamData, opts *wshrpc.RpcOpts) error } -type wshRpcAdapter struct { - rpc *wshutil.WshRpc -} - -func (a *wshRpcAdapter) StreamDataAckCommand(data wshrpc.CommandStreamAckData, opts *wshrpc.RpcOpts) error { - return wshclient.StreamDataAckCommand(a.rpc, data, opts) -} - -func (a *wshRpcAdapter) StreamDataCommand(data wshrpc.CommandStreamData, opts *wshrpc.RpcOpts) error { - return wshclient.StreamDataCommand(a.rpc, data, opts) -} - -func AdaptWshRpc(rpc *wshutil.WshRpc) StreamRpcInterface { - return &wshRpcAdapter{rpc: rpc} -} - type Broker struct { lock sync.Mutex rpcClient StreamRpcInterface diff --git a/pkg/wshrpc/wshrpctypes_const.go b/pkg/wshrpc/wshrpctypes_const.go index a01d103e8f..51a25f147c 100644 --- a/pkg/wshrpc/wshrpctypes_const.go +++ b/pkg/wshrpc/wshrpctypes_const.go @@ -45,4 +45,6 @@ const ( Command_ControllerInput = "controllerinput" Command_EventRecv = "eventrecv" Command_Message = "message" + Command_StreamData = "streamdata" + Command_StreamDataAck = "streamdataack" ) diff --git a/pkg/wshutil/wshrpc.go b/pkg/wshutil/wshrpc.go index 7d94777193..c70b6000b6 100644 --- a/pkg/wshutil/wshrpc.go +++ b/pkg/wshutil/wshrpc.go @@ -18,6 +18,7 @@ import ( "github.com/google/uuid" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" + "github.com/wavetermdev/waveterm/pkg/streamclient" "github.com/wavetermdev/waveterm/pkg/util/ds" "github.com/wavetermdev/waveterm/pkg/util/utilfn" "github.com/wavetermdev/waveterm/pkg/wps" @@ -56,6 +57,7 @@ type WshRpc struct { ServerImpl ServerImpl EventListener *EventListener ResponseHandlerMap map[string]*RpcResponseHandler // reqId => handler + StreamBroker *streamclient.Broker Debug bool DebugName string ServerDone bool @@ -226,6 +228,7 @@ func MakeWshRpcWithChannels(inputCh chan baseds.RpcInputChType, outputCh chan [] ResponseHandlerMap: make(map[string]*RpcResponseHandler), } rtn.RpcContext.Store(&rpcCtx) + rtn.StreamBroker = streamclient.NewBroker(AdaptWshRpc(rtn)) go rtn.runServer() return rtn } @@ -286,11 +289,49 @@ func (w *WshRpc) handleEventRecv(req *RpcMessage) { w.EventListener.RecvEvent(&waveEvent) } +func (w *WshRpc) handleStreamData(req *RpcMessage) { + if w.StreamBroker == nil { + return + } + if req.Data == nil { + return + } + var dataPk wshrpc.CommandStreamData + err := utilfn.ReUnmarshal(&dataPk, req.Data) + if err != nil { + return + } + w.StreamBroker.RecvData(dataPk) +} + +func (w *WshRpc) handleStreamAck(req *RpcMessage) { + if w.StreamBroker == nil { + return + } + if req.Data == nil { + return + } + var ackPk wshrpc.CommandStreamAckData + err := utilfn.ReUnmarshal(&ackPk, req.Data) + if err != nil { + return + } + w.StreamBroker.RecvAck(ackPk) +} + func (w *WshRpc) handleRequestInternal(req *RpcMessage, ingressLinkId baseds.LinkId, pprofCtx context.Context) { if req.Command == wshrpc.Command_EventRecv { w.handleEventRecv(req) return } + if req.Command == wshrpc.Command_StreamData { + w.handleStreamData(req) + return + } + if req.Command == wshrpc.Command_StreamDataAck { + w.handleStreamAck(req) + return + } var respHandler *RpcResponseHandler timeoutMs := req.Timeout diff --git a/pkg/wshutil/wshstreamadapter.go b/pkg/wshutil/wshstreamadapter.go new file mode 100644 index 0000000000..b83d1c727c --- /dev/null +++ b/pkg/wshutil/wshstreamadapter.go @@ -0,0 +1,24 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package wshutil + +import ( + "github.com/wavetermdev/waveterm/pkg/wshrpc" +) + +type WshRpcStreamClientAdapter struct { + rpc *WshRpc +} + +func (a *WshRpcStreamClientAdapter) StreamDataAckCommand(data wshrpc.CommandStreamAckData, opts *wshrpc.RpcOpts) error { + return a.rpc.SendCommand("streamdataack", data, opts) +} + +func (a *WshRpcStreamClientAdapter) StreamDataCommand(data wshrpc.CommandStreamData, opts *wshrpc.RpcOpts) error { + return a.rpc.SendCommand("streamdata", data, opts) +} + +func AdaptWshRpc(rpc *WshRpc) *WshRpcStreamClientAdapter { + return &WshRpcStreamClientAdapter{rpc: rpc} +} From bb9f51da565c4b1f07f34a4acd83b57fe8dc9432 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 14 Jan 2026 15:43:55 -0800 Subject: [PATCH 21/64] update termination flow for jobmanager --- frontend/app/store/wshclientapi.ts | 10 ++ frontend/types/gotypes.d.ts | 11 ++ pkg/jobcontroller/jobcontroller.go | 205 +++++++++++++++++++++++++++++ pkg/jobmanager/jobcmd.go | 26 ++++ pkg/jobmanager/jobmanager.go | 22 ++++ pkg/jobmanager/mainserverconn.go | 27 +++- pkg/wshrpc/wshclient/wshclient.go | 12 ++ pkg/wshrpc/wshrpctypes.go | 14 +- 8 files changed, 324 insertions(+), 3 deletions(-) create mode 100644 pkg/jobcontroller/jobcontroller.go diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index e48c39d8ff..a0603f0011 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -397,6 +397,16 @@ class RpcApiType { return client.wshRpcCall("jobconnect", data, opts); } + // command "jobexited" [call] + JobExitedCommand(client: WshClient, data: CommandJobExitedData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobexited", data, opts); + } + + // command "jobmanagerexit" [call] + JobManagerExitCommand(client: WshClient, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobmanagerexit", null, opts); + } + // command "jobterminate" [call] JobTerminateCommand(client: WshClient, data: CommandJobTerminateData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobterminate", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 881541804f..925718bb7a 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -363,6 +363,17 @@ declare global { // wshrpc.CommandJobConnectRtnData type CommandJobConnectRtnData = { seq: number; + hasexited?: boolean; + exitcode?: number; + exitsignal?: string; + exiterr?: string; + }; + + // wshrpc.CommandJobExitedData + type CommandJobExitedData = { + exitcode: number; + exitsignal?: string; + exiterr?: string; }; // wshrpc.CommandJobTerminateData diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go new file mode 100644 index 0000000000..c01d56332a --- /dev/null +++ b/pkg/jobcontroller/jobcontroller.go @@ -0,0 +1,205 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package jobcontroller + +import ( + "context" + "fmt" + "io" + "log" + "time" + + "github.com/google/uuid" + "github.com/wavetermdev/waveterm/pkg/filestore" + "github.com/wavetermdev/waveterm/pkg/panichandler" + "github.com/wavetermdev/waveterm/pkg/remote/conncontroller" + "github.com/wavetermdev/waveterm/pkg/streamclient" + "github.com/wavetermdev/waveterm/pkg/util/utilfn" + "github.com/wavetermdev/waveterm/pkg/waveobj" + "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" + "github.com/wavetermdev/waveterm/pkg/wshutil" + "github.com/wavetermdev/waveterm/pkg/wstore" +) + +const ( + JobStatus_Init = "init" + JobStatus_Running = "running" + JobStatus_Done = "done" + JobStatus_Error = "error" +) + +const DefaultStreamRwnd = 64 * 1024 + +type StartJobParams struct { + ConnName string + Cmd string + Args []string + Env map[string]string + TermSize *waveobj.TermSize +} + +func StartJob(ctx context.Context, params StartJobParams) (string, error) { + if params.ConnName == "" { + return "", fmt.Errorf("connection name is required") + } + if params.Cmd == "" { + return "", fmt.Errorf("command is required") + } + if params.TermSize == nil { + params.TermSize = &waveobj.TermSize{Rows: 24, Cols: 80} + } + + err := conncontroller.EnsureConnection(ctx, params.ConnName) + if err != nil { + return "", fmt.Errorf("failed to ensure connection: %w", err) + } + + jobId := uuid.New().String() + jobAuthToken, err := utilfn.RandomHexString(32) + if err != nil { + return "", fmt.Errorf("failed to generate job auth token: %w", err) + } + + rpcCtx := wshrpc.RpcContext{ + RouteId: wshutil.MakeJobRouteId(jobId), + } + jobAccessToken, err := wshutil.MakeClientJWTToken(rpcCtx) + if err != nil { + return "", fmt.Errorf("failed to generate job access token: %w", err) + } + + job := &waveobj.Job{ + OID: jobId, + Connection: params.ConnName, + Cmd: params.Cmd, + CmdArgs: params.Args, + CmdEnv: params.Env, + TermSize: *params.TermSize, + JobAuthToken: jobAuthToken, + Status: JobStatus_Init, + StartTs: time.Now().UnixMilli(), + Meta: make(waveobj.MetaMapType), + } + + err = wstore.DBInsert(ctx, job) + if err != nil { + return "", fmt.Errorf("failed to create job in database: %w", err) + } + + connRpc := wshclient.GetBareRpcClient() + if connRpc == nil { + return "", fmt.Errorf("main rpc client not available") + } + + broker := connRpc.StreamBroker + if broker == nil { + return "", fmt.Errorf("stream broker not available") + } + + readerRouteId := wshutil.MakeJobRouteId(jobId) + writerRouteId := wshutil.MakeConnectionRouteId(params.ConnName) + reader, streamMeta := broker.CreateStreamReader(readerRouteId, writerRouteId, DefaultStreamRwnd) + + fileOpts := wshrpc.FileOpts{ + MaxSize: 10 * 1024 * 1024, + Circular: true, + } + err = filestore.WFS.MakeFile(ctx, jobId, "term", wshrpc.FileMeta{}, fileOpts) + if err != nil { + return "", fmt.Errorf("failed to create WaveFS file: %w", err) + } + + clientId, err := wstore.DBGetSingleton[*waveobj.Client](ctx) + if err != nil || clientId == nil { + return "", fmt.Errorf("failed to get client: %w", err) + } + + startJobData := wshrpc.CommandRemoteStartJobData{ + Cmd: params.Cmd, + Args: params.Args, + Env: params.Env, + TermSize: *params.TermSize, + StreamMeta: streamMeta, + JobAuthToken: jobAuthToken, + JobId: jobId, + MainServerJwtToken: jobAccessToken, + ClientId: clientId.OID, + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeConnectionRouteId(params.ConnName), + Timeout: 30000, + } + + rtnData, err := wshclient.RemoteStartJobCommand(connRpc, startJobData, rpcOpts) + if err != nil { + wstore.DBUpdate(ctx, &waveobj.Job{ + OID: jobId, + Status: JobStatus_Error, + Error: fmt.Sprintf("failed to start job: %v", err), + }) + return "", fmt.Errorf("failed to start remote job: %w", err) + } + + job.Pgid = rtnData.Pgid + job.Status = JobStatus_Running + err = wstore.DBUpdate(ctx, job) + if err != nil { + log.Printf("warning: failed to update job status to running: %v", err) + } + + go func() { + defer func() { + panichandler.PanicHandler("jobcontroller:runOutputLoop", recover()) + }() + runOutputLoop(context.Background(), jobId, reader) + }() + + return jobId, nil +} + +func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reader) { + defer func() { + log.Printf("[job:%s] output loop finished", jobId) + }() + + buf := make([]byte, 4096) + for { + n, err := reader.Read(buf) + if n > 0 { + appendErr := filestore.WFS.AppendData(ctx, jobId, "term", buf[:n]) + if appendErr != nil { + log.Printf("[job:%s] error appending data to WaveFS: %v", jobId, appendErr) + } + } + + if err == io.EOF { + log.Printf("[job:%s] stream ended (EOF)", jobId) + updateErr := wstore.DBUpdate(ctx, &waveobj.Job{ + OID: jobId, + Status: JobStatus_Done, + ExitTs: time.Now().UnixMilli(), + }) + if updateErr != nil { + log.Printf("[job:%s] error updating job status to done: %v", jobId, updateErr) + } + break + } + + if err != nil { + log.Printf("[job:%s] stream error: %v", jobId, err) + updateErr := wstore.DBUpdate(ctx, &waveobj.Job{ + OID: jobId, + Status: JobStatus_Error, + Error: fmt.Sprintf("stream error: %v", err), + ExitTs: time.Now().UnixMilli(), + }) + if updateErr != nil { + log.Printf("[job:%s] error updating job status to error: %v", jobId, updateErr) + } + break + } + } +} diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index e25701be09..39cd42fe33 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -90,6 +90,16 @@ func (jm *JobCmd) waitForProcess() { jm.exitCode = 0 } log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", jm.exitCode, jm.exitSignal, jm.exitErr) + + exitData := &wshrpc.CommandJobExitedData{ + ExitCode: jm.exitCode, + ExitSignal: jm.exitSignal, + } + if jm.exitErr != nil { + exitData.ExitErr = jm.exitErr.Error() + } + + go WshCmdJobManager.sendJobExited(exitData) } func (jm *JobCmd) GetCmd() (*exec.Cmd, pty.Pty) { @@ -117,6 +127,22 @@ func (jm *JobCmd) GetPGID() (int, error) { return pgid, nil } +func (jm *JobCmd) GetExitInfo() (bool, *wshrpc.CommandJobExitedData) { + jm.lock.Lock() + defer jm.lock.Unlock() + if !jm.processExited { + return false, nil + } + exitData := &wshrpc.CommandJobExitedData{ + ExitCode: jm.exitCode, + ExitSignal: jm.exitSignal, + } + if jm.exitErr != nil { + exitData.ExitErr = jm.exitErr.Error() + } + return true, exitData +} + func (jm *JobCmd) HandleInput(data wshrpc.CommandBlockInputData) error { jm.lock.Lock() defer jm.lock.Unlock() diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 7805061e4b..e9aaef8e63 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -20,6 +20,7 @@ import ( "github.com/wavetermdev/waveterm/pkg/wavebase" "github.com/wavetermdev/waveterm/pkg/wavejwt" "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" "github.com/wavetermdev/waveterm/pkg/wshutil" ) @@ -73,6 +74,27 @@ func (jm *JobManager) GetCmd() *JobCmd { return jm.Cmd } +func (jm *JobManager) sendJobExited(exitData *wshrpc.CommandJobExitedData) { + jm.lock.Lock() + attachedClient := jm.attachedClient + jm.lock.Unlock() + + if attachedClient == nil { + log.Printf("sendJobExited: no attached client, exit notification not sent\n") + return + } + if attachedClient.WshRpc == nil { + log.Printf("sendJobExited: no wsh rpc connection, exit notification not sent\n") + return + } + + log.Printf("sendJobExited: sending exit notification to main server exitcode=%d signal=%s\n", exitData.ExitCode, exitData.ExitSignal) + err := wshclient.JobExitedCommand(attachedClient.WshRpc, *exitData, nil) + if err != nil { + log.Printf("sendJobExited: error sending exit notification: %v\n", err) + } +} + func daemonize(clientId string, jobId string) error { devNull, err := os.OpenFile("/dev/null", os.O_RDONLY, 0) if err != nil { diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index 8b4ac25d92..ee78e685c8 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -8,8 +8,10 @@ import ( "fmt" "log" "net" + "os" "sync" "sync/atomic" + "time" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/wavejwt" @@ -173,8 +175,17 @@ func (msc *MainServerConn) JobConnectCommand(ctx context.Context, data wshrpc.Co return nil, err } - log.Printf("JobConnect: streamid=%s clientSeq=%d serverSeq=%d\n", data.StreamMeta.Id, data.Seq, serverSeq) - return &wshrpc.CommandJobConnectRtnData{Seq: serverSeq}, nil + rtnData := &wshrpc.CommandJobConnectRtnData{Seq: serverSeq} + hasExited, exitData := WshCmdJobManager.Cmd.GetExitInfo() + if hasExited && exitData != nil { + rtnData.HasExited = true + rtnData.ExitCode = exitData.ExitCode + rtnData.ExitSignal = exitData.ExitSignal + rtnData.ExitErr = exitData.ExitErr + } + + log.Printf("JobConnect: streamid=%s clientSeq=%d serverSeq=%d hasExited=%v\n", data.StreamMeta.Id, data.Seq, serverSeq, hasExited) + return rtnData, nil } func (msc *MainServerConn) StreamDataAckCommand(ctx context.Context, data wshrpc.CommandStreamAckData) error { @@ -202,3 +213,15 @@ func (msc *MainServerConn) JobTerminateCommand(ctx context.Context, data wshrpc. WshCmdJobManager.Cmd.Terminate() return nil } + +func (msc *MainServerConn) JobManagerExitCommand(ctx context.Context) error { + if !msc.PeerAuthenticated.Load() { + return fmt.Errorf("not authenticated") + } + log.Printf("JobManagerExit called, terminating job manager\n") + go func() { + time.Sleep(500 * time.Millisecond) + os.Exit(0) + }() + return nil +} diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 7160120d0f..ba4a3d1ea2 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -482,6 +482,18 @@ func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opt return resp, err } +// command "jobexited", wshserver.JobExitedCommand +func JobExitedCommand(w *wshutil.WshRpc, data wshrpc.CommandJobExitedData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobexited", data, opts) + return err +} + +// command "jobmanagerexit", wshserver.JobManagerExitCommand +func JobManagerExitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobmanagerexit", nil, opts) + return err +} + // command "jobterminate", wshserver.JobTerminateCommand func JobTerminateCommand(w *wshutil.WshRpc, data wshrpc.CommandJobTerminateData, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobterminate", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index bfb1ade001..e00ac7b141 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -163,6 +163,8 @@ type WshRpcInterface interface { StartJobCommand(ctx context.Context, data CommandStartJobData) (*CommandStartJobRtnData, error) JobConnectCommand(ctx context.Context, data CommandJobConnectData) (*CommandJobConnectRtnData, error) JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error + JobExitedCommand(ctx context.Context, data CommandJobExitedData) error // this is sent FROM the job manager => main server + JobManagerExitCommand(ctx context.Context) error } // for frontend @@ -705,8 +707,18 @@ type CommandJobConnectData struct { } type CommandJobConnectRtnData struct { - Seq int64 `json:"seq"` + Seq int64 `json:"seq"` + HasExited bool `json:"hasexited,omitempty"` + ExitCode int `json:"exitcode,omitempty"` + ExitSignal string `json:"exitsignal,omitempty"` + ExitErr string `json:"exiterr,omitempty"` } type CommandJobTerminateData struct { } + +type CommandJobExitedData struct { + ExitCode int `json:"exitcode"` + ExitSignal string `json:"exitsignal,omitempty"` + ExitErr string `json:"exiterr,omitempty"` +} From 1db56677abf8455bf4081742e160ac520bdf5821 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 14 Jan 2026 17:00:05 -0800 Subject: [PATCH 22/64] better exit info, exitts, store exit data separate from stream eof/error --- pkg/jobcontroller/jobcontroller.go | 44 ++++++++++++++++++++++++------ pkg/jobmanager/jobcmd.go | 15 ++++------ pkg/jobmanager/jobmanager.go | 13 ++++++++- pkg/waveobj/wtype.go | 2 ++ pkg/wshrpc/wshrpctypes.go | 2 ++ pkg/wshrpc/wshserver/wshserver.go | 5 ++++ 6 files changed, 62 insertions(+), 19 deletions(-) diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index c01d56332a..532806e26e 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -178,12 +178,11 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if err == io.EOF { log.Printf("[job:%s] stream ended (EOF)", jobId) updateErr := wstore.DBUpdate(ctx, &waveobj.Job{ - OID: jobId, - Status: JobStatus_Done, - ExitTs: time.Now().UnixMilli(), + OID: jobId, + StreamDone: true, }) if updateErr != nil { - log.Printf("[job:%s] error updating job status to done: %v", jobId, updateErr) + log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr) } break } @@ -191,15 +190,42 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if err != nil { log.Printf("[job:%s] stream error: %v", jobId, err) updateErr := wstore.DBUpdate(ctx, &waveobj.Job{ - OID: jobId, - Status: JobStatus_Error, - Error: fmt.Sprintf("stream error: %v", err), - ExitTs: time.Now().UnixMilli(), + OID: jobId, + StreamError: err.Error(), }) if updateErr != nil { - log.Printf("[job:%s] error updating job status to error: %v", jobId, updateErr) + log.Printf("[job:%s] error updating job stream error: %v", jobId, updateErr) } break } } } + +func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobExitedData) error { + var status string + if data.ExitErr != "" { + status = JobStatus_Error + } else { + status = JobStatus_Done + } + + updateData := &waveobj.Job{ + OID: jobId, + Status: status, + ExitCode: data.ExitCode, + ExitSignal: data.ExitSignal, + ExitTs: data.ExitTs, + } + + if data.ExitErr != "" { + updateData.Error = data.ExitErr + } + + err := wstore.DBUpdate(ctx, updateData) + if err != nil { + return fmt.Errorf("failed to update job exit status: %w", err) + } + + log.Printf("[job:%s] exited with code:%d signal:%q status:%s", jobId, data.ExitCode, data.ExitSignal, status) + return nil +} diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index 39cd42fe33..93cdd32239 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -11,6 +11,7 @@ import ( "os/exec" "sync" "syscall" + "time" "github.com/creack/pty" "github.com/wavetermdev/waveterm/pkg/waveobj" @@ -35,6 +36,7 @@ type JobCmd struct { exitCode int exitSignal string exitErr error + exitTs int64 } func MakeJobCmd(jobId string, cmdDef CmdDef) (*JobCmd, error) { @@ -74,6 +76,7 @@ func (jm *JobCmd) waitForProcess() { defer jm.lock.Unlock() jm.processExited = true + jm.exitTs = time.Now().UnixMilli() jm.exitErr = err if err != nil { if exitErr, ok := err.(*exec.ExitError); ok { @@ -91,15 +94,7 @@ func (jm *JobCmd) waitForProcess() { } log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", jm.exitCode, jm.exitSignal, jm.exitErr) - exitData := &wshrpc.CommandJobExitedData{ - ExitCode: jm.exitCode, - ExitSignal: jm.exitSignal, - } - if jm.exitErr != nil { - exitData.ExitErr = jm.exitErr.Error() - } - - go WshCmdJobManager.sendJobExited(exitData) + go WshCmdJobManager.sendJobExited() } func (jm *JobCmd) GetCmd() (*exec.Cmd, pty.Pty) { @@ -134,8 +129,10 @@ func (jm *JobCmd) GetExitInfo() (bool, *wshrpc.CommandJobExitedData) { return false, nil } exitData := &wshrpc.CommandJobExitedData{ + JobId: WshCmdJobManager.JobId, ExitCode: jm.exitCode, ExitSignal: jm.exitSignal, + ExitTs: jm.exitTs, } if jm.exitErr != nil { exitData.ExitErr = jm.exitErr.Error() diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index e9aaef8e63..a16281b2aa 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -74,9 +74,10 @@ func (jm *JobManager) GetCmd() *JobCmd { return jm.Cmd } -func (jm *JobManager) sendJobExited(exitData *wshrpc.CommandJobExitedData) { +func (jm *JobManager) sendJobExited() { jm.lock.Lock() attachedClient := jm.attachedClient + cmd := jm.Cmd jm.lock.Unlock() if attachedClient == nil { @@ -87,6 +88,16 @@ func (jm *JobManager) sendJobExited(exitData *wshrpc.CommandJobExitedData) { log.Printf("sendJobExited: no wsh rpc connection, exit notification not sent\n") return } + if cmd == nil { + log.Printf("sendJobExited: no cmd, exit notification not sent\n") + return + } + + exited, exitData := cmd.GetExitInfo() + if !exited || exitData == nil { + log.Printf("sendJobExited: process not exited yet\n") + return + } log.Printf("sendJobExited: sending exit notification to main server exitcode=%d signal=%s\n", exitData.ExitCode, exitData.ExitSignal) err := wshclient.JobExitedCommand(attachedClient.WshRpc, *exitData, nil) diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 037c295e8e..9816ed0367 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -327,6 +327,8 @@ type Job struct { ExitCode int `json:"exitcode,omitempty"` ExitSignal string `json:"exitsignal,omitempty"` Error string `json:"error,omitempty"` + StreamDone bool `json:"streamdone,omitempty"` + StreamError string `json:"streamerror,omitempty"` Meta MetaMapType `json:"meta"` } diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index e00ac7b141..73e787f61a 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -718,7 +718,9 @@ type CommandJobTerminateData struct { } type CommandJobExitedData struct { + JobId string `json:"jobid"` ExitCode int `json:"exitcode"` ExitSignal string `json:"exitsignal,omitempty"` ExitErr string `json:"exiterr,omitempty"` + ExitTs int64 `json:"exitts,omitempty"` } diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 9e447dd5f3..c5a53cd4f0 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -29,6 +29,7 @@ import ( "github.com/wavetermdev/waveterm/pkg/filebackup" "github.com/wavetermdev/waveterm/pkg/filestore" "github.com/wavetermdev/waveterm/pkg/genconn" + "github.com/wavetermdev/waveterm/pkg/jobcontroller" "github.com/wavetermdev/waveterm/pkg/panichandler" "github.com/wavetermdev/waveterm/pkg/remote" "github.com/wavetermdev/waveterm/pkg/remote/awsconn" @@ -1430,3 +1431,7 @@ func (ws *WshServer) GetSecretsLinuxStorageBackendCommand(ctx context.Context) ( } return backend, nil } + +func (ws *WshServer) JobExitedCommand(ctx context.Context, data wshrpc.CommandJobExitedData) error { + return jobcontroller.HandleJobExited(ctx, data.JobId, data) +} From caf7fdcfcf8031dfaf200269f63662f1c8d76b3e Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 14 Jan 2026 18:21:52 -0800 Subject: [PATCH 23/64] add job termination --- pkg/jobcontroller/jobcontroller.go | 41 ++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 532806e26e..47cfcffa7b 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -184,6 +184,7 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if updateErr != nil { log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr) } + tryTerminateJobManager(ctx, jobId) break } @@ -191,11 +192,13 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade log.Printf("[job:%s] stream error: %v", jobId, err) updateErr := wstore.DBUpdate(ctx, &waveobj.Job{ OID: jobId, + StreamDone: true, StreamError: err.Error(), }) if updateErr != nil { log.Printf("[job:%s] error updating job stream error: %v", jobId, updateErr) } + tryTerminateJobManager(ctx, jobId) break } } @@ -227,5 +230,43 @@ func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobEx } log.Printf("[job:%s] exited with code:%d signal:%q status:%s", jobId, data.ExitCode, data.ExitSignal, status) + tryTerminateJobManager(ctx, jobId) return nil } + +func tryTerminateJobManager(ctx context.Context, jobId string) { + job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + log.Printf("[job:%s] error getting job for termination check: %v", jobId, err) + return + } + + jobExited := job.Status == JobStatus_Done || job.Status == JobStatus_Error + + if !jobExited || !job.StreamDone { + log.Printf("[job:%s] not ready for termination: exited=%v streamDone=%v", jobId, jobExited, job.StreamDone) + return + } + + log.Printf("[job:%s] both job exited and stream finished, terminating job manager", jobId) + + connRpc := wshclient.GetBareRpcClient() + if connRpc == nil { + log.Printf("[job:%s] error terminating job manager: rpc client not available", jobId) + return + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeJobRouteId(jobId), + Timeout: 5000, + NoResponse: true, + } + + err = wshclient.JobManagerExitCommand(connRpc, rpcOpts) + if err != nil { + log.Printf("[job:%s] error sending job manager exit command: %v", jobId, err) + return + } + + log.Printf("[job:%s] job manager exit command sent successfully", jobId) +} From da8b38d6a9c322d1f2756c8e074e75437f7d8a7e Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 10:22:44 -0800 Subject: [PATCH 24/64] add installid to client --- pkg/waveobj/wtype.go | 1 + pkg/wcore/wcore.go | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 9816ed0367..c871229b4f 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -136,6 +136,7 @@ type Client struct { TosAgreed int64 `json:"tosagreed,omitempty"` // unix milli HasOldHistory bool `json:"hasoldhistory,omitempty"` TempOID string `json:"tempoid,omitempty"` + InstallId string `json:"installid,omitempty"` } func (*Client) GetOType() string { diff --git a/pkg/wcore/wcore.go b/pkg/wcore/wcore.go index d8603f5caf..0c3fb905eb 100644 --- a/pkg/wcore/wcore.go +++ b/pkg/wcore/wcore.go @@ -50,6 +50,14 @@ func EnsureInitialData() (bool, error) { return firstLaunch, fmt.Errorf("error updating client: %w", err) } } + if client.InstallId == "" { + log.Println("client.InstallId is empty") + client.InstallId = uuid.NewString() + err = wstore.DBUpdate(ctx, client) + if err != nil { + return firstLaunch, fmt.Errorf("error updating client: %w", err) + } + } log.Printf("clientid: %s\n", client.OID) if len(client.WindowIds) == 1 { log.Println("client has one window") From 79eca5c2f0e4faba564a3df6ad84527bc399c401 Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 11:10:03 -0800 Subject: [PATCH 25/64] fix platform specific funcs --- pkg/jobmanager/jobcmd.go | 2 +- pkg/jobmanager/jobmanager.go | 71 -------------------------- pkg/jobmanager/jobmanager_unix.go | 74 ++++++++++++++++++++++++++++ pkg/jobmanager/jobmanager_windows.go | 7 +++ 4 files changed, 82 insertions(+), 72 deletions(-) diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index 93cdd32239..c9fd4732e0 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -112,7 +112,7 @@ func (jm *JobCmd) GetPGID() (int, error) { if jm.processExited { return 0, fmt.Errorf("process already exited") } - pgid, err := syscall.Getpgid(jm.cmd.Process.Pid) + pgid, err := getProcessGroupId(jm.cmd.Process.Pid) if err != nil { return 0, fmt.Errorf("failed to get pgid: %w", err) } diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index a16281b2aa..d73ac03cf1 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -8,12 +8,9 @@ import ( "log" "net" "os" - "os/signal" "path/filepath" "runtime" "sync" - "syscall" - "time" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" @@ -106,74 +103,6 @@ func (jm *JobManager) sendJobExited() { } } -func daemonize(clientId string, jobId string) error { - devNull, err := os.OpenFile("/dev/null", os.O_RDONLY, 0) - if err != nil { - return fmt.Errorf("failed to open /dev/null: %w", err) - } - err = syscall.Dup2(int(devNull.Fd()), int(os.Stdin.Fd())) - if err != nil { - return fmt.Errorf("failed to dup2 stdin: %w", err) - } - devNull.Close() // dupped so we can close this one - - logPath := GetJobFilePath(clientId, jobId, "log") - logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600) - if err != nil { - return fmt.Errorf("failed to open log file: %w", err) - } - err = syscall.Dup2(int(logFile.Fd()), int(os.Stdout.Fd())) - if err != nil { - return fmt.Errorf("failed to dup2 stdout: %w", err) - } - err = syscall.Dup2(int(logFile.Fd()), int(os.Stderr.Fd())) - if err != nil { - return fmt.Errorf("failed to dup2 stderr: %w", err) - } - logFile.Close() // dupped, so we can close this one - - log.SetOutput(os.Stdout) - log.Printf("job manager daemonized, logging to %s\n", logPath) - - setupJobManagerSignalHandlers() - return nil -} - -func setupJobManagerSignalHandlers() { - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) - - go func() { - for sig := range sigChan { - log.Printf("job manager received signal: %v\n", sig) - - cmd := WshCmdJobManager.GetCmd() - if cmd != nil { - pgid, err := cmd.GetPGID() - if err == nil { - if s, ok := sig.(syscall.Signal); ok { - log.Printf("forwarding signal %v to process group %d\n", sig, pgid) - _ = syscall.Kill(-pgid, s) - } else { - log.Printf("signal is not a syscall.Signal: %T\n", sig) - } - } else { - log.Printf("failed to get pgid: %v\n", err) - } - } - - if sig == syscall.SIGTERM { - if cmd != nil { - log.Printf("received SIGTERM, will exit\n") - time.Sleep(500 * time.Millisecond) - } - log.Printf("terminating job manager\n") - os.Exit(0) - } - } - }() -} - func (jm *JobManager) GetJobAuthInfo() (string, string) { jm.lock.Lock() defer jm.lock.Unlock() diff --git a/pkg/jobmanager/jobmanager_unix.go b/pkg/jobmanager/jobmanager_unix.go index a70dec72e1..8db1965e3d 100644 --- a/pkg/jobmanager/jobmanager_unix.go +++ b/pkg/jobmanager/jobmanager_unix.go @@ -6,9 +6,15 @@ package jobmanager import ( + "fmt" + "log" "os" + "os/signal" "strings" "syscall" + "time" + + "golang.org/x/sys/unix" ) func getProcessGroupId(pid int) (int, error) { @@ -46,3 +52,71 @@ func normalizeSignal(sigName string) os.Signal { return nil } } + +func daemonize(clientId string, jobId string) error { + devNull, err := os.OpenFile("/dev/null", os.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("failed to open /dev/null: %w", err) + } + err = unix.Dup2(int(devNull.Fd()), int(os.Stdin.Fd())) + if err != nil { + return fmt.Errorf("failed to dup2 stdin: %w", err) + } + devNull.Close() + + logPath := GetJobFilePath(clientId, jobId, "log") + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600) + if err != nil { + return fmt.Errorf("failed to open log file: %w", err) + } + err = unix.Dup2(int(logFile.Fd()), int(os.Stdout.Fd())) + if err != nil { + return fmt.Errorf("failed to dup2 stdout: %w", err) + } + err = unix.Dup2(int(logFile.Fd()), int(os.Stderr.Fd())) + if err != nil { + return fmt.Errorf("failed to dup2 stderr: %w", err) + } + logFile.Close() + + log.SetOutput(os.Stdout) + log.Printf("job manager daemonized, logging to %s\n", logPath) + + setupJobManagerSignalHandlers() + return nil +} + +func setupJobManagerSignalHandlers() { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) + + go func() { + for sig := range sigChan { + log.Printf("job manager received signal: %v\n", sig) + + cmd := WshCmdJobManager.GetCmd() + if cmd != nil { + pgid, err := cmd.GetPGID() + if err == nil { + if s, ok := sig.(syscall.Signal); ok { + log.Printf("forwarding signal %v to process group %d\n", sig, pgid) + _ = syscall.Kill(-pgid, s) + } else { + log.Printf("signal is not a syscall.Signal: %T\n", sig) + } + } else { + log.Printf("failed to get pgid: %v\n", err) + } + } + + if sig == syscall.SIGTERM { + if cmd != nil { + log.Printf("received SIGTERM, will exit\n") + time.Sleep(500 * time.Millisecond) + } + log.Printf("terminating job manager\n") + os.Exit(0) + } + } + }() +} diff --git a/pkg/jobmanager/jobmanager_windows.go b/pkg/jobmanager/jobmanager_windows.go index 7ce8d358fa..1806c9f4fd 100644 --- a/pkg/jobmanager/jobmanager_windows.go +++ b/pkg/jobmanager/jobmanager_windows.go @@ -17,3 +17,10 @@ func getProcessGroupId(pid int) (int, error) { func normalizeSignal(sigName string) os.Signal { return nil } + +func daemonize(clientId string, jobId string) error { + return fmt.Errorf("daemonize not supported on windows") +} + +func setupJobManagerSignalHandlers() { +} From 6763b5b0e6aabb25d191556ea81d02339b8a2651 Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 12:04:31 -0800 Subject: [PATCH 26/64] working through bugs, checkpoint --- cmd/server/main-server.go | 2 +- cmd/wsh/cmd/wshcmd-jobdebug.go | 190 +++++++++++++++++++++++++++++ frontend/app/store/wshclientapi.ts | 15 +++ frontend/types/gotypes.d.ts | 15 +++ pkg/jobcontroller/jobcontroller.go | 6 + pkg/jobmanager/jobmanager.go | 5 +- pkg/wshrpc/wshclient/wshclient.go | 18 +++ pkg/wshrpc/wshremote/wshremote.go | 51 +++++++- pkg/wshrpc/wshrpctypes.go | 12 ++ pkg/wshrpc/wshserver/wshserver.go | 19 +++ 10 files changed, 329 insertions(+), 4 deletions(-) create mode 100644 cmd/wsh/cmd/wshcmd-jobdebug.go diff --git a/cmd/server/main-server.go b/cmd/server/main-server.go index c09713b952..061af3c245 100644 --- a/cmd/server/main-server.go +++ b/cmd/server/main-server.go @@ -391,7 +391,7 @@ func createMainWshClient() { wshfs.RpcClient = rpc wshutil.DefaultRouter.RegisterTrustedLeaf(rpc, wshutil.DefaultRoute) wps.Broker.SetClient(wshutil.DefaultRouter) - localConnWsh := wshutil.MakeWshRpc(wshrpc.RpcContext{Conn: wshrpc.LocalConnName}, &wshremote.ServerImpl{Router: wshutil.DefaultRouter, RpcClient: wshclient.GetBareRpcClient()}, "conn:local") + localConnWsh := wshutil.MakeWshRpc(wshrpc.RpcContext{Conn: wshrpc.LocalConnName}, &wshremote.ServerImpl{Router: wshutil.DefaultRouter, RpcClient: wshclient.GetBareRpcClient(), IsLocal: true}, "conn:local") go wshremote.RunSysInfoLoop(localConnWsh, wshrpc.LocalConnName) wshutil.DefaultRouter.RegisterTrustedLeaf(localConnWsh, wshutil.MakeConnectionRouteId(wshrpc.LocalConnName)) } diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go new file mode 100644 index 0000000000..474a3ad985 --- /dev/null +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -0,0 +1,190 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package cmd + +import ( + "encoding/base64" + "encoding/json" + "fmt" + + "github.com/spf13/cobra" + "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" + "github.com/wavetermdev/waveterm/pkg/wshutil" +) + +var jobDebugCmd = &cobra.Command{ + Use: "jobdebug", + Short: "debugging commands for the job system", + Hidden: true, + PersistentPreRunE: preRunSetupRpcClient, +} + +var jobDebugListCmd = &cobra.Command{ + Use: "list", + Short: "list all jobs with debug information", + RunE: jobDebugListRun, +} + +var jobDebugDeleteCmd = &cobra.Command{ + Use: "delete", + Short: "delete a job entry by jobid", + RunE: jobDebugDeleteRun, +} + +var jobDebugTerminateCmd = &cobra.Command{ + Use: "terminate", + Short: "terminate a job manager process", + RunE: jobDebugTerminateRun, +} + +var jobDebugGetOutputCmd = &cobra.Command{ + Use: "getoutput", + Short: "get the terminal output for a job", + RunE: jobDebugGetOutputRun, +} + +var jobDebugStartCmd = &cobra.Command{ + Use: "start", + Short: "start a new job", + RunE: jobDebugStartRun, +} + +var jobIdFlag string +var jobDebugJsonFlag bool +var jobConnFlag string + +func init() { + rootCmd.AddCommand(jobDebugCmd) + jobDebugCmd.AddCommand(jobDebugListCmd) + jobDebugCmd.AddCommand(jobDebugDeleteCmd) + jobDebugCmd.AddCommand(jobDebugTerminateCmd) + jobDebugCmd.AddCommand(jobDebugGetOutputCmd) + jobDebugCmd.AddCommand(jobDebugStartCmd) + + jobDebugListCmd.Flags().BoolVar(&jobDebugJsonFlag, "json", false, "output as JSON") + + jobDebugDeleteCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to delete (required)") + jobDebugDeleteCmd.MarkFlagRequired("jobid") + + jobDebugTerminateCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to terminate (required)") + jobDebugTerminateCmd.MarkFlagRequired("jobid") + + jobDebugGetOutputCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to get output for (required)") + jobDebugGetOutputCmd.MarkFlagRequired("jobid") + + jobDebugStartCmd.Flags().StringVar(&jobConnFlag, "conn", "", "connection name (required)") + jobDebugStartCmd.MarkFlagRequired("conn") +} + +func jobDebugListRun(cmd *cobra.Command, args []string) error { + rtnData, err := wshclient.JobDebugListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + return fmt.Errorf("getting job debug list: %w", err) + } + + if jobDebugJsonFlag { + jsonData, err := json.MarshalIndent(rtnData, "", " ") + if err != nil { + return fmt.Errorf("marshaling json: %w", err) + } + fmt.Printf("%s\n", string(jsonData)) + return nil + } + + fmt.Printf("%-36s %-20s %-30s %-10s %-10s %-8s %s\n", "OID", "Connection", "Cmd", "Status", "Stream", "ExitCode", "Error") + for _, job := range rtnData { + streamStatus := "-" + if job.StreamDone { + if job.StreamError == "" { + streamStatus = "EOF" + } else { + streamStatus = fmt.Sprintf("%q", job.StreamError) + } + } + + exitCode := "-" + if job.Status != "running" && job.Status != "init" { + exitCode = fmt.Sprintf("%d", job.ExitCode) + } + + errorStr := "" + if job.Error != "" { + errorStr = fmt.Sprintf("%q", job.Error) + } + + fmt.Printf("%-36s %-20s %-30s %-10s %-10s %-8s %s\n", + job.OID, job.Connection, job.Cmd, job.Status, streamStatus, exitCode, errorStr) + } + return nil +} + +func jobDebugDeleteRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobDebugDeleteCommand(RpcClient, jobIdFlag, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + return fmt.Errorf("deleting job: %w", err) + } + + fmt.Printf("Job %s deleted successfully\n", jobIdFlag) + return nil +} + +func jobDebugTerminateRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobManagerExitCommand(RpcClient, &wshrpc.RpcOpts{ + Route: wshutil.MakeJobRouteId(jobIdFlag), + Timeout: 5000, + NoResponse: true, + }) + if err != nil { + return fmt.Errorf("terminating job manager: %w", err) + } + + fmt.Printf("Job manager for %s terminated successfully\n", jobIdFlag) + return nil +} + +func jobDebugGetOutputRun(cmd *cobra.Command, args []string) error { + fileData, err := wshclient.FileReadCommand(RpcClient, wshrpc.FileData{ + Info: &wshrpc.FileInfo{ + Path: fmt.Sprintf("wavefile://%s/term", jobIdFlag), + }, + }, &wshrpc.RpcOpts{Timeout: 10000}) + if err != nil { + return fmt.Errorf("reading job output: %w", err) + } + + if fileData.Data64 != "" { + decoded, err := base64.StdEncoding.DecodeString(fileData.Data64) + if err != nil { + return fmt.Errorf("decoding output data: %w", err) + } + fmt.Printf("%s", string(decoded)) + } + return nil +} + +func jobDebugStartRun(cmd *cobra.Command, args []string) error { + if len(args) == 0 { + return fmt.Errorf("no command specified after --") + } + + cmdToRun := args[0] + cmdArgs := args[1:] + + data := wshrpc.CommandJobControllerStartJobData{ + ConnName: jobConnFlag, + Cmd: cmdToRun, + Args: cmdArgs, + Env: make(map[string]string), + TermSize: nil, + } + + jobId, err := wshclient.JobControllerStartJobCommand(RpcClient, data, &wshrpc.RpcOpts{Timeout: 10000}) + if err != nil { + return fmt.Errorf("starting job: %w", err) + } + + fmt.Printf("Job started successfully with ID: %s\n", jobId) + return nil +} diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index a0603f0011..a86c12929f 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -397,6 +397,21 @@ class RpcApiType { return client.wshRpcCall("jobconnect", data, opts); } + // command "jobcontrollerstartjob" [call] + JobControllerStartJobCommand(client: WshClient, data: CommandJobControllerStartJobData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerstartjob", data, opts); + } + + // command "jobdebugdelete" [call] + JobDebugDeleteCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobdebugdelete", data, opts); + } + + // command "jobdebuglist" [call] + JobDebugListCommand(client: WshClient, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobdebuglist", null, opts); + } + // command "jobexited" [call] JobExitedCommand(client: WshClient, data: CommandJobExitedData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobexited", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 925718bb7a..f9d52d11fb 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -179,6 +179,7 @@ declare global { tosagreed?: number; hasoldhistory?: boolean; tempoid?: string; + installid?: string; }; // workspaceservice.CloseTabRtnType @@ -369,11 +370,22 @@ declare global { exiterr?: string; }; + // wshrpc.CommandJobControllerStartJobData + type CommandJobControllerStartJobData = { + connname: string; + cmd: string; + args: string[]; + env: {[key: string]: string}; + termsize?: TermSize; + }; + // wshrpc.CommandJobExitedData type CommandJobExitedData = { + jobid: string; exitcode: number; exitsignal?: string; exiterr?: string; + exitts?: number; }; // wshrpc.CommandJobTerminateData @@ -456,6 +468,7 @@ declare global { jobid: string; mainserverjwttoken: string; clientid: string; + publickeybase64: string; }; // wshrpc.CommandRemoteStreamFileData @@ -875,6 +888,8 @@ declare global { exitcode?: number; exitsignal?: string; error?: string; + streamdone?: boolean; + streamerror?: string; }; // waveobj.LayoutActionData diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 47cfcffa7b..dee6d65779 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -5,6 +5,7 @@ package jobcontroller import ( "context" + "encoding/base64" "fmt" "io" "log" @@ -16,6 +17,7 @@ import ( "github.com/wavetermdev/waveterm/pkg/remote/conncontroller" "github.com/wavetermdev/waveterm/pkg/streamclient" "github.com/wavetermdev/waveterm/pkg/util/utilfn" + "github.com/wavetermdev/waveterm/pkg/wavejwt" "github.com/wavetermdev/waveterm/pkg/waveobj" "github.com/wavetermdev/waveterm/pkg/wshrpc" "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" @@ -116,6 +118,9 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { return "", fmt.Errorf("failed to get client: %w", err) } + publicKey := wavejwt.GetPublicKey() + publicKeyBase64 := base64.StdEncoding.EncodeToString(publicKey) + startJobData := wshrpc.CommandRemoteStartJobData{ Cmd: params.Cmd, Args: params.Args, @@ -126,6 +131,7 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { JobId: jobId, MainServerJwtToken: jobAccessToken, ClientId: clientId.OID, + PublicKeyBase64: publicKeyBase64, } rpcOpts := &wshrpc.RpcOpts{ diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index d73ac03cf1..04f594785a 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -11,6 +11,7 @@ import ( "path/filepath" "runtime" "sync" + "time" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" @@ -56,8 +57,10 @@ func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAu return err } fmt.Fprintf(os.Stdout, JobManagerStartLabel+"\n") + os.Stdout.Sync() + time.Sleep(200 * time.Millisecond) - err = daemonize(clientId, jobId) + // err = daemonize(clientId, jobId) if err != nil { return fmt.Errorf("failed to daemonize: %w", err) } diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index ba4a3d1ea2..ff06949c7f 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -482,6 +482,24 @@ func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opt return resp, err } +// command "jobcontrollerstartjob", wshserver.JobControllerStartJobCommand +func JobControllerStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobControllerStartJobData, opts *wshrpc.RpcOpts) (string, error) { + resp, err := sendRpcRequestCallHelper[string](w, "jobcontrollerstartjob", data, opts) + return resp, err +} + +// command "jobdebugdelete", wshserver.JobDebugDeleteCommand +func JobDebugDeleteCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobdebugdelete", data, opts) + return err +} + +// command "jobdebuglist", wshserver.JobDebugListCommand +func JobDebugListCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) ([]*waveobj.Job, error) { + resp, err := sendRpcRequestCallHelper[[]*waveobj.Job](w, "jobdebuglist", nil, opts) + return resp, err +} + // command "jobexited", wshserver.JobExitedCommand func JobExitedCommand(w *wshutil.WshRpc, data wshrpc.CommandJobExitedData, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobexited", data, opts) diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index cfd7ddb816..3036c1c2de 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -38,6 +38,7 @@ type ServerImpl struct { LogWriter io.Writer Router *wshutil.WshRouter RpcClient *wshutil.WshRpc + IsLocal bool } func (*ServerImpl) WshServerImpl() {} @@ -867,16 +868,33 @@ func (*ServerImpl) DisposeSuggestionsCommand(ctx context.Context, widgetId strin return nil } +func (impl *ServerImpl) getWshPath() (string, error) { + if impl.IsLocal { + return filepath.Join(wavebase.GetWaveDataDir(), "bin", "wsh"), nil + } + wshPath, err := wavebase.ExpandHomeDir("~/.waveterm/bin/wsh") + if err != nil { + return "", fmt.Errorf("cannot expand wsh path: %w", err) + } + return wshPath, nil +} + func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.CommandRemoteStartJobData) (*wshrpc.CommandStartJobRtnData, error) { + log.Printf("RemoteStartJobCommand: starting, jobid=%s, clientid=%s\n", data.JobId, data.ClientId) if impl.Router == nil { return nil, fmt.Errorf("cannot start remote job: no router available") } - wshPath, err := wavebase.ExpandHomeDir("~/.waveterm/bin/wsh") + + wshPath, err := impl.getWshPath() if err != nil { - return nil, fmt.Errorf("cannot expand wsh path: %w", err) + return nil, err } + log.Printf("RemoteStartJobCommand: wshPath=%s\n", wshPath) cmd := exec.Command(wshPath, "jobmanager", "--jobid", data.JobId, "--clientid", data.ClientId) + if data.PublicKeyBase64 != "" { + cmd.Env = append(os.Environ(), "WAVETERM_PUBLICKEY="+data.PublicKeyBase64) + } stdin, err := cmd.StdinPipe() if err != nil { return nil, fmt.Errorf("cannot create stdin pipe: %w", err) @@ -885,10 +903,16 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C if err != nil { return nil, fmt.Errorf("cannot create stdout pipe: %w", err) } + stderr, err := cmd.StderrPipe() + if err != nil { + return nil, fmt.Errorf("cannot create stderr pipe: %w", err) + } + log.Printf("RemoteStartJobCommand: created pipes\n") if err := cmd.Start(); err != nil { return nil, fmt.Errorf("cannot start job manager: %w", err) } + log.Printf("RemoteStartJobCommand: job manager process started\n") jobAuthTokenLine := fmt.Sprintf("Wave-JobAccessToken:%s\n", data.JobAuthToken) if _, err := stdin.Write([]byte(jobAuthTokenLine)); err != nil { @@ -896,12 +920,27 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C return nil, fmt.Errorf("cannot write job auth token: %w", err) } stdin.Close() + log.Printf("RemoteStartJobCommand: wrote auth token to stdin\n") + + go func() { + scanner := bufio.NewScanner(stderr) + for scanner.Scan() { + line := scanner.Text() + log.Printf("RemoteStartJobCommand: stderr: %s\n", line) + } + if err := scanner.Err(); err != nil { + log.Printf("RemoteStartJobCommand: error reading stderr: %v\n", err) + } else { + log.Printf("RemoteStartJobCommand: stderr EOF\n") + } + }() startCh := make(chan error, 1) go func() { scanner := bufio.NewScanner(stdout) for scanner.Scan() { line := scanner.Text() + log.Printf("RemoteStartJobCommand: stdout line: %s\n", line) if strings.Contains(line, "Wave-JobManagerStart") { startCh <- nil return @@ -910,6 +949,7 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C if err := scanner.Err(); err != nil { startCh <- fmt.Errorf("error reading stdout: %w", err) } else { + log.Printf("RemoteStartJobCommand: stdout EOF\n") startCh <- fmt.Errorf("job manager exited without start signal") } }() @@ -917,14 +957,18 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() + log.Printf("RemoteStartJobCommand: waiting for start signal\n") select { case err := <-startCh: if err != nil { cmd.Process.Kill() + log.Printf("RemoteStartJobCommand: error from start signal: %v\n", err) return nil, err } + log.Printf("RemoteStartJobCommand: received start signal\n") case <-timeoutCtx.Done(): cmd.Process.Kill() + log.Printf("RemoteStartJobCommand: timeout waiting for start signal\n") return nil, fmt.Errorf("timeout waiting for job manager to start") } @@ -933,10 +977,13 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C }() socketPath := filepath.Join(wavebase.GetHomeDir(), ".waveterm", "jobs", data.ClientId, fmt.Sprintf("%s.sock", data.JobId)) + log.Printf("RemoteStartJobCommand: connecting to socket: %s\n", socketPath) conn, err := net.Dial("unix", socketPath) if err != nil { + log.Printf("RemoteStartJobCommand: error connecting to socket: %v\n", err) return nil, fmt.Errorf("cannot connect to job manager socket: %w", err) } + log.Printf("RemoteStartJobCommand: connected to socket\n") proxy := wshutil.MakeRpcProxy("jobmanager") go func() { diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 73e787f61a..7a477d6e99 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -165,6 +165,9 @@ type WshRpcInterface interface { JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error JobExitedCommand(ctx context.Context, data CommandJobExitedData) error // this is sent FROM the job manager => main server JobManagerExitCommand(ctx context.Context) error + JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, error) + JobDebugDeleteCommand(ctx context.Context, jobId string) error + JobControllerStartJobCommand(ctx context.Context, data CommandJobControllerStartJobData) (string, error) } // for frontend @@ -695,6 +698,7 @@ type CommandRemoteStartJobData struct { JobId string `json:"jobid"` MainServerJwtToken string `json:"mainserverjwttoken"` ClientId string `json:"clientid"` + PublicKeyBase64 string `json:"publickeybase64"` } type CommandStartJobRtnData struct { @@ -724,3 +728,11 @@ type CommandJobExitedData struct { ExitErr string `json:"exiterr,omitempty"` ExitTs int64 `json:"exitts,omitempty"` } + +type CommandJobControllerStartJobData struct { + ConnName string `json:"connname"` + Cmd string `json:"cmd"` + Args []string `json:"args"` + Env map[string]string `json:"env"` + TermSize *waveobj.TermSize `json:"termsize,omitempty"` +} diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index c5a53cd4f0..17fd6bf23e 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -1435,3 +1435,22 @@ func (ws *WshServer) GetSecretsLinuxStorageBackendCommand(ctx context.Context) ( func (ws *WshServer) JobExitedCommand(ctx context.Context, data wshrpc.CommandJobExitedData) error { return jobcontroller.HandleJobExited(ctx, data.JobId, data) } + +func (ws *WshServer) JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, error) { + return wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job) +} + +func (ws *WshServer) JobDebugDeleteCommand(ctx context.Context, jobId string) error { + return wstore.DBDelete(ctx, waveobj.OType_Job, jobId) +} + +func (ws *WshServer) JobControllerStartJobCommand(ctx context.Context, data wshrpc.CommandJobControllerStartJobData) (string, error) { + params := jobcontroller.StartJobParams{ + ConnName: data.ConnName, + Cmd: data.Cmd, + Args: data.Args, + Env: data.Env, + TermSize: data.TermSize, + } + return jobcontroller.StartJob(ctx, params) +} From 70273ed6acf5f31f02c8c09a3838f057a3fdfd63 Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 14:15:19 -0800 Subject: [PATCH 27/64] job manager runs now, working more on control --- cmd/wsh/cmd/wshcmd-jobdebug.go | 48 ++++++++++++++++--------- frontend/app/store/wshclientapi.ts | 10 ++++++ pkg/jobcontroller/jobcontroller.go | 58 ++++++++++++++++++++++++++++-- pkg/jobmanager/jobmanager.go | 16 +++++---- pkg/wshrpc/wshclient/wshclient.go | 12 +++++++ pkg/wshrpc/wshremote/wshremote.go | 5 +-- pkg/wshrpc/wshrpctypes.go | 2 ++ pkg/wshrpc/wshserver/wshserver.go | 8 +++++ 8 files changed, 132 insertions(+), 27 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 474a3ad985..bdc80f5bd7 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -11,7 +11,6 @@ import ( "github.com/spf13/cobra" "github.com/wavetermdev/waveterm/pkg/wshrpc" "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" - "github.com/wavetermdev/waveterm/pkg/wshutil" ) var jobDebugCmd = &cobra.Command{ @@ -33,10 +32,16 @@ var jobDebugDeleteCmd = &cobra.Command{ RunE: jobDebugDeleteRun, } -var jobDebugTerminateCmd = &cobra.Command{ - Use: "terminate", - Short: "terminate a job manager process", - RunE: jobDebugTerminateRun, +var jobDebugTerminateCmdCmd = &cobra.Command{ + Use: "terminate-cmd", + Short: "terminate a command process", + RunE: jobDebugTerminateCmdRun, +} + +var jobDebugExitCmd = &cobra.Command{ + Use: "exit", + Short: "exit a job manager", + RunE: jobDebugExitRun, } var jobDebugGetOutputCmd = &cobra.Command{ @@ -54,12 +59,14 @@ var jobDebugStartCmd = &cobra.Command{ var jobIdFlag string var jobDebugJsonFlag bool var jobConnFlag string +var exitJobIdFlag string func init() { rootCmd.AddCommand(jobDebugCmd) jobDebugCmd.AddCommand(jobDebugListCmd) jobDebugCmd.AddCommand(jobDebugDeleteCmd) - jobDebugCmd.AddCommand(jobDebugTerminateCmd) + jobDebugCmd.AddCommand(jobDebugTerminateCmdCmd) + jobDebugCmd.AddCommand(jobDebugExitCmd) jobDebugCmd.AddCommand(jobDebugGetOutputCmd) jobDebugCmd.AddCommand(jobDebugStartCmd) @@ -68,8 +75,11 @@ func init() { jobDebugDeleteCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to delete (required)") jobDebugDeleteCmd.MarkFlagRequired("jobid") - jobDebugTerminateCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to terminate (required)") - jobDebugTerminateCmd.MarkFlagRequired("jobid") + jobDebugTerminateCmdCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to terminate (required)") + jobDebugTerminateCmdCmd.MarkFlagRequired("jobid") + + jobDebugExitCmd.Flags().StringVar(&exitJobIdFlag, "jobid", "", "job id to exit (required)") + jobDebugExitCmd.MarkFlagRequired("jobid") jobDebugGetOutputCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to get output for (required)") jobDebugGetOutputCmd.MarkFlagRequired("jobid") @@ -130,17 +140,23 @@ func jobDebugDeleteRun(cmd *cobra.Command, args []string) error { return nil } -func jobDebugTerminateRun(cmd *cobra.Command, args []string) error { - err := wshclient.JobManagerExitCommand(RpcClient, &wshrpc.RpcOpts{ - Route: wshutil.MakeJobRouteId(jobIdFlag), - Timeout: 5000, - NoResponse: true, - }) +func jobDebugTerminateCmdRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobControllerTerminateJobCommand(RpcClient, jobIdFlag, nil) + if err != nil { + return fmt.Errorf("terminating command: %w", err) + } + + fmt.Printf("Command for %s terminated successfully\n", jobIdFlag) + return nil +} + +func jobDebugExitRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobControllerExitJobCommand(RpcClient, exitJobIdFlag, nil) if err != nil { - return fmt.Errorf("terminating job manager: %w", err) + return fmt.Errorf("exiting job manager: %w", err) } - fmt.Printf("Job manager for %s terminated successfully\n", jobIdFlag) + fmt.Printf("Job manager for %s exited successfully\n", exitJobIdFlag) return nil } diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index a86c12929f..db1e6e182e 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -397,11 +397,21 @@ class RpcApiType { return client.wshRpcCall("jobconnect", data, opts); } + // command "jobcontrollerexitjob" [call] + JobControllerExitJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerexitjob", data, opts); + } + // command "jobcontrollerstartjob" [call] JobControllerStartJobCommand(client: WshClient, data: CommandJobControllerStartJobData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerstartjob", data, opts); } + // command "jobcontrollerterminatejob" [call] + JobControllerTerminateJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerterminatejob", data, opts); + } + // command "jobdebugdelete" [call] JobDebugDeleteCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("jobdebugdelete", data, opts); diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index dee6d65779..498a4abc39 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -64,10 +64,11 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { return "", fmt.Errorf("failed to generate job auth token: %w", err) } - rpcCtx := wshrpc.RpcContext{ - RouteId: wshutil.MakeJobRouteId(jobId), + jobAccessClaims := &wavejwt.WaveJwtClaims{ + MainServer: true, + JobId: jobId, } - jobAccessToken, err := wshutil.MakeClientJWTToken(rpcCtx) + jobAccessToken, err := wavejwt.Sign(jobAccessClaims) if err != nil { return "", fmt.Errorf("failed to generate job access token: %w", err) } @@ -276,3 +277,54 @@ func tryTerminateJobManager(ctx context.Context, jobId string) { log.Printf("[job:%s] job manager exit command sent successfully", jobId) } + +func TerminateJob(ctx context.Context, jobId string) error { + _, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + return fmt.Errorf("failed to get job: %w", err) + } + + connRpc := wshclient.GetBareRpcClient() + if connRpc == nil { + return fmt.Errorf("main rpc client not available") + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeJobRouteId(jobId), + Timeout: 5000, + } + + err = wshclient.JobTerminateCommand(connRpc, wshrpc.CommandJobTerminateData{}, rpcOpts) + if err != nil { + return fmt.Errorf("failed to send terminate command: %w", err) + } + + log.Printf("[job:%s] job terminate command sent successfully", jobId) + return nil +} + +func ExitJobManager(ctx context.Context, jobId string) error { + _, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + return fmt.Errorf("failed to get job: %w", err) + } + + connRpc := wshclient.GetBareRpcClient() + if connRpc == nil { + return fmt.Errorf("main rpc client not available") + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeJobRouteId(jobId), + Timeout: 5000, + NoResponse: true, + } + + err = wshclient.JobManagerExitCommand(connRpc, rpcOpts) + if err != nil { + return fmt.Errorf("failed to send exit command: %w", err) + } + + log.Printf("[job:%s] job manager exit command sent successfully", jobId) + return nil +} diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 04f594785a..cd23548969 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -60,7 +60,7 @@ func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAu os.Stdout.Sync() time.Sleep(200 * time.Millisecond) - // err = daemonize(clientId, jobId) + err = daemonize(clientId, jobId) if err != nil { return fmt.Errorf("failed to daemonize: %w", err) } @@ -156,21 +156,25 @@ func (jm *JobManager) disconnectFromStreamHelper(mainServerConn *MainServerConn) jm.connectedStreamClient = nil } +func GetJobSocketPath(jobId string) string { + socketDir := filepath.Join("/tmp", fmt.Sprintf("waveterm-%d", os.Getuid())) + return filepath.Join(socketDir, fmt.Sprintf("%s.sock", jobId)) +} + func GetJobFilePath(clientId string, jobId string, extension string) string { homeDir := wavebase.GetHomeDir() - socketDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) - return filepath.Join(socketDir, fmt.Sprintf("%s.%s", jobId, extension)) + jobDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) + return filepath.Join(jobDir, fmt.Sprintf("%s.%s", jobId, extension)) } func MakeJobDomainSocket(clientId string, jobId string) error { - homeDir := wavebase.GetHomeDir() - socketDir := filepath.Join(homeDir, ".waveterm", "jobs", clientId) + socketDir := filepath.Join("/tmp", fmt.Sprintf("waveterm-%d", os.Getuid())) err := os.MkdirAll(socketDir, 0700) if err != nil { return fmt.Errorf("failed to create socket directory: %w", err) } - socketPath := GetJobFilePath(clientId, jobId, "sock") + socketPath := GetJobSocketPath(jobId) os.Remove(socketPath) diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index ff06949c7f..ae10c5cf47 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -482,12 +482,24 @@ func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opt return resp, err } +// command "jobcontrollerexitjob", wshserver.JobControllerExitJobCommand +func JobControllerExitJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerexitjob", data, opts) + return err +} + // command "jobcontrollerstartjob", wshserver.JobControllerStartJobCommand func JobControllerStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobControllerStartJobData, opts *wshrpc.RpcOpts) (string, error) { resp, err := sendRpcRequestCallHelper[string](w, "jobcontrollerstartjob", data, opts) return resp, err } +// command "jobcontrollerterminatejob", wshserver.JobControllerTerminateJobCommand +func JobControllerTerminateJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerterminatejob", data, opts) + return err +} + // command "jobdebugdelete", wshserver.JobDebugDeleteCommand func JobDebugDeleteCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobdebugdelete", data, opts) diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index 3036c1c2de..906a270e8b 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -20,6 +20,7 @@ import ( "strings" "time" + "github.com/wavetermdev/waveterm/pkg/jobmanager" "github.com/wavetermdev/waveterm/pkg/remote/connparse" "github.com/wavetermdev/waveterm/pkg/remote/fileshare/fstype" "github.com/wavetermdev/waveterm/pkg/remote/fileshare/wshfs" @@ -976,7 +977,7 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C cmd.Wait() }() - socketPath := filepath.Join(wavebase.GetHomeDir(), ".waveterm", "jobs", data.ClientId, fmt.Sprintf("%s.sock", data.JobId)) + socketPath := jobmanager.GetJobSocketPath(data.JobId) log.Printf("RemoteStartJobCommand: connecting to socket: %s\n", socketPath) conn, err := net.Dial("unix", socketPath) if err != nil { @@ -1004,7 +1005,7 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C routeId := wshutil.MakeLinkRouteId(linkId) authData := wshrpc.CommandAuthenticateToJobData{ - JobAccessToken: data.JobAuthToken, + JobAccessToken: data.MainServerJwtToken, } err = wshclient.AuthenticateToJobManagerCommand(impl.RpcClient, authData, &wshrpc.RpcOpts{Route: routeId}) if err != nil { diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 7a477d6e99..529b90a773 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -168,6 +168,8 @@ type WshRpcInterface interface { JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, error) JobDebugDeleteCommand(ctx context.Context, jobId string) error JobControllerStartJobCommand(ctx context.Context, data CommandJobControllerStartJobData) (string, error) + JobControllerTerminateJobCommand(ctx context.Context, jobId string) error + JobControllerExitJobCommand(ctx context.Context, jobId string) error } // for frontend diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 17fd6bf23e..82dfff6317 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -1454,3 +1454,11 @@ func (ws *WshServer) JobControllerStartJobCommand(ctx context.Context, data wshr } return jobcontroller.StartJob(ctx, params) } + +func (ws *WshServer) JobControllerTerminateJobCommand(ctx context.Context, jobId string) error { + return jobcontroller.TerminateJob(ctx, jobId) +} + +func (ws *WshServer) JobControllerExitJobCommand(ctx context.Context, jobId string) error { + return jobcontroller.ExitJobManager(ctx, jobId) +} From 8fca82f3327a38b3144de69e0f14fd189dcf79e6 Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 14:23:42 -0800 Subject: [PATCH 28/64] route:up added as an event from wshrouter --- frontend/app/view/vdom/vdom-model.tsx | 2 +- pkg/wps/wpstypes.go | 3 ++- pkg/wshutil/wshrouter.go | 14 ++++++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/frontend/app/view/vdom/vdom-model.tsx b/frontend/app/view/vdom/vdom-model.tsx index fbe556daba..40877894f2 100644 --- a/frontend/app/view/vdom/vdom-model.tsx +++ b/frontend/app/view/vdom/vdom-model.tsx @@ -162,7 +162,7 @@ export class VDomModel { this.queueUpdate(true); } this.routeGoneUnsub = waveEventSubscribe({ - eventType: "route:gone", + eventType: "route:down", scope: curBackendRoute, handler: (event: WaveEvent) => { this.disposed = true; diff --git a/pkg/wps/wpstypes.go b/pkg/wps/wpstypes.go index 4f16295e64..4ec65070b5 100644 --- a/pkg/wps/wpstypes.go +++ b/pkg/wps/wpstypes.go @@ -16,7 +16,8 @@ const ( Event_BlockFile = "blockfile" Event_Config = "config" Event_UserInput = "userinput" - Event_RouteGone = "route:gone" + Event_RouteDown = "route:down" + Event_RouteUp = "route:up" Event_WorkspaceUpdate = "workspace:update" Event_WaveAIRateLimit = "waveai:ratelimit" Event_WaveAppAppGoUpdated = "waveapp:appgoupdated" diff --git a/pkg/wshutil/wshrouter.go b/pkg/wshutil/wshrouter.go index 19fa40d037..310554f18d 100644 --- a/pkg/wshutil/wshrouter.go +++ b/pkg/wshutil/wshrouter.go @@ -695,6 +695,9 @@ func (router *WshRouter) bindRoute(linkId baseds.LinkId, routeId string, isSourc if !strings.HasPrefix(routeId, ControlPrefix) { router.announceUpstream(routeId) } + if router.IsRootRouter() { + router.publishRouteToBroker(routeId) + } return nil } @@ -711,12 +714,19 @@ func (router *WshRouter) getUpstreamClient() AbstractRpcClient { return lm.client } +func (router *WshRouter) publishRouteToBroker(routeId string) { + defer func() { + panichandler.PanicHandler("WshRouter:publishRouteToBroker", recover()) + }() + wps.Broker.Publish(wps.WaveEvent{Event: wps.Event_RouteUp, Scopes: []string{routeId}}) +} + func (router *WshRouter) unsubscribeFromBroker(routeId string) { defer func() { - panichandler.PanicHandler("WshRouter:unregisterRoute:routegone", recover()) + panichandler.PanicHandler("WshRouter:unregisterRoute:routedown", recover()) }() wps.Broker.UnsubscribeAll(routeId) - wps.Broker.Publish(wps.WaveEvent{Event: wps.Event_RouteGone, Scopes: []string{routeId}}) + wps.Broker.Publish(wps.WaveEvent{Event: wps.Event_RouteDown, Scopes: []string{routeId}}) } func sendControlUnauthenticatedErrorResponse(cmdMsg RpcMessage, linkMeta linkMeta) { From 02534e5d36f0031a50cee92fd54da3f5eee6866e Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 14:35:22 -0800 Subject: [PATCH 29/64] fix bare client rpc --- pkg/wshrpc/wshclient/barerpcclient.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/wshrpc/wshclient/barerpcclient.go b/pkg/wshrpc/wshclient/barerpcclient.go index 62d1f27ea7..00d404cff1 100644 --- a/pkg/wshrpc/wshclient/barerpcclient.go +++ b/pkg/wshrpc/wshclient/barerpcclient.go @@ -4,8 +4,10 @@ package wshclient import ( + "fmt" "sync" + "github.com/google/uuid" "github.com/wavetermdev/waveterm/pkg/wps" "github.com/wavetermdev/waveterm/pkg/wshrpc" "github.com/wavetermdev/waveterm/pkg/wshutil" @@ -17,20 +19,14 @@ func (*WshServer) WshServerImpl() {} var WshServerImpl = WshServer{} -const ( - DefaultOutputChSize = 32 - DefaultInputChSize = 32 -) - var waveSrvClient_Singleton *wshutil.WshRpc var waveSrvClient_Once = &sync.Once{} -const BareClientRoute = "bare" - func GetBareRpcClient() *wshutil.WshRpc { waveSrvClient_Once.Do(func() { waveSrvClient_Singleton = wshutil.MakeWshRpc(wshrpc.RpcContext{}, &WshServerImpl, "bare-client") - wshutil.DefaultRouter.RegisterTrustedLeaf(waveSrvClient_Singleton, BareClientRoute) + bareClientRoute := fmt.Sprintf("bare:%s", uuid.New().String()) + wshutil.DefaultRouter.RegisterTrustedLeaf(waveSrvClient_Singleton, bareClientRoute) wps.Broker.SetClient(wshutil.DefaultRouter) }) return waveSrvClient_Singleton From c4c2168c1fde5c5ab7e86476e76e928883cfdddd Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 17:16:08 -0800 Subject: [PATCH 30/64] debugging, lots of new logging, but finally capturing output from jobs in the main server --- cmd/server/main-server.go | 2 + cmd/wsh/cmd/wshcmd-jobmanager.go | 7 +- pkg/jobcontroller/jobcontroller.go | 126 +++++++--- pkg/jobmanager/jobmanager.go | 8 +- pkg/jobmanager/jobmanager_unix.go | 3 +- pkg/jobmanager/mainserverconn.go | 17 ++ pkg/jobmanager/streammanager.go | 263 +++++++++----------- pkg/remote/conncontroller/conncontroller.go | 15 ++ pkg/wshrpc/wshclient/barerpcclient.go | 10 +- pkg/wshrpc/wshremote/wshremote.go | 29 ++- 10 files changed, 298 insertions(+), 182 deletions(-) diff --git a/cmd/server/main-server.go b/cmd/server/main-server.go index 061af3c245..410e1fd63b 100644 --- a/cmd/server/main-server.go +++ b/cmd/server/main-server.go @@ -20,6 +20,7 @@ import ( "github.com/wavetermdev/waveterm/pkg/blocklogger" "github.com/wavetermdev/waveterm/pkg/filebackup" "github.com/wavetermdev/waveterm/pkg/filestore" + "github.com/wavetermdev/waveterm/pkg/jobcontroller" "github.com/wavetermdev/waveterm/pkg/panichandler" "github.com/wavetermdev/waveterm/pkg/remote/conncontroller" "github.com/wavetermdev/waveterm/pkg/remote/fileshare/wshfs" @@ -572,6 +573,7 @@ func main() { go backupCleanupLoop() go startupActivityUpdate(firstLaunch) // must be after startConfigWatcher() blocklogger.InitBlockLogger() + jobcontroller.InitJobController() go func() { defer func() { panichandler.PanicHandler("GetSystemSummary", recover()) diff --git a/cmd/wsh/cmd/wshcmd-jobmanager.go b/cmd/wsh/cmd/wshcmd-jobmanager.go index ac3959ba83..8ada93dc4d 100644 --- a/cmd/wsh/cmd/wshcmd-jobmanager.go +++ b/cmd/wsh/cmd/wshcmd-jobmanager.go @@ -65,7 +65,12 @@ func jobManagerRun(cmd *cobra.Command, args []string) error { return fmt.Errorf("failed to read job auth token: %v", err) } - err = jobmanager.SetupJobManager(jobManagerClientId, jobManagerJobId, publicKeyBytes, jobAuthToken) + readyFile := os.NewFile(3, "ready-pipe") + if readyFile == nil { + return fmt.Errorf("ready pipe (fd 3) not available") + } + + err = jobmanager.SetupJobManager(jobManagerClientId, jobManagerJobId, publicKeyBytes, jobAuthToken, readyFile) if err != nil { return fmt.Errorf("error setting up job manager: %v", err) } diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 498a4abc39..d535e7e13e 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -9,6 +9,8 @@ import ( "fmt" "io" "log" + "strings" + "sync" "time" "github.com/google/uuid" @@ -19,6 +21,7 @@ import ( "github.com/wavetermdev/waveterm/pkg/util/utilfn" "github.com/wavetermdev/waveterm/pkg/wavejwt" "github.com/wavetermdev/waveterm/pkg/waveobj" + "github.com/wavetermdev/waveterm/pkg/wps" "github.com/wavetermdev/waveterm/pkg/wshrpc" "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" "github.com/wavetermdev/waveterm/pkg/wshutil" @@ -32,8 +35,59 @@ const ( JobStatus_Error = "error" ) +const ( + JobConnStatus_Disconnected = "disconnected" + JobConnStatus_Connecting = "connecting" + JobConnStatus_Connected = "connected" +) + const DefaultStreamRwnd = 64 * 1024 +var ( + jobConnStates = make(map[string]string) + jobConnStatesLock sync.Mutex +) + +func InitJobController() { + rpcClient := wshclient.GetBareRpcClient() + rpcClient.EventListener.On(wps.Event_RouteUp, handleRouteUpEvent) + rpcClient.EventListener.On(wps.Event_RouteDown, handleRouteDownEvent) +} + +func handleRouteUpEvent(event *wps.WaveEvent) { + handleRouteEvent(event, JobConnStatus_Connected) +} + +func handleRouteDownEvent(event *wps.WaveEvent) { + handleRouteEvent(event, JobConnStatus_Disconnected) +} + +func handleRouteEvent(event *wps.WaveEvent, newStatus string) { + for _, scope := range event.Scopes { + if strings.HasPrefix(scope, "job:") { + jobId := strings.TrimPrefix(scope, "job:") + SetJobConnStatus(jobId, newStatus) + log.Printf("[job:%s] connection status changed to %s", jobId, newStatus) + } + } +} + +func GetJobConnStatus(jobId string) string { + jobConnStatesLock.Lock() + defer jobConnStatesLock.Unlock() + status, exists := jobConnStates[jobId] + if !exists { + return JobConnStatus_Disconnected + } + return status +} + +func SetJobConnStatus(jobId string, status string) { + jobConnStatesLock.Lock() + defer jobConnStatesLock.Unlock() + jobConnStates[jobId] = status +} + type StartJobParams struct { ConnName string Cmd string @@ -53,9 +107,12 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { params.TermSize = &waveobj.TermSize{Rows: 24, Cols: 80} } - err := conncontroller.EnsureConnection(ctx, params.ConnName) + isConnected, err := conncontroller.IsConnected(params.ConnName) if err != nil { - return "", fmt.Errorf("failed to ensure connection: %w", err) + return "", fmt.Errorf("error checking connection status: %w", err) + } + if !isConnected { + return "", fmt.Errorf("connection %q is not connected", params.ConnName) } jobId := uuid.New().String() @@ -91,18 +148,14 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { return "", fmt.Errorf("failed to create job in database: %w", err) } - connRpc := wshclient.GetBareRpcClient() - if connRpc == nil { + bareRpc := wshclient.GetBareRpcClient() + if bareRpc == nil { return "", fmt.Errorf("main rpc client not available") } - broker := connRpc.StreamBroker - if broker == nil { - return "", fmt.Errorf("stream broker not available") - } - - readerRouteId := wshutil.MakeJobRouteId(jobId) - writerRouteId := wshutil.MakeConnectionRouteId(params.ConnName) + broker := bareRpc.StreamBroker + readerRouteId := wshclient.GetBareRpcClientRouteId() + writerRouteId := wshutil.MakeJobRouteId(jobId) reader, streamMeta := broker.CreateStreamReader(readerRouteId, writerRouteId, DefaultStreamRwnd) fileOpts := wshrpc.FileOpts{ @@ -140,8 +193,10 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { Timeout: 30000, } - rtnData, err := wshclient.RemoteStartJobCommand(connRpc, startJobData, rpcOpts) + log.Printf("[job:%s] sending RemoteStartJobCommand to connection %s", jobId, params.ConnName) + rtnData, err := wshclient.RemoteStartJobCommand(bareRpc, startJobData, rpcOpts) if err != nil { + log.Printf("[job:%s] RemoteStartJobCommand failed: %v", jobId, err) wstore.DBUpdate(ctx, &waveobj.Job{ OID: jobId, Status: JobStatus_Error, @@ -150,11 +205,14 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { return "", fmt.Errorf("failed to start remote job: %w", err) } + log.Printf("[job:%s] RemoteStartJobCommand succeeded, pgid=%d", jobId, rtnData.Pgid) job.Pgid = rtnData.Pgid job.Status = JobStatus_Running err = wstore.DBUpdate(ctx, job) if err != nil { - log.Printf("warning: failed to update job status to running: %v", err) + log.Printf("[job:%s] warning: failed to update job status to running: %v", jobId, err) + } else { + log.Printf("[job:%s] job status updated to running", jobId) } go func() { @@ -172,13 +230,17 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade log.Printf("[job:%s] output loop finished", jobId) }() + log.Printf("[job:%s] output loop started", jobId) buf := make([]byte, 4096) for { n, err := reader.Read(buf) if n > 0 { + log.Printf("[job:%s] received %d bytes of data", jobId, n) appendErr := filestore.WFS.AppendData(ctx, jobId, "term", buf[:n]) if appendErr != nil { log.Printf("[job:%s] error appending data to WaveFS: %v", jobId, appendErr) + } else { + log.Printf("[job:%s] successfully appended %d bytes to WaveFS", jobId, n) } } @@ -257,19 +319,18 @@ func tryTerminateJobManager(ctx context.Context, jobId string) { log.Printf("[job:%s] both job exited and stream finished, terminating job manager", jobId) - connRpc := wshclient.GetBareRpcClient() - if connRpc == nil { + bareRpc := wshclient.GetBareRpcClient() + if bareRpc == nil { log.Printf("[job:%s] error terminating job manager: rpc client not available", jobId) return } rpcOpts := &wshrpc.RpcOpts{ - Route: wshutil.MakeJobRouteId(jobId), - Timeout: 5000, - NoResponse: true, + Route: wshutil.MakeJobRouteId(jobId), + Timeout: 5000, } - err = wshclient.JobManagerExitCommand(connRpc, rpcOpts) + err = wshclient.JobManagerExitCommand(bareRpc, rpcOpts) if err != nil { log.Printf("[job:%s] error sending job manager exit command: %v", jobId, err) return @@ -284,8 +345,13 @@ func TerminateJob(ctx context.Context, jobId string) error { return fmt.Errorf("failed to get job: %w", err) } - connRpc := wshclient.GetBareRpcClient() - if connRpc == nil { + jobConnStatus := GetJobConnStatus(jobId) + if jobConnStatus != JobConnStatus_Connected { + return fmt.Errorf("job connection is not connected (status: %s)", jobConnStatus) + } + + bareRpc := wshclient.GetBareRpcClient() + if bareRpc == nil { return fmt.Errorf("main rpc client not available") } @@ -294,7 +360,7 @@ func TerminateJob(ctx context.Context, jobId string) error { Timeout: 5000, } - err = wshclient.JobTerminateCommand(connRpc, wshrpc.CommandJobTerminateData{}, rpcOpts) + err = wshclient.JobTerminateCommand(bareRpc, wshrpc.CommandJobTerminateData{}, rpcOpts) if err != nil { return fmt.Errorf("failed to send terminate command: %w", err) } @@ -309,18 +375,22 @@ func ExitJobManager(ctx context.Context, jobId string) error { return fmt.Errorf("failed to get job: %w", err) } - connRpc := wshclient.GetBareRpcClient() - if connRpc == nil { + jobConnStatus := GetJobConnStatus(jobId) + if jobConnStatus != JobConnStatus_Connected { + return fmt.Errorf("job connection is not connected (status: %s)", jobConnStatus) + } + + bareRpc := wshclient.GetBareRpcClient() + if bareRpc == nil { return fmt.Errorf("main rpc client not available") } rpcOpts := &wshrpc.RpcOpts{ - Route: wshutil.MakeJobRouteId(jobId), - Timeout: 5000, - NoResponse: true, + Route: wshutil.MakeJobRouteId(jobId), + Timeout: 5000, } - err = wshclient.JobManagerExitCommand(connRpc, rpcOpts) + err = wshclient.JobManagerExitCommand(bareRpc, rpcOpts) if err != nil { return fmt.Errorf("failed to send exit command: %w", err) } diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index cd23548969..fb1516cec8 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -11,7 +11,6 @@ import ( "path/filepath" "runtime" "sync" - "time" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/panichandler" @@ -39,7 +38,7 @@ type JobManager struct { connectedStreamClient *MainServerConn } -func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAuthToken string) error { +func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAuthToken string, readyFile *os.File) error { if runtime.GOOS != "linux" && runtime.GOOS != "darwin" { return fmt.Errorf("job manager only supported on unix systems, not %s", runtime.GOOS) } @@ -56,9 +55,8 @@ func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAu if err != nil { return err } - fmt.Fprintf(os.Stdout, JobManagerStartLabel+"\n") - os.Stdout.Sync() - time.Sleep(200 * time.Millisecond) + fmt.Fprintf(readyFile, JobManagerStartLabel+"\n") + readyFile.Close() err = daemonize(clientId, jobId) if err != nil { diff --git a/pkg/jobmanager/jobmanager_unix.go b/pkg/jobmanager/jobmanager_unix.go index 8db1965e3d..8b8fa1fcb1 100644 --- a/pkg/jobmanager/jobmanager_unix.go +++ b/pkg/jobmanager/jobmanager_unix.go @@ -77,9 +77,8 @@ func daemonize(clientId string, jobId string) error { if err != nil { return fmt.Errorf("failed to dup2 stderr: %w", err) } - logFile.Close() - log.SetOutput(os.Stdout) + log.SetOutput(logFile) log.Printf("job manager daemonized, logging to %s\n", logPath) setupJobManagerSignalHandlers() diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index ee78e685c8..7989759031 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -44,6 +44,8 @@ type routedDataSender struct { } func (rds *routedDataSender) SendData(dataPk wshrpc.CommandStreamData) { + log.Printf("SendData: sending seq=%d, len=%d, eof=%t, error=%s, route=%s", + dataPk.Seq, len(dataPk.Data64), dataPk.Eof, dataPk.Error, rds.route) err := wshclient.StreamDataCommand(rds.wshRpc, dataPk, &wshrpc.RpcOpts{NoResponse: true, Route: rds.route}) if err != nil { log.Printf("SendData: error sending stream data: %v\n", err) @@ -103,10 +105,13 @@ func (msc *MainServerConn) AuthenticateToJobManagerCommand(ctx context.Context, } func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.CommandStartJobData) (*wshrpc.CommandStartJobRtnData, error) { + log.Printf("StartJobCommand: received command=%s args=%v", data.Cmd, data.Args) if !msc.PeerAuthenticated.Load() { + log.Printf("StartJobCommand: not authenticated") return nil, fmt.Errorf("not authenticated") } if WshCmdJobManager.IsJobStarted() { + log.Printf("StartJobCommand: job already started") return nil, fmt.Errorf("job already started") } @@ -114,6 +119,7 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm defer WshCmdJobManager.lock.Unlock() if WshCmdJobManager.Cmd != nil { + log.Printf("StartJobCommand: job already started (double check)") return nil, fmt.Errorf("job already started") } @@ -123,11 +129,14 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm Env: data.Env, TermSize: data.TermSize, } + log.Printf("StartJobCommand: creating job cmd for jobid=%s", WshCmdJobManager.JobId) jobCmd, err := MakeJobCmd(WshCmdJobManager.JobId, cmdDef) if err != nil { + log.Printf("StartJobCommand: failed to make job cmd: %v", err) return nil, fmt.Errorf("failed to start job: %w", err) } WshCmdJobManager.Cmd = jobCmd + log.Printf("StartJobCommand: job cmd created successfully") if data.StreamMeta != nil { serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(msc, *data.StreamMeta, 0) @@ -139,20 +148,28 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm _, cmdPty := jobCmd.GetCmd() if cmdPty != nil { + log.Printf("StartJobCommand: attaching pty reader to stream manager") err = WshCmdJobManager.StreamManager.AttachReader(cmdPty) if err != nil { + log.Printf("StartJobCommand: failed to attach reader: %v", err) return nil, fmt.Errorf("failed to attach reader to stream manager: %w", err) } + log.Printf("StartJobCommand: pty reader attached successfully") + } else { + log.Printf("StartJobCommand: no pty to attach") } cmd, _ := jobCmd.GetCmd() if cmd == nil || cmd.Process == nil { + log.Printf("StartJobCommand: cmd or process is nil") return nil, fmt.Errorf("cmd or process is nil") } pgid, err := getProcessGroupId(cmd.Process.Pid) if err != nil { + log.Printf("StartJobCommand: failed to get pgid: %v", err) return nil, fmt.Errorf("failed to get process group id: %w", err) } + log.Printf("StartJobCommand: job started successfully pid=%d pgid=%d", cmd.Process.Pid, pgid) return &wshrpc.CommandStartJobRtnData{Pgid: pgid}, nil } diff --git a/pkg/jobmanager/streammanager.go b/pkg/jobmanager/streammanager.go index 0edfc802bb..79ea6084db 100644 --- a/pkg/jobmanager/streammanager.go +++ b/pkg/jobmanager/streammanager.go @@ -7,6 +7,7 @@ import ( "encoding/base64" "fmt" "io" + "log" "sync" "github.com/wavetermdev/waveterm/pkg/wshrpc" @@ -30,28 +31,31 @@ type streamTerminalEvent struct { // StreamManager handles PTY output buffering with ACK-based flow control type StreamManager struct { - lock sync.Mutex + lock sync.Mutex + drainCond *sync.Cond streamId string - buf *CirBuf - terminalEvent *streamTerminalEvent - terminalEventSent bool - terminalEventAcked bool + // this is the data read from the attached reader + buf *CirBuf + terminalEvent *streamTerminalEvent + eofPos int64 // fixed position when EOF/error occurs (-1 if not yet) - reader io.Reader - readerWg sync.WaitGroup + reader io.Reader + cwndSize int + rwndSize int + // invariant: if connected is true, dataSender is non-nil + connected bool dataSender DataSender - cwndSize int - rwndSize int - connected bool - drained bool + // unacked state (reset on disconnect) + sentNotAcked int64 + terminalEventSent bool - sentNotAcked int64 - drainCond *sync.Cond - closed bool + // terminal state - once true, stream is complete + terminalEventAcked bool + closed bool } func MakeStreamManager() *StreamManager { @@ -60,10 +64,10 @@ func MakeStreamManager() *StreamManager { func MakeStreamManagerWithSizes(cwndSize, cirbufSize int) *StreamManager { sm := &StreamManager{ - buf: MakeCirBuf(cirbufSize, true), - cwndSize: cwndSize, - rwndSize: cwndSize, - sentNotAcked: 0, + buf: MakeCirBuf(cirbufSize, true), + eofPos: -1, + cwndSize: cwndSize, + rwndSize: cwndSize, } sm.drainCond = sync.NewCond(&sm.lock) go sm.senderLoop() @@ -80,8 +84,6 @@ func (sm *StreamManager) AttachReader(r io.Reader) error { } sm.reader = r - - sm.readerWg.Add(1) go sm.readLoop() return nil @@ -92,6 +94,10 @@ func (sm *StreamManager) ClientConnected(streamId string, dataSender DataSender, sm.lock.Lock() defer sm.lock.Unlock() + if sm.closed || sm.terminalEventAcked { + return 0, fmt.Errorf("stream is closed") + } + if sm.connected { return 0, fmt.Errorf("client already connected") } @@ -118,12 +124,8 @@ func (sm *StreamManager) ClientConnected(streamId string, dataSender DataSender, sm.streamId = streamId sm.dataSender = dataSender sm.connected = true - sm.drained = false sm.rwndSize = rwndSize sm.sentNotAcked = 0 - if !sm.terminalEventAcked { - sm.terminalEventSent = false - } effectiveWindow := sm.cwndSize if sm.rwndSize < effectiveWindow { effectiveWindow = sm.rwndSize @@ -150,9 +152,12 @@ func (sm *StreamManager) ClientDisconnected() { sm.connected = false sm.dataSender = nil - sm.drainCond.Signal() sm.sentNotAcked = 0 + if !sm.terminalEventAcked { + sm.terminalEventSent = false + } sm.buf.SetEffectiveWindow(false, CirBufSize) + sm.drainCond.Signal() } // RecvAck processes an ACK from the client @@ -167,6 +172,8 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { if ackPk.Fin { sm.terminalEventAcked = true + sm.drainCond.Signal() + return } seq := ackPk.Seq @@ -176,25 +183,15 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { } ackedBytes := seq - headPos - available := sm.buf.Size() - - maxAckable := int64(available) + sm.sentNotAcked - if ackedBytes > maxAckable { + if ackedBytes > sm.sentNotAcked { return } if ackedBytes > 0 { - consumeFromBuf := int(ackedBytes) - if consumeFromBuf > available { - consumeFromBuf = available - } - if err := sm.buf.Consume(consumeFromBuf); err != nil { + if err := sm.buf.Consume(int(ackedBytes)); err != nil { return } sm.sentNotAcked -= ackedBytes - if sm.sentNotAcked < 0 { - sm.sentNotAcked = 0 - } } prevRwnd := sm.rwndSize @@ -208,47 +205,34 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { if sm.rwndSize > prevRwnd || ackedBytes > 0 { sm.drainCond.Signal() } - - if sm.terminalEvent != nil && !sm.terminalEventSent && sm.buf.Size() == 0 && sm.sentNotAcked == 0 { - sm.sendTerminalEvent_withlock() - } } -// Close shuts down the sender loop and waits for the reader to finish +// Close shuts down the sender loop. The reader loop will exit on its next iteration +// or when the underlying reader is closed. func (sm *StreamManager) Close() { sm.lock.Lock() + defer sm.lock.Unlock() sm.closed = true sm.drainCond.Signal() - sm.lock.Unlock() - - sm.readerWg.Wait() } // readLoop is the main read goroutine func (sm *StreamManager) readLoop() { - defer sm.readerWg.Done() - + readBuf := make([]byte, MaxPacketSize) for { sm.lock.Lock() - if sm.terminalEvent != nil { - sm.lock.Unlock() - return - } - - isConnected := sm.connected && sm.drained + closed := sm.closed sm.lock.Unlock() - var readBuf []byte - if isConnected { - readBuf = make([]byte, 32*1024) - } else { - readBuf = make([]byte, DisconnReadSz) + if closed { + return } n, err := sm.reader.Read(readBuf) + log.Printf("readLoop: read %d bytes from PTY, err=%v", n, err) if n > 0 { - sm.handleReadData(readBuf[:n], isConnected) + sm.handleReadData(readBuf[:n]) } if err != nil { @@ -262,10 +246,14 @@ func (sm *StreamManager) readLoop() { } } -func (sm *StreamManager) handleReadData(data []byte, isConnected bool) { +func (sm *StreamManager) handleReadData(data []byte) { + log.Printf("handleReadData: writing %d bytes to buffer", len(data)) sm.buf.Write(data) - if isConnected { - sm.sendBufferData() + sm.lock.Lock() + defer sm.lock.Unlock() + log.Printf("handleReadData: buffer size=%d, connected=%t, signaling=%t", sm.buf.Size(), sm.connected, sm.connected) + if sm.connected { + sm.drainCond.Signal() } } @@ -273,115 +261,110 @@ func (sm *StreamManager) handleEOF() { sm.lock.Lock() defer sm.lock.Unlock() + log.Printf("handleEOF: PTY reached EOF, totalSize=%d", sm.buf.TotalSize()) + sm.eofPos = sm.buf.TotalSize() sm.terminalEvent = &streamTerminalEvent{isEof: true} - - if sm.buf.Size() == 0 && sm.sentNotAcked == 0 && sm.connected && sm.drained { - sm.sendTerminalEvent_withlock() - } + sm.drainCond.Signal() } func (sm *StreamManager) handleError(err error) { sm.lock.Lock() defer sm.lock.Unlock() + log.Printf("handleError: PTY error=%v, totalSize=%d", err, sm.buf.TotalSize()) + sm.eofPos = sm.buf.TotalSize() sm.terminalEvent = &streamTerminalEvent{err: err.Error()} - - if sm.buf.Size() == 0 && sm.sentNotAcked == 0 && sm.connected && sm.drained { - sm.sendTerminalEvent_withlock() - } + sm.drainCond.Signal() } func (sm *StreamManager) senderLoop() { for { - sm.lock.Lock() - - if sm.closed { - sm.lock.Unlock() + done, pkt, sender := sm.prepareNextPacket() + if done { return } - - if !sm.connected { - sm.drainCond.Wait() - sm.lock.Unlock() + if pkt == nil { continue } + sender.SendData(*pkt) + } +} - available := sm.buf.Size() - if available == 0 { - sm.drained = true - if sm.terminalEvent != nil && !sm.terminalEventSent && sm.sentNotAcked == 0 { - sm.sendTerminalEvent_withlock() - } - sm.drainCond.Wait() - sm.lock.Unlock() - continue - } +func (sm *StreamManager) prepareNextPacket() (done bool, pkt *wshrpc.CommandStreamData, sender DataSender) { + sm.lock.Lock() + defer sm.lock.Unlock() - effectiveRwnd := sm.rwndSize - if sm.cwndSize < effectiveRwnd { - effectiveRwnd = sm.cwndSize - } - availableToSend := int64(effectiveRwnd) - sm.sentNotAcked + available := sm.buf.Size() + log.Printf("prepareNextPacket: connected=%t, available=%d, closed=%t, terminalEventAcked=%t, terminalEvent=%v", + sm.connected, available, sm.closed, sm.terminalEventAcked, sm.terminalEvent != nil) - if availableToSend <= 0 { - sm.drainCond.Wait() - sm.lock.Unlock() - continue - } + if sm.closed || sm.terminalEventAcked { + return true, nil, nil + } - peekSize := int(availableToSend) - if peekSize > MaxPacketSize { - peekSize = MaxPacketSize - } - if peekSize > available { - peekSize = available - } + if !sm.connected { + log.Printf("prepareNextPacket: waiting for connection") + sm.drainCond.Wait() + return false, nil, nil + } - data := make([]byte, peekSize) - n := sm.buf.PeekDataAt(int(sm.sentNotAcked), data) - if n == 0 { - sm.lock.Unlock() - continue + if available == 0 { + if sm.terminalEvent != nil && !sm.terminalEventSent { + log.Printf("prepareNextPacket: preparing terminal packet") + return false, sm.prepareTerminalPacket(), sm.dataSender } - data = data[:n] - - seq := sm.buf.HeadPos() + sm.sentNotAcked - sm.sentNotAcked += int64(n) - dataSender := sm.dataSender - sm.lock.Unlock() + log.Printf("prepareNextPacket: no data available, waiting") + sm.drainCond.Wait() + return false, nil, nil + } - if dataSender == nil { - continue - } + effectiveRwnd := sm.rwndSize + if sm.cwndSize < effectiveRwnd { + effectiveRwnd = sm.cwndSize + } + availableToSend := int64(effectiveRwnd) - sm.sentNotAcked - pkt := wshrpc.CommandStreamData{ - Id: sm.streamId, - Seq: seq, - Data64: base64.StdEncoding.EncodeToString(data), - } - dataSender.SendData(pkt) + if availableToSend <= 0 { + sm.drainCond.Wait() + return false, nil, nil } -} -func (sm *StreamManager) sendBufferData() { - sm.lock.Lock() - defer sm.lock.Unlock() - sm.drainCond.Signal() -} + peekSize := int(availableToSend) + if peekSize > MaxPacketSize { + peekSize = MaxPacketSize + } + if peekSize > available { + peekSize = available + } -func (sm *StreamManager) sendTerminalEvent_withlock() { - if sm.terminalEventSent { - return + data := make([]byte, peekSize) + n := sm.buf.PeekDataAt(int(sm.sentNotAcked), data) + if n == 0 { + log.Printf("prepareNextPacket: PeekDataAt returned 0 bytes, waiting for ACK") + sm.drainCond.Wait() + return false, nil, nil } + data = data[:n] - if sm.dataSender == nil { - return + seq := sm.buf.HeadPos() + sm.sentNotAcked + sm.sentNotAcked += int64(n) + + log.Printf("prepareNextPacket: sending packet seq=%d, len=%d bytes", seq, n) + return false, &wshrpc.CommandStreamData{ + Id: sm.streamId, + Seq: seq, + Data64: base64.StdEncoding.EncodeToString(data), + }, sm.dataSender +} + +func (sm *StreamManager) prepareTerminalPacket() *wshrpc.CommandStreamData { + if sm.terminalEventSent || sm.terminalEvent == nil { + return nil } - seq := sm.buf.HeadPos() - pkt := wshrpc.CommandStreamData{ + pkt := &wshrpc.CommandStreamData{ Id: sm.streamId, - Seq: seq, + Seq: sm.eofPos, } if sm.terminalEvent.isEof { @@ -391,5 +374,5 @@ func (sm *StreamManager) sendTerminalEvent_withlock() { } sm.terminalEventSent = true - sm.dataSender.SendData(pkt) + return pkt } diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 7a3ff6e546..0cc5ecaeb7 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -873,6 +873,21 @@ func GetConn(opts *remote.SSHOpts) *SSHConn { return conn } +func IsConnected(connName string) (bool, error) { + if IsLocalConnName(connName) { + return true, nil + } + connOpts, err := remote.ParseOpts(connName) + if err != nil { + return false, fmt.Errorf("error parsing connection name: %w", err) + } + conn := GetConn(connOpts) + if conn == nil { + return false, nil + } + return conn.GetStatus() == Status_Connected, nil +} + // Convenience function for ensuring a connection is established func EnsureConnection(ctx context.Context, connName string) error { if IsLocalConnName(connName) { diff --git a/pkg/wshrpc/wshclient/barerpcclient.go b/pkg/wshrpc/wshclient/barerpcclient.go index 00d404cff1..4a4d17dd24 100644 --- a/pkg/wshrpc/wshclient/barerpcclient.go +++ b/pkg/wshrpc/wshclient/barerpcclient.go @@ -21,13 +21,19 @@ var WshServerImpl = WshServer{} var waveSrvClient_Singleton *wshutil.WshRpc var waveSrvClient_Once = &sync.Once{} +var waveSrvClient_RouteId string func GetBareRpcClient() *wshutil.WshRpc { waveSrvClient_Once.Do(func() { waveSrvClient_Singleton = wshutil.MakeWshRpc(wshrpc.RpcContext{}, &WshServerImpl, "bare-client") - bareClientRoute := fmt.Sprintf("bare:%s", uuid.New().String()) - wshutil.DefaultRouter.RegisterTrustedLeaf(waveSrvClient_Singleton, bareClientRoute) + waveSrvClient_RouteId = fmt.Sprintf("bare:%s", uuid.New().String()) + wshutil.DefaultRouter.RegisterTrustedLeaf(waveSrvClient_Singleton, waveSrvClient_RouteId) wps.Broker.SetClient(wshutil.DefaultRouter) }) return waveSrvClient_Singleton } + +func GetBareRpcClientRouteId() string { + GetBareRpcClient() + return waveSrvClient_RouteId +} diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index 906a270e8b..0b44edaeb5 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -892,10 +892,18 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C } log.Printf("RemoteStartJobCommand: wshPath=%s\n", wshPath) + readyPipeRead, readyPipeWrite, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("cannot create ready pipe: %w", err) + } + defer readyPipeRead.Close() + defer readyPipeWrite.Close() + cmd := exec.Command(wshPath, "jobmanager", "--jobid", data.JobId, "--clientid", data.ClientId) if data.PublicKeyBase64 != "" { cmd.Env = append(os.Environ(), "WAVETERM_PUBLICKEY="+data.PublicKeyBase64) } + cmd.ExtraFiles = []*os.File{readyPipeWrite} stdin, err := cmd.StdinPipe() if err != nil { return nil, fmt.Errorf("cannot create stdin pipe: %w", err) @@ -936,21 +944,34 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C } }() - startCh := make(chan error, 1) go func() { scanner := bufio.NewScanner(stdout) for scanner.Scan() { line := scanner.Text() - log.Printf("RemoteStartJobCommand: stdout line: %s\n", line) + log.Printf("RemoteStartJobCommand: stdout: %s\n", line) + } + if err := scanner.Err(); err != nil { + log.Printf("RemoteStartJobCommand: error reading stdout: %v\n", err) + } else { + log.Printf("RemoteStartJobCommand: stdout EOF\n") + } + }() + + startCh := make(chan error, 1) + go func() { + scanner := bufio.NewScanner(readyPipeRead) + for scanner.Scan() { + line := scanner.Text() + log.Printf("RemoteStartJobCommand: ready pipe line: %s\n", line) if strings.Contains(line, "Wave-JobManagerStart") { startCh <- nil return } } if err := scanner.Err(); err != nil { - startCh <- fmt.Errorf("error reading stdout: %w", err) + startCh <- fmt.Errorf("error reading ready pipe: %w", err) } else { - log.Printf("RemoteStartJobCommand: stdout EOF\n") + log.Printf("RemoteStartJobCommand: ready pipe EOF\n") startCh <- fmt.Errorf("job manager exited without start signal") } }() From 09c4e6982d7650f330882cc03d0baf0bd14e96e9 Mon Sep 17 00:00:00 2001 From: sawka Date: Thu, 15 Jan 2026 19:05:29 -0800 Subject: [PATCH 31/64] mostly working start/getoutput with data streaming --- cmd/wsh/cmd/wshcmd-jobdebug.go | 32 +++++++++++++++ pkg/jobcontroller/jobcontroller.go | 63 +++++++++++++----------------- pkg/jobmanager/jobmanager.go | 5 +++ pkg/jobmanager/mainserverconn.go | 20 +++++----- pkg/jobmanager/streammanager.go | 7 ++++ pkg/streamclient/streambroker.go | 8 ++++ pkg/wstore/wstore_dbops.go | 11 ++++++ 7 files changed, 100 insertions(+), 46 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index bdc80f5bd7..38d779b483 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -38,6 +38,12 @@ var jobDebugTerminateCmdCmd = &cobra.Command{ RunE: jobDebugTerminateCmdRun, } +var jobDebugDeleteAllCmd = &cobra.Command{ + Use: "deleteall", + Short: "delete all jobs", + RunE: jobDebugDeleteAllRun, +} + var jobDebugExitCmd = &cobra.Command{ Use: "exit", Short: "exit a job manager", @@ -65,6 +71,7 @@ func init() { rootCmd.AddCommand(jobDebugCmd) jobDebugCmd.AddCommand(jobDebugListCmd) jobDebugCmd.AddCommand(jobDebugDeleteCmd) + jobDebugCmd.AddCommand(jobDebugDeleteAllCmd) jobDebugCmd.AddCommand(jobDebugTerminateCmdCmd) jobDebugCmd.AddCommand(jobDebugExitCmd) jobDebugCmd.AddCommand(jobDebugGetOutputCmd) @@ -140,6 +147,31 @@ func jobDebugDeleteRun(cmd *cobra.Command, args []string) error { return nil } +func jobDebugDeleteAllRun(cmd *cobra.Command, args []string) error { + rtnData, err := wshclient.JobDebugListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + return fmt.Errorf("getting job debug list: %w", err) + } + + if len(rtnData) == 0 { + fmt.Printf("No jobs to delete\n") + return nil + } + + deletedCount := 0 + for _, job := range rtnData { + err := wshclient.JobDebugDeleteCommand(RpcClient, job.OID, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + fmt.Printf("Error deleting job %s: %v\n", job.OID, err) + } else { + deletedCount++ + } + } + + fmt.Printf("Deleted %d of %d job(s)\n", deletedCount, len(rtnData)) + return nil +} + func jobDebugTerminateCmdRun(cmd *cobra.Command, args []string) error { err := wshclient.JobControllerTerminateJobCommand(RpcClient, jobIdFlag, nil) if err != nil { diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index d535e7e13e..05b0793bd6 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -197,18 +197,19 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { rtnData, err := wshclient.RemoteStartJobCommand(bareRpc, startJobData, rpcOpts) if err != nil { log.Printf("[job:%s] RemoteStartJobCommand failed: %v", jobId, err) - wstore.DBUpdate(ctx, &waveobj.Job{ - OID: jobId, - Status: JobStatus_Error, - Error: fmt.Sprintf("failed to start job: %v", err), + errMsg := fmt.Sprintf("failed to start job: %v", err) + wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.Status = JobStatus_Error + job.Error = errMsg }) return "", fmt.Errorf("failed to start remote job: %w", err) } log.Printf("[job:%s] RemoteStartJobCommand succeeded, pgid=%d", jobId, rtnData.Pgid) - job.Pgid = rtnData.Pgid - job.Status = JobStatus_Running - err = wstore.DBUpdate(ctx, job) + err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.Pgid = rtnData.Pgid + job.Status = JobStatus_Running + }) if err != nil { log.Printf("[job:%s] warning: failed to update job status to running: %v", jobId, err) } else { @@ -246,9 +247,8 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if err == io.EOF { log.Printf("[job:%s] stream ended (EOF)", jobId) - updateErr := wstore.DBUpdate(ctx, &waveobj.Job{ - OID: jobId, - StreamDone: true, + updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.StreamDone = true }) if updateErr != nil { log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr) @@ -259,10 +259,10 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if err != nil { log.Printf("[job:%s] stream error: %v", jobId, err) - updateErr := wstore.DBUpdate(ctx, &waveobj.Job{ - OID: jobId, - StreamDone: true, - StreamError: err.Error(), + streamErr := err.Error() + updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.StreamDone = true + job.StreamError = streamErr }) if updateErr != nil { log.Printf("[job:%s] error updating job stream error: %v", jobId, updateErr) @@ -274,31 +274,24 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade } func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobExitedData) error { - var status string - if data.ExitErr != "" { - status = JobStatus_Error - } else { - status = JobStatus_Done - } - - updateData := &waveobj.Job{ - OID: jobId, - Status: status, - ExitCode: data.ExitCode, - ExitSignal: data.ExitSignal, - ExitTs: data.ExitTs, - } - - if data.ExitErr != "" { - updateData.Error = data.ExitErr - } - - err := wstore.DBUpdate(ctx, updateData) + var finalStatus string + err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + if data.ExitErr != "" { + job.Status = JobStatus_Error + job.Error = data.ExitErr + } else { + job.Status = JobStatus_Done + } + job.ExitCode = data.ExitCode + job.ExitSignal = data.ExitSignal + job.ExitTs = data.ExitTs + finalStatus = job.Status + }) if err != nil { return fmt.Errorf("failed to update job exit status: %w", err) } - log.Printf("[job:%s] exited with code:%d signal:%q status:%s", jobId, data.ExitCode, data.ExitSignal, status) + log.Printf("[job:%s] exited with code:%d signal:%q status:%s", jobId, data.ExitCode, data.ExitSignal, finalStatus) tryTerminateJobManager(ctx, jobId) return nil } diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index fb1516cec8..b4efedb2d7 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -124,7 +124,12 @@ func (jm *JobManager) connectToStreamHelper_withlock(mainServerConn *MainServerC if jm.connectedStreamClient != nil { log.Printf("connectToStreamHelper: disconnecting existing client\n") + oldStreamId := jm.StreamManager.GetStreamId() jm.StreamManager.ClientDisconnected() + if oldStreamId != "" { + mainServerConn.WshRpc.StreamBroker.DetachStreamWriter(oldStreamId) + log.Printf("connectToStreamHelper: detached old stream id=%s\n", oldStreamId) + } jm.connectedStreamClient = nil } dataSender := &routedDataSender{ diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index 7989759031..49b60ebf03 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -143,6 +143,10 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm if err != nil { return nil, fmt.Errorf("failed to connect stream: %w", err) } + err = msc.WshRpc.StreamBroker.AttachStreamWriter(data.StreamMeta, WshCmdJobManager.StreamManager) + if err != nil { + return nil, fmt.Errorf("failed to attach stream writer: %w", err) + } log.Printf("StartJob: connected stream streamid=%s serverSeq=%d\n", data.StreamMeta.Id, serverSeq) } @@ -192,6 +196,11 @@ func (msc *MainServerConn) JobConnectCommand(ctx context.Context, data wshrpc.Co return nil, err } + err = msc.WshRpc.StreamBroker.AttachStreamWriter(&data.StreamMeta, WshCmdJobManager.StreamManager) + if err != nil { + return nil, fmt.Errorf("failed to attach stream writer: %w", err) + } + rtnData := &wshrpc.CommandJobConnectRtnData{Seq: serverSeq} hasExited, exitData := WshCmdJobManager.Cmd.GetExitInfo() if hasExited && exitData != nil { @@ -205,17 +214,6 @@ func (msc *MainServerConn) JobConnectCommand(ctx context.Context, data wshrpc.Co return rtnData, nil } -func (msc *MainServerConn) StreamDataAckCommand(ctx context.Context, data wshrpc.CommandStreamAckData) error { - if !msc.PeerAuthenticated.Load() { - return nil - } - if !msc.SelfAuthenticated.Load() { - return nil - } - WshCmdJobManager.StreamManager.RecvAck(data) - return nil -} - func (msc *MainServerConn) JobTerminateCommand(ctx context.Context, data wshrpc.CommandJobTerminateData) error { WshCmdJobManager.lock.Lock() defer WshCmdJobManager.lock.Unlock() diff --git a/pkg/jobmanager/streammanager.go b/pkg/jobmanager/streammanager.go index 79ea6084db..5c7342f666 100644 --- a/pkg/jobmanager/streammanager.go +++ b/pkg/jobmanager/streammanager.go @@ -141,6 +141,13 @@ func (sm *StreamManager) ClientConnected(streamId string, dataSender DataSender, return startSeq, nil } +// GetStreamId returns the current stream ID (safe to call with lock held by caller) +func (sm *StreamManager) GetStreamId() string { + sm.lock.Lock() + defer sm.lock.Unlock() + return sm.streamId +} + // ClientDisconnected transitions to DISCONNECTED mode func (sm *StreamManager) ClientDisconnected() { sm.lock.Lock() diff --git a/pkg/streamclient/streambroker.go b/pkg/streamclient/streambroker.go index a9b41ec393..65f9e6cbfa 100644 --- a/pkg/streamclient/streambroker.go +++ b/pkg/streamclient/streambroker.go @@ -87,6 +87,14 @@ func (b *Broker) AttachStreamWriter(meta *wshrpc.StreamMeta, writer StreamWriter return nil } +func (b *Broker) DetachStreamWriter(streamId string) { + b.lock.Lock() + defer b.lock.Unlock() + + delete(b.writers, streamId) + delete(b.writerRoutes, streamId) +} + func (b *Broker) CreateStreamWriter(meta *wshrpc.StreamMeta) (*Writer, error) { writer := NewWriter(meta.Id, meta.RWnd, b) err := b.AttachStreamWriter(meta, writer) diff --git a/pkg/wstore/wstore_dbops.go b/pkg/wstore/wstore_dbops.go index 6b64b3e474..7b16dbcf73 100644 --- a/pkg/wstore/wstore_dbops.go +++ b/pkg/wstore/wstore_dbops.go @@ -317,6 +317,17 @@ func DBUpdate(ctx context.Context, val waveobj.WaveObj) error { }) } +func DBUpdateFn[T waveobj.WaveObj](ctx context.Context, id string, updateFn func(T)) error { + return WithTx(ctx, func(tx *TxWrap) error { + val, err := DBMustGet[T](tx.Context(), id) + if err != nil { + return err + } + updateFn(val) + return DBUpdate(tx.Context(), val) + }) +} + func DBInsert(ctx context.Context, val waveobj.WaveObj) error { oid := waveobj.GetOID(val) if oid == "" { From 51aa95ef2ecc5ad940a86d5ba648d0587e882ad5 Mon Sep 17 00:00:00 2001 From: sawka Date: Fri, 16 Jan 2026 11:27:23 -0800 Subject: [PATCH 32/64] reorg job fields, split error, better jobdebug list output --- cmd/wsh/cmd/wshcmd-jobdebug.go | 70 +++++++++++++++++++++++++++--- frontend/app/store/wshclientapi.ts | 5 +++ frontend/types/gotypes.d.ts | 12 ++--- pkg/jobcontroller/jobcontroller.go | 67 ++++++++++++++++++---------- pkg/waveobj/wtype.go | 45 ++++++++++++------- pkg/wshrpc/wshclient/wshclient.go | 6 +++ pkg/wshrpc/wshrpctypes.go | 7 +++ pkg/wshrpc/wshserver/wshserver.go | 6 ++- 8 files changed, 167 insertions(+), 51 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 38d779b483..1a068fcf2f 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -44,6 +44,12 @@ var jobDebugDeleteAllCmd = &cobra.Command{ RunE: jobDebugDeleteAllRun, } +var jobDebugPruneCmd = &cobra.Command{ + Use: "prune", + Short: "remove jobs where the job manager is no longer running", + RunE: jobDebugPruneRun, +} + var jobDebugExitCmd = &cobra.Command{ Use: "exit", Short: "exit a job manager", @@ -72,6 +78,7 @@ func init() { jobDebugCmd.AddCommand(jobDebugListCmd) jobDebugCmd.AddCommand(jobDebugDeleteCmd) jobDebugCmd.AddCommand(jobDebugDeleteAllCmd) + jobDebugCmd.AddCommand(jobDebugPruneCmd) jobDebugCmd.AddCommand(jobDebugTerminateCmdCmd) jobDebugCmd.AddCommand(jobDebugExitCmd) jobDebugCmd.AddCommand(jobDebugGetOutputCmd) @@ -101,6 +108,16 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { return fmt.Errorf("getting job debug list: %w", err) } + connectedJobIds, err := wshclient.JobControllerConnectedJobsCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + return fmt.Errorf("getting connected job ids: %w", err) + } + + connectedMap := make(map[string]bool) + for _, jobId := range connectedJobIds { + connectedMap[jobId] = true + } + if jobDebugJsonFlag { jsonData, err := json.MarshalIndent(rtnData, "", " ") if err != nil { @@ -110,8 +127,18 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { return nil } - fmt.Printf("%-36s %-20s %-30s %-10s %-10s %-8s %s\n", "OID", "Connection", "Cmd", "Status", "Stream", "ExitCode", "Error") + fmt.Printf("%-36s %-20s %-9s %-7s %-30s %-10s %-10s %-8s %s\n", "OID", "Connection", "Connected", "Manager", "Cmd", "Status", "Stream", "ExitCode", "Error") for _, job := range rtnData { + connectedStatus := "no" + if connectedMap[job.OID] { + connectedStatus = "yes" + } + + managerStatus := "no" + if job.JobManagerRunning { + managerStatus = "yes" + } + streamStatus := "-" if job.StreamDone { if job.StreamError == "" { @@ -127,12 +154,14 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { } errorStr := "" - if job.Error != "" { - errorStr = fmt.Sprintf("%q", job.Error) + if job.StartupError != "" { + errorStr = fmt.Sprintf("%q", job.StartupError) + } else if job.ExitError != "" { + errorStr = fmt.Sprintf("%q", job.ExitError) } - fmt.Printf("%-36s %-20s %-30s %-10s %-10s %-8s %s\n", - job.OID, job.Connection, job.Cmd, job.Status, streamStatus, exitCode, errorStr) + fmt.Printf("%-36s %-20s %-9s %-7s %-30s %-10s %-10s %-8s %s\n", + job.OID, job.Connection, connectedStatus, managerStatus, job.Cmd, job.Status, streamStatus, exitCode, errorStr) } return nil } @@ -172,6 +201,37 @@ func jobDebugDeleteAllRun(cmd *cobra.Command, args []string) error { return nil } +func jobDebugPruneRun(cmd *cobra.Command, args []string) error { + rtnData, err := wshclient.JobDebugListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + return fmt.Errorf("getting job debug list: %w", err) + } + + if len(rtnData) == 0 { + fmt.Printf("No jobs to prune\n") + return nil + } + + deletedCount := 0 + for _, job := range rtnData { + if !job.JobManagerRunning { + err := wshclient.JobDebugDeleteCommand(RpcClient, job.OID, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + fmt.Printf("Error deleting job %s: %v\n", job.OID, err) + } else { + deletedCount++ + } + } + } + + if deletedCount == 0 { + fmt.Printf("No jobs with stopped job managers to prune\n") + } else { + fmt.Printf("Pruned %d job(s) with stopped job managers\n", deletedCount) + } + return nil +} + func jobDebugTerminateCmdRun(cmd *cobra.Command, args []string) error { err := wshclient.JobControllerTerminateJobCommand(RpcClient, jobIdFlag, nil) if err != nil { diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index db1e6e182e..1594abe565 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -397,6 +397,11 @@ class RpcApiType { return client.wshRpcCall("jobconnect", data, opts); } + // command "jobcontrollerconnectedjobs" [call] + JobControllerConnectedJobsCommand(client: WshClient, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerconnectedjobs", null, opts); + } + // command "jobcontrollerexitjob" [call] JobControllerExitJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerexitjob", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index f9d52d11fb..10220f151c 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -874,20 +874,22 @@ declare global { type Job = WaveObj & { connection: string; jobkind: string; - pgid: number; - ownerblockid: string; - huponconnect: boolean; - jobauthtoken: string; cmd: string; cmdargs?: string[]; cmdenv?: {[key: string]: string}; + jobauthtoken: string; + ownerblockid: string; + pgid: number; termsize?: TermSize; startts?: number; status: string; + startuperror?: string; exitts?: number; exitcode?: number; exitsignal?: string; - error?: string; + exiterror?: string; + huponconnect: boolean; + jobmanagerrunning?: boolean; streamdone?: boolean; streamerror?: string; }; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 05b0793bd6..6694c283cd 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -85,7 +85,23 @@ func GetJobConnStatus(jobId string) string { func SetJobConnStatus(jobId string, status string) { jobConnStatesLock.Lock() defer jobConnStatesLock.Unlock() - jobConnStates[jobId] = status + if status == JobConnStatus_Disconnected { + delete(jobConnStates, jobId) + } else { + jobConnStates[jobId] = status + } +} + +func GetConnectedJobIds() []string { + jobConnStatesLock.Lock() + defer jobConnStatesLock.Unlock() + var connectedJobIds []string + for jobId, status := range jobConnStates { + if status == JobConnStatus_Connected { + connectedJobIds = append(connectedJobIds, jobId) + } + } + return connectedJobIds } type StartJobParams struct { @@ -200,7 +216,7 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { errMsg := fmt.Sprintf("failed to start job: %v", err) wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { job.Status = JobStatus_Error - job.Error = errMsg + job.StartupError = errMsg }) return "", fmt.Errorf("failed to start remote job: %w", err) } @@ -209,6 +225,7 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { job.Pgid = rtnData.Pgid job.Status = JobStatus_Running + job.JobManagerRunning = true }) if err != nil { log.Printf("[job:%s] warning: failed to update job status to running: %v", jobId, err) @@ -253,7 +270,7 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if updateErr != nil { log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr) } - tryTerminateJobManager(ctx, jobId) + tryExitJobManager(ctx, jobId) break } @@ -267,7 +284,7 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if updateErr != nil { log.Printf("[job:%s] error updating job stream error: %v", jobId, updateErr) } - tryTerminateJobManager(ctx, jobId) + tryExitJobManager(ctx, jobId) break } } @@ -278,7 +295,7 @@ func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobEx err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { if data.ExitErr != "" { job.Status = JobStatus_Error - job.Error = data.ExitErr + job.ExitError = data.ExitErr } else { job.Status = JobStatus_Done } @@ -292,11 +309,11 @@ func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobEx } log.Printf("[job:%s] exited with code:%d signal:%q status:%s", jobId, data.ExitCode, data.ExitSignal, finalStatus) - tryTerminateJobManager(ctx, jobId) + tryExitJobManager(ctx, jobId) return nil } -func tryTerminateJobManager(ctx context.Context, jobId string) { +func tryExitJobManager(ctx context.Context, jobId string) { job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { log.Printf("[job:%s] error getting job for termination check: %v", jobId, err) @@ -310,26 +327,12 @@ func tryTerminateJobManager(ctx context.Context, jobId string) { return } - log.Printf("[job:%s] both job exited and stream finished, terminating job manager", jobId) + log.Printf("[job:%s] both job exited and stream finished, exiting job manager", jobId) - bareRpc := wshclient.GetBareRpcClient() - if bareRpc == nil { - log.Printf("[job:%s] error terminating job manager: rpc client not available", jobId) - return - } - - rpcOpts := &wshrpc.RpcOpts{ - Route: wshutil.MakeJobRouteId(jobId), - Timeout: 5000, - } - - err = wshclient.JobManagerExitCommand(bareRpc, rpcOpts) + err = ExitJobManager(ctx, jobId) if err != nil { - log.Printf("[job:%s] error sending job manager exit command: %v", jobId, err) - return + log.Printf("[job:%s] error exiting job manager: %v", jobId, err) } - - log.Printf("[job:%s] job manager exit command sent successfully", jobId) } func TerminateJob(ctx context.Context, jobId string) error { @@ -388,6 +391,22 @@ func ExitJobManager(ctx context.Context, jobId string) error { return fmt.Errorf("failed to send exit command: %w", err) } + updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.JobManagerRunning = false + }) + if updateErr != nil { + log.Printf("[job:%s] error updating job manager running status: %v", jobId, updateErr) + } + log.Printf("[job:%s] job manager exit command sent successfully", jobId) return nil } + +func DeleteJob(ctx context.Context, jobId string) error { + SetJobConnStatus(jobId, JobConnStatus_Disconnected) + err := filestore.WFS.DeleteZone(ctx, jobId) + if err != nil { + log.Printf("[job:%s] warning: error deleting WaveFS zone: %v", jobId, err) + } + return wstore.DBDelete(ctx, waveobj.OType_Job, jobId) +} diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index c871229b4f..496c846620 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -310,27 +310,40 @@ func (*MainServer) GetOType() string { } type Job struct { - OID string `json:"oid"` - Version int `json:"version"` + OID string `json:"oid"` + Version int `json:"version"` + + // job metadata Connection string `json:"connection"` JobKind string `json:"jobkind"` // shell, task - Pgid int `json:"pgid"` // process group id - AttachedBlockId string `json:"ownerblockid"` - HupOnConnect bool `json:"huponconnect"` - JobAuthToken string `json:"jobauthtoken"` // job manger -> wave Cmd string `json:"cmd"` CmdArgs []string `json:"cmdargs,omitempty"` CmdEnv map[string]string `json:"cmdenv,omitempty"` - TermSize TermSize `json:"termsize,omitempty"` - StartTs int64 `json:"startts,omitempty"` // timestamp (milliseconds) - Status string `json:"status"` // init, running, done - ExitTs int64 `json:"exitts,omitempty"` // timestamp (milliseconds) - ExitCode int `json:"exitcode,omitempty"` - ExitSignal string `json:"exitsignal,omitempty"` - Error string `json:"error,omitempty"` - StreamDone bool `json:"streamdone,omitempty"` - StreamError string `json:"streamerror,omitempty"` - Meta MetaMapType `json:"meta"` + JobAuthToken string `json:"jobauthtoken"` // job manger -> wave + AttachedBlockId string `json:"ownerblockid"` + + // cmd/process runtime info + Pgid int `json:"pgid"` // process group id + TermSize TermSize `json:"termsize,omitempty"` + StartTs int64 `json:"startts,omitempty"` // timestamp (milliseconds) + Status string `json:"status"` // init, running, done + StartupError string `json:"startuperror,omitempty"` + ExitTs int64 `json:"exitts,omitempty"` // timestamp (milliseconds) + ExitCode int `json:"exitcode,omitempty"` + ExitSignal string `json:"exitsignal,omitempty"` + ExitError string `json:"exiterror,omitempty"` + + // reconnect option (e.g. orphaned, so we need to kill on connect) + HupOnConnect bool `json:"huponconnect"` + + // job manager state + JobManagerRunning bool `json:"jobmanagerrunning,omitempty"` + + // output info + StreamDone bool `json:"streamdone,omitempty"` + StreamError string `json:"streamerror,omitempty"` + + Meta MetaMapType `json:"meta"` } func (*Job) GetOType() string { diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index ae10c5cf47..5881e62e93 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -482,6 +482,12 @@ func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opt return resp, err } +// command "jobcontrollerconnectedjobs", wshserver.JobControllerConnectedJobsCommand +func JobControllerConnectedJobsCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) ([]string, error) { + resp, err := sendRpcRequestCallHelper[[]string](w, "jobcontrollerconnectedjobs", nil, opts) + return resp, err +} + // command "jobcontrollerexitjob", wshserver.JobControllerExitJobCommand func JobControllerExitJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerexitjob", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 529b90a773..43bcb8a069 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -22,6 +22,12 @@ type RespOrErrorUnion[T any] struct { Error error } +// Instructions for adding a new RPC call +// * methods must end with Command +// * methods must take context as their first parameter +// * methods may take up to one parameter, and may return either just an error, or one return value plus an error +// * after modifying WshRpcInterface, run `task generate` to regnerate bindings + type WshRpcInterface interface { AuthenticateCommand(ctx context.Context, data string) (CommandAuthenticateRtnData, error) AuthenticateTokenCommand(ctx context.Context, data CommandAuthenticateTokenData) (CommandAuthenticateRtnData, error) @@ -170,6 +176,7 @@ type WshRpcInterface interface { JobControllerStartJobCommand(ctx context.Context, data CommandJobControllerStartJobData) (string, error) JobControllerTerminateJobCommand(ctx context.Context, jobId string) error JobControllerExitJobCommand(ctx context.Context, jobId string) error + JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) } // for frontend diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 82dfff6317..4a951e2bf6 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -1441,7 +1441,7 @@ func (ws *WshServer) JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, e } func (ws *WshServer) JobDebugDeleteCommand(ctx context.Context, jobId string) error { - return wstore.DBDelete(ctx, waveobj.OType_Job, jobId) + return jobcontroller.DeleteJob(ctx, jobId) } func (ws *WshServer) JobControllerStartJobCommand(ctx context.Context, data wshrpc.CommandJobControllerStartJobData) (string, error) { @@ -1462,3 +1462,7 @@ func (ws *WshServer) JobControllerTerminateJobCommand(ctx context.Context, jobId func (ws *WshServer) JobControllerExitJobCommand(ctx context.Context, jobId string) error { return jobcontroller.ExitJobManager(ctx, jobId) } + +func (ws *WshServer) JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) { + return jobcontroller.GetConnectedJobIds(), nil +} From bce074796a93ac1fb60abe771480744db86512a6 Mon Sep 17 00:00:00 2001 From: sawka Date: Fri, 16 Jan 2026 16:54:41 -0800 Subject: [PATCH 33/64] checkpoint on reconnect --- frontend/app/store/wshclientapi.ts | 15 ++-- frontend/types/gotypes.d.ts | 16 ++-- pkg/jobmanager/jobmanager.go | 1 + pkg/jobmanager/mainserverconn.go | 42 ++++++++-- pkg/jobmanager/streammanager.go | 20 +++++ pkg/wshrpc/wshclient/wshclient.go | 18 +++-- pkg/wshrpc/wshremote/wshremote.go | 121 ++++++++++++++++++----------- pkg/wshrpc/wshrpctypes.go | 15 +++- 8 files changed, 175 insertions(+), 73 deletions(-) diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 1594abe565..20007e08fc 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -392,11 +392,6 @@ class RpcApiType { return client.wshRpcCall("getwaveairatelimit", null, opts); } - // command "jobconnect" [call] - JobConnectCommand(client: WshClient, data: CommandJobConnectData, opts?: RpcOpts): Promise { - return client.wshRpcCall("jobconnect", data, opts); - } - // command "jobcontrollerconnectedjobs" [call] JobControllerConnectedJobsCommand(client: WshClient, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerconnectedjobs", null, opts); @@ -437,6 +432,16 @@ class RpcApiType { return client.wshRpcCall("jobmanagerexit", null, opts); } + // command "jobprepareconnect" [call] + JobPrepareConnectCommand(client: WshClient, data: CommandJobPrepareConnectData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobprepareconnect", data, opts); + } + + // command "jobstartstream" [call] + JobStartStreamCommand(client: WshClient, data: CommandJobStartStreamData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobstartstream", data, opts); + } + // command "jobterminate" [call] JobTerminateCommand(client: WshClient, data: CommandJobTerminateData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobterminate", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 10220f151c..a9d8e6f10b 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -355,12 +355,6 @@ declare global { chatid: string; }; - // wshrpc.CommandJobConnectData - type CommandJobConnectData = { - streammeta: StreamMeta; - seq: number; - }; - // wshrpc.CommandJobConnectRtnData type CommandJobConnectRtnData = { seq: number; @@ -388,6 +382,16 @@ declare global { exitts?: number; }; + // wshrpc.CommandJobPrepareConnectData + type CommandJobPrepareConnectData = { + streammeta: StreamMeta; + seq: number; + }; + + // wshrpc.CommandJobStartStreamData + type CommandJobStartStreamData = { + }; + // wshrpc.CommandJobTerminateData type CommandJobTerminateData = { }; diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index b4efedb2d7..bcdb18570d 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -36,6 +36,7 @@ type JobManager struct { lock sync.Mutex attachedClient *MainServerConn connectedStreamClient *MainServerConn + pendingStreamMeta *wshrpc.StreamMeta } func SetupJobManager(clientId string, jobId string, publicKeyBytes []byte, jobAuthToken string, readyFile *os.File) error { diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index 49b60ebf03..ecf73a2d4c 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -177,7 +177,7 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm return &wshrpc.CommandStartJobRtnData{Pgid: pgid}, nil } -func (msc *MainServerConn) JobConnectCommand(ctx context.Context, data wshrpc.CommandJobConnectData) (*wshrpc.CommandJobConnectRtnData, error) { +func (msc *MainServerConn) JobPrepareConnectCommand(ctx context.Context, data wshrpc.CommandJobPrepareConnectData) (*wshrpc.CommandJobConnectRtnData, error) { WshCmdJobManager.lock.Lock() defer WshCmdJobManager.lock.Unlock() @@ -191,15 +191,14 @@ func (msc *MainServerConn) JobConnectCommand(ctx context.Context, data wshrpc.Co return nil, fmt.Errorf("job not started") } - serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(msc, data.StreamMeta, data.Seq) + corkedStreamMeta := data.StreamMeta + corkedStreamMeta.RWnd = 0 + serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(msc, corkedStreamMeta, data.Seq) if err != nil { return nil, err } - err = msc.WshRpc.StreamBroker.AttachStreamWriter(&data.StreamMeta, WshCmdJobManager.StreamManager) - if err != nil { - return nil, fmt.Errorf("failed to attach stream writer: %w", err) - } + WshCmdJobManager.pendingStreamMeta = &data.StreamMeta rtnData := &wshrpc.CommandJobConnectRtnData{Seq: serverSeq} hasExited, exitData := WshCmdJobManager.Cmd.GetExitInfo() @@ -210,10 +209,39 @@ func (msc *MainServerConn) JobConnectCommand(ctx context.Context, data wshrpc.Co rtnData.ExitErr = exitData.ExitErr } - log.Printf("JobConnect: streamid=%s clientSeq=%d serverSeq=%d hasExited=%v\n", data.StreamMeta.Id, data.Seq, serverSeq, hasExited) + log.Printf("JobPrepareConnect: streamid=%s clientSeq=%d serverSeq=%d hasExited=%v (rwnd=0 cork mode)\n", data.StreamMeta.Id, data.Seq, serverSeq, hasExited) return rtnData, nil } +func (msc *MainServerConn) JobStartStreamCommand(ctx context.Context, data wshrpc.CommandJobStartStreamData) error { + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if !msc.PeerAuthenticated.Load() { + return fmt.Errorf("not authenticated") + } + if WshCmdJobManager.Cmd == nil { + return fmt.Errorf("job not started") + } + if WshCmdJobManager.pendingStreamMeta == nil { + return fmt.Errorf("no pending stream (call JobPrepareConnect first)") + } + + err := msc.WshRpc.StreamBroker.AttachStreamWriter(WshCmdJobManager.pendingStreamMeta, WshCmdJobManager.StreamManager) + if err != nil { + return fmt.Errorf("failed to attach stream writer: %w", err) + } + + err = WshCmdJobManager.StreamManager.SetRwndSize(int(WshCmdJobManager.pendingStreamMeta.RWnd)) + if err != nil { + return fmt.Errorf("failed to set rwnd size: %w", err) + } + + log.Printf("JobStartStream: streamid=%s rwnd=%d streaming started\n", WshCmdJobManager.pendingStreamMeta.Id, WshCmdJobManager.pendingStreamMeta.RWnd) + WshCmdJobManager.pendingStreamMeta = nil + return nil +} + func (msc *MainServerConn) JobTerminateCommand(ctx context.Context, data wshrpc.CommandJobTerminateData) error { WshCmdJobManager.lock.Lock() defer WshCmdJobManager.lock.Unlock() diff --git a/pkg/jobmanager/streammanager.go b/pkg/jobmanager/streammanager.go index 5c7342f666..e36129ba69 100644 --- a/pkg/jobmanager/streammanager.go +++ b/pkg/jobmanager/streammanager.go @@ -214,6 +214,26 @@ func (sm *StreamManager) RecvAck(ackPk wshrpc.CommandStreamAckData) { } } +// SetRwndSize dynamically updates the receive window size +func (sm *StreamManager) SetRwndSize(rwndSize int) error { + sm.lock.Lock() + defer sm.lock.Unlock() + if rwndSize < 0 { + return fmt.Errorf("rwndSize cannot be negative") + } + if !sm.connected { + return fmt.Errorf("not connected") + } + sm.rwndSize = rwndSize + effectiveWindow := sm.cwndSize + if sm.rwndSize < effectiveWindow { + effectiveWindow = sm.rwndSize + } + sm.buf.SetEffectiveWindow(true, effectiveWindow) + sm.drainCond.Signal() + return nil +} + // Close shuts down the sender loop. The reader loop will exit on its next iteration // or when the underlying reader is closed. func (sm *StreamManager) Close() { diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 5881e62e93..67bc8e75ed 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -476,12 +476,6 @@ func GetWaveAIRateLimitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) (*uctype return resp, err } -// command "jobconnect", wshserver.JobConnectCommand -func JobConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobConnectData, opts *wshrpc.RpcOpts) (*wshrpc.CommandJobConnectRtnData, error) { - resp, err := sendRpcRequestCallHelper[*wshrpc.CommandJobConnectRtnData](w, "jobconnect", data, opts) - return resp, err -} - // command "jobcontrollerconnectedjobs", wshserver.JobControllerConnectedJobsCommand func JobControllerConnectedJobsCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) ([]string, error) { resp, err := sendRpcRequestCallHelper[[]string](w, "jobcontrollerconnectedjobs", nil, opts) @@ -530,6 +524,18 @@ func JobManagerExitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) error { return err } +// command "jobprepareconnect", wshserver.JobPrepareConnectCommand +func JobPrepareConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobPrepareConnectData, opts *wshrpc.RpcOpts) (*wshrpc.CommandJobConnectRtnData, error) { + resp, err := sendRpcRequestCallHelper[*wshrpc.CommandJobConnectRtnData](w, "jobprepareconnect", data, opts) + return resp, err +} + +// command "jobstartstream", wshserver.JobStartStreamCommand +func JobStartStreamCommand(w *wshutil.WshRpc, data wshrpc.CommandJobStartStreamData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobstartstream", data, opts) + return err +} + // command "jobterminate", wshserver.JobTerminateCommand func JobTerminateCommand(w *wshutil.WshRpc, data wshrpc.CommandJobTerminateData, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobterminate", data, opts) diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index 0b44edaeb5..2626f8c17f 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -880,12 +880,67 @@ func (impl *ServerImpl) getWshPath() (string, error) { return wshPath, nil } +// returns jobRouteId, cleanupFunc, error +func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, mainServerJwtToken string) (string, func(), error) { + socketPath := jobmanager.GetJobSocketPath(jobId) + log.Printf("connectToJobManager: connecting to socket: %s\n", socketPath) + conn, err := net.Dial("unix", socketPath) + if err != nil { + log.Printf("connectToJobManager: error connecting to socket: %v\n", err) + return "", nil, fmt.Errorf("cannot connect to job manager socket: %w", err) + } + log.Printf("connectToJobManager: connected to socket\n") + + proxy := wshutil.MakeRpcProxy("jobmanager") + go func() { + writeErr := wshutil.AdaptOutputChToStream(proxy.ToRemoteCh, conn) + if writeErr != nil { + log.Printf("connectToJobManager: error writing to job manager socket: %v\n", writeErr) + } + }() + go func() { + defer func() { + conn.Close() + close(proxy.FromRemoteCh) + }() + wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) + }() + + linkId := impl.Router.RegisterUntrustedLink(proxy) + cleanup := func() { + conn.Close() + impl.Router.UnregisterLink(linkId) + } + + routeId := wshutil.MakeLinkRouteId(linkId) + authData := wshrpc.CommandAuthenticateToJobData{ + JobAccessToken: mainServerJwtToken, + } + err = wshclient.AuthenticateToJobManagerCommand(impl.RpcClient, authData, &wshrpc.RpcOpts{Route: routeId}) + if err != nil { + cleanup() + return "", nil, fmt.Errorf("authentication to job manager failed: %w", err) + } + + jobRouteId := wshutil.MakeJobRouteId(jobId) + waitCtx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + err = impl.Router.WaitForRegister(waitCtx, jobRouteId) + if err != nil { + cleanup() + return "", nil, fmt.Errorf("timeout waiting for job route to register: %w", err) + } + + log.Printf("connectToJobManager: successfully connected and authenticated\n") + return jobRouteId, cleanup, nil +} + func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.CommandRemoteStartJobData) (*wshrpc.CommandStartJobRtnData, error) { log.Printf("RemoteStartJobCommand: starting, jobid=%s, clientid=%s\n", data.JobId, data.ClientId) if impl.Router == nil { return nil, fmt.Errorf("cannot start remote job: no router available") } - + wshPath, err := impl.getWshPath() if err != nil { return nil, err @@ -998,51 +1053,9 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C cmd.Wait() }() - socketPath := jobmanager.GetJobSocketPath(data.JobId) - log.Printf("RemoteStartJobCommand: connecting to socket: %s\n", socketPath) - conn, err := net.Dial("unix", socketPath) - if err != nil { - log.Printf("RemoteStartJobCommand: error connecting to socket: %v\n", err) - return nil, fmt.Errorf("cannot connect to job manager socket: %w", err) - } - log.Printf("RemoteStartJobCommand: connected to socket\n") - - proxy := wshutil.MakeRpcProxy("jobmanager") - go func() { - writeErr := wshutil.AdaptOutputChToStream(proxy.ToRemoteCh, conn) - if writeErr != nil { - log.Printf("RemoteStartJobCommand: error writing to job manager socket: %v\n", writeErr) - } - }() - go func() { - defer func() { - conn.Close() - close(proxy.FromRemoteCh) - }() - wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) - }() - - linkId := impl.Router.RegisterUntrustedLink(proxy) - - routeId := wshutil.MakeLinkRouteId(linkId) - authData := wshrpc.CommandAuthenticateToJobData{ - JobAccessToken: data.MainServerJwtToken, - } - err = wshclient.AuthenticateToJobManagerCommand(impl.RpcClient, authData, &wshrpc.RpcOpts{Route: routeId}) + jobRouteId, cleanup, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) if err != nil { - conn.Close() - impl.Router.UnregisterLink(linkId) - return nil, fmt.Errorf("authentication to job manager failed: %w", err) - } - - jobRouteId := wshutil.MakeJobRouteId(data.JobId) - waitCtx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) - defer cancel() - err = impl.Router.WaitForRegister(waitCtx, jobRouteId) - if err != nil { - conn.Close() - impl.Router.UnregisterLink(linkId) - return nil, fmt.Errorf("timeout waiting for job route to register: %w", err) + return nil, err } startJobData := wshrpc.CommandStartJobData{ @@ -1054,10 +1067,24 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C } rtnData, err := wshclient.StartJobCommand(impl.RpcClient, startJobData, &wshrpc.RpcOpts{Route: jobRouteId}) if err != nil { - conn.Close() - impl.Router.UnregisterLink(linkId) + cleanup() return nil, fmt.Errorf("failed to start job: %w", err) } return rtnData, nil } + +func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteReconnectToJobManagerData) error { + log.Printf("RemoteReconnectToJobManagerCommand: reconnecting, jobid=%s\n", data.JobId) + if impl.Router == nil { + return fmt.Errorf("cannot reconnect to job manager: no router available") + } + + _, _, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) + if err != nil { + return err + } + + log.Printf("RemoteReconnectToJobManagerCommand: successfully reconnected to job manager\n") + return nil +} diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 43bcb8a069..e66c78bfa7 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -109,6 +109,7 @@ type WshRpcInterface interface { RemoteGetInfoCommand(ctx context.Context) (RemoteInfo, error) RemoteInstallRcFilesCommand(ctx context.Context) error RemoteStartJobCommand(ctx context.Context, data CommandRemoteStartJobData) (*CommandStartJobRtnData, error) + RemoteReconnectToJobManagerCommand(ctx context.Context, data CommandRemoteReconnectToJobManagerData) error // emain WebSelectorCommand(ctx context.Context, data CommandWebSelectorData) ([]string, error) @@ -167,7 +168,8 @@ type WshRpcInterface interface { // jobs AuthenticateToJobManagerCommand(ctx context.Context, data CommandAuthenticateToJobData) error StartJobCommand(ctx context.Context, data CommandStartJobData) (*CommandStartJobRtnData, error) - JobConnectCommand(ctx context.Context, data CommandJobConnectData) (*CommandJobConnectRtnData, error) + JobPrepareConnectCommand(ctx context.Context, data CommandJobPrepareConnectData) (*CommandJobConnectRtnData, error) + JobStartStreamCommand(ctx context.Context, data CommandJobStartStreamData) error JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error JobExitedCommand(ctx context.Context, data CommandJobExitedData) error // this is sent FROM the job manager => main server JobManagerExitCommand(ctx context.Context) error @@ -710,15 +712,24 @@ type CommandRemoteStartJobData struct { PublicKeyBase64 string `json:"publickeybase64"` } +type CommandRemoteReconnectToJobManagerData struct { + JobId string `json:"jobid"` + JobAuthToken string `json:"jobauthtoken"` + MainServerJwtToken string `json:"mainserverjwttoken"` +} + type CommandStartJobRtnData struct { Pgid int `json:"pgid"` } -type CommandJobConnectData struct { +type CommandJobPrepareConnectData struct { StreamMeta StreamMeta `json:"streammeta"` Seq int64 `json:"seq"` } +type CommandJobStartStreamData struct { +} + type CommandJobConnectRtnData struct { Seq int64 `json:"seq"` HasExited bool `json:"hasexited,omitempty"` From c627a4d2bbaa9f1e628f99f8f7dee52abe4dd87e Mon Sep 17 00:00:00 2001 From: sawka Date: Sun, 18 Jan 2026 16:45:12 -0800 Subject: [PATCH 34/64] checkpoint, spilt up wshremote --- frontend/app/store/wshclientapi.ts | 5 + frontend/types/gotypes.d.ts | 15 +- pkg/jobcontroller/jobcontroller.go | 6 +- pkg/jobmanager/mainserverconn.go | 24 +- pkg/waveobj/wtype.go | 6 +- pkg/wshrpc/wshclient/wshclient.go | 6 + pkg/wshrpc/wshremote/wshremote.go | 793 +----------------------- pkg/wshrpc/wshremote/wshremote_file.go | 810 +++++++++++++++++++++++++ pkg/wshrpc/wshrpctypes.go | 4 +- 9 files changed, 869 insertions(+), 800 deletions(-) create mode 100644 pkg/wshrpc/wshremote/wshremote_file.go diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 20007e08fc..cacc89514b 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -552,6 +552,11 @@ class RpcApiType { return client.wshRpcCall("remotemkdir", data, opts); } + // command "remotereconnecttojobmanager" [call] + RemoteReconnectToJobManagerCommand(client: WshClient, data: CommandRemoteReconnectToJobManagerData, opts?: RpcOpts): Promise { + return client.wshRpcCall("remotereconnecttojobmanager", data, opts); + } + // command "remotestartjob" [call] RemoteStartJobCommand(client: WshClient, data: CommandRemoteStartJobData, opts?: RpcOpts): Promise { return client.wshRpcCall("remotestartjob", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index a9d8e6f10b..ce12df7aa0 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -461,6 +461,13 @@ declare global { fileinfo?: FileInfo[]; }; + // wshrpc.CommandRemoteReconnectToJobManagerData + type CommandRemoteReconnectToJobManagerData = { + jobid: string; + jobauthtoken: string; + mainserverjwttoken: string; + }; + // wshrpc.CommandRemoteStartJobData type CommandRemoteStartJobData = { cmd: string; @@ -539,7 +546,9 @@ declare global { // wshrpc.CommandStartJobRtnData type CommandStartJobRtnData = { - pgid: number; + cmdpgid: number; + jobmanagerpid: number; + jobmanagerstartts: number; }; // wshrpc.CommandStreamAckData @@ -883,7 +892,7 @@ declare global { cmdenv?: {[key: string]: string}; jobauthtoken: string; ownerblockid: string; - pgid: number; + cmdpgid: number; termsize?: TermSize; startts?: number; status: string; @@ -894,6 +903,8 @@ declare global { exiterror?: string; huponconnect: boolean; jobmanagerrunning?: boolean; + jobmanagerpid?: number; + jobmanagerstartts?: number; streamdone?: boolean; streamerror?: string; }; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 6694c283cd..8dd98c04c6 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -221,9 +221,11 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { return "", fmt.Errorf("failed to start remote job: %w", err) } - log.Printf("[job:%s] RemoteStartJobCommand succeeded, pgid=%d", jobId, rtnData.Pgid) + log.Printf("[job:%s] RemoteStartJobCommand succeeded, cmdpgid=%d jobmanagerpid=%d jobmanagerstartts=%d", jobId, rtnData.CmdPgid, rtnData.JobManagerPid, rtnData.JobManagerStartTs) err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.Pgid = rtnData.Pgid + job.CmdPgid = rtnData.CmdPgid + job.JobManagerPid = rtnData.JobManagerPid + job.JobManagerStartTs = rtnData.JobManagerStartTs job.Status = JobStatus_Running job.JobManagerRunning = true }) diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index ecf73a2d4c..4f683b32e6 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -13,6 +13,7 @@ import ( "sync/atomic" "time" + "github.com/shirou/gopsutil/v4/process" "github.com/wavetermdev/waveterm/pkg/baseds" "github.com/wavetermdev/waveterm/pkg/wavejwt" "github.com/wavetermdev/waveterm/pkg/wshrpc" @@ -168,13 +169,30 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm log.Printf("StartJobCommand: cmd or process is nil") return nil, fmt.Errorf("cmd or process is nil") } - pgid, err := getProcessGroupId(cmd.Process.Pid) + cmdPgid, err := getProcessGroupId(cmd.Process.Pid) if err != nil { log.Printf("StartJobCommand: failed to get pgid: %v", err) return nil, fmt.Errorf("failed to get process group id: %w", err) } - log.Printf("StartJobCommand: job started successfully pid=%d pgid=%d", cmd.Process.Pid, pgid) - return &wshrpc.CommandStartJobRtnData{Pgid: pgid}, nil + + jobManagerPid := os.Getpid() + proc, err := process.NewProcess(int32(jobManagerPid)) + if err != nil { + log.Printf("StartJobCommand: failed to get job manager process: %v", err) + return nil, fmt.Errorf("failed to get job manager process: %w", err) + } + jobManagerStartTs, err := proc.CreateTime() + if err != nil { + log.Printf("StartJobCommand: failed to get job manager start time: %v", err) + return nil, fmt.Errorf("failed to get job manager start time: %w", err) + } + + log.Printf("StartJobCommand: job started successfully cmdPid=%d cmdPgid=%d jobManagerPid=%d jobManagerStartTs=%d", cmd.Process.Pid, cmdPgid, jobManagerPid, jobManagerStartTs) + return &wshrpc.CommandStartJobRtnData{ + CmdPgid: cmdPgid, + JobManagerPid: jobManagerPid, + JobManagerStartTs: jobManagerStartTs, + }, nil } func (msc *MainServerConn) JobPrepareConnectCommand(ctx context.Context, data wshrpc.CommandJobPrepareConnectData) (*wshrpc.CommandJobConnectRtnData, error) { diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 496c846620..14dc8b56c9 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -323,7 +323,7 @@ type Job struct { AttachedBlockId string `json:"ownerblockid"` // cmd/process runtime info - Pgid int `json:"pgid"` // process group id + CmdPgid int `json:"cmdpgid"` // command process group id TermSize TermSize `json:"termsize,omitempty"` StartTs int64 `json:"startts,omitempty"` // timestamp (milliseconds) Status string `json:"status"` // init, running, done @@ -337,7 +337,9 @@ type Job struct { HupOnConnect bool `json:"huponconnect"` // job manager state - JobManagerRunning bool `json:"jobmanagerrunning,omitempty"` + JobManagerRunning bool `json:"jobmanagerrunning,omitempty"` + JobManagerPid int `json:"jobmanagerpid,omitempty"` + JobManagerStartTs int64 `json:"jobmanagerstartts,omitempty"` // output info StreamDone bool `json:"streamdone,omitempty"` diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 67bc8e75ed..accdc8cd80 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -667,6 +667,12 @@ func RemoteMkdirCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) er return err } +// command "remotereconnecttojobmanager", wshserver.RemoteReconnectToJobManagerCommand +func RemoteReconnectToJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteReconnectToJobManagerData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "remotereconnecttojobmanager", data, opts) + return err +} + // command "remotestartjob", wshserver.RemoteStartJobCommand func RemoteStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteStartJobData, opts *wshrpc.RpcOpts) (*wshrpc.CommandStartJobRtnData, error) { resp, err := sendRpcRequestCallHelper[*wshrpc.CommandStartJobRtnData](w, "remotestartjob", data, opts) diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index 2626f8c17f..7da3ea5d35 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -4,14 +4,10 @@ package wshremote import ( - "archive/tar" "bufio" "context" - "encoding/base64" - "errors" "fmt" "io" - "io/fs" "log" "net" "os" @@ -21,20 +17,16 @@ import ( "time" "github.com/wavetermdev/waveterm/pkg/jobmanager" - "github.com/wavetermdev/waveterm/pkg/remote/connparse" - "github.com/wavetermdev/waveterm/pkg/remote/fileshare/fstype" - "github.com/wavetermdev/waveterm/pkg/remote/fileshare/wshfs" "github.com/wavetermdev/waveterm/pkg/suggestion" - "github.com/wavetermdev/waveterm/pkg/util/fileutil" - "github.com/wavetermdev/waveterm/pkg/util/iochan/iochantypes" - "github.com/wavetermdev/waveterm/pkg/util/tarcopy" - "github.com/wavetermdev/waveterm/pkg/util/utilfn" "github.com/wavetermdev/waveterm/pkg/wavebase" "github.com/wavetermdev/waveterm/pkg/wshrpc" "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" "github.com/wavetermdev/waveterm/pkg/wshutil" ) +// this is the connserver interface. +// it runs on remote servers, and one instance also runs on localhost + type ServerImpl struct { LogWriter io.Writer Router *wshutil.WshRouter @@ -73,785 +65,6 @@ func (impl *ServerImpl) StreamTestCommand(ctx context.Context) chan wshrpc.RespO return ch } -type ByteRangeType struct { - All bool - Start int64 - End int64 -} - -func parseByteRange(rangeStr string) (ByteRangeType, error) { - if rangeStr == "" { - return ByteRangeType{All: true}, nil - } - var start, end int64 - _, err := fmt.Sscanf(rangeStr, "%d-%d", &start, &end) - if err != nil { - return ByteRangeType{}, errors.New("invalid byte range") - } - if start < 0 || end < 0 || start > end { - return ByteRangeType{}, errors.New("invalid byte range") - } - return ByteRangeType{Start: start, End: end}, nil -} - -func (impl *ServerImpl) remoteStreamFileDir(ctx context.Context, path string, byteRange ByteRangeType, dataCallback func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType)) error { - innerFilesEntries, err := os.ReadDir(path) - if err != nil { - return fmt.Errorf("cannot open dir %q: %w", path, err) - } - if byteRange.All { - if len(innerFilesEntries) > wshrpc.MaxDirSize { - innerFilesEntries = innerFilesEntries[:wshrpc.MaxDirSize] - } - } else { - if byteRange.Start < int64(len(innerFilesEntries)) { - realEnd := byteRange.End - if realEnd > int64(len(innerFilesEntries)) { - realEnd = int64(len(innerFilesEntries)) - } - innerFilesEntries = innerFilesEntries[byteRange.Start:realEnd] - } else { - innerFilesEntries = []os.DirEntry{} - } - } - var fileInfoArr []*wshrpc.FileInfo - for _, innerFileEntry := range innerFilesEntries { - if ctx.Err() != nil { - return ctx.Err() - } - innerFileInfoInt, err := innerFileEntry.Info() - if err != nil { - continue - } - innerFileInfo := statToFileInfo(filepath.Join(path, innerFileInfoInt.Name()), innerFileInfoInt, false) - fileInfoArr = append(fileInfoArr, innerFileInfo) - if len(fileInfoArr) >= wshrpc.DirChunkSize { - dataCallback(fileInfoArr, nil, byteRange) - fileInfoArr = nil - } - } - if len(fileInfoArr) > 0 { - dataCallback(fileInfoArr, nil, byteRange) - } - return nil -} - -func (impl *ServerImpl) remoteStreamFileRegular(ctx context.Context, path string, byteRange ByteRangeType, dataCallback func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType)) error { - fd, err := os.Open(path) - if err != nil { - return fmt.Errorf("cannot open file %q: %w", path, err) - } - defer utilfn.GracefulClose(fd, "remoteStreamFileRegular", path) - var filePos int64 - if !byteRange.All && byteRange.Start > 0 { - _, err := fd.Seek(byteRange.Start, io.SeekStart) - if err != nil { - return fmt.Errorf("seeking file %q: %w", path, err) - } - filePos = byteRange.Start - } - buf := make([]byte, wshrpc.FileChunkSize) - for { - if ctx.Err() != nil { - return ctx.Err() - } - n, err := fd.Read(buf) - if n > 0 { - if !byteRange.All && filePos+int64(n) > byteRange.End { - n = int(byteRange.End - filePos) - } - filePos += int64(n) - dataCallback(nil, buf[:n], byteRange) - } - if !byteRange.All && filePos >= byteRange.End { - break - } - if errors.Is(err, io.EOF) { - break - } - if err != nil { - return fmt.Errorf("reading file %q: %w", path, err) - } - } - return nil -} - -func (impl *ServerImpl) remoteStreamFileInternal(ctx context.Context, data wshrpc.CommandRemoteStreamFileData, dataCallback func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType)) error { - byteRange, err := parseByteRange(data.ByteRange) - if err != nil { - return err - } - path, err := wavebase.ExpandHomeDir(data.Path) - if err != nil { - return err - } - finfo, err := impl.fileInfoInternal(path, true) - if err != nil { - return fmt.Errorf("cannot stat file %q: %w", path, err) - } - dataCallback([]*wshrpc.FileInfo{finfo}, nil, byteRange) - if finfo.NotFound { - return nil - } - if finfo.IsDir { - return impl.remoteStreamFileDir(ctx, path, byteRange, dataCallback) - } else { - return impl.remoteStreamFileRegular(ctx, path, byteRange, dataCallback) - } -} - -func (impl *ServerImpl) RemoteStreamFileCommand(ctx context.Context, data wshrpc.CommandRemoteStreamFileData) chan wshrpc.RespOrErrorUnion[wshrpc.FileData] { - ch := make(chan wshrpc.RespOrErrorUnion[wshrpc.FileData], 16) - go func() { - defer close(ch) - firstPk := true - err := impl.remoteStreamFileInternal(ctx, data, func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType) { - resp := wshrpc.FileData{} - fileInfoLen := len(fileInfo) - if fileInfoLen > 1 || !firstPk { - resp.Entries = fileInfo - } else if fileInfoLen == 1 { - resp.Info = fileInfo[0] - } - if firstPk { - firstPk = false - } - if len(data) > 0 { - resp.Data64 = base64.StdEncoding.EncodeToString(data) - resp.At = &wshrpc.FileDataAt{Offset: byteRange.Start, Size: len(data)} - } - ch <- wshrpc.RespOrErrorUnion[wshrpc.FileData]{Response: resp} - }) - if err != nil { - ch <- wshutil.RespErr[wshrpc.FileData](err) - } - }() - return ch -} - -func (impl *ServerImpl) RemoteTarStreamCommand(ctx context.Context, data wshrpc.CommandRemoteStreamTarData) <-chan wshrpc.RespOrErrorUnion[iochantypes.Packet] { - path := data.Path - opts := data.Opts - if opts == nil { - opts = &wshrpc.FileCopyOpts{} - } - log.Printf("RemoteTarStreamCommand: path=%s\n", path) - srcHasSlash := strings.HasSuffix(path, "/") - path, err := wavebase.ExpandHomeDir(path) - if err != nil { - return wshutil.SendErrCh[iochantypes.Packet](fmt.Errorf("cannot expand path %q: %w", path, err)) - } - cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) - finfo, err := os.Stat(cleanedPath) - if err != nil { - return wshutil.SendErrCh[iochantypes.Packet](fmt.Errorf("cannot stat file %q: %w", path, err)) - } - - var pathPrefix string - singleFile := !finfo.IsDir() - if !singleFile && srcHasSlash { - pathPrefix = cleanedPath - } else { - pathPrefix = filepath.Dir(cleanedPath) - } - - timeout := fstype.DefaultTimeout - if opts.Timeout > 0 { - timeout = time.Duration(opts.Timeout) * time.Millisecond - } - readerCtx, cancel := context.WithTimeout(ctx, timeout) - rtn, writeHeader, fileWriter, tarClose := tarcopy.TarCopySrc(readerCtx, pathPrefix) - - go func() { - defer func() { - tarClose() - cancel() - }() - walkFunc := func(path string, info fs.FileInfo, err error) error { - if readerCtx.Err() != nil { - return readerCtx.Err() - } - if err != nil { - return err - } - if err = writeHeader(info, path, singleFile); err != nil { - return err - } - // if not a dir, write file content - if !info.IsDir() { - data, err := os.Open(path) - if err != nil { - return err - } - defer utilfn.GracefulClose(data, "RemoteTarStreamCommand", path) - if _, err := io.Copy(fileWriter, data); err != nil { - return err - } - } - return nil - } - log.Printf("RemoteTarStreamCommand: starting\n") - err = nil - if singleFile { - err = walkFunc(cleanedPath, finfo, nil) - } else { - err = filepath.Walk(cleanedPath, walkFunc) - } - if err != nil { - rtn <- wshutil.RespErr[iochantypes.Packet](err) - } - log.Printf("RemoteTarStreamCommand: done\n") - }() - log.Printf("RemoteTarStreamCommand: returning channel\n") - return rtn -} - -func (impl *ServerImpl) RemoteFileCopyCommand(ctx context.Context, data wshrpc.CommandFileCopyData) (bool, error) { - log.Printf("RemoteFileCopyCommand: src=%s, dest=%s\n", data.SrcUri, data.DestUri) - opts := data.Opts - if opts == nil { - opts = &wshrpc.FileCopyOpts{} - } - destUri := data.DestUri - srcUri := data.SrcUri - merge := opts.Merge - overwrite := opts.Overwrite - if overwrite && merge { - return false, fmt.Errorf("cannot specify both overwrite and merge") - } - - destConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, destUri) - if err != nil { - return false, fmt.Errorf("cannot parse destination URI %q: %w", destUri, err) - } - destPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(destConn.Path)) - destinfo, err := os.Stat(destPathCleaned) - if err != nil { - if !errors.Is(err, fs.ErrNotExist) { - return false, fmt.Errorf("cannot stat destination %q: %w", destPathCleaned, err) - } - } - - destExists := destinfo != nil - destIsDir := destExists && destinfo.IsDir() - destHasSlash := strings.HasSuffix(destUri, "/") - - if destExists && !destIsDir { - if !overwrite { - return false, fmt.Errorf(fstype.OverwriteRequiredError, destPathCleaned) - } else { - err := os.Remove(destPathCleaned) - if err != nil { - return false, fmt.Errorf("cannot remove file %q: %w", destPathCleaned, err) - } - } - } - srcConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, srcUri) - if err != nil { - return false, fmt.Errorf("cannot parse source URI %q: %w", srcUri, err) - } - - copyFileFunc := func(path string, finfo fs.FileInfo, srcFile io.Reader) (int64, error) { - nextinfo, err := os.Stat(path) - if err != nil && !errors.Is(err, fs.ErrNotExist) { - return 0, fmt.Errorf("cannot stat file %q: %w", path, err) - } - - if nextinfo != nil { - if nextinfo.IsDir() { - if !finfo.IsDir() { - // try to create file in directory - path = filepath.Join(path, filepath.Base(finfo.Name())) - newdestinfo, err := os.Stat(path) - if err != nil && !errors.Is(err, fs.ErrNotExist) { - return 0, fmt.Errorf("cannot stat file %q: %w", path, err) - } - if newdestinfo != nil && !overwrite { - return 0, fmt.Errorf(fstype.OverwriteRequiredError, path) - } - } else if overwrite { - err := os.RemoveAll(path) - if err != nil { - return 0, fmt.Errorf("cannot remove directory %q: %w", path, err) - } - } else if !merge { - return 0, fmt.Errorf(fstype.MergeRequiredError, path) - } - } else { - if !overwrite { - return 0, fmt.Errorf(fstype.OverwriteRequiredError, path) - } else if finfo.IsDir() { - err := os.RemoveAll(path) - if err != nil { - return 0, fmt.Errorf("cannot remove directory %q: %w", path, err) - } - } - } - } - - if finfo.IsDir() { - err := os.MkdirAll(path, finfo.Mode()) - if err != nil { - return 0, fmt.Errorf("cannot create directory %q: %w", path, err) - } - return 0, nil - } else { - err := os.MkdirAll(filepath.Dir(path), 0755) - if err != nil { - return 0, fmt.Errorf("cannot create parent directory %q: %w", filepath.Dir(path), err) - } - } - - file, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, finfo.Mode()) - if err != nil { - return 0, fmt.Errorf("cannot create new file %q: %w", path, err) - } - defer utilfn.GracefulClose(file, "RemoteFileCopyCommand", path) - _, err = io.Copy(file, srcFile) - if err != nil { - return 0, fmt.Errorf("cannot write file %q: %w", path, err) - } - - return finfo.Size(), nil - } - - srcIsDir := false - if srcConn.Host == destConn.Host { - srcPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(srcConn.Path)) - - srcFileStat, err := os.Stat(srcPathCleaned) - if err != nil { - return false, fmt.Errorf("cannot stat file %q: %w", srcPathCleaned, err) - } - - if srcFileStat.IsDir() { - srcIsDir = true - var srcPathPrefix string - if destIsDir { - srcPathPrefix = filepath.Dir(srcPathCleaned) - } else { - srcPathPrefix = srcPathCleaned - } - err = filepath.Walk(srcPathCleaned, func(path string, info fs.FileInfo, err error) error { - if err != nil { - return err - } - srcFilePath := path - destFilePath := filepath.Join(destPathCleaned, strings.TrimPrefix(path, srcPathPrefix)) - var file *os.File - if !info.IsDir() { - file, err = os.Open(srcFilePath) - if err != nil { - return fmt.Errorf("cannot open file %q: %w", srcFilePath, err) - } - defer utilfn.GracefulClose(file, "RemoteFileCopyCommand", srcFilePath) - } - _, err = copyFileFunc(destFilePath, info, file) - return err - }) - if err != nil { - return false, fmt.Errorf("cannot copy %q to %q: %w", srcUri, destUri, err) - } - } else { - file, err := os.Open(srcPathCleaned) - if err != nil { - return false, fmt.Errorf("cannot open file %q: %w", srcPathCleaned, err) - } - defer utilfn.GracefulClose(file, "RemoteFileCopyCommand", srcPathCleaned) - var destFilePath string - if destHasSlash { - destFilePath = filepath.Join(destPathCleaned, filepath.Base(srcPathCleaned)) - } else { - destFilePath = destPathCleaned - } - _, err = copyFileFunc(destFilePath, srcFileStat, file) - if err != nil { - return false, fmt.Errorf("cannot copy %q to %q: %w", srcUri, destUri, err) - } - } - } else { - timeout := fstype.DefaultTimeout - if opts.Timeout > 0 { - timeout = time.Duration(opts.Timeout) * time.Millisecond - } - readCtx, cancel := context.WithCancelCause(ctx) - readCtx, timeoutCancel := context.WithTimeoutCause(readCtx, timeout, fmt.Errorf("timeout copying file %q to %q", srcUri, destUri)) - defer timeoutCancel() - copyStart := time.Now() - ioch := wshclient.FileStreamTarCommand(wshfs.RpcClient, wshrpc.CommandRemoteStreamTarData{Path: srcUri, Opts: opts}, &wshrpc.RpcOpts{Timeout: opts.Timeout}) - numFiles := 0 - numSkipped := 0 - totalBytes := int64(0) - - err := tarcopy.TarCopyDest(readCtx, cancel, ioch, func(next *tar.Header, reader *tar.Reader, singleFile bool) error { - numFiles++ - nextpath := filepath.Join(destPathCleaned, next.Name) - srcIsDir = !singleFile - if singleFile && !destHasSlash { - // custom flag to indicate that the source is a single file, not a directory the contents of a directory - nextpath = destPathCleaned - } - finfo := next.FileInfo() - n, err := copyFileFunc(nextpath, finfo, reader) - if err != nil { - return fmt.Errorf("cannot copy file %q: %w", next.Name, err) - } - totalBytes += n - return nil - }) - if err != nil { - return false, fmt.Errorf("cannot copy %q to %q: %w", srcUri, destUri, err) - } - totalTime := time.Since(copyStart).Seconds() - totalMegaBytes := float64(totalBytes) / 1024 / 1024 - rate := float64(0) - if totalTime > 0 { - rate = totalMegaBytes / totalTime - } - log.Printf("RemoteFileCopyCommand: done; %d files copied in %.3fs, total of %.4f MB, %.2f MB/s, %d files skipped\n", numFiles, totalTime, totalMegaBytes, rate, numSkipped) - } - return srcIsDir, nil -} - -func (impl *ServerImpl) RemoteListEntriesCommand(ctx context.Context, data wshrpc.CommandRemoteListEntriesData) chan wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData] { - ch := make(chan wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData], 16) - go func() { - defer close(ch) - path, err := wavebase.ExpandHomeDir(data.Path) - if err != nil { - ch <- wshutil.RespErr[wshrpc.CommandRemoteListEntriesRtnData](err) - return - } - innerFilesEntries := []os.DirEntry{} - seen := 0 - if data.Opts.Limit == 0 { - data.Opts.Limit = wshrpc.MaxDirSize - } - if data.Opts.All { - fs.WalkDir(os.DirFS(path), ".", func(path string, d fs.DirEntry, err error) error { - defer func() { - seen++ - }() - if seen < data.Opts.Offset { - return nil - } - if seen >= data.Opts.Offset+data.Opts.Limit { - return io.EOF - } - if err != nil { - return err - } - if d.IsDir() { - return nil - } - innerFilesEntries = append(innerFilesEntries, d) - return nil - }) - } else { - innerFilesEntries, err = os.ReadDir(path) - if err != nil { - ch <- wshutil.RespErr[wshrpc.CommandRemoteListEntriesRtnData](fmt.Errorf("cannot open dir %q: %w", path, err)) - return - } - } - var fileInfoArr []*wshrpc.FileInfo - for _, innerFileEntry := range innerFilesEntries { - if ctx.Err() != nil { - ch <- wshutil.RespErr[wshrpc.CommandRemoteListEntriesRtnData](ctx.Err()) - return - } - innerFileInfoInt, err := innerFileEntry.Info() - if err != nil { - log.Printf("cannot stat file %q: %v\n", innerFileEntry.Name(), err) - continue - } - innerFileInfo := statToFileInfo(filepath.Join(path, innerFileInfoInt.Name()), innerFileInfoInt, false) - fileInfoArr = append(fileInfoArr, innerFileInfo) - if len(fileInfoArr) >= wshrpc.DirChunkSize { - resp := wshrpc.CommandRemoteListEntriesRtnData{FileInfo: fileInfoArr} - ch <- wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData]{Response: resp} - fileInfoArr = nil - } - } - if len(fileInfoArr) > 0 { - resp := wshrpc.CommandRemoteListEntriesRtnData{FileInfo: fileInfoArr} - ch <- wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData]{Response: resp} - } - }() - return ch -} - -func statToFileInfo(fullPath string, finfo fs.FileInfo, extended bool) *wshrpc.FileInfo { - mimeType := fileutil.DetectMimeType(fullPath, finfo, extended) - rtn := &wshrpc.FileInfo{ - Path: wavebase.ReplaceHomeDir(fullPath), - Dir: computeDirPart(fullPath), - Name: finfo.Name(), - Size: finfo.Size(), - Mode: finfo.Mode(), - ModeStr: finfo.Mode().String(), - ModTime: finfo.ModTime().UnixMilli(), - IsDir: finfo.IsDir(), - MimeType: mimeType, - SupportsMkdir: true, - } - if finfo.IsDir() { - rtn.Size = -1 - } - return rtn -} - -// fileInfo might be null -func checkIsReadOnly(path string, fileInfo fs.FileInfo, exists bool) bool { - if !exists || fileInfo.Mode().IsDir() { - dirName := filepath.Dir(path) - randHexStr, err := utilfn.RandomHexString(12) - if err != nil { - // we're not sure, just return false - return false - } - tmpFileName := filepath.Join(dirName, "wsh-tmp-"+randHexStr) - fd, err := os.Create(tmpFileName) - if err != nil { - return true - } - utilfn.GracefulClose(fd, "checkIsReadOnly", tmpFileName) - os.Remove(tmpFileName) - return false - } - // try to open for writing, if this fails then it is read-only - file, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0666) - if err != nil { - return true - } - utilfn.GracefulClose(file, "checkIsReadOnly", path) - return false -} - -func computeDirPart(path string) string { - path = filepath.Clean(wavebase.ExpandHomeDirSafe(path)) - path = filepath.ToSlash(path) - if path == "/" { - return "/" - } - return filepath.Dir(path) -} - -func (*ServerImpl) fileInfoInternal(path string, extended bool) (*wshrpc.FileInfo, error) { - cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) - finfo, err := os.Stat(cleanedPath) - if os.IsNotExist(err) { - return &wshrpc.FileInfo{ - Path: wavebase.ReplaceHomeDir(path), - Dir: computeDirPart(path), - NotFound: true, - ReadOnly: checkIsReadOnly(cleanedPath, finfo, false), - SupportsMkdir: true, - }, nil - } - if err != nil { - return nil, fmt.Errorf("cannot stat file %q: %w", path, err) - } - rtn := statToFileInfo(cleanedPath, finfo, extended) - if extended { - rtn.ReadOnly = checkIsReadOnly(cleanedPath, finfo, true) - } - return rtn, nil -} - -func resolvePaths(paths []string) string { - if len(paths) == 0 { - return wavebase.ExpandHomeDirSafe("~") - } - rtnPath := wavebase.ExpandHomeDirSafe(paths[0]) - for _, path := range paths[1:] { - path = wavebase.ExpandHomeDirSafe(path) - if filepath.IsAbs(path) { - rtnPath = path - continue - } - rtnPath = filepath.Join(rtnPath, path) - } - return rtnPath -} - -func (impl *ServerImpl) RemoteFileJoinCommand(ctx context.Context, paths []string) (*wshrpc.FileInfo, error) { - rtnPath := resolvePaths(paths) - return impl.fileInfoInternal(rtnPath, true) -} - -func (impl *ServerImpl) RemoteFileInfoCommand(ctx context.Context, path string) (*wshrpc.FileInfo, error) { - return impl.fileInfoInternal(path, true) -} - -func (impl *ServerImpl) RemoteFileTouchCommand(ctx context.Context, path string) error { - cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) - if _, err := os.Stat(cleanedPath); err == nil { - return fmt.Errorf("file %q already exists", path) - } - if err := os.MkdirAll(filepath.Dir(cleanedPath), 0755); err != nil { - return fmt.Errorf("cannot create directory %q: %w", filepath.Dir(cleanedPath), err) - } - if err := os.WriteFile(cleanedPath, []byte{}, 0644); err != nil { - return fmt.Errorf("cannot create file %q: %w", cleanedPath, err) - } - return nil -} - -func (impl *ServerImpl) RemoteFileMoveCommand(ctx context.Context, data wshrpc.CommandFileCopyData) error { - opts := data.Opts - destUri := data.DestUri - srcUri := data.SrcUri - overwrite := opts != nil && opts.Overwrite - recursive := opts != nil && opts.Recursive - - destConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, destUri) - if err != nil { - return fmt.Errorf("cannot parse destination URI %q: %w", srcUri, err) - } - destPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(destConn.Path)) - destinfo, err := os.Stat(destPathCleaned) - if err == nil { - if !destinfo.IsDir() { - if !overwrite { - return fmt.Errorf("destination %q already exists, use overwrite option", destUri) - } else { - err := os.Remove(destPathCleaned) - if err != nil { - return fmt.Errorf("cannot remove file %q: %w", destUri, err) - } - } - } - } else if !errors.Is(err, fs.ErrNotExist) { - return fmt.Errorf("cannot stat destination %q: %w", destUri, err) - } - srcConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, srcUri) - if err != nil { - return fmt.Errorf("cannot parse source URI %q: %w", srcUri, err) - } - if srcConn.Host == destConn.Host { - srcPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(srcConn.Path)) - finfo, err := os.Stat(srcPathCleaned) - if err != nil { - return fmt.Errorf("cannot stat file %q: %w", srcPathCleaned, err) - } - if finfo.IsDir() && !recursive { - return fmt.Errorf(fstype.RecursiveRequiredError) - } - err = os.Rename(srcPathCleaned, destPathCleaned) - if err != nil { - return fmt.Errorf("cannot move file %q to %q: %w", srcPathCleaned, destPathCleaned, err) - } - } else { - return fmt.Errorf("cannot move file %q to %q: different hosts", srcUri, destUri) - } - return nil -} - -func (impl *ServerImpl) RemoteMkdirCommand(ctx context.Context, path string) error { - cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) - if stat, err := os.Stat(cleanedPath); err == nil { - if stat.IsDir() { - return fmt.Errorf("directory %q already exists", path) - } else { - return fmt.Errorf("cannot create directory %q, file exists at path", path) - } - } - if err := os.MkdirAll(cleanedPath, 0755); err != nil { - return fmt.Errorf("cannot create directory %q: %w", cleanedPath, err) - } - return nil -} -func (*ServerImpl) RemoteWriteFileCommand(ctx context.Context, data wshrpc.FileData) error { - var truncate, append bool - var atOffset int64 - if data.Info != nil && data.Info.Opts != nil { - truncate = data.Info.Opts.Truncate - append = data.Info.Opts.Append - } - if data.At != nil { - atOffset = data.At.Offset - } - if truncate && atOffset > 0 { - return fmt.Errorf("cannot specify non-zero offset with truncate option") - } - if append && atOffset > 0 { - return fmt.Errorf("cannot specify non-zero offset with append option") - } - path, err := wavebase.ExpandHomeDir(data.Info.Path) - if err != nil { - return err - } - createMode := os.FileMode(0644) - if data.Info != nil && data.Info.Mode > 0 { - createMode = data.Info.Mode - } - dataSize := base64.StdEncoding.DecodedLen(len(data.Data64)) - dataBytes := make([]byte, dataSize) - n, err := base64.StdEncoding.Decode(dataBytes, []byte(data.Data64)) - if err != nil { - return fmt.Errorf("cannot decode base64 data: %w", err) - } - finfo, err := os.Stat(path) - if err != nil && !errors.Is(err, fs.ErrNotExist) { - return fmt.Errorf("cannot stat file %q: %w", path, err) - } - fileSize := int64(0) - if finfo != nil { - fileSize = finfo.Size() - } - if atOffset > fileSize { - return fmt.Errorf("cannot write at offset %d, file size is %d", atOffset, fileSize) - } - openFlags := os.O_CREATE | os.O_WRONLY - if truncate { - openFlags |= os.O_TRUNC - } - if append { - openFlags |= os.O_APPEND - } - - file, err := os.OpenFile(path, openFlags, createMode) - if err != nil { - return fmt.Errorf("cannot open file %q: %w", path, err) - } - defer utilfn.GracefulClose(file, "RemoteWriteFileCommand", path) - if atOffset > 0 && !append { - n, err = file.WriteAt(dataBytes[:n], atOffset) - } else { - n, err = file.Write(dataBytes[:n]) - } - if err != nil { - return fmt.Errorf("cannot write to file %q: %w", path, err) - } - return nil -} - -func (*ServerImpl) RemoteFileDeleteCommand(ctx context.Context, data wshrpc.CommandDeleteFileData) error { - expandedPath, err := wavebase.ExpandHomeDir(data.Path) - if err != nil { - return fmt.Errorf("cannot delete file %q: %w", data.Path, err) - } - cleanedPath := filepath.Clean(expandedPath) - - err = os.Remove(cleanedPath) - if err != nil { - finfo, _ := os.Stat(cleanedPath) - if finfo != nil && finfo.IsDir() { - if !data.Recursive { - return fmt.Errorf(fstype.RecursiveRequiredError) - } - err = os.RemoveAll(cleanedPath) - if err != nil { - return fmt.Errorf("cannot delete directory %q: %w", data.Path, err) - } - } else { - return fmt.Errorf("cannot delete file %q: %w", data.Path, err) - } - } - return nil -} - func (*ServerImpl) RemoteGetInfoCommand(ctx context.Context) (wshrpc.RemoteInfo, error) { return wshutil.GetInfo(), nil } diff --git a/pkg/wshrpc/wshremote/wshremote_file.go b/pkg/wshrpc/wshremote/wshremote_file.go new file mode 100644 index 0000000000..61f15578b5 --- /dev/null +++ b/pkg/wshrpc/wshremote/wshremote_file.go @@ -0,0 +1,810 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package wshremote + +import ( + "archive/tar" + "context" + "encoding/base64" + "errors" + "fmt" + "io" + "io/fs" + "log" + "os" + "path/filepath" + "strings" + "time" + + "github.com/wavetermdev/waveterm/pkg/remote/connparse" + "github.com/wavetermdev/waveterm/pkg/remote/fileshare/fstype" + "github.com/wavetermdev/waveterm/pkg/remote/fileshare/wshfs" + "github.com/wavetermdev/waveterm/pkg/util/fileutil" + "github.com/wavetermdev/waveterm/pkg/util/iochan/iochantypes" + "github.com/wavetermdev/waveterm/pkg/util/tarcopy" + "github.com/wavetermdev/waveterm/pkg/util/utilfn" + "github.com/wavetermdev/waveterm/pkg/wavebase" + "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" + "github.com/wavetermdev/waveterm/pkg/wshutil" +) + +type ByteRangeType struct { + All bool + Start int64 + End int64 +} + +func parseByteRange(rangeStr string) (ByteRangeType, error) { + if rangeStr == "" { + return ByteRangeType{All: true}, nil + } + var start, end int64 + _, err := fmt.Sscanf(rangeStr, "%d-%d", &start, &end) + if err != nil { + return ByteRangeType{}, errors.New("invalid byte range") + } + if start < 0 || end < 0 || start > end { + return ByteRangeType{}, errors.New("invalid byte range") + } + return ByteRangeType{Start: start, End: end}, nil +} + +func (impl *ServerImpl) remoteStreamFileDir(ctx context.Context, path string, byteRange ByteRangeType, dataCallback func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType)) error { + innerFilesEntries, err := os.ReadDir(path) + if err != nil { + return fmt.Errorf("cannot open dir %q: %w", path, err) + } + if byteRange.All { + if len(innerFilesEntries) > wshrpc.MaxDirSize { + innerFilesEntries = innerFilesEntries[:wshrpc.MaxDirSize] + } + } else { + if byteRange.Start < int64(len(innerFilesEntries)) { + realEnd := byteRange.End + if realEnd > int64(len(innerFilesEntries)) { + realEnd = int64(len(innerFilesEntries)) + } + innerFilesEntries = innerFilesEntries[byteRange.Start:realEnd] + } else { + innerFilesEntries = []os.DirEntry{} + } + } + var fileInfoArr []*wshrpc.FileInfo + for _, innerFileEntry := range innerFilesEntries { + if ctx.Err() != nil { + return ctx.Err() + } + innerFileInfoInt, err := innerFileEntry.Info() + if err != nil { + continue + } + innerFileInfo := statToFileInfo(filepath.Join(path, innerFileInfoInt.Name()), innerFileInfoInt, false) + fileInfoArr = append(fileInfoArr, innerFileInfo) + if len(fileInfoArr) >= wshrpc.DirChunkSize { + dataCallback(fileInfoArr, nil, byteRange) + fileInfoArr = nil + } + } + if len(fileInfoArr) > 0 { + dataCallback(fileInfoArr, nil, byteRange) + } + return nil +} + +func (impl *ServerImpl) remoteStreamFileRegular(ctx context.Context, path string, byteRange ByteRangeType, dataCallback func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType)) error { + fd, err := os.Open(path) + if err != nil { + return fmt.Errorf("cannot open file %q: %w", path, err) + } + defer utilfn.GracefulClose(fd, "remoteStreamFileRegular", path) + var filePos int64 + if !byteRange.All && byteRange.Start > 0 { + _, err := fd.Seek(byteRange.Start, io.SeekStart) + if err != nil { + return fmt.Errorf("seeking file %q: %w", path, err) + } + filePos = byteRange.Start + } + buf := make([]byte, wshrpc.FileChunkSize) + for { + if ctx.Err() != nil { + return ctx.Err() + } + n, err := fd.Read(buf) + if n > 0 { + if !byteRange.All && filePos+int64(n) > byteRange.End { + n = int(byteRange.End - filePos) + } + filePos += int64(n) + dataCallback(nil, buf[:n], byteRange) + } + if !byteRange.All && filePos >= byteRange.End { + break + } + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return fmt.Errorf("reading file %q: %w", path, err) + } + } + return nil +} + +func (impl *ServerImpl) remoteStreamFileInternal(ctx context.Context, data wshrpc.CommandRemoteStreamFileData, dataCallback func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType)) error { + byteRange, err := parseByteRange(data.ByteRange) + if err != nil { + return err + } + path, err := wavebase.ExpandHomeDir(data.Path) + if err != nil { + return err + } + finfo, err := impl.fileInfoInternal(path, true) + if err != nil { + return fmt.Errorf("cannot stat file %q: %w", path, err) + } + dataCallback([]*wshrpc.FileInfo{finfo}, nil, byteRange) + if finfo.NotFound { + return nil + } + if finfo.IsDir { + return impl.remoteStreamFileDir(ctx, path, byteRange, dataCallback) + } else { + return impl.remoteStreamFileRegular(ctx, path, byteRange, dataCallback) + } +} + +func (impl *ServerImpl) RemoteStreamFileCommand(ctx context.Context, data wshrpc.CommandRemoteStreamFileData) chan wshrpc.RespOrErrorUnion[wshrpc.FileData] { + ch := make(chan wshrpc.RespOrErrorUnion[wshrpc.FileData], 16) + go func() { + defer close(ch) + firstPk := true + err := impl.remoteStreamFileInternal(ctx, data, func(fileInfo []*wshrpc.FileInfo, data []byte, byteRange ByteRangeType) { + resp := wshrpc.FileData{} + fileInfoLen := len(fileInfo) + if fileInfoLen > 1 || !firstPk { + resp.Entries = fileInfo + } else if fileInfoLen == 1 { + resp.Info = fileInfo[0] + } + if firstPk { + firstPk = false + } + if len(data) > 0 { + resp.Data64 = base64.StdEncoding.EncodeToString(data) + resp.At = &wshrpc.FileDataAt{Offset: byteRange.Start, Size: len(data)} + } + ch <- wshrpc.RespOrErrorUnion[wshrpc.FileData]{Response: resp} + }) + if err != nil { + ch <- wshutil.RespErr[wshrpc.FileData](err) + } + }() + return ch +} + +func (impl *ServerImpl) RemoteTarStreamCommand(ctx context.Context, data wshrpc.CommandRemoteStreamTarData) <-chan wshrpc.RespOrErrorUnion[iochantypes.Packet] { + path := data.Path + opts := data.Opts + if opts == nil { + opts = &wshrpc.FileCopyOpts{} + } + log.Printf("RemoteTarStreamCommand: path=%s\n", path) + srcHasSlash := strings.HasSuffix(path, "/") + path, err := wavebase.ExpandHomeDir(path) + if err != nil { + return wshutil.SendErrCh[iochantypes.Packet](fmt.Errorf("cannot expand path %q: %w", path, err)) + } + cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) + finfo, err := os.Stat(cleanedPath) + if err != nil { + return wshutil.SendErrCh[iochantypes.Packet](fmt.Errorf("cannot stat file %q: %w", path, err)) + } + + var pathPrefix string + singleFile := !finfo.IsDir() + if !singleFile && srcHasSlash { + pathPrefix = cleanedPath + } else { + pathPrefix = filepath.Dir(cleanedPath) + } + + timeout := fstype.DefaultTimeout + if opts.Timeout > 0 { + timeout = time.Duration(opts.Timeout) * time.Millisecond + } + readerCtx, cancel := context.WithTimeout(ctx, timeout) + rtn, writeHeader, fileWriter, tarClose := tarcopy.TarCopySrc(readerCtx, pathPrefix) + + go func() { + defer func() { + tarClose() + cancel() + }() + walkFunc := func(path string, info fs.FileInfo, err error) error { + if readerCtx.Err() != nil { + return readerCtx.Err() + } + if err != nil { + return err + } + if err = writeHeader(info, path, singleFile); err != nil { + return err + } + // if not a dir, write file content + if !info.IsDir() { + data, err := os.Open(path) + if err != nil { + return err + } + defer utilfn.GracefulClose(data, "RemoteTarStreamCommand", path) + if _, err := io.Copy(fileWriter, data); err != nil { + return err + } + } + return nil + } + log.Printf("RemoteTarStreamCommand: starting\n") + err = nil + if singleFile { + err = walkFunc(cleanedPath, finfo, nil) + } else { + err = filepath.Walk(cleanedPath, walkFunc) + } + if err != nil { + rtn <- wshutil.RespErr[iochantypes.Packet](err) + } + log.Printf("RemoteTarStreamCommand: done\n") + }() + log.Printf("RemoteTarStreamCommand: returning channel\n") + return rtn +} + +func (impl *ServerImpl) RemoteFileCopyCommand(ctx context.Context, data wshrpc.CommandFileCopyData) (bool, error) { + log.Printf("RemoteFileCopyCommand: src=%s, dest=%s\n", data.SrcUri, data.DestUri) + opts := data.Opts + if opts == nil { + opts = &wshrpc.FileCopyOpts{} + } + destUri := data.DestUri + srcUri := data.SrcUri + merge := opts.Merge + overwrite := opts.Overwrite + if overwrite && merge { + return false, fmt.Errorf("cannot specify both overwrite and merge") + } + + destConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, destUri) + if err != nil { + return false, fmt.Errorf("cannot parse destination URI %q: %w", destUri, err) + } + destPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(destConn.Path)) + destinfo, err := os.Stat(destPathCleaned) + if err != nil { + if !errors.Is(err, fs.ErrNotExist) { + return false, fmt.Errorf("cannot stat destination %q: %w", destPathCleaned, err) + } + } + + destExists := destinfo != nil + destIsDir := destExists && destinfo.IsDir() + destHasSlash := strings.HasSuffix(destUri, "/") + + if destExists && !destIsDir { + if !overwrite { + return false, fmt.Errorf(fstype.OverwriteRequiredError, destPathCleaned) + } else { + err := os.Remove(destPathCleaned) + if err != nil { + return false, fmt.Errorf("cannot remove file %q: %w", destPathCleaned, err) + } + } + } + srcConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, srcUri) + if err != nil { + return false, fmt.Errorf("cannot parse source URI %q: %w", srcUri, err) + } + + copyFileFunc := func(path string, finfo fs.FileInfo, srcFile io.Reader) (int64, error) { + nextinfo, err := os.Stat(path) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return 0, fmt.Errorf("cannot stat file %q: %w", path, err) + } + + if nextinfo != nil { + if nextinfo.IsDir() { + if !finfo.IsDir() { + // try to create file in directory + path = filepath.Join(path, filepath.Base(finfo.Name())) + newdestinfo, err := os.Stat(path) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return 0, fmt.Errorf("cannot stat file %q: %w", path, err) + } + if newdestinfo != nil && !overwrite { + return 0, fmt.Errorf(fstype.OverwriteRequiredError, path) + } + } else if overwrite { + err := os.RemoveAll(path) + if err != nil { + return 0, fmt.Errorf("cannot remove directory %q: %w", path, err) + } + } else if !merge { + return 0, fmt.Errorf(fstype.MergeRequiredError, path) + } + } else { + if !overwrite { + return 0, fmt.Errorf(fstype.OverwriteRequiredError, path) + } else if finfo.IsDir() { + err := os.RemoveAll(path) + if err != nil { + return 0, fmt.Errorf("cannot remove directory %q: %w", path, err) + } + } + } + } + + if finfo.IsDir() { + err := os.MkdirAll(path, finfo.Mode()) + if err != nil { + return 0, fmt.Errorf("cannot create directory %q: %w", path, err) + } + return 0, nil + } else { + err := os.MkdirAll(filepath.Dir(path), 0755) + if err != nil { + return 0, fmt.Errorf("cannot create parent directory %q: %w", filepath.Dir(path), err) + } + } + + file, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, finfo.Mode()) + if err != nil { + return 0, fmt.Errorf("cannot create new file %q: %w", path, err) + } + defer utilfn.GracefulClose(file, "RemoteFileCopyCommand", path) + _, err = io.Copy(file, srcFile) + if err != nil { + return 0, fmt.Errorf("cannot write file %q: %w", path, err) + } + + return finfo.Size(), nil + } + + srcIsDir := false + if srcConn.Host == destConn.Host { + srcPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(srcConn.Path)) + + srcFileStat, err := os.Stat(srcPathCleaned) + if err != nil { + return false, fmt.Errorf("cannot stat file %q: %w", srcPathCleaned, err) + } + + if srcFileStat.IsDir() { + srcIsDir = true + var srcPathPrefix string + if destIsDir { + srcPathPrefix = filepath.Dir(srcPathCleaned) + } else { + srcPathPrefix = srcPathCleaned + } + err = filepath.Walk(srcPathCleaned, func(path string, info fs.FileInfo, err error) error { + if err != nil { + return err + } + srcFilePath := path + destFilePath := filepath.Join(destPathCleaned, strings.TrimPrefix(path, srcPathPrefix)) + var file *os.File + if !info.IsDir() { + file, err = os.Open(srcFilePath) + if err != nil { + return fmt.Errorf("cannot open file %q: %w", srcFilePath, err) + } + defer utilfn.GracefulClose(file, "RemoteFileCopyCommand", srcFilePath) + } + _, err = copyFileFunc(destFilePath, info, file) + return err + }) + if err != nil { + return false, fmt.Errorf("cannot copy %q to %q: %w", srcUri, destUri, err) + } + } else { + file, err := os.Open(srcPathCleaned) + if err != nil { + return false, fmt.Errorf("cannot open file %q: %w", srcPathCleaned, err) + } + defer utilfn.GracefulClose(file, "RemoteFileCopyCommand", srcPathCleaned) + var destFilePath string + if destHasSlash { + destFilePath = filepath.Join(destPathCleaned, filepath.Base(srcPathCleaned)) + } else { + destFilePath = destPathCleaned + } + _, err = copyFileFunc(destFilePath, srcFileStat, file) + if err != nil { + return false, fmt.Errorf("cannot copy %q to %q: %w", srcUri, destUri, err) + } + } + } else { + timeout := fstype.DefaultTimeout + if opts.Timeout > 0 { + timeout = time.Duration(opts.Timeout) * time.Millisecond + } + readCtx, cancel := context.WithCancelCause(ctx) + readCtx, timeoutCancel := context.WithTimeoutCause(readCtx, timeout, fmt.Errorf("timeout copying file %q to %q", srcUri, destUri)) + defer timeoutCancel() + copyStart := time.Now() + ioch := wshclient.FileStreamTarCommand(wshfs.RpcClient, wshrpc.CommandRemoteStreamTarData{Path: srcUri, Opts: opts}, &wshrpc.RpcOpts{Timeout: opts.Timeout}) + numFiles := 0 + numSkipped := 0 + totalBytes := int64(0) + + err := tarcopy.TarCopyDest(readCtx, cancel, ioch, func(next *tar.Header, reader *tar.Reader, singleFile bool) error { + numFiles++ + nextpath := filepath.Join(destPathCleaned, next.Name) + srcIsDir = !singleFile + if singleFile && !destHasSlash { + // custom flag to indicate that the source is a single file, not a directory the contents of a directory + nextpath = destPathCleaned + } + finfo := next.FileInfo() + n, err := copyFileFunc(nextpath, finfo, reader) + if err != nil { + return fmt.Errorf("cannot copy file %q: %w", next.Name, err) + } + totalBytes += n + return nil + }) + if err != nil { + return false, fmt.Errorf("cannot copy %q to %q: %w", srcUri, destUri, err) + } + totalTime := time.Since(copyStart).Seconds() + totalMegaBytes := float64(totalBytes) / 1024 / 1024 + rate := float64(0) + if totalTime > 0 { + rate = totalMegaBytes / totalTime + } + log.Printf("RemoteFileCopyCommand: done; %d files copied in %.3fs, total of %.4f MB, %.2f MB/s, %d files skipped\n", numFiles, totalTime, totalMegaBytes, rate, numSkipped) + } + return srcIsDir, nil +} + +func (impl *ServerImpl) RemoteListEntriesCommand(ctx context.Context, data wshrpc.CommandRemoteListEntriesData) chan wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData] { + ch := make(chan wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData], 16) + go func() { + defer close(ch) + path, err := wavebase.ExpandHomeDir(data.Path) + if err != nil { + ch <- wshutil.RespErr[wshrpc.CommandRemoteListEntriesRtnData](err) + return + } + innerFilesEntries := []os.DirEntry{} + seen := 0 + if data.Opts.Limit == 0 { + data.Opts.Limit = wshrpc.MaxDirSize + } + if data.Opts.All { + fs.WalkDir(os.DirFS(path), ".", func(path string, d fs.DirEntry, err error) error { + defer func() { + seen++ + }() + if seen < data.Opts.Offset { + return nil + } + if seen >= data.Opts.Offset+data.Opts.Limit { + return io.EOF + } + if err != nil { + return err + } + if d.IsDir() { + return nil + } + innerFilesEntries = append(innerFilesEntries, d) + return nil + }) + } else { + innerFilesEntries, err = os.ReadDir(path) + if err != nil { + ch <- wshutil.RespErr[wshrpc.CommandRemoteListEntriesRtnData](fmt.Errorf("cannot open dir %q: %w", path, err)) + return + } + } + var fileInfoArr []*wshrpc.FileInfo + for _, innerFileEntry := range innerFilesEntries { + if ctx.Err() != nil { + ch <- wshutil.RespErr[wshrpc.CommandRemoteListEntriesRtnData](ctx.Err()) + return + } + innerFileInfoInt, err := innerFileEntry.Info() + if err != nil { + log.Printf("cannot stat file %q: %v\n", innerFileEntry.Name(), err) + continue + } + innerFileInfo := statToFileInfo(filepath.Join(path, innerFileInfoInt.Name()), innerFileInfoInt, false) + fileInfoArr = append(fileInfoArr, innerFileInfo) + if len(fileInfoArr) >= wshrpc.DirChunkSize { + resp := wshrpc.CommandRemoteListEntriesRtnData{FileInfo: fileInfoArr} + ch <- wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData]{Response: resp} + fileInfoArr = nil + } + } + if len(fileInfoArr) > 0 { + resp := wshrpc.CommandRemoteListEntriesRtnData{FileInfo: fileInfoArr} + ch <- wshrpc.RespOrErrorUnion[wshrpc.CommandRemoteListEntriesRtnData]{Response: resp} + } + }() + return ch +} + +func statToFileInfo(fullPath string, finfo fs.FileInfo, extended bool) *wshrpc.FileInfo { + mimeType := fileutil.DetectMimeType(fullPath, finfo, extended) + rtn := &wshrpc.FileInfo{ + Path: wavebase.ReplaceHomeDir(fullPath), + Dir: computeDirPart(fullPath), + Name: finfo.Name(), + Size: finfo.Size(), + Mode: finfo.Mode(), + ModeStr: finfo.Mode().String(), + ModTime: finfo.ModTime().UnixMilli(), + IsDir: finfo.IsDir(), + MimeType: mimeType, + SupportsMkdir: true, + } + if finfo.IsDir() { + rtn.Size = -1 + } + return rtn +} + +// fileInfo might be null +func checkIsReadOnly(path string, fileInfo fs.FileInfo, exists bool) bool { + if !exists || fileInfo.Mode().IsDir() { + dirName := filepath.Dir(path) + randHexStr, err := utilfn.RandomHexString(12) + if err != nil { + // we're not sure, just return false + return false + } + tmpFileName := filepath.Join(dirName, "wsh-tmp-"+randHexStr) + fd, err := os.Create(tmpFileName) + if err != nil { + return true + } + utilfn.GracefulClose(fd, "checkIsReadOnly", tmpFileName) + os.Remove(tmpFileName) + return false + } + // try to open for writing, if this fails then it is read-only + file, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + return true + } + utilfn.GracefulClose(file, "checkIsReadOnly", path) + return false +} + +func computeDirPart(path string) string { + path = filepath.Clean(wavebase.ExpandHomeDirSafe(path)) + path = filepath.ToSlash(path) + if path == "/" { + return "/" + } + return filepath.Dir(path) +} + +func (*ServerImpl) fileInfoInternal(path string, extended bool) (*wshrpc.FileInfo, error) { + cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) + finfo, err := os.Stat(cleanedPath) + if os.IsNotExist(err) { + return &wshrpc.FileInfo{ + Path: wavebase.ReplaceHomeDir(path), + Dir: computeDirPart(path), + NotFound: true, + ReadOnly: checkIsReadOnly(cleanedPath, finfo, false), + SupportsMkdir: true, + }, nil + } + if err != nil { + return nil, fmt.Errorf("cannot stat file %q: %w", path, err) + } + rtn := statToFileInfo(cleanedPath, finfo, extended) + if extended { + rtn.ReadOnly = checkIsReadOnly(cleanedPath, finfo, true) + } + return rtn, nil +} + +func resolvePaths(paths []string) string { + if len(paths) == 0 { + return wavebase.ExpandHomeDirSafe("~") + } + rtnPath := wavebase.ExpandHomeDirSafe(paths[0]) + for _, path := range paths[1:] { + path = wavebase.ExpandHomeDirSafe(path) + if filepath.IsAbs(path) { + rtnPath = path + continue + } + rtnPath = filepath.Join(rtnPath, path) + } + return rtnPath +} + +func (impl *ServerImpl) RemoteFileJoinCommand(ctx context.Context, paths []string) (*wshrpc.FileInfo, error) { + rtnPath := resolvePaths(paths) + return impl.fileInfoInternal(rtnPath, true) +} + +func (impl *ServerImpl) RemoteFileInfoCommand(ctx context.Context, path string) (*wshrpc.FileInfo, error) { + return impl.fileInfoInternal(path, true) +} + +func (impl *ServerImpl) RemoteFileTouchCommand(ctx context.Context, path string) error { + cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) + if _, err := os.Stat(cleanedPath); err == nil { + return fmt.Errorf("file %q already exists", path) + } + if err := os.MkdirAll(filepath.Dir(cleanedPath), 0755); err != nil { + return fmt.Errorf("cannot create directory %q: %w", filepath.Dir(cleanedPath), err) + } + if err := os.WriteFile(cleanedPath, []byte{}, 0644); err != nil { + return fmt.Errorf("cannot create file %q: %w", cleanedPath, err) + } + return nil +} + +func (impl *ServerImpl) RemoteFileMoveCommand(ctx context.Context, data wshrpc.CommandFileCopyData) error { + opts := data.Opts + destUri := data.DestUri + srcUri := data.SrcUri + overwrite := opts != nil && opts.Overwrite + recursive := opts != nil && opts.Recursive + + destConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, destUri) + if err != nil { + return fmt.Errorf("cannot parse destination URI %q: %w", srcUri, err) + } + destPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(destConn.Path)) + destinfo, err := os.Stat(destPathCleaned) + if err == nil { + if !destinfo.IsDir() { + if !overwrite { + return fmt.Errorf("destination %q already exists, use overwrite option", destUri) + } else { + err := os.Remove(destPathCleaned) + if err != nil { + return fmt.Errorf("cannot remove file %q: %w", destUri, err) + } + } + } + } else if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("cannot stat destination %q: %w", destUri, err) + } + srcConn, err := connparse.ParseURIAndReplaceCurrentHost(ctx, srcUri) + if err != nil { + return fmt.Errorf("cannot parse source URI %q: %w", srcUri, err) + } + if srcConn.Host == destConn.Host { + srcPathCleaned := filepath.Clean(wavebase.ExpandHomeDirSafe(srcConn.Path)) + finfo, err := os.Stat(srcPathCleaned) + if err != nil { + return fmt.Errorf("cannot stat file %q: %w", srcPathCleaned, err) + } + if finfo.IsDir() && !recursive { + return fmt.Errorf(fstype.RecursiveRequiredError) + } + err = os.Rename(srcPathCleaned, destPathCleaned) + if err != nil { + return fmt.Errorf("cannot move file %q to %q: %w", srcPathCleaned, destPathCleaned, err) + } + } else { + return fmt.Errorf("cannot move file %q to %q: different hosts", srcUri, destUri) + } + return nil +} + +func (impl *ServerImpl) RemoteMkdirCommand(ctx context.Context, path string) error { + cleanedPath := filepath.Clean(wavebase.ExpandHomeDirSafe(path)) + if stat, err := os.Stat(cleanedPath); err == nil { + if stat.IsDir() { + return fmt.Errorf("directory %q already exists", path) + } else { + return fmt.Errorf("cannot create directory %q, file exists at path", path) + } + } + if err := os.MkdirAll(cleanedPath, 0755); err != nil { + return fmt.Errorf("cannot create directory %q: %w", cleanedPath, err) + } + return nil +} +func (*ServerImpl) RemoteWriteFileCommand(ctx context.Context, data wshrpc.FileData) error { + var truncate, append bool + var atOffset int64 + if data.Info != nil && data.Info.Opts != nil { + truncate = data.Info.Opts.Truncate + append = data.Info.Opts.Append + } + if data.At != nil { + atOffset = data.At.Offset + } + if truncate && atOffset > 0 { + return fmt.Errorf("cannot specify non-zero offset with truncate option") + } + if append && atOffset > 0 { + return fmt.Errorf("cannot specify non-zero offset with append option") + } + path, err := wavebase.ExpandHomeDir(data.Info.Path) + if err != nil { + return err + } + createMode := os.FileMode(0644) + if data.Info != nil && data.Info.Mode > 0 { + createMode = data.Info.Mode + } + dataSize := base64.StdEncoding.DecodedLen(len(data.Data64)) + dataBytes := make([]byte, dataSize) + n, err := base64.StdEncoding.Decode(dataBytes, []byte(data.Data64)) + if err != nil { + return fmt.Errorf("cannot decode base64 data: %w", err) + } + finfo, err := os.Stat(path) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("cannot stat file %q: %w", path, err) + } + fileSize := int64(0) + if finfo != nil { + fileSize = finfo.Size() + } + if atOffset > fileSize { + return fmt.Errorf("cannot write at offset %d, file size is %d", atOffset, fileSize) + } + openFlags := os.O_CREATE | os.O_WRONLY + if truncate { + openFlags |= os.O_TRUNC + } + if append { + openFlags |= os.O_APPEND + } + + file, err := os.OpenFile(path, openFlags, createMode) + if err != nil { + return fmt.Errorf("cannot open file %q: %w", path, err) + } + defer utilfn.GracefulClose(file, "RemoteWriteFileCommand", path) + if atOffset > 0 && !append { + n, err = file.WriteAt(dataBytes[:n], atOffset) + } else { + n, err = file.Write(dataBytes[:n]) + } + if err != nil { + return fmt.Errorf("cannot write to file %q: %w", path, err) + } + return nil +} + +func (*ServerImpl) RemoteFileDeleteCommand(ctx context.Context, data wshrpc.CommandDeleteFileData) error { + expandedPath, err := wavebase.ExpandHomeDir(data.Path) + if err != nil { + return fmt.Errorf("cannot delete file %q: %w", data.Path, err) + } + cleanedPath := filepath.Clean(expandedPath) + + err = os.Remove(cleanedPath) + if err != nil { + finfo, _ := os.Stat(cleanedPath) + if finfo != nil && finfo.IsDir() { + if !data.Recursive { + return fmt.Errorf(fstype.RecursiveRequiredError) + } + err = os.RemoveAll(cleanedPath) + if err != nil { + return fmt.Errorf("cannot delete directory %q: %w", data.Path, err) + } + } else { + return fmt.Errorf("cannot delete file %q: %w", data.Path, err) + } + } + return nil +} diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index e66c78bfa7..4973f0a454 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -719,7 +719,9 @@ type CommandRemoteReconnectToJobManagerData struct { } type CommandStartJobRtnData struct { - Pgid int `json:"pgid"` + CmdPgid int `json:"cmdpgid"` + JobManagerPid int `json:"jobmanagerpid"` + JobManagerStartTs int64 `json:"jobmanagerstartts"` } type CommandJobPrepareConnectData struct { From 991769a1c1f63e48646ba30d4e5e1b79069517ae Mon Sep 17 00:00:00 2001 From: sawka Date: Sun, 18 Jan 2026 17:58:09 -0800 Subject: [PATCH 35/64] working on jobcontroller/remote reconnect/disconnect job manager commands --- cmd/server/main-server.go | 2 +- cmd/wsh/cmd/wshcmd-connserver.go | 4 +- cmd/wsh/cmd/wshcmd-jobdebug.go | 42 ++++ frontend/app/store/wshclientapi.ts | 15 ++ frontend/types/gotypes.d.ts | 7 + pkg/jobcontroller/jobcontroller.go | 85 +++++++ pkg/wshrpc/wshclient/wshclient.go | 18 ++ pkg/wshrpc/wshremote/wshremote.go | 245 ++------------------ pkg/wshrpc/wshremote/wshremote_file.go | 2 +- pkg/wshrpc/wshremote/wshremote_job.go | 305 +++++++++++++++++++++++++ pkg/wshrpc/wshrpctypes.go | 9 + pkg/wshrpc/wshserver/wshserver.go | 8 + 12 files changed, 516 insertions(+), 226 deletions(-) create mode 100644 pkg/wshrpc/wshremote/wshremote_job.go diff --git a/cmd/server/main-server.go b/cmd/server/main-server.go index 410e1fd63b..5eb247c75c 100644 --- a/cmd/server/main-server.go +++ b/cmd/server/main-server.go @@ -392,7 +392,7 @@ func createMainWshClient() { wshfs.RpcClient = rpc wshutil.DefaultRouter.RegisterTrustedLeaf(rpc, wshutil.DefaultRoute) wps.Broker.SetClient(wshutil.DefaultRouter) - localConnWsh := wshutil.MakeWshRpc(wshrpc.RpcContext{Conn: wshrpc.LocalConnName}, &wshremote.ServerImpl{Router: wshutil.DefaultRouter, RpcClient: wshclient.GetBareRpcClient(), IsLocal: true}, "conn:local") + localConnWsh := wshutil.MakeWshRpc(wshrpc.RpcContext{Conn: wshrpc.LocalConnName}, wshremote.MakeRemoteRpcServerImpl(nil, wshutil.DefaultRouter, wshclient.GetBareRpcClient(), true), "conn:local") go wshremote.RunSysInfoLoop(localConnWsh, wshrpc.LocalConnName) wshutil.DefaultRouter.RegisterTrustedLeaf(localConnWsh, wshutil.MakeConnectionRouteId(wshrpc.LocalConnName)) } diff --git a/cmd/wsh/cmd/wshcmd-connserver.go b/cmd/wsh/cmd/wshcmd-connserver.go index dd672438c9..6ec0d5e4d7 100644 --- a/cmd/wsh/cmd/wshcmd-connserver.go +++ b/cmd/wsh/cmd/wshcmd-connserver.go @@ -131,7 +131,7 @@ func setupConnServerRpcClientWithRouter(router *wshutil.WshRouter) (*wshutil.Wsh bareClient := wshutil.MakeWshRpc(wshrpc.RpcContext{}, &wshclient.WshServer{}, bareRouteId) router.RegisterTrustedLeaf(bareClient, bareRouteId) - connServerClient := wshutil.MakeWshRpc(rpcCtx, &wshremote.ServerImpl{LogWriter: os.Stdout, Router: router, RpcClient: bareClient}, routeId) + connServerClient := wshutil.MakeWshRpc(rpcCtx, wshremote.MakeRemoteRpcServerImpl(os.Stdout, router, bareClient, false), routeId) router.RegisterTrustedLeaf(connServerClient, routeId) return connServerClient, nil } @@ -323,7 +323,7 @@ func serverRunRouterDomainSocket(jwtToken string) error { } func serverRunNormal(jwtToken string) error { - err := setupRpcClient(&wshremote.ServerImpl{LogWriter: os.Stdout}, jwtToken) + err := setupRpcClient(wshremote.MakeRemoteRpcServerImpl(os.Stdout, nil, nil, false), jwtToken) if err != nil { return err } diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 1a068fcf2f..b7c702911a 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -56,6 +56,18 @@ var jobDebugExitCmd = &cobra.Command{ RunE: jobDebugExitRun, } +var jobDebugDisconnectCmd = &cobra.Command{ + Use: "disconnect", + Short: "disconnect from a job manager", + RunE: jobDebugDisconnectRun, +} + +var jobDebugReconnectCmd = &cobra.Command{ + Use: "reconnect", + Short: "reconnect to a job manager", + RunE: jobDebugReconnectRun, +} + var jobDebugGetOutputCmd = &cobra.Command{ Use: "getoutput", Short: "get the terminal output for a job", @@ -72,6 +84,8 @@ var jobIdFlag string var jobDebugJsonFlag bool var jobConnFlag string var exitJobIdFlag string +var disconnectJobIdFlag string +var reconnectJobIdFlag string func init() { rootCmd.AddCommand(jobDebugCmd) @@ -81,6 +95,8 @@ func init() { jobDebugCmd.AddCommand(jobDebugPruneCmd) jobDebugCmd.AddCommand(jobDebugTerminateCmdCmd) jobDebugCmd.AddCommand(jobDebugExitCmd) + jobDebugCmd.AddCommand(jobDebugDisconnectCmd) + jobDebugCmd.AddCommand(jobDebugReconnectCmd) jobDebugCmd.AddCommand(jobDebugGetOutputCmd) jobDebugCmd.AddCommand(jobDebugStartCmd) @@ -95,6 +111,12 @@ func init() { jobDebugExitCmd.Flags().StringVar(&exitJobIdFlag, "jobid", "", "job id to exit (required)") jobDebugExitCmd.MarkFlagRequired("jobid") + jobDebugDisconnectCmd.Flags().StringVar(&disconnectJobIdFlag, "jobid", "", "job id to disconnect (required)") + jobDebugDisconnectCmd.MarkFlagRequired("jobid") + + jobDebugReconnectCmd.Flags().StringVar(&reconnectJobIdFlag, "jobid", "", "job id to reconnect (required)") + jobDebugReconnectCmd.MarkFlagRequired("jobid") + jobDebugGetOutputCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to get output for (required)") jobDebugGetOutputCmd.MarkFlagRequired("jobid") @@ -252,6 +274,26 @@ func jobDebugExitRun(cmd *cobra.Command, args []string) error { return nil } +func jobDebugDisconnectRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobControllerDisconnectJobCommand(RpcClient, disconnectJobIdFlag, nil) + if err != nil { + return fmt.Errorf("disconnecting from job manager: %w", err) + } + + fmt.Printf("Disconnected from job manager for %s successfully\n", disconnectJobIdFlag) + return nil +} + +func jobDebugReconnectRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobControllerReconnectJobCommand(RpcClient, reconnectJobIdFlag, nil) + if err != nil { + return fmt.Errorf("reconnecting to job manager: %w", err) + } + + fmt.Printf("Reconnected to job manager for %s successfully\n", reconnectJobIdFlag) + return nil +} + func jobDebugGetOutputRun(cmd *cobra.Command, args []string) error { fileData, err := wshclient.FileReadCommand(RpcClient, wshrpc.FileData{ Info: &wshrpc.FileInfo{ diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index cacc89514b..ac0cede785 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -397,11 +397,21 @@ class RpcApiType { return client.wshRpcCall("jobcontrollerconnectedjobs", null, opts); } + // command "jobcontrollerdisconnectjob" [call] + JobControllerDisconnectJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerdisconnectjob", data, opts); + } + // command "jobcontrollerexitjob" [call] JobControllerExitJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerexitjob", data, opts); } + // command "jobcontrollerreconnectjob" [call] + JobControllerReconnectJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerreconnectjob", data, opts); + } + // command "jobcontrollerstartjob" [call] JobControllerStartJobCommand(client: WshClient, data: CommandJobControllerStartJobData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerstartjob", data, opts); @@ -502,6 +512,11 @@ class RpcApiType { return client.wshRpcCall("recordtevent", data, opts); } + // command "remotedisconnectfromjobmanager" [call] + RemoteDisconnectFromJobManagerCommand(client: WshClient, data: CommandRemoteDisconnectFromJobManagerData, opts?: RpcOpts): Promise { + return client.wshRpcCall("remotedisconnectfromjobmanager", data, opts); + } + // command "remotefilecopy" [call] RemoteFileCopyCommand(client: WshClient, data: CommandFileCopyData, opts?: RpcOpts): Promise { return client.wshRpcCall("remotefilecopy", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index ce12df7aa0..6493d22750 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -450,6 +450,11 @@ declare global { modts?: number; }; + // wshrpc.CommandRemoteDisconnectFromJobManagerData + type CommandRemoteDisconnectFromJobManagerData = { + jobid: string; + }; + // wshrpc.CommandRemoteListEntriesData type CommandRemoteListEntriesData = { path: string; @@ -466,6 +471,8 @@ declare global { jobid: string; jobauthtoken: string; mainserverjwttoken: string; + jobmanagerpid: number; + jobmanagerstartts: number; }; // wshrpc.CommandRemoteStartJobData diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 8dd98c04c6..d3e6f7a8a7 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -404,6 +404,91 @@ func ExitJobManager(ctx context.Context, jobId string) error { return nil } +func DisconnectJob(ctx context.Context, jobId string) error { + job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + return fmt.Errorf("failed to get job: %w", err) + } + + bareRpc := wshclient.GetBareRpcClient() + if bareRpc == nil { + return fmt.Errorf("main rpc client not available") + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeConnectionRouteId(job.Connection), + Timeout: 5000, + } + + disconnectData := wshrpc.CommandRemoteDisconnectFromJobManagerData{ + JobId: jobId, + } + + err = wshclient.RemoteDisconnectFromJobManagerCommand(bareRpc, disconnectData, rpcOpts) + if err != nil { + return fmt.Errorf("failed to send disconnect command: %w", err) + } + + log.Printf("[job:%s] job disconnect command sent successfully", jobId) + return nil +} + +func ReconnectJob(ctx context.Context, jobId string) error { + job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + return fmt.Errorf("failed to get job: %w", err) + } + + if job.Connection == "" { + return fmt.Errorf("job has no connection") + } + + isConnected, err := conncontroller.IsConnected(job.Connection) + if err != nil { + return fmt.Errorf("error checking connection status: %w", err) + } + if !isConnected { + return fmt.Errorf("connection %q is not connected", job.Connection) + } + + jobAccessClaims := &wavejwt.WaveJwtClaims{ + MainServer: true, + JobId: jobId, + } + jobAccessToken, err := wavejwt.Sign(jobAccessClaims) + if err != nil { + return fmt.Errorf("failed to generate job access token: %w", err) + } + + bareRpc := wshclient.GetBareRpcClient() + if bareRpc == nil { + return fmt.Errorf("main rpc client not available") + } + + reconnectData := wshrpc.CommandRemoteReconnectToJobManagerData{ + JobId: jobId, + JobAuthToken: job.JobAuthToken, + MainServerJwtToken: jobAccessToken, + JobManagerPid: job.JobManagerPid, + JobManagerStartTs: job.JobManagerStartTs, + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeConnectionRouteId(job.Connection), + Timeout: 30000, + } + + log.Printf("[job:%s] sending RemoteReconnectToJobManagerCommand to connection %s", jobId, job.Connection) + err = wshclient.RemoteReconnectToJobManagerCommand(bareRpc, reconnectData, rpcOpts) + if err != nil { + log.Printf("[job:%s] RemoteReconnectToJobManagerCommand failed: %v", jobId, err) + return fmt.Errorf("failed to reconnect to job manager: %w", err) + } + + log.Printf("[job:%s] RemoteReconnectToJobManagerCommand succeeded", jobId) + return nil +} + func DeleteJob(ctx context.Context, jobId string) error { SetJobConnStatus(jobId, JobConnStatus_Disconnected) err := filestore.WFS.DeleteZone(ctx, jobId) diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index accdc8cd80..2fd95e12a7 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -482,12 +482,24 @@ func JobControllerConnectedJobsCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) return resp, err } +// command "jobcontrollerdisconnectjob", wshserver.JobControllerDisconnectJobCommand +func JobControllerDisconnectJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerdisconnectjob", data, opts) + return err +} + // command "jobcontrollerexitjob", wshserver.JobControllerExitJobCommand func JobControllerExitJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerexitjob", data, opts) return err } +// command "jobcontrollerreconnectjob", wshserver.JobControllerReconnectJobCommand +func JobControllerReconnectJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerreconnectjob", data, opts) + return err +} + // command "jobcontrollerstartjob", wshserver.JobControllerStartJobCommand func JobControllerStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobControllerStartJobData, opts *wshrpc.RpcOpts) (string, error) { resp, err := sendRpcRequestCallHelper[string](w, "jobcontrollerstartjob", data, opts) @@ -608,6 +620,12 @@ func RecordTEventCommand(w *wshutil.WshRpc, data telemetrydata.TEvent, opts *wsh return err } +// command "remotedisconnectfromjobmanager", wshserver.RemoteDisconnectFromJobManagerCommand +func RemoteDisconnectFromJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteDisconnectFromJobManagerData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "remotedisconnectfromjobmanager", data, opts) + return err +} + // command "remotefilecopy", wshserver.RemoteFileCopyCommand func RemoteFileCopyCommand(w *wshutil.WshRpc, data wshrpc.CommandFileCopyData, opts *wshrpc.RpcOpts) (bool, error) { resp, err := sendRpcRequestCallHelper[bool](w, "remotefilecopy", data, opts) diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index 7da3ea5d35..f0cfbb145c 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -4,34 +4,44 @@ package wshremote import ( - "bufio" "context" "fmt" "io" "log" "net" - "os" - "os/exec" "path/filepath" - "strings" - "time" + "sync" - "github.com/wavetermdev/waveterm/pkg/jobmanager" "github.com/wavetermdev/waveterm/pkg/suggestion" "github.com/wavetermdev/waveterm/pkg/wavebase" "github.com/wavetermdev/waveterm/pkg/wshrpc" - "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" "github.com/wavetermdev/waveterm/pkg/wshutil" ) -// this is the connserver interface. -// it runs on remote servers, and one instance also runs on localhost +type JobManagerConnection struct { + JobId string + Conn net.Conn + WshRpc *wshutil.WshRpc + CleanupFn func() +} type ServerImpl struct { - LogWriter io.Writer - Router *wshutil.WshRouter - RpcClient *wshutil.WshRpc - IsLocal bool + LogWriter io.Writer + Router *wshutil.WshRouter + RpcClient *wshutil.WshRpc + IsLocal bool + JobManagerMap map[string]*JobManagerConnection + Lock sync.Mutex +} + +func MakeRemoteRpcServerImpl(logWriter io.Writer, router *wshutil.WshRouter, rpcClient *wshutil.WshRpc, isLocal bool) *ServerImpl { + return &ServerImpl{ + LogWriter: logWriter, + Router: router, + RpcClient: rpcClient, + IsLocal: isLocal, + JobManagerMap: make(map[string]*JobManagerConnection), + } } func (*ServerImpl) WshServerImpl() {} @@ -92,212 +102,3 @@ func (impl *ServerImpl) getWshPath() (string, error) { } return wshPath, nil } - -// returns jobRouteId, cleanupFunc, error -func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, mainServerJwtToken string) (string, func(), error) { - socketPath := jobmanager.GetJobSocketPath(jobId) - log.Printf("connectToJobManager: connecting to socket: %s\n", socketPath) - conn, err := net.Dial("unix", socketPath) - if err != nil { - log.Printf("connectToJobManager: error connecting to socket: %v\n", err) - return "", nil, fmt.Errorf("cannot connect to job manager socket: %w", err) - } - log.Printf("connectToJobManager: connected to socket\n") - - proxy := wshutil.MakeRpcProxy("jobmanager") - go func() { - writeErr := wshutil.AdaptOutputChToStream(proxy.ToRemoteCh, conn) - if writeErr != nil { - log.Printf("connectToJobManager: error writing to job manager socket: %v\n", writeErr) - } - }() - go func() { - defer func() { - conn.Close() - close(proxy.FromRemoteCh) - }() - wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) - }() - - linkId := impl.Router.RegisterUntrustedLink(proxy) - cleanup := func() { - conn.Close() - impl.Router.UnregisterLink(linkId) - } - - routeId := wshutil.MakeLinkRouteId(linkId) - authData := wshrpc.CommandAuthenticateToJobData{ - JobAccessToken: mainServerJwtToken, - } - err = wshclient.AuthenticateToJobManagerCommand(impl.RpcClient, authData, &wshrpc.RpcOpts{Route: routeId}) - if err != nil { - cleanup() - return "", nil, fmt.Errorf("authentication to job manager failed: %w", err) - } - - jobRouteId := wshutil.MakeJobRouteId(jobId) - waitCtx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) - defer cancel() - err = impl.Router.WaitForRegister(waitCtx, jobRouteId) - if err != nil { - cleanup() - return "", nil, fmt.Errorf("timeout waiting for job route to register: %w", err) - } - - log.Printf("connectToJobManager: successfully connected and authenticated\n") - return jobRouteId, cleanup, nil -} - -func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.CommandRemoteStartJobData) (*wshrpc.CommandStartJobRtnData, error) { - log.Printf("RemoteStartJobCommand: starting, jobid=%s, clientid=%s\n", data.JobId, data.ClientId) - if impl.Router == nil { - return nil, fmt.Errorf("cannot start remote job: no router available") - } - - wshPath, err := impl.getWshPath() - if err != nil { - return nil, err - } - log.Printf("RemoteStartJobCommand: wshPath=%s\n", wshPath) - - readyPipeRead, readyPipeWrite, err := os.Pipe() - if err != nil { - return nil, fmt.Errorf("cannot create ready pipe: %w", err) - } - defer readyPipeRead.Close() - defer readyPipeWrite.Close() - - cmd := exec.Command(wshPath, "jobmanager", "--jobid", data.JobId, "--clientid", data.ClientId) - if data.PublicKeyBase64 != "" { - cmd.Env = append(os.Environ(), "WAVETERM_PUBLICKEY="+data.PublicKeyBase64) - } - cmd.ExtraFiles = []*os.File{readyPipeWrite} - stdin, err := cmd.StdinPipe() - if err != nil { - return nil, fmt.Errorf("cannot create stdin pipe: %w", err) - } - stdout, err := cmd.StdoutPipe() - if err != nil { - return nil, fmt.Errorf("cannot create stdout pipe: %w", err) - } - stderr, err := cmd.StderrPipe() - if err != nil { - return nil, fmt.Errorf("cannot create stderr pipe: %w", err) - } - log.Printf("RemoteStartJobCommand: created pipes\n") - - if err := cmd.Start(); err != nil { - return nil, fmt.Errorf("cannot start job manager: %w", err) - } - log.Printf("RemoteStartJobCommand: job manager process started\n") - - jobAuthTokenLine := fmt.Sprintf("Wave-JobAccessToken:%s\n", data.JobAuthToken) - if _, err := stdin.Write([]byte(jobAuthTokenLine)); err != nil { - cmd.Process.Kill() - return nil, fmt.Errorf("cannot write job auth token: %w", err) - } - stdin.Close() - log.Printf("RemoteStartJobCommand: wrote auth token to stdin\n") - - go func() { - scanner := bufio.NewScanner(stderr) - for scanner.Scan() { - line := scanner.Text() - log.Printf("RemoteStartJobCommand: stderr: %s\n", line) - } - if err := scanner.Err(); err != nil { - log.Printf("RemoteStartJobCommand: error reading stderr: %v\n", err) - } else { - log.Printf("RemoteStartJobCommand: stderr EOF\n") - } - }() - - go func() { - scanner := bufio.NewScanner(stdout) - for scanner.Scan() { - line := scanner.Text() - log.Printf("RemoteStartJobCommand: stdout: %s\n", line) - } - if err := scanner.Err(); err != nil { - log.Printf("RemoteStartJobCommand: error reading stdout: %v\n", err) - } else { - log.Printf("RemoteStartJobCommand: stdout EOF\n") - } - }() - - startCh := make(chan error, 1) - go func() { - scanner := bufio.NewScanner(readyPipeRead) - for scanner.Scan() { - line := scanner.Text() - log.Printf("RemoteStartJobCommand: ready pipe line: %s\n", line) - if strings.Contains(line, "Wave-JobManagerStart") { - startCh <- nil - return - } - } - if err := scanner.Err(); err != nil { - startCh <- fmt.Errorf("error reading ready pipe: %w", err) - } else { - log.Printf("RemoteStartJobCommand: ready pipe EOF\n") - startCh <- fmt.Errorf("job manager exited without start signal") - } - }() - - timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) - defer cancel() - - log.Printf("RemoteStartJobCommand: waiting for start signal\n") - select { - case err := <-startCh: - if err != nil { - cmd.Process.Kill() - log.Printf("RemoteStartJobCommand: error from start signal: %v\n", err) - return nil, err - } - log.Printf("RemoteStartJobCommand: received start signal\n") - case <-timeoutCtx.Done(): - cmd.Process.Kill() - log.Printf("RemoteStartJobCommand: timeout waiting for start signal\n") - return nil, fmt.Errorf("timeout waiting for job manager to start") - } - - go func() { - cmd.Wait() - }() - - jobRouteId, cleanup, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) - if err != nil { - return nil, err - } - - startJobData := wshrpc.CommandStartJobData{ - Cmd: data.Cmd, - Args: data.Args, - Env: data.Env, - TermSize: data.TermSize, - StreamMeta: data.StreamMeta, - } - rtnData, err := wshclient.StartJobCommand(impl.RpcClient, startJobData, &wshrpc.RpcOpts{Route: jobRouteId}) - if err != nil { - cleanup() - return nil, fmt.Errorf("failed to start job: %w", err) - } - - return rtnData, nil -} - -func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteReconnectToJobManagerData) error { - log.Printf("RemoteReconnectToJobManagerCommand: reconnecting, jobid=%s\n", data.JobId) - if impl.Router == nil { - return fmt.Errorf("cannot reconnect to job manager: no router available") - } - - _, _, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) - if err != nil { - return err - } - - log.Printf("RemoteReconnectToJobManagerCommand: successfully reconnected to job manager\n") - return nil -} diff --git a/pkg/wshrpc/wshremote/wshremote_file.go b/pkg/wshrpc/wshremote/wshremote_file.go index 61f15578b5..c83ae60cfa 100644 --- a/pkg/wshrpc/wshremote/wshremote_file.go +++ b/pkg/wshrpc/wshremote/wshremote_file.go @@ -1,4 +1,4 @@ -// Copyright 2025, Command Line Inc. +// Copyright 2026, Command Line Inc. // SPDX-License-Identifier: Apache-2.0 package wshremote diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go new file mode 100644 index 0000000000..c15bc010fd --- /dev/null +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -0,0 +1,305 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package wshremote + +import ( + "bufio" + "context" + "fmt" + "log" + "net" + "os" + "os/exec" + "strings" + "time" + + "github.com/shirou/gopsutil/v4/process" + "github.com/wavetermdev/waveterm/pkg/jobmanager" + "github.com/wavetermdev/waveterm/pkg/wshrpc" + "github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient" + "github.com/wavetermdev/waveterm/pkg/wshutil" +) + +func isProcessRunning(pid int, pidStartTs int64) bool { + if pid <= 0 { + return false + } + proc, err := process.NewProcess(int32(pid)) + if err != nil { + return false + } + createTime, err := proc.CreateTime() + if err != nil { + return false + } + return createTime == pidStartTs +} + +// returns jobRouteId, cleanupFunc, error +func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, mainServerJwtToken string) (string, func(), error) { + socketPath := jobmanager.GetJobSocketPath(jobId) + log.Printf("connectToJobManager: connecting to socket: %s\n", socketPath) + conn, err := net.Dial("unix", socketPath) + if err != nil { + log.Printf("connectToJobManager: error connecting to socket: %v\n", err) + return "", nil, fmt.Errorf("cannot connect to job manager socket: %w", err) + } + log.Printf("connectToJobManager: connected to socket\n") + + proxy := wshutil.MakeRpcProxy("jobmanager") + go func() { + writeErr := wshutil.AdaptOutputChToStream(proxy.ToRemoteCh, conn) + if writeErr != nil { + log.Printf("connectToJobManager: error writing to job manager socket: %v\n", writeErr) + } + }() + go func() { + defer func() { + conn.Close() + close(proxy.FromRemoteCh) + impl.removeJobManagerConnection(jobId) + }() + wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) + }() + + linkId := impl.Router.RegisterUntrustedLink(proxy) + cleanup := func() { + conn.Close() + impl.Router.UnregisterLink(linkId) + impl.removeJobManagerConnection(jobId) + } + + routeId := wshutil.MakeLinkRouteId(linkId) + authData := wshrpc.CommandAuthenticateToJobData{ + JobAccessToken: mainServerJwtToken, + } + err = wshclient.AuthenticateToJobManagerCommand(impl.RpcClient, authData, &wshrpc.RpcOpts{Route: routeId}) + if err != nil { + cleanup() + return "", nil, fmt.Errorf("authentication to job manager failed: %w", err) + } + + jobRouteId := wshutil.MakeJobRouteId(jobId) + waitCtx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + err = impl.Router.WaitForRegister(waitCtx, jobRouteId) + if err != nil { + cleanup() + return "", nil, fmt.Errorf("timeout waiting for job route to register: %w", err) + } + + jobConn := &JobManagerConnection{ + JobId: jobId, + Conn: conn, + CleanupFn: cleanup, + } + impl.addJobManagerConnection(jobConn) + + log.Printf("connectToJobManager: successfully connected and authenticated\n") + return jobRouteId, cleanup, nil +} + +func (impl *ServerImpl) addJobManagerConnection(conn *JobManagerConnection) { + impl.Lock.Lock() + defer impl.Lock.Unlock() + impl.JobManagerMap[conn.JobId] = conn + log.Printf("addJobManagerConnection: added job manager connection for jobid=%s\n", conn.JobId) +} + +func (impl *ServerImpl) removeJobManagerConnection(jobId string) { + impl.Lock.Lock() + defer impl.Lock.Unlock() + if _, exists := impl.JobManagerMap[jobId]; exists { + delete(impl.JobManagerMap, jobId) + log.Printf("removeJobManagerConnection: removed job manager connection for jobid=%s\n", jobId) + } +} + +func (impl *ServerImpl) getJobManagerConnection(jobId string) *JobManagerConnection { + impl.Lock.Lock() + defer impl.Lock.Unlock() + return impl.JobManagerMap[jobId] +} + +func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.CommandRemoteStartJobData) (*wshrpc.CommandStartJobRtnData, error) { + log.Printf("RemoteStartJobCommand: starting, jobid=%s, clientid=%s\n", data.JobId, data.ClientId) + if impl.Router == nil { + return nil, fmt.Errorf("cannot start remote job: no router available") + } + + wshPath, err := impl.getWshPath() + if err != nil { + return nil, err + } + log.Printf("RemoteStartJobCommand: wshPath=%s\n", wshPath) + + readyPipeRead, readyPipeWrite, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("cannot create ready pipe: %w", err) + } + defer readyPipeRead.Close() + defer readyPipeWrite.Close() + + cmd := exec.Command(wshPath, "jobmanager", "--jobid", data.JobId, "--clientid", data.ClientId) + if data.PublicKeyBase64 != "" { + cmd.Env = append(os.Environ(), "WAVETERM_PUBLICKEY="+data.PublicKeyBase64) + } + cmd.ExtraFiles = []*os.File{readyPipeWrite} + stdin, err := cmd.StdinPipe() + if err != nil { + return nil, fmt.Errorf("cannot create stdin pipe: %w", err) + } + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, fmt.Errorf("cannot create stdout pipe: %w", err) + } + stderr, err := cmd.StderrPipe() + if err != nil { + return nil, fmt.Errorf("cannot create stderr pipe: %w", err) + } + log.Printf("RemoteStartJobCommand: created pipes\n") + + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("cannot start job manager: %w", err) + } + log.Printf("RemoteStartJobCommand: job manager process started\n") + + jobAuthTokenLine := fmt.Sprintf("Wave-JobAccessToken:%s\n", data.JobAuthToken) + if _, err := stdin.Write([]byte(jobAuthTokenLine)); err != nil { + cmd.Process.Kill() + return nil, fmt.Errorf("cannot write job auth token: %w", err) + } + stdin.Close() + log.Printf("RemoteStartJobCommand: wrote auth token to stdin\n") + + go func() { + scanner := bufio.NewScanner(stderr) + for scanner.Scan() { + line := scanner.Text() + log.Printf("RemoteStartJobCommand: stderr: %s\n", line) + } + if err := scanner.Err(); err != nil { + log.Printf("RemoteStartJobCommand: error reading stderr: %v\n", err) + } else { + log.Printf("RemoteStartJobCommand: stderr EOF\n") + } + }() + + go func() { + scanner := bufio.NewScanner(stdout) + for scanner.Scan() { + line := scanner.Text() + log.Printf("RemoteStartJobCommand: stdout: %s\n", line) + } + if err := scanner.Err(); err != nil { + log.Printf("RemoteStartJobCommand: error reading stdout: %v\n", err) + } else { + log.Printf("RemoteStartJobCommand: stdout EOF\n") + } + }() + + startCh := make(chan error, 1) + go func() { + scanner := bufio.NewScanner(readyPipeRead) + for scanner.Scan() { + line := scanner.Text() + log.Printf("RemoteStartJobCommand: ready pipe line: %s\n", line) + if strings.Contains(line, "Wave-JobManagerStart") { + startCh <- nil + return + } + } + if err := scanner.Err(); err != nil { + startCh <- fmt.Errorf("error reading ready pipe: %w", err) + } else { + log.Printf("RemoteStartJobCommand: ready pipe EOF\n") + startCh <- fmt.Errorf("job manager exited without start signal") + } + }() + + timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + log.Printf("RemoteStartJobCommand: waiting for start signal\n") + select { + case err := <-startCh: + if err != nil { + cmd.Process.Kill() + log.Printf("RemoteStartJobCommand: error from start signal: %v\n", err) + return nil, err + } + log.Printf("RemoteStartJobCommand: received start signal\n") + case <-timeoutCtx.Done(): + cmd.Process.Kill() + log.Printf("RemoteStartJobCommand: timeout waiting for start signal\n") + return nil, fmt.Errorf("timeout waiting for job manager to start") + } + + go func() { + cmd.Wait() + }() + + jobRouteId, cleanup, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) + if err != nil { + return nil, err + } + + startJobData := wshrpc.CommandStartJobData{ + Cmd: data.Cmd, + Args: data.Args, + Env: data.Env, + TermSize: data.TermSize, + StreamMeta: data.StreamMeta, + } + rtnData, err := wshclient.StartJobCommand(impl.RpcClient, startJobData, &wshrpc.RpcOpts{Route: jobRouteId}) + if err != nil { + cleanup() + return nil, fmt.Errorf("failed to start job: %w", err) + } + + return rtnData, nil +} + +func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteReconnectToJobManagerData) error { + log.Printf("RemoteReconnectToJobManagerCommand: reconnecting, jobid=%s\n", data.JobId) + if impl.Router == nil { + return fmt.Errorf("cannot reconnect to job manager: no router available") + } + + if !isProcessRunning(data.JobManagerPid, data.JobManagerStartTs) { + return fmt.Errorf("job manager process (pid=%d) is not running", data.JobManagerPid) + } + + existingConn := impl.getJobManagerConnection(data.JobId) + if existingConn != nil { + log.Printf("RemoteReconnectToJobManagerCommand: closing existing connection for jobid=%s\n", data.JobId) + if existingConn.CleanupFn != nil { + existingConn.CleanupFn() + } + } + + _, _, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) + if err != nil { + return err + } + + log.Printf("RemoteReconnectToJobManagerCommand: successfully reconnected to job manager\n") + return nil +} + +func (impl *ServerImpl) RemoteDisconnectFromJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteDisconnectFromJobManagerData) error { + log.Printf("RemoteDisconnectFromJobManagerCommand: disconnecting, jobid=%s\n", data.JobId) + conn := impl.getJobManagerConnection(data.JobId) + if conn == nil { + log.Printf("RemoteDisconnectFromJobManagerCommand: no connection found for jobid=%s\n", data.JobId) + return nil + } + + if conn.CleanupFn != nil { + conn.CleanupFn() + log.Printf("RemoteDisconnectFromJobManagerCommand: cleanup completed for jobid=%s\n", data.JobId) + } + + return nil +} diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 4973f0a454..e759812aa4 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -110,6 +110,7 @@ type WshRpcInterface interface { RemoteInstallRcFilesCommand(ctx context.Context) error RemoteStartJobCommand(ctx context.Context, data CommandRemoteStartJobData) (*CommandStartJobRtnData, error) RemoteReconnectToJobManagerCommand(ctx context.Context, data CommandRemoteReconnectToJobManagerData) error + RemoteDisconnectFromJobManagerCommand(ctx context.Context, data CommandRemoteDisconnectFromJobManagerData) error // emain WebSelectorCommand(ctx context.Context, data CommandWebSelectorData) ([]string, error) @@ -178,6 +179,8 @@ type WshRpcInterface interface { JobControllerStartJobCommand(ctx context.Context, data CommandJobControllerStartJobData) (string, error) JobControllerTerminateJobCommand(ctx context.Context, jobId string) error JobControllerExitJobCommand(ctx context.Context, jobId string) error + JobControllerDisconnectJobCommand(ctx context.Context, jobId string) error + JobControllerReconnectJobCommand(ctx context.Context, jobId string) error JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) } @@ -716,6 +719,12 @@ type CommandRemoteReconnectToJobManagerData struct { JobId string `json:"jobid"` JobAuthToken string `json:"jobauthtoken"` MainServerJwtToken string `json:"mainserverjwttoken"` + JobManagerPid int `json:"jobmanagerpid"` + JobManagerStartTs int64 `json:"jobmanagerstartts"` +} + +type CommandRemoteDisconnectFromJobManagerData struct { + JobId string `json:"jobid"` } type CommandStartJobRtnData struct { diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 4a951e2bf6..fd178309dd 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -1463,6 +1463,14 @@ func (ws *WshServer) JobControllerExitJobCommand(ctx context.Context, jobId stri return jobcontroller.ExitJobManager(ctx, jobId) } +func (ws *WshServer) JobControllerDisconnectJobCommand(ctx context.Context, jobId string) error { + return jobcontroller.DisconnectJob(ctx, jobId) +} + +func (ws *WshServer) JobControllerReconnectJobCommand(ctx context.Context, jobId string) error { + return jobcontroller.ReconnectJob(ctx, jobId) +} + func (ws *WshServer) JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) { return jobcontroller.GetConnectedJobIds(), nil } From f04cf0dfa706e5ea1804900ceedad70ccfab945d Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 10:18:51 -0800 Subject: [PATCH 36/64] return bool saying if jobmanager is no longer running (from reconnect) --- frontend/app/store/wshclientapi.ts | 2 +- frontend/types/gotypes.d.ts | 7 +++++++ pkg/jobcontroller/jobcontroller.go | 16 +++++++++++++++- pkg/wshrpc/wshclient/wshclient.go | 6 +++--- pkg/wshrpc/wshremote/wshremote_job.go | 22 +++++++++++++++++----- pkg/wshrpc/wshrpctypes.go | 8 +++++++- 6 files changed, 50 insertions(+), 11 deletions(-) diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index ac0cede785..2d394e40e1 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -568,7 +568,7 @@ class RpcApiType { } // command "remotereconnecttojobmanager" [call] - RemoteReconnectToJobManagerCommand(client: WshClient, data: CommandRemoteReconnectToJobManagerData, opts?: RpcOpts): Promise { + RemoteReconnectToJobManagerCommand(client: WshClient, data: CommandRemoteReconnectToJobManagerData, opts?: RpcOpts): Promise { return client.wshRpcCall("remotereconnecttojobmanager", data, opts); } diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 6493d22750..bfe1e569c7 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -475,6 +475,13 @@ declare global { jobmanagerstartts: number; }; + // wshrpc.CommandRemoteReconnectToJobManagerRtnData + type CommandRemoteReconnectToJobManagerRtnData = { + success: boolean; + jobmanagerexited: boolean; + error?: string; + }; + // wshrpc.CommandRemoteStartJobData type CommandRemoteStartJobData = { cmd: string; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index d3e6f7a8a7..4b3868745a 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -479,12 +479,26 @@ func ReconnectJob(ctx context.Context, jobId string) error { } log.Printf("[job:%s] sending RemoteReconnectToJobManagerCommand to connection %s", jobId, job.Connection) - err = wshclient.RemoteReconnectToJobManagerCommand(bareRpc, reconnectData, rpcOpts) + rtnData, err := wshclient.RemoteReconnectToJobManagerCommand(bareRpc, reconnectData, rpcOpts) if err != nil { log.Printf("[job:%s] RemoteReconnectToJobManagerCommand failed: %v", jobId, err) return fmt.Errorf("failed to reconnect to job manager: %w", err) } + if !rtnData.Success { + log.Printf("[job:%s] RemoteReconnectToJobManagerCommand returned error: %s", jobId, rtnData.Error) + if rtnData.JobManagerExited { + updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.JobManagerRunning = false + }) + if updateErr != nil { + log.Printf("[job:%s] error updating job manager running status: %v", jobId, updateErr) + } + return fmt.Errorf("job manager has exited: %s", rtnData.Error) + } + return fmt.Errorf("failed to reconnect to job manager: %s", rtnData.Error) + } + log.Printf("[job:%s] RemoteReconnectToJobManagerCommand succeeded", jobId) return nil } diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 2fd95e12a7..b318cf0f65 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -686,9 +686,9 @@ func RemoteMkdirCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) er } // command "remotereconnecttojobmanager", wshserver.RemoteReconnectToJobManagerCommand -func RemoteReconnectToJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteReconnectToJobManagerData, opts *wshrpc.RpcOpts) error { - _, err := sendRpcRequestCallHelper[any](w, "remotereconnecttojobmanager", data, opts) - return err +func RemoteReconnectToJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteReconnectToJobManagerData, opts *wshrpc.RpcOpts) (*wshrpc.CommandRemoteReconnectToJobManagerRtnData, error) { + resp, err := sendRpcRequestCallHelper[*wshrpc.CommandRemoteReconnectToJobManagerRtnData](w, "remotereconnecttojobmanager", data, opts) + return resp, err } // command "remotestartjob", wshserver.RemoteStartJobCommand diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index c15bc010fd..0daf4c8ac2 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -261,14 +261,21 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C return rtnData, nil } -func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteReconnectToJobManagerData) error { +func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteReconnectToJobManagerData) (*wshrpc.CommandRemoteReconnectToJobManagerRtnData, error) { log.Printf("RemoteReconnectToJobManagerCommand: reconnecting, jobid=%s\n", data.JobId) if impl.Router == nil { - return fmt.Errorf("cannot reconnect to job manager: no router available") + return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ + Success: false, + Error: "cannot reconnect to job manager: no router available", + }, nil } if !isProcessRunning(data.JobManagerPid, data.JobManagerStartTs) { - return fmt.Errorf("job manager process (pid=%d) is not running", data.JobManagerPid) + return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ + Success: false, + JobManagerExited: true, + Error: fmt.Sprintf("job manager process (pid=%d) is not running", data.JobManagerPid), + }, nil } existingConn := impl.getJobManagerConnection(data.JobId) @@ -281,11 +288,16 @@ func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, _, _, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) if err != nil { - return err + return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ + Success: false, + Error: err.Error(), + }, nil } log.Printf("RemoteReconnectToJobManagerCommand: successfully reconnected to job manager\n") - return nil + return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ + Success: true, + }, nil } func (impl *ServerImpl) RemoteDisconnectFromJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteDisconnectFromJobManagerData) error { diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index e759812aa4..ad243b049d 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -109,7 +109,7 @@ type WshRpcInterface interface { RemoteGetInfoCommand(ctx context.Context) (RemoteInfo, error) RemoteInstallRcFilesCommand(ctx context.Context) error RemoteStartJobCommand(ctx context.Context, data CommandRemoteStartJobData) (*CommandStartJobRtnData, error) - RemoteReconnectToJobManagerCommand(ctx context.Context, data CommandRemoteReconnectToJobManagerData) error + RemoteReconnectToJobManagerCommand(ctx context.Context, data CommandRemoteReconnectToJobManagerData) (*CommandRemoteReconnectToJobManagerRtnData, error) RemoteDisconnectFromJobManagerCommand(ctx context.Context, data CommandRemoteDisconnectFromJobManagerData) error // emain @@ -723,6 +723,12 @@ type CommandRemoteReconnectToJobManagerData struct { JobManagerStartTs int64 `json:"jobmanagerstartts"` } +type CommandRemoteReconnectToJobManagerRtnData struct { + Success bool `json:"success"` + JobManagerExited bool `json:"jobmanagerexited"` + Error string `json:"error,omitempty"` +} + type CommandRemoteDisconnectFromJobManagerData struct { JobId string `json:"jobid"` } From 5c5dfac4a64bb89307ccda55c3a07487a5c2c5d2 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 10:42:59 -0800 Subject: [PATCH 37/64] implement terminate and a new SIGHUP handler --- frontend/app/store/wshclientapi.ts | 5 +++ frontend/types/gotypes.d.ts | 9 +++++- pkg/jobmanager/jobcmd.go | 2 +- pkg/jobmanager/jobmanager_unix.go | 13 ++++++++ pkg/jobmanager/mainserverconn.go | 2 +- pkg/waveobj/wtype.go | 2 +- pkg/wshrpc/wshclient/wshclient.go | 6 ++++ pkg/wshrpc/wshremote/wshremote_job.go | 46 +++++++++++++++++++++++---- pkg/wshrpc/wshrpctypes.go | 7 ++++ 9 files changed, 81 insertions(+), 11 deletions(-) diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 2d394e40e1..73fb191f2d 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -592,6 +592,11 @@ class RpcApiType { return client.wshRpcStream("remotetarstream", data, opts); } + // command "remoteterminatejobmanager" [call] + RemoteTerminateJobManagerCommand(client: WshClient, data: CommandRemoteTerminateJobManagerData, opts?: RpcOpts): Promise { + return client.wshRpcCall("remoteterminatejobmanager", data, opts); + } + // command "remotewritefile" [call] RemoteWriteFileCommand(client: WshClient, data: FileData, opts?: RpcOpts): Promise { return client.wshRpcCall("remotewritefile", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index bfe1e569c7..53fdafd090 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -508,6 +508,13 @@ declare global { opts?: FileCopyOpts; }; + // wshrpc.CommandRemoteTerminateJobManagerData + type CommandRemoteTerminateJobManagerData = { + jobid: string; + jobmanagerpid: number; + jobmanagerstartts: number; + }; + // wshrpc.CommandRenameAppFileData type CommandRenameAppFileData = { appid: string; @@ -915,7 +922,7 @@ declare global { exitcode?: number; exitsignal?: string; exiterror?: string; - huponconnect: boolean; + terminateonreconnect?: boolean; jobmanagerrunning?: boolean; jobmanagerpid?: number; jobmanagerstartts?: number; diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index c9fd4732e0..07841e1bf9 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -183,7 +183,7 @@ func (jm *JobCmd) HandleInput(data wshrpc.CommandBlockInputData) error { return nil } -func (jm *JobCmd) Terminate() { +func (jm *JobCmd) TerminateByClosingPtyMaster() { jm.lock.Lock() defer jm.lock.Unlock() if jm.ptyClosed { diff --git a/pkg/jobmanager/jobmanager_unix.go b/pkg/jobmanager/jobmanager_unix.go index 8b8fa1fcb1..3d16a97205 100644 --- a/pkg/jobmanager/jobmanager_unix.go +++ b/pkg/jobmanager/jobmanager_unix.go @@ -85,6 +85,14 @@ func daemonize(clientId string, jobId string) error { return nil } +func handleSIGHUP() { + cmd := WshCmdJobManager.GetCmd() + if cmd != nil { + log.Printf("handling SIGHUP, closing pty master\n") + cmd.TerminateByClosingPtyMaster() + } +} + func setupJobManagerSignalHandlers() { sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) @@ -93,6 +101,11 @@ func setupJobManagerSignalHandlers() { for sig := range sigChan { log.Printf("job manager received signal: %v\n", sig) + if sig == syscall.SIGHUP { + handleSIGHUP() + continue + } + cmd := WshCmdJobManager.GetCmd() if cmd != nil { pgid, err := cmd.GetPGID() diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index 4f683b32e6..c51f7a701a 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -271,7 +271,7 @@ func (msc *MainServerConn) JobTerminateCommand(ctx context.Context, data wshrpc. return fmt.Errorf("job not started") } log.Printf("JobTerminate called\n") - WshCmdJobManager.Cmd.Terminate() + WshCmdJobManager.Cmd.TerminateByClosingPtyMaster() return nil } diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 14dc8b56c9..29d3c54c45 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -334,7 +334,7 @@ type Job struct { ExitError string `json:"exiterror,omitempty"` // reconnect option (e.g. orphaned, so we need to kill on connect) - HupOnConnect bool `json:"huponconnect"` + TerminateOnReconnect bool `json:"terminateonreconnect,omitempty"` // job manager state JobManagerRunning bool `json:"jobmanagerrunning,omitempty"` diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index b318cf0f65..58421ea908 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -712,6 +712,12 @@ func RemoteTarStreamCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteStreamTa return sendRpcRequestResponseStreamHelper[iochantypes.Packet](w, "remotetarstream", data, opts) } +// command "remoteterminatejobmanager", wshserver.RemoteTerminateJobManagerCommand +func RemoteTerminateJobManagerCommand(w *wshutil.WshRpc, data wshrpc.CommandRemoteTerminateJobManagerData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "remoteterminatejobmanager", data, opts) + return err +} + // command "remotewritefile", wshserver.RemoteWriteFileCommand func RemoteWriteFileCommand(w *wshutil.WshRpc, data wshrpc.FileData, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "remotewritefile", data, opts) diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index 0daf4c8ac2..49f76c1beb 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -12,6 +12,7 @@ import ( "os" "os/exec" "strings" + "syscall" "time" "github.com/shirou/gopsutil/v4/process" @@ -21,19 +22,22 @@ import ( "github.com/wavetermdev/waveterm/pkg/wshutil" ) -func isProcessRunning(pid int, pidStartTs int64) bool { +func isProcessRunning(pid int, pidStartTs int64) (*process.Process, error) { if pid <= 0 { - return false + return nil, nil } proc, err := process.NewProcess(int32(pid)) if err != nil { - return false + return nil, nil } createTime, err := proc.CreateTime() if err != nil { - return false + return nil, err + } + if createTime != pidStartTs { + return nil, nil } - return createTime == pidStartTs + return proc, nil } // returns jobRouteId, cleanupFunc, error @@ -270,7 +274,14 @@ func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, }, nil } - if !isProcessRunning(data.JobManagerPid, data.JobManagerStartTs) { + proc, err := isProcessRunning(data.JobManagerPid, data.JobManagerStartTs) + if err != nil { + return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ + Success: false, + Error: fmt.Sprintf("error checking job manager process: %v", err), + }, nil + } + if proc == nil { return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ Success: false, JobManagerExited: true, @@ -286,7 +297,7 @@ func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, } } - _, _, err := impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) + _, _, err = impl.connectToJobManager(ctx, data.JobId, data.MainServerJwtToken) if err != nil { return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ Success: false, @@ -315,3 +326,24 @@ func (impl *ServerImpl) RemoteDisconnectFromJobManagerCommand(ctx context.Contex return nil } + +func (impl *ServerImpl) RemoteTerminateJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteTerminateJobManagerData) error { + log.Printf("RemoteTerminateJobManagerCommand: terminating job manager, jobid=%s, pid=%d\n", data.JobId, data.JobManagerPid) + + proc, err := isProcessRunning(data.JobManagerPid, data.JobManagerStartTs) + if err != nil { + return fmt.Errorf("error checking job manager process: %w", err) + } + if proc == nil { + log.Printf("RemoteTerminateJobManagerCommand: job manager process not running, jobid=%s\n", data.JobId) + return nil + } + + err = proc.SendSignal(syscall.SIGHUP) + if err != nil { + return fmt.Errorf("failed to send SIGHUP to job manager: %w", err) + } + + log.Printf("RemoteTerminateJobManagerCommand: sent SIGHUP to job manager process, jobid=%s, pid=%d\n", data.JobId, data.JobManagerPid) + return nil +} diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index ad243b049d..28082882a1 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -111,6 +111,7 @@ type WshRpcInterface interface { RemoteStartJobCommand(ctx context.Context, data CommandRemoteStartJobData) (*CommandStartJobRtnData, error) RemoteReconnectToJobManagerCommand(ctx context.Context, data CommandRemoteReconnectToJobManagerData) (*CommandRemoteReconnectToJobManagerRtnData, error) RemoteDisconnectFromJobManagerCommand(ctx context.Context, data CommandRemoteDisconnectFromJobManagerData) error + RemoteTerminateJobManagerCommand(ctx context.Context, data CommandRemoteTerminateJobManagerData) error // emain WebSelectorCommand(ctx context.Context, data CommandWebSelectorData) ([]string, error) @@ -733,6 +734,12 @@ type CommandRemoteDisconnectFromJobManagerData struct { JobId string `json:"jobid"` } +type CommandRemoteTerminateJobManagerData struct { + JobId string `json:"jobid"` + JobManagerPid int `json:"jobmanagerpid"` + JobManagerStartTs int64 `json:"jobmanagerstartts"` +} + type CommandStartJobRtnData struct { CmdPgid int `json:"cmdpgid"` JobManagerPid int `json:"jobmanagerpid"` From e99d03d8825902f13b792c1da65c4f2ef3eca883 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 10:58:35 -0800 Subject: [PATCH 38/64] reconnect jobs for conn --- cmd/wsh/cmd/wshcmd-jobdebug.go | 21 ++++++++++ frontend/app/store/wshclientapi.ts | 5 +++ pkg/jobcontroller/jobcontroller.go | 64 ++++++++++++++++++++++++++++++ pkg/wshrpc/wshclient/wshclient.go | 6 +++ pkg/wshrpc/wshrpctypes.go | 1 + pkg/wshrpc/wshserver/wshserver.go | 4 ++ 6 files changed, 101 insertions(+) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index b7c702911a..50d28f9406 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -68,6 +68,12 @@ var jobDebugReconnectCmd = &cobra.Command{ RunE: jobDebugReconnectRun, } +var jobDebugReconnectConnCmd = &cobra.Command{ + Use: "reconnectconn", + Short: "reconnect all jobs for a connection", + RunE: jobDebugReconnectConnRun, +} + var jobDebugGetOutputCmd = &cobra.Command{ Use: "getoutput", Short: "get the terminal output for a job", @@ -86,6 +92,7 @@ var jobConnFlag string var exitJobIdFlag string var disconnectJobIdFlag string var reconnectJobIdFlag string +var reconnectConnNameFlag string func init() { rootCmd.AddCommand(jobDebugCmd) @@ -97,6 +104,7 @@ func init() { jobDebugCmd.AddCommand(jobDebugExitCmd) jobDebugCmd.AddCommand(jobDebugDisconnectCmd) jobDebugCmd.AddCommand(jobDebugReconnectCmd) + jobDebugCmd.AddCommand(jobDebugReconnectConnCmd) jobDebugCmd.AddCommand(jobDebugGetOutputCmd) jobDebugCmd.AddCommand(jobDebugStartCmd) @@ -117,6 +125,9 @@ func init() { jobDebugReconnectCmd.Flags().StringVar(&reconnectJobIdFlag, "jobid", "", "job id to reconnect (required)") jobDebugReconnectCmd.MarkFlagRequired("jobid") + jobDebugReconnectConnCmd.Flags().StringVar(&reconnectConnNameFlag, "conn", "", "connection name (required)") + jobDebugReconnectConnCmd.MarkFlagRequired("conn") + jobDebugGetOutputCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to get output for (required)") jobDebugGetOutputCmd.MarkFlagRequired("jobid") @@ -294,6 +305,16 @@ func jobDebugReconnectRun(cmd *cobra.Command, args []string) error { return nil } +func jobDebugReconnectConnRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobControllerReconnectJobsForConnCommand(RpcClient, reconnectConnNameFlag, nil) + if err != nil { + return fmt.Errorf("reconnecting jobs for connection: %w", err) + } + + fmt.Printf("Reconnected all jobs for connection %s successfully\n", reconnectConnNameFlag) + return nil +} + func jobDebugGetOutputRun(cmd *cobra.Command, args []string) error { fileData, err := wshclient.FileReadCommand(RpcClient, wshrpc.FileData{ Info: &wshrpc.FileInfo{ diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 73fb191f2d..a8153d6204 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -412,6 +412,11 @@ class RpcApiType { return client.wshRpcCall("jobcontrollerreconnectjob", data, opts); } + // command "jobcontrollerreconnectjobsforconn" [call] + JobControllerReconnectJobsForConnCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerreconnectjobsforconn", data, opts); + } + // command "jobcontrollerstartjob" [call] JobControllerStartJobCommand(client: WshClient, data: CommandJobControllerStartJobData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerstartjob", data, opts); diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 4b3868745a..590727ca42 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -503,6 +503,70 @@ func ReconnectJob(ctx context.Context, jobId string) error { return nil } +func ReconnectJobsForConn(ctx context.Context, connName string) error { + isConnected, err := conncontroller.IsConnected(connName) + if err != nil { + return fmt.Errorf("error checking connection status: %w", err) + } + if !isConnected { + return fmt.Errorf("connection %q is not connected", connName) + } + + allJobs, err := wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job) + if err != nil { + return fmt.Errorf("failed to get jobs: %w", err) + } + + var jobsToReconnect []*waveobj.Job + for _, job := range allJobs { + if job.Connection == connName && job.JobManagerRunning { + jobsToReconnect = append(jobsToReconnect, job) + } + } + + log.Printf("[conn:%s] found %d jobs to reconnect", connName, len(jobsToReconnect)) + + for _, job := range jobsToReconnect { + if job.TerminateOnReconnect { + log.Printf("[job:%s] terminating job manager on reconnect", job.OID) + + bareRpc := wshclient.GetBareRpcClient() + if bareRpc == nil { + log.Printf("[job:%s] warning: main rpc client not available for termination", job.OID) + continue + } + + terminateData := wshrpc.CommandRemoteTerminateJobManagerData{ + JobId: job.OID, + JobManagerPid: job.JobManagerPid, + JobManagerStartTs: job.JobManagerStartTs, + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeConnectionRouteId(connName), + Timeout: 5000, + } + + err = wshclient.RemoteTerminateJobManagerCommand(bareRpc, terminateData, rpcOpts) + if err != nil { + log.Printf("[job:%s] error terminating job manager: %v", job.OID, err) + } else { + log.Printf("[job:%s] job manager terminate command sent successfully", job.OID) + } + } else { + log.Printf("[job:%s] reconnecting to job manager", job.OID) + err = ReconnectJob(ctx, job.OID) + if err != nil { + log.Printf("[job:%s] error reconnecting: %v", job.OID, err) + } else { + log.Printf("[job:%s] reconnected successfully", job.OID) + } + } + } + + return nil +} + func DeleteJob(ctx context.Context, jobId string) error { SetJobConnStatus(jobId, JobConnStatus_Disconnected) err := filestore.WFS.DeleteZone(ctx, jobId) diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 58421ea908..271de67d29 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -500,6 +500,12 @@ func JobControllerReconnectJobCommand(w *wshutil.WshRpc, data string, opts *wshr return err } +// command "jobcontrollerreconnectjobsforconn", wshserver.JobControllerReconnectJobsForConnCommand +func JobControllerReconnectJobsForConnCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerreconnectjobsforconn", data, opts) + return err +} + // command "jobcontrollerstartjob", wshserver.JobControllerStartJobCommand func JobControllerStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobControllerStartJobData, opts *wshrpc.RpcOpts) (string, error) { resp, err := sendRpcRequestCallHelper[string](w, "jobcontrollerstartjob", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 28082882a1..30bc0e6044 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -182,6 +182,7 @@ type WshRpcInterface interface { JobControllerExitJobCommand(ctx context.Context, jobId string) error JobControllerDisconnectJobCommand(ctx context.Context, jobId string) error JobControllerReconnectJobCommand(ctx context.Context, jobId string) error + JobControllerReconnectJobsForConnCommand(ctx context.Context, connName string) error JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) } diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index fd178309dd..61bc0f15e7 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -1471,6 +1471,10 @@ func (ws *WshServer) JobControllerReconnectJobCommand(ctx context.Context, jobId return jobcontroller.ReconnectJob(ctx, jobId) } +func (ws *WshServer) JobControllerReconnectJobsForConnCommand(ctx context.Context, connName string) error { + return jobcontroller.ReconnectJobsForConn(ctx, connName) +} + func (ws *WshServer) JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) { return jobcontroller.GetConnectedJobIds(), nil } From 0ec269b1258ddeaf49d09b361b7e8df69feac1dc Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 11:32:12 -0800 Subject: [PATCH 39/64] remove barerpc nil checks --- pkg/jobcontroller/jobcontroller.go | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 590727ca42..dd842a8aa1 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -165,10 +165,6 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { } bareRpc := wshclient.GetBareRpcClient() - if bareRpc == nil { - return "", fmt.Errorf("main rpc client not available") - } - broker := bareRpc.StreamBroker readerRouteId := wshclient.GetBareRpcClientRouteId() writerRouteId := wshutil.MakeJobRouteId(jobId) @@ -349,10 +345,6 @@ func TerminateJob(ctx context.Context, jobId string) error { } bareRpc := wshclient.GetBareRpcClient() - if bareRpc == nil { - return fmt.Errorf("main rpc client not available") - } - rpcOpts := &wshrpc.RpcOpts{ Route: wshutil.MakeJobRouteId(jobId), Timeout: 5000, @@ -379,10 +371,6 @@ func ExitJobManager(ctx context.Context, jobId string) error { } bareRpc := wshclient.GetBareRpcClient() - if bareRpc == nil { - return fmt.Errorf("main rpc client not available") - } - rpcOpts := &wshrpc.RpcOpts{ Route: wshutil.MakeJobRouteId(jobId), Timeout: 5000, @@ -411,10 +399,6 @@ func DisconnectJob(ctx context.Context, jobId string) error { } bareRpc := wshclient.GetBareRpcClient() - if bareRpc == nil { - return fmt.Errorf("main rpc client not available") - } - rpcOpts := &wshrpc.RpcOpts{ Route: wshutil.MakeConnectionRouteId(job.Connection), Timeout: 5000, @@ -461,10 +445,6 @@ func ReconnectJob(ctx context.Context, jobId string) error { } bareRpc := wshclient.GetBareRpcClient() - if bareRpc == nil { - return fmt.Errorf("main rpc client not available") - } - reconnectData := wshrpc.CommandRemoteReconnectToJobManagerData{ JobId: jobId, JobAuthToken: job.JobAuthToken, @@ -531,11 +511,6 @@ func ReconnectJobsForConn(ctx context.Context, connName string) error { log.Printf("[job:%s] terminating job manager on reconnect", job.OID) bareRpc := wshclient.GetBareRpcClient() - if bareRpc == nil { - log.Printf("[job:%s] warning: main rpc client not available for termination", job.OID) - continue - } - terminateData := wshrpc.CommandRemoteTerminateJobManagerData{ JobId: job.OID, JobManagerPid: job.JobManagerPid, From 02e76a05b7713be9e47443ab0981f58b16ed68f3 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 11:46:59 -0800 Subject: [PATCH 40/64] update termination, make more consistent --- pkg/jobcontroller/jobcontroller.go | 97 +++++++++++++++--------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index dd842a8aa1..e3700b2689 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -334,29 +334,20 @@ func tryExitJobManager(ctx context.Context, jobId string) { } func TerminateJob(ctx context.Context, jobId string) error { - _, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { return fmt.Errorf("failed to get job: %w", err) } - jobConnStatus := GetJobConnStatus(jobId) - if jobConnStatus != JobConnStatus_Connected { - return fmt.Errorf("job connection is not connected (status: %s)", jobConnStatus) - } - - bareRpc := wshclient.GetBareRpcClient() - rpcOpts := &wshrpc.RpcOpts{ - Route: wshutil.MakeJobRouteId(jobId), - Timeout: 5000, - } - - err = wshclient.JobTerminateCommand(bareRpc, wshrpc.CommandJobTerminateData{}, rpcOpts) + isConnected, err := conncontroller.IsConnected(job.Connection) if err != nil { - return fmt.Errorf("failed to send terminate command: %w", err) + return fmt.Errorf("error checking connection status: %w", err) + } + if !isConnected { + return fmt.Errorf("connection %q is not connected", job.Connection) } - log.Printf("[job:%s] job terminate command sent successfully", jobId) - return nil + return remoteTerminateJobManager(ctx, job) } func ExitJobManager(ctx context.Context, jobId string) error { @@ -417,6 +408,39 @@ func DisconnectJob(ctx context.Context, jobId string) error { return nil } +func remoteTerminateJobManager(ctx context.Context, job *waveobj.Job) error { + log.Printf("[job:%s] terminating job manager", job.OID) + + bareRpc := wshclient.GetBareRpcClient() + terminateData := wshrpc.CommandRemoteTerminateJobManagerData{ + JobId: job.OID, + JobManagerPid: job.JobManagerPid, + JobManagerStartTs: job.JobManagerStartTs, + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeConnectionRouteId(job.Connection), + Timeout: 5000, + } + + err := wshclient.RemoteTerminateJobManagerCommand(bareRpc, terminateData, rpcOpts) + if err != nil { + log.Printf("[job:%s] error terminating job manager: %v", job.OID, err) + return fmt.Errorf("failed to terminate job manager: %w", err) + } + + updateErr := wstore.DBUpdateFn(ctx, job.OID, func(job *waveobj.Job) { + job.JobManagerRunning = false + job.TerminateOnReconnect = false + }) + if updateErr != nil { + log.Printf("[job:%s] error updating job manager running status: %v", job.OID, updateErr) + } + + log.Printf("[job:%s] job manager terminated successfully", job.OID) + return nil +} + func ReconnectJob(ctx context.Context, jobId string) error { job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { @@ -435,6 +459,12 @@ func ReconnectJob(ctx context.Context, jobId string) error { return fmt.Errorf("connection %q is not connected", job.Connection) } + if job.TerminateOnReconnect { + return remoteTerminateJobManager(ctx, job) + } + + bareRpc := wshclient.GetBareRpcClient() + jobAccessClaims := &wavejwt.WaveJwtClaims{ MainServer: true, JobId: jobId, @@ -444,7 +474,6 @@ func ReconnectJob(ctx context.Context, jobId string) error { return fmt.Errorf("failed to generate job access token: %w", err) } - bareRpc := wshclient.GetBareRpcClient() reconnectData := wshrpc.CommandRemoteReconnectToJobManagerData{ JobId: jobId, JobAuthToken: job.JobAuthToken, @@ -455,7 +484,7 @@ func ReconnectJob(ctx context.Context, jobId string) error { rpcOpts := &wshrpc.RpcOpts{ Route: wshutil.MakeConnectionRouteId(job.Connection), - Timeout: 30000, + Timeout: 5000, } log.Printf("[job:%s] sending RemoteReconnectToJobManagerCommand to connection %s", jobId, job.Connection) @@ -507,35 +536,9 @@ func ReconnectJobsForConn(ctx context.Context, connName string) error { log.Printf("[conn:%s] found %d jobs to reconnect", connName, len(jobsToReconnect)) for _, job := range jobsToReconnect { - if job.TerminateOnReconnect { - log.Printf("[job:%s] terminating job manager on reconnect", job.OID) - - bareRpc := wshclient.GetBareRpcClient() - terminateData := wshrpc.CommandRemoteTerminateJobManagerData{ - JobId: job.OID, - JobManagerPid: job.JobManagerPid, - JobManagerStartTs: job.JobManagerStartTs, - } - - rpcOpts := &wshrpc.RpcOpts{ - Route: wshutil.MakeConnectionRouteId(connName), - Timeout: 5000, - } - - err = wshclient.RemoteTerminateJobManagerCommand(bareRpc, terminateData, rpcOpts) - if err != nil { - log.Printf("[job:%s] error terminating job manager: %v", job.OID, err) - } else { - log.Printf("[job:%s] job manager terminate command sent successfully", job.OID) - } - } else { - log.Printf("[job:%s] reconnecting to job manager", job.OID) - err = ReconnectJob(ctx, job.OID) - if err != nil { - log.Printf("[job:%s] error reconnecting: %v", job.OID, err) - } else { - log.Printf("[job:%s] reconnected successfully", job.OID) - } + err = ReconnectJob(ctx, job.OID) + if err != nil { + log.Printf("[job:%s] error reconnecting: %v", job.OID, err) } } From b04894cd18a7efc2fff2e6e84f0b20f9189a01eb Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 13:08:05 -0800 Subject: [PATCH 41/64] working on the ability to restart a job stream --- pkg/jobcontroller/jobcontroller.go | 145 ++++++++++++++++++++++++++--- pkg/streamclient/streambroker.go | 6 +- pkg/streamclient/streamreader.go | 6 +- 3 files changed, 141 insertions(+), 16 deletions(-) diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index e3700b2689..a7120717a8 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -29,10 +29,11 @@ import ( ) const ( - JobStatus_Init = "init" - JobStatus_Running = "running" - JobStatus_Done = "done" - JobStatus_Error = "error" + JobStatus_Init = "init" + JobStatus_Running = "running" + JobStatus_Done = "done" // natural exit (managed by job manager, command completed) + JobStatus_Error = "error" // failed to start or unmanaged failure + JobStatus_Terminated = "terminated" // explicitly killed via terminate command ) const ( @@ -42,12 +43,28 @@ const ( ) const DefaultStreamRwnd = 64 * 1024 +const MetaKey_TotalGap = "totalgap" +const JobOutputFileName = "term" var ( jobConnStates = make(map[string]string) jobConnStatesLock sync.Mutex ) +func getMetaInt64(meta wshrpc.FileMeta, key string) int64 { + val, ok := meta[key] + if !ok { + return 0 + } + if intVal, ok := val.(int64); ok { + return intVal + } + if floatVal, ok := val.(float64); ok { + return int64(floatVal) + } + return 0 +} + func InitJobController() { rpcClient := wshclient.GetBareRpcClient() rpcClient.EventListener.On(wps.Event_RouteUp, handleRouteUpEvent) @@ -174,7 +191,7 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { MaxSize: 10 * 1024 * 1024, Circular: true, } - err = filestore.WFS.MakeFile(ctx, jobId, "term", wshrpc.FileMeta{}, fileOpts) + err = filestore.WFS.MakeFile(ctx, jobId, JobOutputFileName, wshrpc.FileMeta{}, fileOpts) if err != nil { return "", fmt.Errorf("failed to create WaveFS file: %w", err) } @@ -252,7 +269,7 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade n, err := reader.Read(buf) if n > 0 { log.Printf("[job:%s] received %d bytes of data", jobId, n) - appendErr := filestore.WFS.AppendData(ctx, jobId, "term", buf[:n]) + appendErr := filestore.WFS.AppendData(ctx, jobId, JobOutputFileName, buf[:n]) if appendErr != nil { log.Printf("[job:%s] error appending data to WaveFS: %v", jobId, appendErr) } else { @@ -291,12 +308,8 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobExitedData) error { var finalStatus string err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - if data.ExitErr != "" { - job.Status = JobStatus_Error - job.ExitError = data.ExitErr - } else { - job.Status = JobStatus_Done - } + job.Status = JobStatus_Done + job.ExitError = data.ExitErr job.ExitCode = data.ExitCode job.ExitSignal = data.ExitSignal job.ExitTs = data.ExitTs @@ -318,7 +331,7 @@ func tryExitJobManager(ctx context.Context, jobId string) { return } - jobExited := job.Status == JobStatus_Done || job.Status == JobStatus_Error + jobExited := job.Status == JobStatus_Done || job.Status == JobStatus_Error || job.Status == JobStatus_Terminated if !jobExited || !job.StreamDone { log.Printf("[job:%s] not ready for termination: exited=%v streamDone=%v", jobId, jobExited, job.StreamDone) @@ -430,11 +443,12 @@ func remoteTerminateJobManager(ctx context.Context, job *waveobj.Job) error { } updateErr := wstore.DBUpdateFn(ctx, job.OID, func(job *waveobj.Job) { + job.Status = JobStatus_Terminated job.JobManagerRunning = false job.TerminateOnReconnect = false }) if updateErr != nil { - log.Printf("[job:%s] error updating job manager running status: %v", job.OID, updateErr) + log.Printf("[job:%s] error updating job status after termination: %v", job.OID, updateErr) } log.Printf("[job:%s] job manager terminated successfully", job.OID) @@ -545,6 +559,109 @@ func ReconnectJobsForConn(ctx context.Context, connName string) error { return nil } +func RestartStreaming(ctx context.Context, jobId string) error { + job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + return fmt.Errorf("failed to get job: %w", err) + } + + isConnected, err := conncontroller.IsConnected(job.Connection) + if err != nil { + return fmt.Errorf("error checking connection status: %w", err) + } + if !isConnected { + return fmt.Errorf("connection %q is not connected", job.Connection) + } + + jobConnStatus := GetJobConnStatus(jobId) + if jobConnStatus != JobConnStatus_Connected { + return fmt.Errorf("job manager is not connected (status: %s)", jobConnStatus) + } + + var currentSeq int64 = 0 + var totalGap int64 = 0 + waveFile, err := filestore.WFS.Stat(ctx, jobId, JobOutputFileName) + if err == nil { + currentSeq = waveFile.Size + totalGap = getMetaInt64(waveFile.Meta, MetaKey_TotalGap) + currentSeq += totalGap + } + + bareRpc := wshclient.GetBareRpcClient() + broker := bareRpc.StreamBroker + readerRouteId := wshclient.GetBareRpcClientRouteId() + writerRouteId := wshutil.MakeJobRouteId(jobId) + + reader, streamMeta := broker.CreateStreamReaderWithSeq(readerRouteId, writerRouteId, DefaultStreamRwnd, currentSeq) + + prepareData := wshrpc.CommandJobPrepareConnectData{ + StreamMeta: *streamMeta, + Seq: currentSeq, + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeJobRouteId(jobId), + Timeout: 5000, + } + + log.Printf("[job:%s] sending JobPrepareConnectCommand with seq=%d (fileSize=%d, totalGap=%d)", jobId, currentSeq, waveFile.Size, totalGap) + rtnData, err := wshclient.JobPrepareConnectCommand(bareRpc, prepareData, rpcOpts) + if err != nil { + reader.Close() + return fmt.Errorf("failed to prepare connect: %w", err) + } + + if rtnData.HasExited { + reader.Close() + log.Printf("[job:%s] job has already exited: code=%d signal=%q err=%q", jobId, rtnData.ExitCode, rtnData.ExitSignal, rtnData.ExitErr) + + updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.Status = JobStatus_Done + job.ExitCode = rtnData.ExitCode + job.ExitSignal = rtnData.ExitSignal + job.ExitError = rtnData.ExitErr + }) + if updateErr != nil { + log.Printf("[job:%s] error updating job exit status: %v", jobId, updateErr) + } + return nil + } + + if rtnData.Seq > currentSeq { + gap := rtnData.Seq - currentSeq + totalGap += gap + log.Printf("[job:%s] detected gap: our seq=%d, server seq=%d, gap=%d, new totalGap=%d", jobId, currentSeq, rtnData.Seq, gap, totalGap) + + metaErr := filestore.WFS.WriteMeta(ctx, jobId, JobOutputFileName, wshrpc.FileMeta{ + MetaKey_TotalGap: totalGap, + }, true) + if metaErr != nil { + log.Printf("[job:%s] error updating totalgap metadata: %v", jobId, metaErr) + } + + reader.Close() + reader, streamMeta = broker.CreateStreamReaderWithSeq(readerRouteId, writerRouteId, DefaultStreamRwnd, rtnData.Seq) + } + + log.Printf("[job:%s] sending JobStartStreamCommand", jobId) + startStreamData := wshrpc.CommandJobStartStreamData{} + err = wshclient.JobStartStreamCommand(bareRpc, startStreamData, rpcOpts) + if err != nil { + reader.Close() + return fmt.Errorf("failed to start stream: %w", err) + } + + go func() { + defer func() { + panichandler.PanicHandler("jobcontroller:RestartStreaming:runOutputLoop", recover()) + }() + runOutputLoop(context.Background(), jobId, reader) + }() + + log.Printf("[job:%s] streaming restarted successfully", jobId) + return nil +} + func DeleteJob(ctx context.Context, jobId string) error { SetJobConnStatus(jobId, JobConnStatus_Disconnected) err := filestore.WFS.DeleteZone(ctx, jobId) diff --git a/pkg/streamclient/streambroker.go b/pkg/streamclient/streambroker.go index 65f9e6cbfa..c0082450e0 100644 --- a/pkg/streamclient/streambroker.go +++ b/pkg/streamclient/streambroker.go @@ -52,12 +52,16 @@ func NewBroker(rpcClient StreamRpcInterface) *Broker { } func (b *Broker) CreateStreamReader(readerRoute string, writerRoute string, rwnd int64) (*Reader, *wshrpc.StreamMeta) { + return b.CreateStreamReaderWithSeq(readerRoute, writerRoute, rwnd, 0) +} + +func (b *Broker) CreateStreamReaderWithSeq(readerRoute string, writerRoute string, rwnd int64, startSeq int64) (*Reader, *wshrpc.StreamMeta) { b.lock.Lock() defer b.lock.Unlock() streamId := uuid.New().String() - reader := NewReader(streamId, rwnd, b) + reader := NewReaderWithSeq(streamId, rwnd, startSeq, b) b.readers[streamId] = reader b.readerRoutes[streamId] = readerRoute b.writerRoutes[streamId] = writerRoute diff --git a/pkg/streamclient/streamreader.go b/pkg/streamclient/streamreader.go index 28e5f3fcf3..e124011bfc 100644 --- a/pkg/streamclient/streamreader.go +++ b/pkg/streamclient/streamreader.go @@ -28,11 +28,15 @@ type Reader struct { } func NewReader(id string, readWindow int64, ackSender AckSender) *Reader { + return NewReaderWithSeq(id, readWindow, 0, ackSender) +} + +func NewReaderWithSeq(id string, readWindow int64, startSeq int64, ackSender AckSender) *Reader { r := &Reader{ id: id, readWindow: readWindow, ackSender: ackSender, - nextSeq: 0, + nextSeq: startSeq, lastRwndSent: readWindow, } r.cond = sync.NewCond(&r.lock) From 55697788c1a36c854d99cdc9b31e7d4e61be9873 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 13:21:30 -0800 Subject: [PATCH 42/64] fix restartstream to not recreate the stream (use the same streamid) --- pkg/jobcontroller/jobcontroller.go | 3 +-- pkg/streamclient/streamreader.go | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index a7120717a8..3839379321 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -639,8 +639,7 @@ func RestartStreaming(ctx context.Context, jobId string) error { log.Printf("[job:%s] error updating totalgap metadata: %v", jobId, metaErr) } - reader.Close() - reader, streamMeta = broker.CreateStreamReaderWithSeq(readerRouteId, writerRouteId, DefaultStreamRwnd, rtnData.Seq) + reader.UpdateNextSeq(rtnData.Seq) } log.Printf("[job:%s] sending JobStartStreamCommand", jobId) diff --git a/pkg/streamclient/streamreader.go b/pkg/streamclient/streamreader.go index e124011bfc..8c6e87b1ee 100644 --- a/pkg/streamclient/streamreader.go +++ b/pkg/streamclient/streamreader.go @@ -150,6 +150,12 @@ func (r *Reader) Read(p []byte) (int, error) { return n, nil } +func (r *Reader) UpdateNextSeq(newSeq int64) { + r.lock.Lock() + defer r.lock.Unlock() + r.nextSeq = newSeq +} + func (r *Reader) Close() error { r.lock.Lock() defer r.lock.Unlock() From 5421c035e85bd307f37528054106b281b79586cc Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 14:31:42 -0800 Subject: [PATCH 43/64] working on reconnect --- frontend/types/gotypes.d.ts | 2 ++ pkg/jobcontroller/jobcontroller.go | 43 +++++++++++++++++++++++++++--- pkg/jobmanager/mainserverconn.go | 3 ++- pkg/jobmanager/streammanager.go | 14 ++++++++++ pkg/wshrpc/wshrpctypes.go | 12 +++++---- 5 files changed, 64 insertions(+), 10 deletions(-) diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 53fdafd090..857a6a0ed5 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -358,6 +358,8 @@ declare global { // wshrpc.CommandJobConnectRtnData type CommandJobConnectRtnData = { seq: number; + streamdone?: boolean; + streamerror?: string; hasexited?: boolean; exitcode?: number; exitsignal?: string; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 3839379321..c4100aca11 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -522,8 +522,18 @@ func ReconnectJob(ctx context.Context, jobId string) error { return fmt.Errorf("failed to reconnect to job manager: %s", rtnData.Error) } - log.Printf("[job:%s] RemoteReconnectToJobManagerCommand succeeded", jobId) - return nil + log.Printf("[job:%s] RemoteReconnectToJobManagerCommand succeeded, waiting for route", jobId) + + routeId := wshutil.MakeJobRouteId(jobId) + waitCtx, cancelFn := context.WithTimeout(ctx, 2*time.Second) + defer cancelFn() + err = wshutil.DefaultRouter.WaitForRegister(waitCtx, routeId) + if err != nil { + return fmt.Errorf("route did not establish after successful reconnection: %w", err) + } + + log.Printf("[job:%s] route established, restarting streaming", jobId) + return RestartStreaming(ctx, jobId) } func ReconnectJobsForConn(ctx context.Context, connName string) error { @@ -612,9 +622,7 @@ func RestartStreaming(ctx context.Context, jobId string) error { } if rtnData.HasExited { - reader.Close() log.Printf("[job:%s] job has already exited: code=%d signal=%q err=%q", jobId, rtnData.ExitCode, rtnData.ExitSignal, rtnData.ExitErr) - updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { job.Status = JobStatus_Done job.ExitCode = rtnData.ExitCode @@ -624,6 +632,33 @@ func RestartStreaming(ctx context.Context, jobId string) error { if updateErr != nil { log.Printf("[job:%s] error updating job exit status: %v", jobId, updateErr) } + } + + if rtnData.StreamDone { + log.Printf("[job:%s] stream is already done: error=%q", jobId, rtnData.StreamError) + updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + if !job.StreamDone { + job.StreamDone = true + if rtnData.StreamError != "" { + job.StreamError = rtnData.StreamError + } + } + }) + if updateErr != nil { + log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr) + } + } + + if rtnData.StreamDone && rtnData.HasExited { + reader.Close() + log.Printf("[job:%s] both stream done and job exited, calling tryExitJobManager", jobId) + tryExitJobManager(ctx, jobId) + return nil + } + + if rtnData.StreamDone { + reader.Close() + log.Printf("[job:%s] stream already done, no need to restart streaming", jobId) return nil } diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index c51f7a701a..f8178d4077 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -219,6 +219,7 @@ func (msc *MainServerConn) JobPrepareConnectCommand(ctx context.Context, data ws WshCmdJobManager.pendingStreamMeta = &data.StreamMeta rtnData := &wshrpc.CommandJobConnectRtnData{Seq: serverSeq} + rtnData.StreamDone, rtnData.StreamError = WshCmdJobManager.StreamManager.GetStreamDoneInfo() hasExited, exitData := WshCmdJobManager.Cmd.GetExitInfo() if hasExited && exitData != nil { rtnData.HasExited = true @@ -227,7 +228,7 @@ func (msc *MainServerConn) JobPrepareConnectCommand(ctx context.Context, data ws rtnData.ExitErr = exitData.ExitErr } - log.Printf("JobPrepareConnect: streamid=%s clientSeq=%d serverSeq=%d hasExited=%v (rwnd=0 cork mode)\n", data.StreamMeta.Id, data.Seq, serverSeq, hasExited) + log.Printf("JobPrepareConnect: streamid=%s clientSeq=%d serverSeq=%d streamDone=%v streamError=%q hasExited=%v (rwnd=0 cork mode)\n", data.StreamMeta.Id, data.Seq, serverSeq, rtnData.StreamDone, rtnData.StreamError, hasExited) return rtnData, nil } diff --git a/pkg/jobmanager/streammanager.go b/pkg/jobmanager/streammanager.go index e36129ba69..43861449b7 100644 --- a/pkg/jobmanager/streammanager.go +++ b/pkg/jobmanager/streammanager.go @@ -148,6 +148,20 @@ func (sm *StreamManager) GetStreamId() string { return sm.streamId } +// GetStreamDoneInfo returns whether the stream is done and the error if there was one. +// The error is only meaningful if done=true, as the error is delivered as part of the stream otherwise. +func (sm *StreamManager) GetStreamDoneInfo() (done bool, streamError string) { + sm.lock.Lock() + defer sm.lock.Unlock() + if !sm.terminalEventAcked { + return false, "" + } + if sm.terminalEvent != nil && !sm.terminalEvent.isEof { + return true, sm.terminalEvent.err + } + return true, "" +} + // ClientDisconnected transitions to DISCONNECTED mode func (sm *StreamManager) ClientDisconnected() { sm.lock.Lock() diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 30bc0e6044..5d779d0cf0 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -756,11 +756,13 @@ type CommandJobStartStreamData struct { } type CommandJobConnectRtnData struct { - Seq int64 `json:"seq"` - HasExited bool `json:"hasexited,omitempty"` - ExitCode int `json:"exitcode,omitempty"` - ExitSignal string `json:"exitsignal,omitempty"` - ExitErr string `json:"exiterr,omitempty"` + Seq int64 `json:"seq"` + StreamDone bool `json:"streamdone,omitempty"` + StreamError string `json:"streamerror,omitempty"` + HasExited bool `json:"hasexited,omitempty"` + ExitCode int `json:"exitcode,omitempty"` + ExitSignal string `json:"exitsignal,omitempty"` + ExitErr string `json:"exiterr,omitempty"` } type CommandJobTerminateData struct { From 1975d67be594741b19d868e6fae94783f63c947a Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 19 Jan 2026 17:21:49 -0800 Subject: [PATCH 44/64] fix many bugs, get reconnect logic working/tested --- cmd/wsh/cmd/wshcmd-jobdebug.go | 3 +++ pkg/jobcontroller/jobcontroller.go | 34 +++++++++++++++++---------- pkg/jobmanager/jobmanager_unix.go | 11 +++++++++ pkg/jobmanager/mainserverconn.go | 29 +++++++++++++++-------- pkg/wshrpc/wshremote/wshremote_job.go | 4 +++- 5 files changed, 58 insertions(+), 23 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 50d28f9406..069ab55e4c 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -7,6 +7,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "log" "github.com/spf13/cobra" "github.com/wavetermdev/waveterm/pkg/wshrpc" @@ -146,6 +147,8 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { return fmt.Errorf("getting connected job ids: %w", err) } + log.Printf("connnected jobids: %v\n", connectedJobIds) + connectedMap := make(map[string]bool) for _, jobId := range connectedJobIds { connectedMap[jobId] = true diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index c4100aca11..119ef53904 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -69,6 +69,14 @@ func InitJobController() { rpcClient := wshclient.GetBareRpcClient() rpcClient.EventListener.On(wps.Event_RouteUp, handleRouteUpEvent) rpcClient.EventListener.On(wps.Event_RouteDown, handleRouteDownEvent) + wshclient.EventSubCommand(rpcClient, wps.SubscriptionRequest{ + Event: wps.Event_RouteUp, + AllScopes: true, + }, nil) + wshclient.EventSubCommand(rpcClient, wps.SubscriptionRequest{ + Event: wps.Event_RouteDown, + AllScopes: true, + }, nil) } func handleRouteUpEvent(event *wps.WaveEvent) { @@ -533,7 +541,7 @@ func ReconnectJob(ctx context.Context, jobId string) error { } log.Printf("[job:%s] route established, restarting streaming", jobId) - return RestartStreaming(ctx, jobId) + return RestartStreaming(ctx, jobId, true) } func ReconnectJobsForConn(ctx context.Context, connName string) error { @@ -569,23 +577,25 @@ func ReconnectJobsForConn(ctx context.Context, connName string) error { return nil } -func RestartStreaming(ctx context.Context, jobId string) error { +func RestartStreaming(ctx context.Context, jobId string, knownConnected bool) error { job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { return fmt.Errorf("failed to get job: %w", err) } - isConnected, err := conncontroller.IsConnected(job.Connection) - if err != nil { - return fmt.Errorf("error checking connection status: %w", err) - } - if !isConnected { - return fmt.Errorf("connection %q is not connected", job.Connection) - } + if !knownConnected { + isConnected, err := conncontroller.IsConnected(job.Connection) + if err != nil { + return fmt.Errorf("error checking connection status: %w", err) + } + if !isConnected { + return fmt.Errorf("connection %q is not connected", job.Connection) + } - jobConnStatus := GetJobConnStatus(jobId) - if jobConnStatus != JobConnStatus_Connected { - return fmt.Errorf("job manager is not connected (status: %s)", jobConnStatus) + jobConnStatus := GetJobConnStatus(jobId) + if jobConnStatus != JobConnStatus_Connected { + return fmt.Errorf("job manager is not connected (status: %s)", jobConnStatus) + } } var currentSeq int64 = 0 diff --git a/pkg/jobmanager/jobmanager_unix.go b/pkg/jobmanager/jobmanager_unix.go index 3d16a97205..f8a895a2af 100644 --- a/pkg/jobmanager/jobmanager_unix.go +++ b/pkg/jobmanager/jobmanager_unix.go @@ -54,6 +54,11 @@ func normalizeSignal(sigName string) os.Signal { } func daemonize(clientId string, jobId string) error { + _, err := unix.Setsid() + if err != nil { + return fmt.Errorf("failed to setsid: %w", err) + } + devNull, err := os.OpenFile("/dev/null", os.O_RDONLY, 0) if err != nil { return fmt.Errorf("failed to open /dev/null: %w", err) @@ -91,6 +96,12 @@ func handleSIGHUP() { log.Printf("handling SIGHUP, closing pty master\n") cmd.TerminateByClosingPtyMaster() } + go func() { + log.Printf("received SIGHUP, will exit") + time.Sleep(500 * time.Millisecond) + log.Printf("terminating job manager\n") + os.Exit(0) + }() } func setupJobManagerSignalHandlers() { diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index f8178d4077..378c84011d 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -209,17 +209,26 @@ func (msc *MainServerConn) JobPrepareConnectCommand(ctx context.Context, data ws return nil, fmt.Errorf("job not started") } - corkedStreamMeta := data.StreamMeta - corkedStreamMeta.RWnd = 0 - serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(msc, corkedStreamMeta, data.Seq) - if err != nil { - return nil, err + rtnData := &wshrpc.CommandJobConnectRtnData{} + streamDone, streamError := WshCmdJobManager.StreamManager.GetStreamDoneInfo() + + if streamDone { + log.Printf("JobPrepareConnect: stream already done, skipping connection streamError=%q\n", streamError) + rtnData.Seq = data.Seq + rtnData.StreamDone = true + rtnData.StreamError = streamError + } else { + corkedStreamMeta := data.StreamMeta + corkedStreamMeta.RWnd = 0 + serverSeq, err := WshCmdJobManager.connectToStreamHelper_withlock(msc, corkedStreamMeta, data.Seq) + if err != nil { + return nil, err + } + WshCmdJobManager.pendingStreamMeta = &data.StreamMeta + rtnData.Seq = serverSeq + rtnData.StreamDone = false } - WshCmdJobManager.pendingStreamMeta = &data.StreamMeta - - rtnData := &wshrpc.CommandJobConnectRtnData{Seq: serverSeq} - rtnData.StreamDone, rtnData.StreamError = WshCmdJobManager.StreamManager.GetStreamDoneInfo() hasExited, exitData := WshCmdJobManager.Cmd.GetExitInfo() if hasExited && exitData != nil { rtnData.HasExited = true @@ -228,7 +237,7 @@ func (msc *MainServerConn) JobPrepareConnectCommand(ctx context.Context, data ws rtnData.ExitErr = exitData.ExitErr } - log.Printf("JobPrepareConnect: streamid=%s clientSeq=%d serverSeq=%d streamDone=%v streamError=%q hasExited=%v (rwnd=0 cork mode)\n", data.StreamMeta.Id, data.Seq, serverSeq, rtnData.StreamDone, rtnData.StreamError, hasExited) + log.Printf("JobPrepareConnect: streamid=%s clientSeq=%d serverSeq=%d streamDone=%v streamError=%q hasExited=%v\n", data.StreamMeta.Id, data.Seq, rtnData.Seq, rtnData.StreamDone, rtnData.StreamError, hasExited) return rtnData, nil } diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index 49f76c1beb..934b793c6d 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -52,6 +52,8 @@ func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, m log.Printf("connectToJobManager: connected to socket\n") proxy := wshutil.MakeRpcProxy("jobmanager") + linkId := impl.Router.RegisterUntrustedLink(proxy) + go func() { writeErr := wshutil.AdaptOutputChToStream(proxy.ToRemoteCh, conn) if writeErr != nil { @@ -61,13 +63,13 @@ func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, m go func() { defer func() { conn.Close() + impl.Router.UnregisterLink(linkId) close(proxy.FromRemoteCh) impl.removeJobManagerConnection(jobId) }() wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) }() - linkId := impl.Router.RegisterUntrustedLink(proxy) cleanup := func() { conn.Close() impl.Router.UnregisterLink(linkId) From 8a89c24badcc4e9cd105d595f0507e66c4522a94 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 20 Jan 2026 10:18:02 -0800 Subject: [PATCH 45/64] get job input through block working. remove old webcmd code --- cmd/wsh/cmd/wshcmd-jobdebug.go | 50 ++++++++++++++ frontend/app/store/wshclientapi.ts | 15 +++++ frontend/app/view/term/term.tsx | 1 + frontend/app/view/term/termwrap.ts | 24 +++---- frontend/types/gotypes.d.ts | 31 ++++----- pkg/jobcontroller/jobcontroller.go | 101 ++++++++++++++++++++++++++++- pkg/jobmanager/jobcmd.go | 2 +- pkg/jobmanager/mainserverconn.go | 14 ++++ pkg/waveobj/wtype.go | 1 + pkg/wcore/block.go | 12 ++++ pkg/web/webcmd/webcmd.go | 42 +----------- pkg/web/ws.go | 35 ---------- pkg/wshrpc/wshclient/wshclient.go | 18 +++++ pkg/wshrpc/wshrpctypes.go | 21 +++++- pkg/wshrpc/wshserver/wshserver.go | 23 +++++++ 15 files changed, 283 insertions(+), 107 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 069ab55e4c..f07617634b 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -87,6 +87,18 @@ var jobDebugStartCmd = &cobra.Command{ RunE: jobDebugStartRun, } +var jobDebugAttachJobCmd = &cobra.Command{ + Use: "attachjob", + Short: "attach a job to a block", + RunE: jobDebugAttachJobRun, +} + +var jobDebugDetachJobCmd = &cobra.Command{ + Use: "detachjob", + Short: "detach a job from its block", + RunE: jobDebugDetachJobRun, +} + var jobIdFlag string var jobDebugJsonFlag bool var jobConnFlag string @@ -94,6 +106,9 @@ var exitJobIdFlag string var disconnectJobIdFlag string var reconnectJobIdFlag string var reconnectConnNameFlag string +var attachJobIdFlag string +var attachBlockIdFlag string +var detachJobIdFlag string func init() { rootCmd.AddCommand(jobDebugCmd) @@ -108,6 +123,8 @@ func init() { jobDebugCmd.AddCommand(jobDebugReconnectConnCmd) jobDebugCmd.AddCommand(jobDebugGetOutputCmd) jobDebugCmd.AddCommand(jobDebugStartCmd) + jobDebugCmd.AddCommand(jobDebugAttachJobCmd) + jobDebugCmd.AddCommand(jobDebugDetachJobCmd) jobDebugListCmd.Flags().BoolVar(&jobDebugJsonFlag, "json", false, "output as JSON") @@ -134,6 +151,14 @@ func init() { jobDebugStartCmd.Flags().StringVar(&jobConnFlag, "conn", "", "connection name (required)") jobDebugStartCmd.MarkFlagRequired("conn") + + jobDebugAttachJobCmd.Flags().StringVar(&attachJobIdFlag, "jobid", "", "job id to attach (required)") + jobDebugAttachJobCmd.MarkFlagRequired("jobid") + jobDebugAttachJobCmd.Flags().StringVar(&attachBlockIdFlag, "blockid", "", "block id to attach to (required)") + jobDebugAttachJobCmd.MarkFlagRequired("blockid") + + jobDebugDetachJobCmd.Flags().StringVar(&detachJobIdFlag, "jobid", "", "job id to detach (required)") + jobDebugDetachJobCmd.MarkFlagRequired("jobid") } func jobDebugListRun(cmd *cobra.Command, args []string) error { @@ -362,3 +387,28 @@ func jobDebugStartRun(cmd *cobra.Command, args []string) error { fmt.Printf("Job started successfully with ID: %s\n", jobId) return nil } + +func jobDebugAttachJobRun(cmd *cobra.Command, args []string) error { + data := wshrpc.CommandJobControllerAttachJobData{ + JobId: attachJobIdFlag, + BlockId: attachBlockIdFlag, + } + + err := wshclient.JobControllerAttachJobCommand(RpcClient, data, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + return fmt.Errorf("attaching job: %w", err) + } + + fmt.Printf("Job %s attached to block %s successfully\n", attachJobIdFlag, attachBlockIdFlag) + return nil +} + +func jobDebugDetachJobRun(cmd *cobra.Command, args []string) error { + err := wshclient.JobControllerDetachJobCommand(RpcClient, detachJobIdFlag, &wshrpc.RpcOpts{Timeout: 5000}) + if err != nil { + return fmt.Errorf("detaching job: %w", err) + } + + fmt.Printf("Job %s detached successfully\n", detachJobIdFlag) + return nil +} diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index a8153d6204..eb0c12df49 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -392,11 +392,21 @@ class RpcApiType { return client.wshRpcCall("getwaveairatelimit", null, opts); } + // command "jobcontrollerattachjob" [call] + JobControllerAttachJobCommand(client: WshClient, data: CommandJobControllerAttachJobData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerattachjob", data, opts); + } + // command "jobcontrollerconnectedjobs" [call] JobControllerConnectedJobsCommand(client: WshClient, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerconnectedjobs", null, opts); } + // command "jobcontrollerdetachjob" [call] + JobControllerDetachJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerdetachjob", data, opts); + } + // command "jobcontrollerdisconnectjob" [call] JobControllerDisconnectJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerdisconnectjob", data, opts); @@ -442,6 +452,11 @@ class RpcApiType { return client.wshRpcCall("jobexited", data, opts); } + // command "jobinput" [call] + JobInputCommand(client: WshClient, data: CommandJobInputData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobinput", data, opts); + } + // command "jobmanagerexit" [call] JobManagerExitCommand(client: WshClient, opts?: RpcOpts): Promise { return client.wshRpcCall("jobmanagerexit", null, opts); diff --git a/frontend/app/view/term/term.tsx b/frontend/app/view/term/term.tsx index 10fd0fb112..d1ca981c97 100644 --- a/frontend/app/view/term/term.tsx +++ b/frontend/app/view/term/term.tsx @@ -298,6 +298,7 @@ const TerminalView = ({ blockId, model }: ViewComponentProps) => useWebGl: !termSettings?.["term:disablewebgl"], sendDataHandler: model.sendDataToController.bind(model), nodeModel: model.nodeModel, + jobId: blockData?.jobid, } ); (window as any).term = termWrap; diff --git a/frontend/app/view/term/termwrap.ts b/frontend/app/view/term/termwrap.ts index 6393b2165a..60743db584 100644 --- a/frontend/app/view/term/termwrap.ts +++ b/frontend/app/view/term/termwrap.ts @@ -3,7 +3,6 @@ import type { BlockNodeModel } from "@/app/block/blocktypes"; import { getFileSubject } from "@/app/store/wps"; -import { sendWSCommand } from "@/app/store/ws"; import { RpcApi } from "@/app/store/wshclientapi"; import { TabRpcClient } from "@/app/store/wshrpcutil"; import { WOS, fetchWaveFile, getApi, getSettingsKeyAtom, globalStore, openLink, recordTEvent } from "@/store/global"; @@ -50,6 +49,7 @@ type TermWrapOptions = { useWebGl?: boolean; sendDataHandler?: (data: string) => void; nodeModel?: BlockNodeModel; + jobId?: string; }; // for xterm OSC handlers, we return true always because we "own" the OSC number. @@ -375,6 +375,7 @@ function handleOsc16162Command(data: string, blockId: string, loaded: boolean, t export class TermWrap { tabId: string; blockId: string; + jobId: string; ptyOffset: number; dataBytesProcessed: number; terminal: Terminal; @@ -422,6 +423,7 @@ export class TermWrap { this.loaded = false; this.tabId = tabId; this.blockId = blockId; + this.jobId = waveOptions.jobId; this.sendDataHandler = waveOptions.sendDataHandler; this.nodeModel = waveOptions.nodeModel; this.ptyOffset = 0; @@ -495,6 +497,10 @@ export class TermWrap { }); } + getZoneId(): string { + return this.jobId ?? this.blockId; + } + resetCompositionState() { this.isComposing = false; this.composingData = ""; @@ -566,7 +572,7 @@ export class TermWrap { }); } - this.mainFileSubject = getFileSubject(this.blockId, TermFileName); + this.mainFileSubject = getFileSubject(this.getZoneId(), TermFileName); this.mainFileSubject.subscribe(this.handleNewFileSubjectData.bind(this)); try { @@ -699,8 +705,9 @@ export class TermWrap { } async loadInitialTerminalData(): Promise { - let startTs = Date.now(); - const { data: cacheData, fileInfo: cacheFile } = await fetchWaveFile(this.blockId, TermCacheFileName); + const startTs = Date.now(); + const zoneId = this.getZoneId(); + const { data: cacheData, fileInfo: cacheFile } = await fetchWaveFile(zoneId, TermCacheFileName); let ptyOffset = 0; if (cacheFile != null) { ptyOffset = cacheFile.meta["ptyoffset"] ?? 0; @@ -722,7 +729,7 @@ export class TermWrap { } } } - const { data: mainData, fileInfo: mainFile } = await fetchWaveFile(this.blockId, TermFileName, ptyOffset); + const { data: mainData, fileInfo: mainFile } = await fetchWaveFile(zoneId, TermFileName, ptyOffset); console.log( `terminal loaded cachefile:${cacheData?.byteLength ?? 0} main:${mainData?.byteLength ?? 0} bytes, ${Date.now() - startTs}ms` ); @@ -751,12 +758,7 @@ export class TermWrap { this.fitAddon.fit(); if (oldRows !== this.terminal.rows || oldCols !== this.terminal.cols) { const termSize: TermSize = { rows: this.terminal.rows, cols: this.terminal.cols }; - const wsCommand: SetBlockTermSizeWSCommand = { - wscommand: "setblocktermsize", - blockid: this.blockId, - termsize: termSize, - }; - sendWSCommand(wsCommand); + RpcApi.ControllerInputCommand(TabRpcClient, { blockid: this.blockId, termsize: termSize }); } dlog("resize", `${this.terminal.rows}x${this.terminal.cols}`, `${oldRows}x${oldCols}`, this.hasResized); if (!this.hasResized) { diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 857a6a0ed5..b028a329c5 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -112,6 +112,7 @@ declare global { runtimeopts?: RuntimeOpts; stickers?: StickerType[]; subblockids?: string[]; + jobid?: string; }; // blockcontroller.BlockControllerRuntimeStatus @@ -139,13 +140,6 @@ declare global { files: FileInfo[]; }; - // webcmd.BlockInputWSCommand - type BlockInputWSCommand = { - wscommand: "blockinput"; - blockid: string; - inputdata64: string; - }; - // wshrpc.BlocksListEntry type BlocksListEntry = { windowid: string; @@ -366,6 +360,12 @@ declare global { exiterr?: string; }; + // wshrpc.CommandJobControllerAttachJobData + type CommandJobControllerAttachJobData = { + jobid: string; + blockid: string; + }; + // wshrpc.CommandJobControllerStartJobData type CommandJobControllerStartJobData = { connname: string; @@ -384,6 +384,14 @@ declare global { exitts?: number; }; + // wshrpc.CommandJobInputData + type CommandJobInputData = { + jobid: string; + inputdata64?: string; + signame?: string; + termsize?: TermSize; + }; + // wshrpc.CommandJobPrepareConnectData type CommandJobPrepareConnectData = { streammeta: StreamMeta; @@ -1201,13 +1209,6 @@ declare global { optional: boolean; }; - // webcmd.SetBlockTermSizeWSCommand - type SetBlockTermSizeWSCommand = { - wscommand: "setblocktermsize"; - blockid: string; - termsize: TermSize; - }; - // wconfig.SettingsType type SettingsType = { "app:*"?: boolean; @@ -1787,7 +1788,7 @@ declare global { type WSCommandType = { wscommand: string; - } & ( SetBlockTermSizeWSCommand | BlockInputWSCommand | WSRpcCommand ); + } & ( WSRpcCommand ); // eventbus.WSEventType type WSEventType = { diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 119ef53904..117bcb66a1 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -266,6 +266,26 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { return jobId, nil } +func handleAppendJobFile(ctx context.Context, jobId string, fileName string, data []byte) error { + err := filestore.WFS.AppendData(ctx, jobId, fileName, data) + if err != nil { + return fmt.Errorf("error appending to job file: %w", err) + } + wps.Broker.Publish(wps.WaveEvent{ + Event: wps.Event_BlockFile, + Scopes: []string{ + waveobj.MakeORef(waveobj.OType_Job, jobId).String(), + }, + Data: &wps.WSFileEventData{ + ZoneId: jobId, + FileName: fileName, + FileOp: wps.FileOp_Append, + Data64: base64.StdEncoding.EncodeToString(data), + }, + }) + return nil +} + func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reader) { defer func() { log.Printf("[job:%s] output loop finished", jobId) @@ -277,7 +297,7 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade n, err := reader.Read(buf) if n > 0 { log.Printf("[job:%s] received %d bytes of data", jobId, n) - appendErr := filestore.WFS.AppendData(ctx, jobId, JobOutputFileName, buf[:n]) + appendErr := handleAppendJobFile(ctx, jobId, JobOutputFileName, buf[:n]) if appendErr != nil { log.Printf("[job:%s] error appending data to WaveFS: %v", jobId, appendErr) } else { @@ -714,3 +734,82 @@ func DeleteJob(ctx context.Context, jobId string) error { } return wstore.DBDelete(ctx, waveobj.OType_Job, jobId) } + +func AttachJobToBlock(ctx context.Context, jobId string, blockId string) error { + return wstore.WithTx(ctx, func(tx *wstore.TxWrap) error { + err := wstore.DBUpdateFn(tx.Context(), blockId, func(block *waveobj.Block) { + block.JobId = jobId + }) + if err != nil { + return fmt.Errorf("failed to update block: %w", err) + } + + err = wstore.DBUpdateFn(tx.Context(), jobId, func(job *waveobj.Job) { + job.AttachedBlockId = blockId + }) + if err != nil { + return fmt.Errorf("failed to update job: %w", err) + } + + log.Printf("[job:%s] attached to block:%s", jobId, blockId) + return nil + }) +} + +func DetachJobFromBlock(ctx context.Context, jobId string, updateBlock bool) error { + return wstore.WithTx(ctx, func(tx *wstore.TxWrap) error { + job, err := wstore.DBMustGet[*waveobj.Job](tx.Context(), jobId) + if err != nil { + return fmt.Errorf("failed to get job: %w", err) + } + + blockId := job.AttachedBlockId + if blockId == "" { + return nil + } + + if updateBlock { + block, err := wstore.DBGet[*waveobj.Block](tx.Context(), blockId) + if err == nil && block != nil { + err = wstore.DBUpdateFn(tx.Context(), blockId, func(block *waveobj.Block) { + block.JobId = "" + }) + if err != nil { + log.Printf("[job:%s] warning: failed to clear JobId from block:%s: %v", jobId, blockId, err) + } + } + } + + err = wstore.DBUpdateFn(tx.Context(), jobId, func(job *waveobj.Job) { + job.AttachedBlockId = "" + }) + if err != nil { + return fmt.Errorf("failed to update job: %w", err) + } + + log.Printf("[job:%s] detached from block:%s", jobId, blockId) + return nil + }) +} + +func SendInput(ctx context.Context, data wshrpc.CommandJobInputData) error { + jobId := data.JobId + _, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + return fmt.Errorf("failed to get job: %w", err) + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeJobRouteId(jobId), + Timeout: 5000, + NoResponse: true, + } + + bareRpc := wshclient.GetBareRpcClient() + err = wshclient.JobInputCommand(bareRpc, data, rpcOpts) + if err != nil { + return fmt.Errorf("failed to send input to job: %w", err) + } + + return nil +} diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index 07841e1bf9..c4a5f97778 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -140,7 +140,7 @@ func (jm *JobCmd) GetExitInfo() (bool, *wshrpc.CommandJobExitedData) { return true, exitData } -func (jm *JobCmd) HandleInput(data wshrpc.CommandBlockInputData) error { +func (jm *JobCmd) HandleInput(data wshrpc.CommandJobInputData) error { jm.lock.Lock() defer jm.lock.Unlock() diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index 378c84011d..3af89fca20 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -285,6 +285,20 @@ func (msc *MainServerConn) JobTerminateCommand(ctx context.Context, data wshrpc. return nil } +func (msc *MainServerConn) JobInputCommand(ctx context.Context, data wshrpc.CommandJobInputData) error { + WshCmdJobManager.lock.Lock() + defer WshCmdJobManager.lock.Unlock() + + if !msc.PeerAuthenticated.Load() { + return fmt.Errorf("not authenticated") + } + if WshCmdJobManager.Cmd == nil { + return fmt.Errorf("job not started") + } + + return WshCmdJobManager.Cmd.HandleInput(data) +} + func (msc *MainServerConn) JobManagerExitCommand(ctx context.Context) error { if !msc.PeerAuthenticated.Load() { return fmt.Errorf("not authenticated") diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 29d3c54c45..39e06a7383 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -291,6 +291,7 @@ type Block struct { Stickers []*StickerType `json:"stickers,omitempty"` Meta MetaMapType `json:"meta"` SubBlockIds []string `json:"subblockids,omitempty"` + JobId string `json:"jobid,omitempty"` // if set, the block will render this jobid's pty output } func (*Block) GetOType() string { diff --git a/pkg/wcore/block.go b/pkg/wcore/block.go index 3c6e2e197b..32456951bd 100644 --- a/pkg/wcore/block.go +++ b/pkg/wcore/block.go @@ -12,6 +12,7 @@ import ( "github.com/google/uuid" "github.com/wavetermdev/waveterm/pkg/blockcontroller" "github.com/wavetermdev/waveterm/pkg/filestore" + "github.com/wavetermdev/waveterm/pkg/jobcontroller" "github.com/wavetermdev/waveterm/pkg/panichandler" "github.com/wavetermdev/waveterm/pkg/telemetry" "github.com/wavetermdev/waveterm/pkg/telemetry/telemetrydata" @@ -167,6 +168,17 @@ func DeleteBlock(ctx context.Context, blockId string, recursive bool) error { } } } + if block.JobId != "" { + go func() { + defer func() { + panichandler.PanicHandler("DetachJobFromBlock", recover()) + }() + err := jobcontroller.DetachJobFromBlock(ctx, block.JobId, false) + if err != nil { + log.Printf("error detaching job from block %s: %v", blockId, err) + } + }() + } parentBlockCount, err := deleteBlockObj(ctx, blockId) if err != nil { return fmt.Errorf("error deleting block: %w", err) diff --git a/pkg/web/webcmd/webcmd.go b/pkg/web/webcmd/webcmd.go index bf732de0c6..b86934ce7a 100644 --- a/pkg/web/webcmd/webcmd.go +++ b/pkg/web/webcmd/webcmd.go @@ -9,14 +9,11 @@ import ( "github.com/wavetermdev/waveterm/pkg/tsgen/tsgenmeta" "github.com/wavetermdev/waveterm/pkg/util/utilfn" - "github.com/wavetermdev/waveterm/pkg/waveobj" "github.com/wavetermdev/waveterm/pkg/wshutil" ) const ( - WSCommand_SetBlockTermSize = "setblocktermsize" - WSCommand_BlockInput = "blockinput" - WSCommand_Rpc = "rpc" + WSCommand_Rpc = "rpc" ) type WSCommandType interface { @@ -28,8 +25,6 @@ func WSCommandTypeUnionMeta() tsgenmeta.TypeUnionMeta { BaseType: reflect.TypeOf((*WSCommandType)(nil)).Elem(), TypeFieldName: "wscommand", Types: []reflect.Type{ - reflect.TypeOf(SetBlockTermSizeWSCommand{}), - reflect.TypeOf(BlockInputWSCommand{}), reflect.TypeOf(WSRpcCommand{}), }, } @@ -44,46 +39,12 @@ func (cmd *WSRpcCommand) GetWSCommand() string { return cmd.WSCommand } -type SetBlockTermSizeWSCommand struct { - WSCommand string `json:"wscommand" tstype:"\"setblocktermsize\""` - BlockId string `json:"blockid"` - TermSize waveobj.TermSize `json:"termsize"` -} - -func (cmd *SetBlockTermSizeWSCommand) GetWSCommand() string { - return cmd.WSCommand -} - -type BlockInputWSCommand struct { - WSCommand string `json:"wscommand" tstype:"\"blockinput\""` - BlockId string `json:"blockid"` - InputData64 string `json:"inputdata64"` -} - -func (cmd *BlockInputWSCommand) GetWSCommand() string { - return cmd.WSCommand -} - func ParseWSCommandMap(cmdMap map[string]any) (WSCommandType, error) { cmdType, ok := cmdMap["wscommand"].(string) if !ok { return nil, fmt.Errorf("no wscommand field in command map") } switch cmdType { - case WSCommand_SetBlockTermSize: - var cmd SetBlockTermSizeWSCommand - err := utilfn.DoMapStructure(&cmd, cmdMap) - if err != nil { - return nil, fmt.Errorf("error decoding SetBlockTermSizeWSCommand: %w", err) - } - return &cmd, nil - case WSCommand_BlockInput: - var cmd BlockInputWSCommand - err := utilfn.DoMapStructure(&cmd, cmdMap) - if err != nil { - return nil, fmt.Errorf("error decoding BlockInputWSCommand: %w", err) - } - return &cmd, nil case WSCommand_Rpc: var cmd WSRpcCommand err := utilfn.DoMapStructure(&cmd, cmdMap) @@ -94,5 +55,4 @@ func ParseWSCommandMap(cmdMap map[string]any) (WSCommandType, error) { default: return nil, fmt.Errorf("unknown wscommand type %q", cmdType) } - } diff --git a/pkg/web/ws.go b/pkg/web/ws.go index 0e6f0b0f9b..719753ba3c 100644 --- a/pkg/web/ws.go +++ b/pkg/web/ws.go @@ -20,7 +20,6 @@ import ( "github.com/wavetermdev/waveterm/pkg/eventbus" "github.com/wavetermdev/waveterm/pkg/panichandler" "github.com/wavetermdev/waveterm/pkg/web/webcmd" - "github.com/wavetermdev/waveterm/pkg/wshrpc" "github.com/wavetermdev/waveterm/pkg/wshutil" ) @@ -110,40 +109,6 @@ func processWSCommand(jmsg map[string]any, outputCh chan any, rpcInputCh chan ba } cmdType = wsCommand.GetWSCommand() switch cmd := wsCommand.(type) { - case *webcmd.SetBlockTermSizeWSCommand: - data := wshrpc.CommandBlockInputData{ - BlockId: cmd.BlockId, - TermSize: &cmd.TermSize, - } - rpcMsg := wshutil.RpcMessage{ - Command: wshrpc.Command_ControllerInput, - Data: data, - } - msgBytes, err := json.Marshal(rpcMsg) - if err != nil { - // this really should never fail since we just unmarshalled this value - log.Printf("[websocket] error marshalling rpc message: %v\n", err) - return - } - rpcInputCh <- baseds.RpcInputChType{MsgBytes: msgBytes} - - case *webcmd.BlockInputWSCommand: - data := wshrpc.CommandBlockInputData{ - BlockId: cmd.BlockId, - InputData64: cmd.InputData64, - } - rpcMsg := wshutil.RpcMessage{ - Command: wshrpc.Command_ControllerInput, - Data: data, - } - msgBytes, err := json.Marshal(rpcMsg) - if err != nil { - // this really should never fail since we just unmarshalled this value - log.Printf("[websocket] error marshalling rpc message: %v\n", err) - return - } - rpcInputCh <- baseds.RpcInputChType{MsgBytes: msgBytes} - case *webcmd.WSRpcCommand: rpcMsg := cmd.Message if rpcMsg == nil { diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 271de67d29..937d054ab3 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -476,12 +476,24 @@ func GetWaveAIRateLimitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) (*uctype return resp, err } +// command "jobcontrollerattachjob", wshserver.JobControllerAttachJobCommand +func JobControllerAttachJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobControllerAttachJobData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerattachjob", data, opts) + return err +} + // command "jobcontrollerconnectedjobs", wshserver.JobControllerConnectedJobsCommand func JobControllerConnectedJobsCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) ([]string, error) { resp, err := sendRpcRequestCallHelper[[]string](w, "jobcontrollerconnectedjobs", nil, opts) return resp, err } +// command "jobcontrollerdetachjob", wshserver.JobControllerDetachJobCommand +func JobControllerDetachJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerdetachjob", data, opts) + return err +} + // command "jobcontrollerdisconnectjob", wshserver.JobControllerDisconnectJobCommand func JobControllerDisconnectJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerdisconnectjob", data, opts) @@ -536,6 +548,12 @@ func JobExitedCommand(w *wshutil.WshRpc, data wshrpc.CommandJobExitedData, opts return err } +// command "jobinput", wshserver.JobInputCommand +func JobInputCommand(w *wshutil.WshRpc, data wshrpc.CommandJobInputData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobinput", data, opts) + return err +} + // command "jobmanagerexit", wshserver.JobManagerExitCommand func JobManagerExitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobmanagerexit", nil, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 5d779d0cf0..92b37b1ac8 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -173,6 +173,7 @@ type WshRpcInterface interface { JobPrepareConnectCommand(ctx context.Context, data CommandJobPrepareConnectData) (*CommandJobConnectRtnData, error) JobStartStreamCommand(ctx context.Context, data CommandJobStartStreamData) error JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error + JobInputCommand(ctx context.Context, data CommandJobInputData) error JobExitedCommand(ctx context.Context, data CommandJobExitedData) error // this is sent FROM the job manager => main server JobManagerExitCommand(ctx context.Context) error JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, error) @@ -184,6 +185,8 @@ type WshRpcInterface interface { JobControllerReconnectJobCommand(ctx context.Context, jobId string) error JobControllerReconnectJobsForConnCommand(ctx context.Context, connName string) error JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) + JobControllerAttachJobCommand(ctx context.Context, data CommandJobControllerAttachJobData) error + JobControllerDetachJobCommand(ctx context.Context, jobId string) error } // for frontend @@ -280,6 +283,13 @@ type CommandBlockInputData struct { TermSize *waveobj.TermSize `json:"termsize,omitempty"` } +type CommandJobInputData struct { + JobId string `json:"jobid"` + InputData64 string `json:"inputdata64,omitempty"` + SigName string `json:"signame,omitempty"` + TermSize *waveobj.TermSize `json:"termsize,omitempty"` +} + type CommandWaitForRouteData struct { RouteId string `json:"routeid"` WaitMs int `json:"waitms"` @@ -736,9 +746,9 @@ type CommandRemoteDisconnectFromJobManagerData struct { } type CommandRemoteTerminateJobManagerData struct { - JobId string `json:"jobid"` - JobManagerPid int `json:"jobmanagerpid"` - JobManagerStartTs int64 `json:"jobmanagerstartts"` + JobId string `json:"jobid"` + JobManagerPid int `json:"jobmanagerpid"` + JobManagerStartTs int64 `json:"jobmanagerstartts"` } type CommandStartJobRtnData struct { @@ -783,3 +793,8 @@ type CommandJobControllerStartJobData struct { Env map[string]string `json:"env"` TermSize *waveobj.TermSize `json:"termsize,omitempty"` } + +type CommandJobControllerAttachJobData struct { + JobId string `json:"jobid"` + BlockId string `json:"blockid"` +} diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 61bc0f15e7..9f7e4054f3 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -295,6 +295,21 @@ func (ws *WshServer) ControllerResyncCommand(ctx context.Context, data wshrpc.Co } func (ws *WshServer) ControllerInputCommand(ctx context.Context, data wshrpc.CommandBlockInputData) error { + block, err := wstore.DBMustGet[*waveobj.Block](ctx, data.BlockId) + if err != nil { + return fmt.Errorf("error getting block: %w", err) + } + + if block.JobId != "" { + jobInputData := wshrpc.CommandJobInputData{ + JobId: block.JobId, + InputData64: data.InputData64, + SigName: data.SigName, + TermSize: data.TermSize, + } + return jobcontroller.SendInput(ctx, jobInputData) + } + inputUnion := &blockcontroller.BlockInputUnion{ SigName: data.SigName, TermSize: data.TermSize, @@ -1478,3 +1493,11 @@ func (ws *WshServer) JobControllerReconnectJobsForConnCommand(ctx context.Contex func (ws *WshServer) JobControllerConnectedJobsCommand(ctx context.Context) ([]string, error) { return jobcontroller.GetConnectedJobIds(), nil } + +func (ws *WshServer) JobControllerAttachJobCommand(ctx context.Context, data wshrpc.CommandJobControllerAttachJobData) error { + return jobcontroller.AttachJobToBlock(ctx, data.JobId, data.BlockId) +} + +func (ws *WshServer) JobControllerDetachJobCommand(ctx context.Context, jobId string) error { + return jobcontroller.DetachJobFromBlock(ctx, jobId, true) +} From dad67c6f548bc2256292d53cc70bb77b8023e8b0 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 20 Jan 2026 10:46:58 -0800 Subject: [PATCH 46/64] stream reader now handles out of order packerts --- pkg/streamclient/stream_test.go | 256 +++++++++++++++++++++++++++++++ pkg/streamclient/streamreader.go | 58 +++++-- 2 files changed, 303 insertions(+), 11 deletions(-) diff --git a/pkg/streamclient/stream_test.go b/pkg/streamclient/stream_test.go index f5c43f937c..67fb3bc057 100644 --- a/pkg/streamclient/stream_test.go +++ b/pkg/streamclient/stream_test.go @@ -2,6 +2,7 @@ package streamclient import ( "bytes" + "encoding/base64" "io" "testing" "time" @@ -265,3 +266,258 @@ func TestMultipleWrites(t *testing.T) { t.Fatalf("Expected %q, got %q", expected, string(buf)) } } + +func TestOutOfOrderPackets(t *testing.T) { + transport := newFakeTransport() + reader := NewReader("test-ooo", 1024, transport) + + packet0 := wshrpc.CommandStreamData{ + Id: "test-ooo", + Seq: 0, + Data64: base64.StdEncoding.EncodeToString([]byte("AAAAA")), + } + packet5 := wshrpc.CommandStreamData{ + Id: "test-ooo", + Seq: 5, + Data64: base64.StdEncoding.EncodeToString([]byte("BBBBB")), + } + packet10 := wshrpc.CommandStreamData{ + Id: "test-ooo", + Seq: 10, + Data64: base64.StdEncoding.EncodeToString([]byte("CCCCC")), + } + packet15 := wshrpc.CommandStreamData{ + Id: "test-ooo", + Seq: 15, + Data64: base64.StdEncoding.EncodeToString([]byte("DDDDD")), + } + + // Send packets out of order: 0, 10, 15, 5 + reader.RecvData(packet0) + reader.RecvData(packet10) // OOO - should be buffered + reader.RecvData(packet15) // OOO - should be buffered + reader.RecvData(packet5) // fills the gap - should trigger processing + + // Read all data + buf := make([]byte, 1024) + totalRead := 0 + expectedLen := 20 // 4 packets * 5 bytes each + + readDone := make(chan struct{}) + go func() { + for totalRead < expectedLen { + n, err := reader.Read(buf[totalRead:]) + if err != nil { + t.Errorf("Read failed: %v", err) + return + } + totalRead += n + } + close(readDone) + }() + + select { + case <-readDone: + // Success + case <-time.After(2 * time.Second): + t.Fatalf("Read didn't complete in time. Read %d bytes, expected %d", totalRead, expectedLen) + } + + if totalRead != expectedLen { + t.Fatalf("Expected to read %d bytes, got %d", expectedLen, totalRead) + } +} + +func TestOutOfOrderWithDuplicates(t *testing.T) { + transport := newFakeTransport() + reader := NewReader("test-dup", 1024, transport) + + packet0 := wshrpc.CommandStreamData{ + Id: "test-dup", + Seq: 0, + Data64: base64.StdEncoding.EncodeToString([]byte("aaaaa")), + } + packet10 := wshrpc.CommandStreamData{ + Id: "test-dup", + Seq: 10, + Data64: base64.StdEncoding.EncodeToString([]byte("ccccc")), + } + packet5First := wshrpc.CommandStreamData{ + Id: "test-dup", + Seq: 5, + Data64: base64.StdEncoding.EncodeToString([]byte("xxxxx")), + } + packet5Second := wshrpc.CommandStreamData{ + Id: "test-dup", + Seq: 5, + Data64: base64.StdEncoding.EncodeToString([]byte("bbbbb")), + } + + reader.RecvData(packet0) + reader.RecvData(packet10) // OOO - buffered + reader.RecvData(packet5First) // OOO - buffered + reader.RecvData(packet5First) // Duplicate - should be ignored + reader.RecvData(packet5Second) // Duplicate with different data - should be ignored + + // Read all data - should get all 3 packets in order + buf := make([]byte, 20) + n, err := reader.Read(buf) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + + // Should get all 15 bytes (3 packets * 5 bytes) + if n != 15 { + t.Fatalf("Expected to read 15 bytes, got %d", n) + } + + // Should be "aaaaaxxxxxccccc" (first packet received for each seq wins) + expected := "aaaaaxxxxxccccc" + if string(buf[:n]) != expected { + t.Fatalf("Expected %q, got %q", expected, string(buf[:n])) + } +} + +func TestOutOfOrderWithGaps(t *testing.T) { + transport := newFakeTransport() + reader := NewReader("test-gaps", 1024, transport) + + packet0 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 0, + Data64: base64.StdEncoding.EncodeToString([]byte("aaaaa")), + } + packet20 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 20, + Data64: base64.StdEncoding.EncodeToString([]byte("eeeee")), + } + packet40 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 40, + Data64: base64.StdEncoding.EncodeToString([]byte("iiiii")), + } + packet5 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 5, + Data64: base64.StdEncoding.EncodeToString([]byte("bbbbb")), + } + + reader.RecvData(packet0) + reader.RecvData(packet40) // Way ahead - should be buffered + reader.RecvData(packet20) // Still ahead - should be buffered + + // Read first packet + buf := make([]byte, 10) + n, err := reader.Read(buf) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + if n != 5 || string(buf[:n]) != "aaaaa" { + t.Fatalf("Expected 'aaaaa', got %q", string(buf[:n])) + } + + // Send packet to partially fill gap + reader.RecvData(packet5) + + // Should be able to read it now + n, err = reader.Read(buf) + if err != nil { + t.Fatalf("Second read failed: %v", err) + } + if n != 5 || string(buf[:n]) != "bbbbb" { + t.Fatalf("Expected 'bbbbb', got %q", string(buf[:n])) + } + + packet10 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 10, + Data64: base64.StdEncoding.EncodeToString([]byte("ccccc")), + } + packet15 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 15, + Data64: base64.StdEncoding.EncodeToString([]byte("ddddd")), + } + packet25 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 25, + Data64: base64.StdEncoding.EncodeToString([]byte("fffff")), + } + packet30 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 30, + Data64: base64.StdEncoding.EncodeToString([]byte("ggggg")), + } + packet35 := wshrpc.CommandStreamData{ + Id: "test-gaps", + Seq: 35, + Data64: base64.StdEncoding.EncodeToString([]byte("hhhhh")), + } + + reader.RecvData(packet10) + reader.RecvData(packet15) + reader.RecvData(packet25) + reader.RecvData(packet30) + reader.RecvData(packet35) + + // Read all remaining data at once + allData := make([]byte, 100) + totalRead := 0 + for totalRead < 35 { + n, err = reader.Read(allData[totalRead:]) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + totalRead += n + } + + expected := "cccccdddddeeeeefffffggggghhhhhiiiii" + if string(allData[:totalRead]) != expected { + t.Fatalf("Expected %q, got %q", expected, string(allData[:totalRead])) + } +} + +func TestOutOfOrderWithEOF(t *testing.T) { + transport := newFakeTransport() + reader := NewReader("test-eof", 1024, transport) + + packet0 := wshrpc.CommandStreamData{ + Id: "test-eof", + Seq: 0, + Data64: base64.StdEncoding.EncodeToString([]byte("first")), + } + packet11 := wshrpc.CommandStreamData{ + Id: "test-eof", + Seq: 11, + Data64: base64.StdEncoding.EncodeToString([]byte("third")), + Eof: true, + } + packet5 := wshrpc.CommandStreamData{ + Id: "test-eof", + Seq: 5, + Data64: base64.StdEncoding.EncodeToString([]byte("second")), + } + + reader.RecvData(packet0) + reader.RecvData(packet11) // OOO with EOF + reader.RecvData(packet5) // Fill the gap + + // Read all data + buf := make([]byte, 20) + n, err := reader.Read(buf) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + + expected := "firstsecondthird" + if string(buf[:n]) != expected { + t.Fatalf("Expected %q, got %q", expected, string(buf[:n])) + } + + // Should get EOF now + _, err = reader.Read(buf) + if err != io.EOF { + t.Fatalf("Expected EOF, got %v", err) + } +} diff --git a/pkg/streamclient/streamreader.go b/pkg/streamclient/streamreader.go index 8c6e87b1ee..541d5c866d 100644 --- a/pkg/streamclient/streamreader.go +++ b/pkg/streamclient/streamreader.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "fmt" "io" + "sort" "sync" "github.com/wavetermdev/waveterm/pkg/wshrpc" @@ -25,6 +26,7 @@ type Reader struct { err error closed bool lastRwndSent int64 + oooPackets []wshrpc.CommandStreamData // out-of-order packets awaiting delivery } func NewReader(id string, readWindow int64, ackSender AckSender) *Reader { @@ -47,7 +49,7 @@ func (r *Reader) RecvData(dataPk wshrpc.CommandStreamData) { r.lock.Lock() defer r.lock.Unlock() - if r.closed { + if r.closed || r.eof || r.err != nil { return } @@ -63,18 +65,25 @@ func (r *Reader) RecvData(dataPk wshrpc.CommandStreamData) { return } - if dataPk.Seq != r.nextSeq { - r.err = fmt.Errorf("stream sequence mismatch: expected %d, got %d", r.nextSeq, dataPk.Seq) - r.cond.Broadcast() - r.sendAckLocked(false, true, "sequence mismatch error") + if dataPk.Seq < r.nextSeq { + return + } + if dataPk.Seq > r.nextSeq { + r.addOOOPacketLocked(dataPk) return } + r.recvDataOrderedLocked(dataPk) + r.processOOOPacketsLocked() + r.cond.Broadcast() + r.sendAckLocked(r.eof, false, "") +} + +func (r *Reader) recvDataOrderedLocked(dataPk wshrpc.CommandStreamData) { if dataPk.Data64 != "" { data, err := base64.StdEncoding.DecodeString(dataPk.Data64) if err != nil { r.err = err - r.cond.Broadcast() r.sendAckLocked(false, true, "base64 decode error") return } @@ -84,13 +93,40 @@ func (r *Reader) RecvData(dataPk wshrpc.CommandStreamData) { if dataPk.Eof { r.eof = true - r.cond.Broadcast() - r.sendAckLocked(true, false, "") - return } +} - r.cond.Broadcast() - r.sendAckLocked(false, false, "") +func (r *Reader) addOOOPacketLocked(dataPk wshrpc.CommandStreamData) { + for _, pkt := range r.oooPackets { + if pkt.Seq == dataPk.Seq { + // this handles duplicates + return + } + } + r.oooPackets = append(r.oooPackets, dataPk) +} + +func (r *Reader) processOOOPacketsLocked() { + if len(r.oooPackets) == 0 { + return + } + sort.Slice(r.oooPackets, func(i, j int) bool { + return r.oooPackets[i].Seq < r.oooPackets[j].Seq + }) + consumed := 0 + for _, pkt := range r.oooPackets { + if r.eof || r.err != nil { + // we're done, so we can clear any pending ooo packets + r.oooPackets = nil + return + } + if pkt.Seq != r.nextSeq { + break + } + r.recvDataOrderedLocked(pkt) + consumed++ + } + r.oooPackets = r.oooPackets[consumed:] } func (r *Reader) sendAckLocked(fin bool, cancel bool, errStr string) { From a22788ca76f9f8222abb7fcb3b5cb9b849bf9b7d Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 20 Jan 2026 10:47:44 -0800 Subject: [PATCH 47/64] remove test coverage UX --- .vscode/settings.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 85f4c06cd6..e0209de61b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -61,5 +61,8 @@ }, "directoryFilters": ["-tsunami/frontend/scaffold", "-dist", "-make"] }, - "tailwindCSS.lint.suggestCanonicalClasses": "ignore" + "tailwindCSS.lint.suggestCanonicalClasses": "ignore", + "go.coverageDecorator": { + "type": "gutter" + } } From 609db36023f15dc18ab231f3488b041bdea6355a Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 20 Jan 2026 10:59:58 -0800 Subject: [PATCH 48/64] remove extra go routine for streaming as the broker was designed to be non-blocking --- pkg/streamclient/streambroker.go | 3 +++ pkg/wshutil/wshrpc.go | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pkg/streamclient/streambroker.go b/pkg/streamclient/streambroker.go index c0082450e0..9f3ec173d9 100644 --- a/pkg/streamclient/streambroker.go +++ b/pkg/streamclient/streambroker.go @@ -116,6 +116,9 @@ func (b *Broker) SendData(dataPk wshrpc.CommandStreamData) { b.sendQueue.Enqueue(workItem{workType: "senddata", dataPk: dataPk}) } +// RecvData and RecvAck are designed to be non-blocking and must remain so to prevent deadlock. +// They only enqueue work items to be processed asynchronously by the work queue's goroutine. +// These methods are called from the main RPC runServer loop, so blocking here would stall all RPC processing. func (b *Broker) RecvData(dataPk wshrpc.CommandStreamData) { b.recvQueue.Enqueue(workItem{workType: "recvdata", dataPk: dataPk}) } diff --git a/pkg/wshutil/wshrpc.go b/pkg/wshutil/wshrpc.go index c70b6000b6..eb2903c1f7 100644 --- a/pkg/wshutil/wshrpc.go +++ b/pkg/wshutil/wshrpc.go @@ -324,14 +324,6 @@ func (w *WshRpc) handleRequestInternal(req *RpcMessage, ingressLinkId baseds.Lin w.handleEventRecv(req) return } - if req.Command == wshrpc.Command_StreamData { - w.handleStreamData(req) - return - } - if req.Command == wshrpc.Command_StreamDataAck { - w.handleStreamAck(req) - return - } var respHandler *RpcResponseHandler timeoutMs := req.Timeout @@ -422,6 +414,17 @@ outer: continue } if msg.IsRpcRequest() { + // Handle stream commands synchronously since the broker is designed to be non-blocking. + // RecvData/RecvAck just enqueue to work queues, so there's no risk of blocking the main loop. + if msg.Command == wshrpc.Command_StreamData { + w.handleStreamData(&msg) + continue + } + if msg.Command == wshrpc.Command_StreamDataAck { + w.handleStreamAck(&msg) + continue + } + ingressLinkId := inputVal.IngressLinkId go func() { defer func() { From 861dcece6e54e2125e89129a232dc18803191b78 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 20 Jan 2026 14:55:34 -0800 Subject: [PATCH 49/64] working through bugs, tightening semantics --- cmd/wsh/cmd/wshcmd-jobdebug.go | 24 +----- frontend/app/store/wshclientapi.ts | 15 ---- frontend/types/gotypes.d.ts | 13 ++- pkg/jobcontroller/jobcontroller.go | 127 +++++++++++++---------------- pkg/jobmanager/mainserverconn.go | 48 ++++------- pkg/waveobj/wtype.go | 21 +++-- pkg/wshrpc/wshclient/wshclient.go | 18 ---- pkg/wshrpc/wshrpctypes.go | 9 +- pkg/wshrpc/wshserver/wshserver.go | 4 - 9 files changed, 93 insertions(+), 186 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index f07617634b..6093d3f1fc 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -33,12 +33,6 @@ var jobDebugDeleteCmd = &cobra.Command{ RunE: jobDebugDeleteRun, } -var jobDebugTerminateCmdCmd = &cobra.Command{ - Use: "terminate-cmd", - Short: "terminate a command process", - RunE: jobDebugTerminateCmdRun, -} - var jobDebugDeleteAllCmd = &cobra.Command{ Use: "deleteall", Short: "delete all jobs", @@ -116,7 +110,6 @@ func init() { jobDebugCmd.AddCommand(jobDebugDeleteCmd) jobDebugCmd.AddCommand(jobDebugDeleteAllCmd) jobDebugCmd.AddCommand(jobDebugPruneCmd) - jobDebugCmd.AddCommand(jobDebugTerminateCmdCmd) jobDebugCmd.AddCommand(jobDebugExitCmd) jobDebugCmd.AddCommand(jobDebugDisconnectCmd) jobDebugCmd.AddCommand(jobDebugReconnectCmd) @@ -131,9 +124,6 @@ func init() { jobDebugDeleteCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to delete (required)") jobDebugDeleteCmd.MarkFlagRequired("jobid") - jobDebugTerminateCmdCmd.Flags().StringVar(&jobIdFlag, "jobid", "", "job id to terminate (required)") - jobDebugTerminateCmdCmd.MarkFlagRequired("jobid") - jobDebugExitCmd.Flags().StringVar(&exitJobIdFlag, "jobid", "", "job id to exit (required)") jobDebugExitCmd.MarkFlagRequired("jobid") @@ -210,7 +200,7 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { } exitCode := "-" - if job.Status != "running" && job.Status != "init" { + if job.JobManagerStatus != "running" && job.JobManagerStatus != "init" { exitCode = fmt.Sprintf("%d", job.ExitCode) } @@ -222,7 +212,7 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { } fmt.Printf("%-36s %-20s %-9s %-7s %-30s %-10s %-10s %-8s %s\n", - job.OID, job.Connection, connectedStatus, managerStatus, job.Cmd, job.Status, streamStatus, exitCode, errorStr) + job.OID, job.Connection, connectedStatus, managerStatus, job.Cmd, job.JobManagerStatus, streamStatus, exitCode, errorStr) } return nil } @@ -293,16 +283,6 @@ func jobDebugPruneRun(cmd *cobra.Command, args []string) error { return nil } -func jobDebugTerminateCmdRun(cmd *cobra.Command, args []string) error { - err := wshclient.JobControllerTerminateJobCommand(RpcClient, jobIdFlag, nil) - if err != nil { - return fmt.Errorf("terminating command: %w", err) - } - - fmt.Printf("Command for %s terminated successfully\n", jobIdFlag) - return nil -} - func jobDebugExitRun(cmd *cobra.Command, args []string) error { err := wshclient.JobControllerExitJobCommand(RpcClient, exitJobIdFlag, nil) if err != nil { diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index eb0c12df49..0317c8bf47 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -432,11 +432,6 @@ class RpcApiType { return client.wshRpcCall("jobcontrollerstartjob", data, opts); } - // command "jobcontrollerterminatejob" [call] - JobControllerTerminateJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { - return client.wshRpcCall("jobcontrollerterminatejob", data, opts); - } - // command "jobdebugdelete" [call] JobDebugDeleteCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("jobdebugdelete", data, opts); @@ -457,11 +452,6 @@ class RpcApiType { return client.wshRpcCall("jobinput", data, opts); } - // command "jobmanagerexit" [call] - JobManagerExitCommand(client: WshClient, opts?: RpcOpts): Promise { - return client.wshRpcCall("jobmanagerexit", null, opts); - } - // command "jobprepareconnect" [call] JobPrepareConnectCommand(client: WshClient, data: CommandJobPrepareConnectData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobprepareconnect", data, opts); @@ -472,11 +462,6 @@ class RpcApiType { return client.wshRpcCall("jobstartstream", data, opts); } - // command "jobterminate" [call] - JobTerminateCommand(client: WshClient, data: CommandJobTerminateData, opts?: RpcOpts): Promise { - return client.wshRpcCall("jobterminate", data, opts); - } - // command "listallappfiles" [call] ListAllAppFilesCommand(client: WshClient, data: CommandListAllAppFilesData, opts?: RpcOpts): Promise { return client.wshRpcCall("listallappfiles", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index b028a329c5..c04587dd8d 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -402,10 +402,6 @@ declare global { type CommandJobStartStreamData = { }; - // wshrpc.CommandJobTerminateData - type CommandJobTerminateData = { - }; - // wshrpc.CommandListAllAppFilesData type CommandListAllAppFilesData = { appid: string; @@ -577,7 +573,8 @@ declare global { // wshrpc.CommandStartJobRtnData type CommandStartJobRtnData = { - cmdpgid: number; + cmdpid: number; + cmdstartts: number; jobmanagerpid: number; jobmanagerstartts: number; }; @@ -923,10 +920,10 @@ declare global { cmdenv?: {[key: string]: string}; jobauthtoken: string; ownerblockid: string; - cmdpgid: number; - termsize?: TermSize; - startts?: number; status: string; + cmdpid?: number; + cmdstartts?: number; + termsize?: TermSize; startuperror?: string; exitts?: number; exitcode?: number; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 117bcb66a1..da212b49ef 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -129,6 +129,28 @@ func GetConnectedJobIds() []string { return connectedJobIds } +func ensureJobConnected(ctx context.Context, jobId string) (*waveobj.Job, error) { + job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + if err != nil { + return nil, fmt.Errorf("failed to get job: %w", err) + } + + isConnected, err := conncontroller.IsConnected(job.Connection) + if err != nil { + return nil, fmt.Errorf("error checking connection status: %w", err) + } + if !isConnected { + return nil, fmt.Errorf("connection %q is not connected", job.Connection) + } + + jobConnStatus := GetJobConnStatus(jobId) + if jobConnStatus != JobConnStatus_Connected { + return nil, fmt.Errorf("job is not connected (status: %s)", jobConnStatus) + } + + return job, nil +} + type StartJobParams struct { ConnName string Cmd string @@ -172,16 +194,15 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { } job := &waveobj.Job{ - OID: jobId, - Connection: params.ConnName, - Cmd: params.Cmd, - CmdArgs: params.Args, - CmdEnv: params.Env, - TermSize: *params.TermSize, - JobAuthToken: jobAuthToken, - Status: JobStatus_Init, - StartTs: time.Now().UnixMilli(), - Meta: make(waveobj.MetaMapType), + OID: jobId, + Connection: params.ConnName, + Cmd: params.Cmd, + CmdArgs: params.Args, + CmdEnv: params.Env, + TermSize: *params.TermSize, + JobAuthToken: jobAuthToken, + JobManagerStatus: JobStatus_Init, + Meta: make(waveobj.MetaMapType), } err = wstore.DBInsert(ctx, job) @@ -236,18 +257,19 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { log.Printf("[job:%s] RemoteStartJobCommand failed: %v", jobId, err) errMsg := fmt.Sprintf("failed to start job: %v", err) wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.Status = JobStatus_Error + job.JobManagerStatus = JobStatus_Error job.StartupError = errMsg }) return "", fmt.Errorf("failed to start remote job: %w", err) } - log.Printf("[job:%s] RemoteStartJobCommand succeeded, cmdpgid=%d jobmanagerpid=%d jobmanagerstartts=%d", jobId, rtnData.CmdPgid, rtnData.JobManagerPid, rtnData.JobManagerStartTs) + log.Printf("[job:%s] RemoteStartJobCommand succeeded, cmdpid=%d cmdstartts=%d jobmanagerpid=%d jobmanagerstartts=%d", jobId, rtnData.CmdPid, rtnData.CmdStartTs, rtnData.JobManagerPid, rtnData.JobManagerStartTs) err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.CmdPgid = rtnData.CmdPgid + job.CmdPid = rtnData.CmdPid + job.CmdStartTs = rtnData.CmdStartTs job.JobManagerPid = rtnData.JobManagerPid job.JobManagerStartTs = rtnData.JobManagerStartTs - job.Status = JobStatus_Running + job.JobManagerStatus = JobStatus_Running job.JobManagerRunning = true }) if err != nil { @@ -336,12 +358,12 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobExitedData) error { var finalStatus string err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.Status = JobStatus_Done + job.JobManagerStatus = JobStatus_Done job.ExitError = data.ExitErr job.ExitCode = data.ExitCode job.ExitSignal = data.ExitSignal job.ExitTs = data.ExitTs - finalStatus = job.Status + finalStatus = job.JobManagerStatus }) if err != nil { return fmt.Errorf("failed to update job exit status: %w", err) @@ -359,7 +381,7 @@ func tryExitJobManager(ctx context.Context, jobId string) { return } - jobExited := job.Status == JobStatus_Done || job.Status == JobStatus_Error || job.Status == JobStatus_Terminated + jobExited := job.JobManagerStatus == JobStatus_Done || job.JobManagerStatus == JobStatus_Error || job.JobManagerStatus == JobStatus_Terminated if !jobExited || !job.StreamDone { log.Printf("[job:%s] not ready for termination: exited=%v streamDone=%v", jobId, jobExited, job.StreamDone) @@ -374,56 +396,15 @@ func tryExitJobManager(ctx context.Context, jobId string) { } } -func TerminateJob(ctx context.Context, jobId string) error { +func ExitJobManager(ctx context.Context, jobId string) error { job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { return fmt.Errorf("failed to get job: %w", err) } - isConnected, err := conncontroller.IsConnected(job.Connection) - if err != nil { - return fmt.Errorf("error checking connection status: %w", err) - } - if !isConnected { - return fmt.Errorf("connection %q is not connected", job.Connection) - } - return remoteTerminateJobManager(ctx, job) } -func ExitJobManager(ctx context.Context, jobId string) error { - _, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) - if err != nil { - return fmt.Errorf("failed to get job: %w", err) - } - - jobConnStatus := GetJobConnStatus(jobId) - if jobConnStatus != JobConnStatus_Connected { - return fmt.Errorf("job connection is not connected (status: %s)", jobConnStatus) - } - - bareRpc := wshclient.GetBareRpcClient() - rpcOpts := &wshrpc.RpcOpts{ - Route: wshutil.MakeJobRouteId(jobId), - Timeout: 5000, - } - - err = wshclient.JobManagerExitCommand(bareRpc, rpcOpts) - if err != nil { - return fmt.Errorf("failed to send exit command: %w", err) - } - - updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.JobManagerRunning = false - }) - if updateErr != nil { - log.Printf("[job:%s] error updating job manager running status: %v", jobId, updateErr) - } - - log.Printf("[job:%s] job manager exit command sent successfully", jobId) - return nil -} - func DisconnectJob(ctx context.Context, jobId string) error { job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { @@ -471,9 +452,13 @@ func remoteTerminateJobManager(ctx context.Context, job *waveobj.Job) error { } updateErr := wstore.DBUpdateFn(ctx, job.OID, func(job *waveobj.Job) { - job.Status = JobStatus_Terminated + job.JobManagerStatus = JobStatus_Terminated job.JobManagerRunning = false job.TerminateOnReconnect = false + if !job.StreamDone { + job.StreamDone = true + job.StreamError = "job manager terminated" + } }) if updateErr != nil { log.Printf("[job:%s] error updating job status after termination: %v", job.OID, updateErr) @@ -488,11 +473,6 @@ func ReconnectJob(ctx context.Context, jobId string) error { if err != nil { return fmt.Errorf("failed to get job: %w", err) } - - if job.Connection == "" { - return fmt.Errorf("job has no connection") - } - isConnected, err := conncontroller.IsConnected(job.Connection) if err != nil { return fmt.Errorf("error checking connection status: %w", err) @@ -654,7 +634,7 @@ func RestartStreaming(ctx context.Context, jobId string, knownConnected bool) er if rtnData.HasExited { log.Printf("[job:%s] job has already exited: code=%d signal=%q err=%q", jobId, rtnData.ExitCode, rtnData.ExitSignal, rtnData.ExitErr) updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.Status = JobStatus_Done + job.JobManagerStatus = JobStatus_Done job.ExitCode = rtnData.ExitCode job.ExitSignal = rtnData.ExitSignal job.ExitError = rtnData.ExitErr @@ -794,15 +774,15 @@ func DetachJobFromBlock(ctx context.Context, jobId string, updateBlock bool) err func SendInput(ctx context.Context, data wshrpc.CommandJobInputData) error { jobId := data.JobId - _, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) + _, err := ensureJobConnected(ctx, jobId) if err != nil { - return fmt.Errorf("failed to get job: %w", err) + return err } rpcOpts := &wshrpc.RpcOpts{ Route: wshutil.MakeJobRouteId(jobId), Timeout: 5000, - NoResponse: true, + NoResponse: false, } bareRpc := wshclient.GetBareRpcClient() @@ -811,5 +791,14 @@ func SendInput(ctx context.Context, data wshrpc.CommandJobInputData) error { return fmt.Errorf("failed to send input to job: %w", err) } + if data.TermSize != nil { + err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { + job.TermSize = *data.TermSize + }) + if err != nil { + log.Printf("[job:%s] warning: failed to update termsize in DB: %v", jobId, err) + } + } + return nil } diff --git a/pkg/jobmanager/mainserverconn.go b/pkg/jobmanager/mainserverconn.go index 3af89fca20..8f10eed20c 100644 --- a/pkg/jobmanager/mainserverconn.go +++ b/pkg/jobmanager/mainserverconn.go @@ -11,7 +11,6 @@ import ( "os" "sync" "sync/atomic" - "time" "github.com/shirou/gopsutil/v4/process" "github.com/wavetermdev/waveterm/pkg/baseds" @@ -169,27 +168,34 @@ func (msc *MainServerConn) StartJobCommand(ctx context.Context, data wshrpc.Comm log.Printf("StartJobCommand: cmd or process is nil") return nil, fmt.Errorf("cmd or process is nil") } - cmdPgid, err := getProcessGroupId(cmd.Process.Pid) + cmdPid := cmd.Process.Pid + cmdProc, err := process.NewProcess(int32(cmdPid)) if err != nil { - log.Printf("StartJobCommand: failed to get pgid: %v", err) - return nil, fmt.Errorf("failed to get process group id: %w", err) + log.Printf("StartJobCommand: failed to get cmd process: %v", err) + return nil, fmt.Errorf("failed to get cmd process: %w", err) + } + cmdStartTs, err := cmdProc.CreateTime() + if err != nil { + log.Printf("StartJobCommand: failed to get cmd start time: %v", err) + return nil, fmt.Errorf("failed to get cmd start time: %w", err) } jobManagerPid := os.Getpid() - proc, err := process.NewProcess(int32(jobManagerPid)) + jobManagerProc, err := process.NewProcess(int32(jobManagerPid)) if err != nil { log.Printf("StartJobCommand: failed to get job manager process: %v", err) return nil, fmt.Errorf("failed to get job manager process: %w", err) } - jobManagerStartTs, err := proc.CreateTime() + jobManagerStartTs, err := jobManagerProc.CreateTime() if err != nil { log.Printf("StartJobCommand: failed to get job manager start time: %v", err) return nil, fmt.Errorf("failed to get job manager start time: %w", err) } - log.Printf("StartJobCommand: job started successfully cmdPid=%d cmdPgid=%d jobManagerPid=%d jobManagerStartTs=%d", cmd.Process.Pid, cmdPgid, jobManagerPid, jobManagerStartTs) + log.Printf("StartJobCommand: job started successfully cmdPid=%d cmdStartTs=%d jobManagerPid=%d jobManagerStartTs=%d", cmdPid, cmdStartTs, jobManagerPid, jobManagerStartTs) return &wshrpc.CommandStartJobRtnData{ - CmdPgid: cmdPgid, + CmdPid: cmdPid, + CmdStartTs: cmdStartTs, JobManagerPid: jobManagerPid, JobManagerStartTs: jobManagerStartTs, }, nil @@ -270,21 +276,6 @@ func (msc *MainServerConn) JobStartStreamCommand(ctx context.Context, data wshrp return nil } -func (msc *MainServerConn) JobTerminateCommand(ctx context.Context, data wshrpc.CommandJobTerminateData) error { - WshCmdJobManager.lock.Lock() - defer WshCmdJobManager.lock.Unlock() - - if !msc.PeerAuthenticated.Load() { - return fmt.Errorf("not authenticated") - } - if WshCmdJobManager.Cmd == nil { - return fmt.Errorf("job not started") - } - log.Printf("JobTerminate called\n") - WshCmdJobManager.Cmd.TerminateByClosingPtyMaster() - return nil -} - func (msc *MainServerConn) JobInputCommand(ctx context.Context, data wshrpc.CommandJobInputData) error { WshCmdJobManager.lock.Lock() defer WshCmdJobManager.lock.Unlock() @@ -299,14 +290,3 @@ func (msc *MainServerConn) JobInputCommand(ctx context.Context, data wshrpc.Comm return WshCmdJobManager.Cmd.HandleInput(data) } -func (msc *MainServerConn) JobManagerExitCommand(ctx context.Context) error { - if !msc.PeerAuthenticated.Load() { - return fmt.Errorf("not authenticated") - } - log.Printf("JobManagerExit called, terminating job manager\n") - go func() { - time.Sleep(500 * time.Millisecond) - os.Exit(0) - }() - return nil -} diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 39e06a7383..57deae5a70 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -323,16 +323,19 @@ type Job struct { JobAuthToken string `json:"jobauthtoken"` // job manger -> wave AttachedBlockId string `json:"ownerblockid"` + JobManagerStatus string `json:"status"` // init, running, done, error, terminated + // cmd/process runtime info - CmdPgid int `json:"cmdpgid"` // command process group id - TermSize TermSize `json:"termsize,omitempty"` - StartTs int64 `json:"startts,omitempty"` // timestamp (milliseconds) - Status string `json:"status"` // init, running, done - StartupError string `json:"startuperror,omitempty"` - ExitTs int64 `json:"exitts,omitempty"` // timestamp (milliseconds) - ExitCode int `json:"exitcode,omitempty"` - ExitSignal string `json:"exitsignal,omitempty"` - ExitError string `json:"exiterror,omitempty"` + CmdPid int `json:"cmdpid,omitempty"` // command process id + CmdStartTs int64 `json:"cmdstartts,omitempty"` // command process start time (milliseconds from epoch) + + TermSize TermSize `json:"termsize,omitempty"` + + StartupError string `json:"startuperror,omitempty"` + ExitTs int64 `json:"exitts,omitempty"` // timestamp (milliseconds) + ExitCode int `json:"exitcode,omitempty"` + ExitSignal string `json:"exitsignal,omitempty"` + ExitError string `json:"exiterror,omitempty"` // reconnect option (e.g. orphaned, so we need to kill on connect) TerminateOnReconnect bool `json:"terminateonreconnect,omitempty"` diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 937d054ab3..bebf0f06c4 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -524,12 +524,6 @@ func JobControllerStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobContr return resp, err } -// command "jobcontrollerterminatejob", wshserver.JobControllerTerminateJobCommand -func JobControllerTerminateJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { - _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerterminatejob", data, opts) - return err -} - // command "jobdebugdelete", wshserver.JobDebugDeleteCommand func JobDebugDeleteCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobdebugdelete", data, opts) @@ -554,12 +548,6 @@ func JobInputCommand(w *wshutil.WshRpc, data wshrpc.CommandJobInputData, opts *w return err } -// command "jobmanagerexit", wshserver.JobManagerExitCommand -func JobManagerExitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) error { - _, err := sendRpcRequestCallHelper[any](w, "jobmanagerexit", nil, opts) - return err -} - // command "jobprepareconnect", wshserver.JobPrepareConnectCommand func JobPrepareConnectCommand(w *wshutil.WshRpc, data wshrpc.CommandJobPrepareConnectData, opts *wshrpc.RpcOpts) (*wshrpc.CommandJobConnectRtnData, error) { resp, err := sendRpcRequestCallHelper[*wshrpc.CommandJobConnectRtnData](w, "jobprepareconnect", data, opts) @@ -572,12 +560,6 @@ func JobStartStreamCommand(w *wshutil.WshRpc, data wshrpc.CommandJobStartStreamD return err } -// command "jobterminate", wshserver.JobTerminateCommand -func JobTerminateCommand(w *wshutil.WshRpc, data wshrpc.CommandJobTerminateData, opts *wshrpc.RpcOpts) error { - _, err := sendRpcRequestCallHelper[any](w, "jobterminate", data, opts) - return err -} - // command "listallappfiles", wshserver.ListAllAppFilesCommand func ListAllAppFilesCommand(w *wshutil.WshRpc, data wshrpc.CommandListAllAppFilesData, opts *wshrpc.RpcOpts) (*wshrpc.CommandListAllAppFilesRtnData, error) { resp, err := sendRpcRequestCallHelper[*wshrpc.CommandListAllAppFilesRtnData](w, "listallappfiles", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 92b37b1ac8..3e2c85f5e8 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -172,14 +172,11 @@ type WshRpcInterface interface { StartJobCommand(ctx context.Context, data CommandStartJobData) (*CommandStartJobRtnData, error) JobPrepareConnectCommand(ctx context.Context, data CommandJobPrepareConnectData) (*CommandJobConnectRtnData, error) JobStartStreamCommand(ctx context.Context, data CommandJobStartStreamData) error - JobTerminateCommand(ctx context.Context, data CommandJobTerminateData) error JobInputCommand(ctx context.Context, data CommandJobInputData) error JobExitedCommand(ctx context.Context, data CommandJobExitedData) error // this is sent FROM the job manager => main server - JobManagerExitCommand(ctx context.Context) error JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, error) JobDebugDeleteCommand(ctx context.Context, jobId string) error JobControllerStartJobCommand(ctx context.Context, data CommandJobControllerStartJobData) (string, error) - JobControllerTerminateJobCommand(ctx context.Context, jobId string) error JobControllerExitJobCommand(ctx context.Context, jobId string) error JobControllerDisconnectJobCommand(ctx context.Context, jobId string) error JobControllerReconnectJobCommand(ctx context.Context, jobId string) error @@ -752,7 +749,8 @@ type CommandRemoteTerminateJobManagerData struct { } type CommandStartJobRtnData struct { - CmdPgid int `json:"cmdpgid"` + CmdPid int `json:"cmdpid"` + CmdStartTs int64 `json:"cmdstartts"` JobManagerPid int `json:"jobmanagerpid"` JobManagerStartTs int64 `json:"jobmanagerstartts"` } @@ -775,9 +773,6 @@ type CommandJobConnectRtnData struct { ExitErr string `json:"exiterr,omitempty"` } -type CommandJobTerminateData struct { -} - type CommandJobExitedData struct { JobId string `json:"jobid"` ExitCode int `json:"exitcode"` diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 9f7e4054f3..27d45434c4 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -1470,10 +1470,6 @@ func (ws *WshServer) JobControllerStartJobCommand(ctx context.Context, data wshr return jobcontroller.StartJob(ctx, params) } -func (ws *WshServer) JobControllerTerminateJobCommand(ctx context.Context, jobId string) error { - return jobcontroller.TerminateJob(ctx, jobId) -} - func (ws *WshServer) JobControllerExitJobCommand(ctx context.Context, jobId string) error { return jobcontroller.ExitJobManager(ctx, jobId) } From 2968757ef320bb652b9b3480f38908a780dd073c Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 20 Jan 2026 16:04:44 -0800 Subject: [PATCH 50/64] more updates, tightening things up --- cmd/wsh/cmd/wshcmd-jobdebug.go | 10 ++--- frontend/types/gotypes.d.ts | 18 ++++---- pkg/jobcontroller/jobcontroller.go | 20 ++++----- pkg/jobmanager/jobcmd.go | 3 ++ pkg/jobmanager/jobmanager_unix.go | 60 +++------------------------ pkg/jobmanager/jobmanager_windows.go | 3 ++ pkg/waveobj/wtype.go | 31 +++++++------- pkg/wshrpc/wshremote/wshremote_job.go | 10 ++--- 8 files changed, 53 insertions(+), 102 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 6093d3f1fc..86b8c7b82b 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -201,14 +201,14 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { exitCode := "-" if job.JobManagerStatus != "running" && job.JobManagerStatus != "init" { - exitCode = fmt.Sprintf("%d", job.ExitCode) + exitCode = fmt.Sprintf("%d", job.CmdExitCode) } errorStr := "" - if job.StartupError != "" { - errorStr = fmt.Sprintf("%q", job.StartupError) - } else if job.ExitError != "" { - errorStr = fmt.Sprintf("%q", job.ExitError) + if job.JobManagerStartupError != "" { + errorStr = fmt.Sprintf("%q", job.JobManagerStartupError) + } else if job.CmdExitError != "" { + errorStr = fmt.Sprintf("%q", job.CmdExitError) } fmt.Printf("%-36s %-20s %-9s %-7s %-30s %-10s %-10s %-8s %s\n", diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index c04587dd8d..b2dd3e47a6 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -920,19 +920,19 @@ declare global { cmdenv?: {[key: string]: string}; jobauthtoken: string; ownerblockid: string; - status: string; - cmdpid?: number; - cmdstartts?: number; - termsize?: TermSize; - startuperror?: string; - exitts?: number; - exitcode?: number; - exitsignal?: string; - exiterror?: string; terminateonreconnect?: boolean; + jobmanagerstatus: string; + jobmanagerstartuperror?: string; jobmanagerrunning?: boolean; jobmanagerpid?: number; jobmanagerstartts?: number; + cmdpid?: number; + cmdstartts?: number; + cmdtermsize: TermSize; + cmdexitts?: number; + cmdexitcode?: number; + cmdexitsignal?: string; + cmdexiterror?: string; streamdone?: boolean; streamerror?: string; }; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index da212b49ef..9afaa2b07b 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -199,7 +199,7 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { Cmd: params.Cmd, CmdArgs: params.Args, CmdEnv: params.Env, - TermSize: *params.TermSize, + CmdTermSize: *params.TermSize, JobAuthToken: jobAuthToken, JobManagerStatus: JobStatus_Init, Meta: make(waveobj.MetaMapType), @@ -258,7 +258,7 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { errMsg := fmt.Sprintf("failed to start job: %v", err) wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { job.JobManagerStatus = JobStatus_Error - job.StartupError = errMsg + job.JobManagerStartupError = errMsg }) return "", fmt.Errorf("failed to start remote job: %w", err) } @@ -359,10 +359,10 @@ func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobEx var finalStatus string err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { job.JobManagerStatus = JobStatus_Done - job.ExitError = data.ExitErr - job.ExitCode = data.ExitCode - job.ExitSignal = data.ExitSignal - job.ExitTs = data.ExitTs + job.CmdExitError = data.ExitErr + job.CmdExitCode = data.ExitCode + job.CmdExitSignal = data.ExitSignal + job.CmdExitTs = data.ExitTs finalStatus = job.JobManagerStatus }) if err != nil { @@ -635,9 +635,9 @@ func RestartStreaming(ctx context.Context, jobId string, knownConnected bool) er log.Printf("[job:%s] job has already exited: code=%d signal=%q err=%q", jobId, rtnData.ExitCode, rtnData.ExitSignal, rtnData.ExitErr) updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { job.JobManagerStatus = JobStatus_Done - job.ExitCode = rtnData.ExitCode - job.ExitSignal = rtnData.ExitSignal - job.ExitError = rtnData.ExitErr + job.CmdExitCode = rtnData.ExitCode + job.CmdExitSignal = rtnData.ExitSignal + job.CmdExitError = rtnData.ExitErr }) if updateErr != nil { log.Printf("[job:%s] error updating job exit status: %v", jobId, updateErr) @@ -793,7 +793,7 @@ func SendInput(ctx context.Context, data wshrpc.CommandJobInputData) error { if data.TermSize != nil { err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.TermSize = *data.TermSize + job.CmdTermSize = *data.TermSize }) if err != nil { log.Printf("[job:%s] warning: failed to update termsize in DB: %v", jobId, err) diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index c4a5f97778..8e2754d723 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -30,6 +30,7 @@ type JobCmd struct { lock sync.Mutex cmd *exec.Cmd cmdPty pty.Pty + ptsName string cleanedUp bool ptyClosed bool processExited bool @@ -61,8 +62,10 @@ func MakeJobCmd(jobId string, cmdDef CmdDef) (*JobCmd, error) { if err != nil { return nil, fmt.Errorf("failed to start command: %w", err) } + setCloseOnExec(int(cmdPty.Fd())) jm.cmd = ecmd jm.cmdPty = cmdPty + jm.ptsName = jm.cmdPty.Name() go jm.waitForProcess() return jm, nil } diff --git a/pkg/jobmanager/jobmanager_unix.go b/pkg/jobmanager/jobmanager_unix.go index f8a895a2af..bddbea3987 100644 --- a/pkg/jobmanager/jobmanager_unix.go +++ b/pkg/jobmanager/jobmanager_unix.go @@ -12,7 +12,6 @@ import ( "os/signal" "strings" "syscall" - "time" "golang.org/x/sys/unix" ) @@ -59,7 +58,7 @@ func daemonize(clientId string, jobId string) error { return fmt.Errorf("failed to setsid: %w", err) } - devNull, err := os.OpenFile("/dev/null", os.O_RDONLY, 0) + devNull, err := os.OpenFile("/dev/null", os.O_RDWR, 0) if err != nil { return fmt.Errorf("failed to open /dev/null: %w", err) } @@ -86,60 +85,11 @@ func daemonize(clientId string, jobId string) error { log.SetOutput(logFile) log.Printf("job manager daemonized, logging to %s\n", logPath) - setupJobManagerSignalHandlers() - return nil -} + signal.Ignore(syscall.SIGHUP) -func handleSIGHUP() { - cmd := WshCmdJobManager.GetCmd() - if cmd != nil { - log.Printf("handling SIGHUP, closing pty master\n") - cmd.TerminateByClosingPtyMaster() - } - go func() { - log.Printf("received SIGHUP, will exit") - time.Sleep(500 * time.Millisecond) - log.Printf("terminating job manager\n") - os.Exit(0) - }() + return nil } -func setupJobManagerSignalHandlers() { - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) - - go func() { - for sig := range sigChan { - log.Printf("job manager received signal: %v\n", sig) - - if sig == syscall.SIGHUP { - handleSIGHUP() - continue - } - - cmd := WshCmdJobManager.GetCmd() - if cmd != nil { - pgid, err := cmd.GetPGID() - if err == nil { - if s, ok := sig.(syscall.Signal); ok { - log.Printf("forwarding signal %v to process group %d\n", sig, pgid) - _ = syscall.Kill(-pgid, s) - } else { - log.Printf("signal is not a syscall.Signal: %T\n", sig) - } - } else { - log.Printf("failed to get pgid: %v\n", err) - } - } - - if sig == syscall.SIGTERM { - if cmd != nil { - log.Printf("received SIGTERM, will exit\n") - time.Sleep(500 * time.Millisecond) - } - log.Printf("terminating job manager\n") - os.Exit(0) - } - } - }() +func setCloseOnExec(fd int) { + unix.CloseOnExec(fd) } diff --git a/pkg/jobmanager/jobmanager_windows.go b/pkg/jobmanager/jobmanager_windows.go index 1806c9f4fd..356bfcb66e 100644 --- a/pkg/jobmanager/jobmanager_windows.go +++ b/pkg/jobmanager/jobmanager_windows.go @@ -24,3 +24,6 @@ func daemonize(clientId string, jobId string) error { func setupJobManagerSignalHandlers() { } + +func setCloseOnExec(fd int) { +} diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 57deae5a70..80f773ec67 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -323,27 +323,24 @@ type Job struct { JobAuthToken string `json:"jobauthtoken"` // job manger -> wave AttachedBlockId string `json:"ownerblockid"` - JobManagerStatus string `json:"status"` // init, running, done, error, terminated - - // cmd/process runtime info - CmdPid int `json:"cmdpid,omitempty"` // command process id - CmdStartTs int64 `json:"cmdstartts,omitempty"` // command process start time (milliseconds from epoch) - - TermSize TermSize `json:"termsize,omitempty"` - - StartupError string `json:"startuperror,omitempty"` - ExitTs int64 `json:"exitts,omitempty"` // timestamp (milliseconds) - ExitCode int `json:"exitcode,omitempty"` - ExitSignal string `json:"exitsignal,omitempty"` - ExitError string `json:"exiterror,omitempty"` - // reconnect option (e.g. orphaned, so we need to kill on connect) TerminateOnReconnect bool `json:"terminateonreconnect,omitempty"` // job manager state - JobManagerRunning bool `json:"jobmanagerrunning,omitempty"` - JobManagerPid int `json:"jobmanagerpid,omitempty"` - JobManagerStartTs int64 `json:"jobmanagerstartts,omitempty"` + JobManagerStatus string `json:"jobmanagerstatus"` // init, running, done, error, terminated + JobManagerStartupError string `json:"jobmanagerstartuperror,omitempty"` + JobManagerRunning bool `json:"jobmanagerrunning,omitempty"` + JobManagerPid int `json:"jobmanagerpid,omitempty"` + JobManagerStartTs int64 `json:"jobmanagerstartts,omitempty"` // exact process start time (milliseconds) + + // cmd/process runtime info + CmdPid int `json:"cmdpid,omitempty"` // command process id + CmdStartTs int64 `json:"cmdstartts,omitempty"` // exact command process start time (milliseconds from epoch) + CmdTermSize TermSize `json:"cmdtermsize"` + CmdExitTs int64 `json:"cmdexitts,omitempty"` // timestamp (milliseconds) + CmdExitCode int `json:"cmdexitcode,omitempty"` + CmdExitSignal string `json:"cmdexitsignal,omitempty"` + CmdExitError string `json:"cmdexiterror,omitempty"` // output info StreamDone bool `json:"streamdone,omitempty"` diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index 934b793c6d..efbd681b11 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -331,7 +331,6 @@ func (impl *ServerImpl) RemoteDisconnectFromJobManagerCommand(ctx context.Contex func (impl *ServerImpl) RemoteTerminateJobManagerCommand(ctx context.Context, data wshrpc.CommandRemoteTerminateJobManagerData) error { log.Printf("RemoteTerminateJobManagerCommand: terminating job manager, jobid=%s, pid=%d\n", data.JobId, data.JobManagerPid) - proc, err := isProcessRunning(data.JobManagerPid, data.JobManagerStartTs) if err != nil { return fmt.Errorf("error checking job manager process: %w", err) @@ -340,12 +339,11 @@ func (impl *ServerImpl) RemoteTerminateJobManagerCommand(ctx context.Context, da log.Printf("RemoteTerminateJobManagerCommand: job manager process not running, jobid=%s\n", data.JobId) return nil } - - err = proc.SendSignal(syscall.SIGHUP) + err = proc.SendSignal(syscall.SIGTERM) if err != nil { - return fmt.Errorf("failed to send SIGHUP to job manager: %w", err) + log.Printf("failed to send SIGTERM to job manager: %v", err) + } else { + log.Printf("RemoteTerminateJobManagerCommand: sent SIGTERM to job manager process, jobid=%s, pid=%d\n", data.JobId, data.JobManagerPid) } - - log.Printf("RemoteTerminateJobManagerCommand: sent SIGHUP to job manager process, jobid=%s, pid=%d\n", data.JobId, data.JobManagerPid) return nil } From f483addce2c3e63ab9babc0eb4156dc461a76e83 Mon Sep 17 00:00:00 2001 From: sawka Date: Tue, 20 Jan 2026 17:11:31 -0800 Subject: [PATCH 51/64] clean up more semantics --- cmd/wsh/cmd/wshcmd-jobdebug.go | 42 +++++++-------- frontend/app/store/wshclientapi.ts | 30 +++++------ frontend/types/gotypes.d.ts | 22 ++++---- pkg/jobcontroller/jobcontroller.go | 74 ++++++++++++++++----------- pkg/jobmanager/jobcmd.go | 21 +++++--- pkg/jobmanager/jobmanager.go | 8 ++- pkg/waveobj/wtype.go | 10 ++-- pkg/wshrpc/wshclient/wshclient.go | 36 ++++++------- pkg/wshrpc/wshremote/wshremote_job.go | 6 +-- pkg/wshrpc/wshrpctypes.go | 20 ++++---- pkg/wshrpc/wshserver/wshserver.go | 10 ++-- 11 files changed, 149 insertions(+), 130 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 86b8c7b82b..d9b6275364 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -152,7 +152,7 @@ func init() { } func jobDebugListRun(cmd *cobra.Command, args []string) error { - rtnData, err := wshclient.JobDebugListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) + rtnData, err := wshclient.JobControllerListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) if err != nil { return fmt.Errorf("getting job debug list: %w", err) } @@ -178,18 +178,13 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { return nil } - fmt.Printf("%-36s %-20s %-9s %-7s %-30s %-10s %-10s %-8s %s\n", "OID", "Connection", "Connected", "Manager", "Cmd", "Status", "Stream", "ExitCode", "Error") + fmt.Printf("%-36s %-20s %-9s %-10s %-30s %-8s %-10s\n", "OID", "Connection", "Connected", "Manager", "Cmd", "ExitCode", "Stream") for _, job := range rtnData { connectedStatus := "no" if connectedMap[job.OID] { connectedStatus = "yes" } - managerStatus := "no" - if job.JobManagerRunning { - managerStatus = "yes" - } - streamStatus := "-" if job.StreamDone { if job.StreamError == "" { @@ -200,25 +195,24 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { } exitCode := "-" - if job.JobManagerStatus != "running" && job.JobManagerStatus != "init" { - exitCode = fmt.Sprintf("%d", job.CmdExitCode) - } - - errorStr := "" - if job.JobManagerStartupError != "" { - errorStr = fmt.Sprintf("%q", job.JobManagerStartupError) - } else if job.CmdExitError != "" { - errorStr = fmt.Sprintf("%q", job.CmdExitError) + if job.CmdExitTs > 0 { + if job.CmdExitCode != nil { + exitCode = fmt.Sprintf("%d", *job.CmdExitCode) + } else if job.CmdExitSignal != "" { + exitCode = job.CmdExitSignal + } else { + exitCode = "?" + } } - fmt.Printf("%-36s %-20s %-9s %-7s %-30s %-10s %-10s %-8s %s\n", - job.OID, job.Connection, connectedStatus, managerStatus, job.Cmd, job.JobManagerStatus, streamStatus, exitCode, errorStr) + fmt.Printf("%-36s %-20s %-9s %-10s %-30s %-8s %-10s\n", + job.OID, job.Connection, connectedStatus, job.JobManagerStatus, job.Cmd, exitCode, streamStatus) } return nil } func jobDebugDeleteRun(cmd *cobra.Command, args []string) error { - err := wshclient.JobDebugDeleteCommand(RpcClient, jobIdFlag, &wshrpc.RpcOpts{Timeout: 5000}) + err := wshclient.JobControllerDeleteJobCommand(RpcClient, jobIdFlag, &wshrpc.RpcOpts{Timeout: 5000}) if err != nil { return fmt.Errorf("deleting job: %w", err) } @@ -228,7 +222,7 @@ func jobDebugDeleteRun(cmd *cobra.Command, args []string) error { } func jobDebugDeleteAllRun(cmd *cobra.Command, args []string) error { - rtnData, err := wshclient.JobDebugListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) + rtnData, err := wshclient.JobControllerListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) if err != nil { return fmt.Errorf("getting job debug list: %w", err) } @@ -240,7 +234,7 @@ func jobDebugDeleteAllRun(cmd *cobra.Command, args []string) error { deletedCount := 0 for _, job := range rtnData { - err := wshclient.JobDebugDeleteCommand(RpcClient, job.OID, &wshrpc.RpcOpts{Timeout: 5000}) + err := wshclient.JobControllerDeleteJobCommand(RpcClient, job.OID, &wshrpc.RpcOpts{Timeout: 5000}) if err != nil { fmt.Printf("Error deleting job %s: %v\n", job.OID, err) } else { @@ -253,7 +247,7 @@ func jobDebugDeleteAllRun(cmd *cobra.Command, args []string) error { } func jobDebugPruneRun(cmd *cobra.Command, args []string) error { - rtnData, err := wshclient.JobDebugListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) + rtnData, err := wshclient.JobControllerListCommand(RpcClient, &wshrpc.RpcOpts{Timeout: 5000}) if err != nil { return fmt.Errorf("getting job debug list: %w", err) } @@ -265,8 +259,8 @@ func jobDebugPruneRun(cmd *cobra.Command, args []string) error { deletedCount := 0 for _, job := range rtnData { - if !job.JobManagerRunning { - err := wshclient.JobDebugDeleteCommand(RpcClient, job.OID, &wshrpc.RpcOpts{Timeout: 5000}) + if job.JobManagerStatus != "running" { + err := wshclient.JobControllerDeleteJobCommand(RpcClient, job.OID, &wshrpc.RpcOpts{Timeout: 5000}) if err != nil { fmt.Printf("Error deleting job %s: %v\n", job.OID, err) } else { diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 0317c8bf47..87409f1866 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -392,6 +392,11 @@ class RpcApiType { return client.wshRpcCall("getwaveairatelimit", null, opts); } + // command "jobcmdexited" [call] + JobCmdExitedCommand(client: WshClient, data: CommandJobCmdExitedData, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcmdexited", data, opts); + } + // command "jobcontrollerattachjob" [call] JobControllerAttachJobCommand(client: WshClient, data: CommandJobControllerAttachJobData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerattachjob", data, opts); @@ -402,6 +407,11 @@ class RpcApiType { return client.wshRpcCall("jobcontrollerconnectedjobs", null, opts); } + // command "jobcontrollerdeletejob" [call] + JobControllerDeleteJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerdeletejob", data, opts); + } + // command "jobcontrollerdetachjob" [call] JobControllerDetachJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerdetachjob", data, opts); @@ -417,6 +427,11 @@ class RpcApiType { return client.wshRpcCall("jobcontrollerexitjob", data, opts); } + // command "jobcontrollerlist" [call] + JobControllerListCommand(client: WshClient, opts?: RpcOpts): Promise { + return client.wshRpcCall("jobcontrollerlist", null, opts); + } + // command "jobcontrollerreconnectjob" [call] JobControllerReconnectJobCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("jobcontrollerreconnectjob", data, opts); @@ -432,21 +447,6 @@ class RpcApiType { return client.wshRpcCall("jobcontrollerstartjob", data, opts); } - // command "jobdebugdelete" [call] - JobDebugDeleteCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { - return client.wshRpcCall("jobdebugdelete", data, opts); - } - - // command "jobdebuglist" [call] - JobDebugListCommand(client: WshClient, opts?: RpcOpts): Promise { - return client.wshRpcCall("jobdebuglist", null, opts); - } - - // command "jobexited" [call] - JobExitedCommand(client: WshClient, data: CommandJobExitedData, opts?: RpcOpts): Promise { - return client.wshRpcCall("jobexited", data, opts); - } - // command "jobinput" [call] JobInputCommand(client: WshClient, data: CommandJobInputData, opts?: RpcOpts): Promise { return client.wshRpcCall("jobinput", data, opts); diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index b2dd3e47a6..eca2305bf8 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -349,6 +349,15 @@ declare global { chatid: string; }; + // wshrpc.CommandJobCmdExitedData + type CommandJobCmdExitedData = { + jobid: string; + exitcode: number; + exitsignal?: string; + exiterr?: string; + exitts?: number; + }; + // wshrpc.CommandJobConnectRtnData type CommandJobConnectRtnData = { seq: number; @@ -375,15 +384,6 @@ declare global { termsize?: TermSize; }; - // wshrpc.CommandJobExitedData - type CommandJobExitedData = { - jobid: string; - exitcode: number; - exitsignal?: string; - exiterr?: string; - exitts?: number; - }; - // wshrpc.CommandJobInputData type CommandJobInputData = { jobid: string; @@ -484,7 +484,7 @@ declare global { // wshrpc.CommandRemoteReconnectToJobManagerRtnData type CommandRemoteReconnectToJobManagerRtnData = { success: boolean; - jobmanagerexited: boolean; + jobmanagergone: boolean; error?: string; }; @@ -922,8 +922,8 @@ declare global { ownerblockid: string; terminateonreconnect?: boolean; jobmanagerstatus: string; + jobmanagerdonereason?: string; jobmanagerstartuperror?: string; - jobmanagerrunning?: boolean; jobmanagerpid?: number; jobmanagerstartts?: number; cmdpid?: number; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index 9afaa2b07b..f984a6a12b 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -29,11 +29,15 @@ import ( ) const ( - JobStatus_Init = "init" - JobStatus_Running = "running" - JobStatus_Done = "done" // natural exit (managed by job manager, command completed) - JobStatus_Error = "error" // failed to start or unmanaged failure - JobStatus_Terminated = "terminated" // explicitly killed via terminate command + JobStatus_Init = "init" + JobStatus_Running = "running" + JobStatus_Done = "done" +) + +const ( + JobDoneReason_StartupError = "startuperror" + JobDoneReason_Gone = "gone" + JobDoneReason_Terminated = "terminated" ) const ( @@ -46,6 +50,10 @@ const DefaultStreamRwnd = 64 * 1024 const MetaKey_TotalGap = "totalgap" const JobOutputFileName = "term" +func isJobManagerRunning(job *waveobj.Job) bool { + return job.JobManagerStatus == JobStatus_Running +} + var ( jobConnStates = make(map[string]string) jobConnStatesLock sync.Mutex @@ -257,7 +265,8 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { log.Printf("[job:%s] RemoteStartJobCommand failed: %v", jobId, err) errMsg := fmt.Sprintf("failed to start job: %v", err) wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.JobManagerStatus = JobStatus_Error + job.JobManagerStatus = JobStatus_Done + job.JobManagerDoneReason = JobDoneReason_StartupError job.JobManagerStartupError = errMsg }) return "", fmt.Errorf("failed to start remote job: %w", err) @@ -270,7 +279,6 @@ func StartJob(ctx context.Context, params StartJobParams) (string, error) { job.JobManagerPid = rtnData.JobManagerPid job.JobManagerStartTs = rtnData.JobManagerStartTs job.JobManagerStatus = JobStatus_Running - job.JobManagerRunning = true }) if err != nil { log.Printf("[job:%s] warning: failed to update job status to running: %v", jobId, err) @@ -335,7 +343,7 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if updateErr != nil { log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr) } - tryExitJobManager(ctx, jobId) + tryTerminateJobManager(ctx, jobId) break } @@ -349,54 +357,53 @@ func runOutputLoop(ctx context.Context, jobId string, reader *streamclient.Reade if updateErr != nil { log.Printf("[job:%s] error updating job stream error: %v", jobId, updateErr) } - tryExitJobManager(ctx, jobId) + tryTerminateJobManager(ctx, jobId) break } } } -func HandleJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobExitedData) error { - var finalStatus string +func HandleCmdJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobCmdExitedData) error { err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.JobManagerStatus = JobStatus_Done job.CmdExitError = data.ExitErr job.CmdExitCode = data.ExitCode job.CmdExitSignal = data.ExitSignal job.CmdExitTs = data.ExitTs - finalStatus = job.JobManagerStatus }) if err != nil { return fmt.Errorf("failed to update job exit status: %w", err) } - - log.Printf("[job:%s] exited with code:%d signal:%q status:%s", jobId, data.ExitCode, data.ExitSignal, finalStatus) - tryExitJobManager(ctx, jobId) + tryTerminateJobManager(ctx, jobId) return nil } -func tryExitJobManager(ctx context.Context, jobId string) { +func tryTerminateJobManager(ctx context.Context, jobId string) { job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { log.Printf("[job:%s] error getting job for termination check: %v", jobId, err) return } - jobExited := job.JobManagerStatus == JobStatus_Done || job.JobManagerStatus == JobStatus_Error || job.JobManagerStatus == JobStatus_Terminated + if job.JobManagerStatus != JobStatus_Running { + return + } + + cmdExited := job.CmdExitTs != 0 - if !jobExited || !job.StreamDone { - log.Printf("[job:%s] not ready for termination: exited=%v streamDone=%v", jobId, jobExited, job.StreamDone) + if !cmdExited || !job.StreamDone { + log.Printf("[job:%s] not ready for termination: exited=%v streamDone=%v", jobId, cmdExited, job.StreamDone) return } - log.Printf("[job:%s] both job exited and stream finished, exiting job manager", jobId) + log.Printf("[job:%s] both job cmd exited and stream finished, terminating job manager", jobId) - err = ExitJobManager(ctx, jobId) + err = TerminateJobManager(ctx, jobId) if err != nil { - log.Printf("[job:%s] error exiting job manager: %v", jobId, err) + log.Printf("[job:%s] error terminating job manager: %v", jobId, err) } } -func ExitJobManager(ctx context.Context, jobId string) error { +func TerminateJobManager(ctx context.Context, jobId string) error { job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId) if err != nil { return fmt.Errorf("failed to get job: %w", err) @@ -452,8 +459,8 @@ func remoteTerminateJobManager(ctx context.Context, job *waveobj.Job) error { } updateErr := wstore.DBUpdateFn(ctx, job.OID, func(job *waveobj.Job) { - job.JobManagerStatus = JobStatus_Terminated - job.JobManagerRunning = false + job.JobManagerStatus = JobStatus_Done + job.JobManagerDoneReason = JobDoneReason_Terminated job.TerminateOnReconnect = false if !job.StreamDone { job.StreamDone = true @@ -518,9 +525,10 @@ func ReconnectJob(ctx context.Context, jobId string) error { if !rtnData.Success { log.Printf("[job:%s] RemoteReconnectToJobManagerCommand returned error: %s", jobId, rtnData.Error) - if rtnData.JobManagerExited { + if rtnData.JobManagerGone { updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { - job.JobManagerRunning = false + job.JobManagerStatus = JobStatus_Done + job.JobManagerDoneReason = JobDoneReason_Gone }) if updateErr != nil { log.Printf("[job:%s] error updating job manager running status: %v", jobId, updateErr) @@ -560,7 +568,7 @@ func ReconnectJobsForConn(ctx context.Context, connName string) error { var jobsToReconnect []*waveobj.Job for _, job := range allJobs { - if job.Connection == connName && job.JobManagerRunning { + if job.Connection == connName && isJobManagerRunning(job) { jobsToReconnect = append(jobsToReconnect, job) } } @@ -632,7 +640,11 @@ func RestartStreaming(ctx context.Context, jobId string, knownConnected bool) er } if rtnData.HasExited { - log.Printf("[job:%s] job has already exited: code=%d signal=%q err=%q", jobId, rtnData.ExitCode, rtnData.ExitSignal, rtnData.ExitErr) + exitCodeStr := "nil" + if rtnData.ExitCode != nil { + exitCodeStr = fmt.Sprintf("%d", *rtnData.ExitCode) + } + log.Printf("[job:%s] job has already exited: code=%s signal=%q err=%q", jobId, exitCodeStr, rtnData.ExitSignal, rtnData.ExitErr) updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) { job.JobManagerStatus = JobStatus_Done job.CmdExitCode = rtnData.ExitCode @@ -662,7 +674,7 @@ func RestartStreaming(ctx context.Context, jobId string, knownConnected bool) er if rtnData.StreamDone && rtnData.HasExited { reader.Close() log.Printf("[job:%s] both stream done and job exited, calling tryExitJobManager", jobId) - tryExitJobManager(ctx, jobId) + tryTerminateJobManager(ctx, jobId) return nil } diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index 8e2754d723..bd647473bb 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -34,7 +34,7 @@ type JobCmd struct { cleanedUp bool ptyClosed bool processExited bool - exitCode int + exitCode *int exitSignal string exitErr error exitTs int64 @@ -86,16 +86,23 @@ func (jm *JobCmd) waitForProcess() { if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { if status.Signaled() { jm.exitSignal = status.Signal().String() - jm.exitCode = -1 + } else if status.Exited() { + code := status.ExitStatus() + jm.exitCode = &code } else { - jm.exitCode = status.ExitStatus() + log.Printf("Invalid WaitStatus, not exited or signaled: %v", status) } } } } else { - jm.exitCode = 0 + code := 0 + jm.exitCode = &code } - log.Printf("process exited: exitcode=%d, signal=%s, err=%v\n", jm.exitCode, jm.exitSignal, jm.exitErr) + exitCodeStr := "nil" + if jm.exitCode != nil { + exitCodeStr = fmt.Sprintf("%d", *jm.exitCode) + } + log.Printf("process exited: exitcode=%s, signal=%s, err=%v\n", exitCodeStr, jm.exitSignal, jm.exitErr) go WshCmdJobManager.sendJobExited() } @@ -125,13 +132,13 @@ func (jm *JobCmd) GetPGID() (int, error) { return pgid, nil } -func (jm *JobCmd) GetExitInfo() (bool, *wshrpc.CommandJobExitedData) { +func (jm *JobCmd) GetExitInfo() (bool, *wshrpc.CommandJobCmdExitedData) { jm.lock.Lock() defer jm.lock.Unlock() if !jm.processExited { return false, nil } - exitData := &wshrpc.CommandJobExitedData{ + exitData := &wshrpc.CommandJobCmdExitedData{ JobId: WshCmdJobManager.JobId, ExitCode: jm.exitCode, ExitSignal: jm.exitSignal, diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index bcdb18570d..afa015304f 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -98,8 +98,12 @@ func (jm *JobManager) sendJobExited() { return } - log.Printf("sendJobExited: sending exit notification to main server exitcode=%d signal=%s\n", exitData.ExitCode, exitData.ExitSignal) - err := wshclient.JobExitedCommand(attachedClient.WshRpc, *exitData, nil) + exitCodeStr := "nil" + if exitData.ExitCode != nil { + exitCodeStr = fmt.Sprintf("%d", *exitData.ExitCode) + } + log.Printf("sendJobExited: sending exit notification to main server exitcode=%s signal=%s\n", exitCodeStr, exitData.ExitSignal) + err := wshclient.JobCmdExitedCommand(attachedClient.WshRpc, *exitData, nil) if err != nil { log.Printf("sendJobExited: error sending exit notification: %v\n", err) } diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index 80f773ec67..ea46dbf149 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -327,9 +327,9 @@ type Job struct { TerminateOnReconnect bool `json:"terminateonreconnect,omitempty"` // job manager state - JobManagerStatus string `json:"jobmanagerstatus"` // init, running, done, error, terminated + JobManagerStatus string `json:"jobmanagerstatus"` // init, running, done + JobManagerDoneReason string `json:"jobmanagerdonereason,omitempty"` // startuperror, gone, terminated JobManagerStartupError string `json:"jobmanagerstartuperror,omitempty"` - JobManagerRunning bool `json:"jobmanagerrunning,omitempty"` JobManagerPid int `json:"jobmanagerpid,omitempty"` JobManagerStartTs int64 `json:"jobmanagerstartts,omitempty"` // exact process start time (milliseconds) @@ -337,9 +337,9 @@ type Job struct { CmdPid int `json:"cmdpid,omitempty"` // command process id CmdStartTs int64 `json:"cmdstartts,omitempty"` // exact command process start time (milliseconds from epoch) CmdTermSize TermSize `json:"cmdtermsize"` - CmdExitTs int64 `json:"cmdexitts,omitempty"` // timestamp (milliseconds) - CmdExitCode int `json:"cmdexitcode,omitempty"` - CmdExitSignal string `json:"cmdexitsignal,omitempty"` + CmdExitTs int64 `json:"cmdexitts,omitempty"` // timestamp (milliseconds) -- use CmdExitTs > 0 to check if command has exited + CmdExitCode *int `json:"cmdexitcode,omitempty"` // nil when CmdExitSignal is set. success exit is when CmdExitCode is 0 + CmdExitSignal string `json:"cmdexitsignal,omitempty"` // empty string if CmdExitCode is set CmdExitError string `json:"cmdexiterror,omitempty"` // output info diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index bebf0f06c4..7c4ba8658f 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -476,6 +476,12 @@ func GetWaveAIRateLimitCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) (*uctype return resp, err } +// command "jobcmdexited", wshserver.JobCmdExitedCommand +func JobCmdExitedCommand(w *wshutil.WshRpc, data wshrpc.CommandJobCmdExitedData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcmdexited", data, opts) + return err +} + // command "jobcontrollerattachjob", wshserver.JobControllerAttachJobCommand func JobControllerAttachJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobControllerAttachJobData, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerattachjob", data, opts) @@ -488,6 +494,12 @@ func JobControllerConnectedJobsCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) return resp, err } +// command "jobcontrollerdeletejob", wshserver.JobControllerDeleteJobCommand +func JobControllerDeleteJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerdeletejob", data, opts) + return err +} + // command "jobcontrollerdetachjob", wshserver.JobControllerDetachJobCommand func JobControllerDetachJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerdetachjob", data, opts) @@ -506,6 +518,12 @@ func JobControllerExitJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.Rp return err } +// command "jobcontrollerlist", wshserver.JobControllerListCommand +func JobControllerListCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) ([]*waveobj.Job, error) { + resp, err := sendRpcRequestCallHelper[[]*waveobj.Job](w, "jobcontrollerlist", nil, opts) + return resp, err +} + // command "jobcontrollerreconnectjob", wshserver.JobControllerReconnectJobCommand func JobControllerReconnectJobCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobcontrollerreconnectjob", data, opts) @@ -524,24 +542,6 @@ func JobControllerStartJobCommand(w *wshutil.WshRpc, data wshrpc.CommandJobContr return resp, err } -// command "jobdebugdelete", wshserver.JobDebugDeleteCommand -func JobDebugDeleteCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { - _, err := sendRpcRequestCallHelper[any](w, "jobdebugdelete", data, opts) - return err -} - -// command "jobdebuglist", wshserver.JobDebugListCommand -func JobDebugListCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) ([]*waveobj.Job, error) { - resp, err := sendRpcRequestCallHelper[[]*waveobj.Job](w, "jobdebuglist", nil, opts) - return resp, err -} - -// command "jobexited", wshserver.JobExitedCommand -func JobExitedCommand(w *wshutil.WshRpc, data wshrpc.CommandJobExitedData, opts *wshrpc.RpcOpts) error { - _, err := sendRpcRequestCallHelper[any](w, "jobexited", data, opts) - return err -} - // command "jobinput", wshserver.JobInputCommand func JobInputCommand(w *wshutil.WshRpc, data wshrpc.CommandJobInputData, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "jobinput", data, opts) diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index efbd681b11..b0384da09d 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -285,9 +285,9 @@ func (impl *ServerImpl) RemoteReconnectToJobManagerCommand(ctx context.Context, } if proc == nil { return &wshrpc.CommandRemoteReconnectToJobManagerRtnData{ - Success: false, - JobManagerExited: true, - Error: fmt.Sprintf("job manager process (pid=%d) is not running", data.JobManagerPid), + Success: false, + JobManagerGone: true, + Error: fmt.Sprintf("job manager process (pid=%d) is not running", data.JobManagerPid), }, nil } diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 3e2c85f5e8..fe529e63e5 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -173,9 +173,11 @@ type WshRpcInterface interface { JobPrepareConnectCommand(ctx context.Context, data CommandJobPrepareConnectData) (*CommandJobConnectRtnData, error) JobStartStreamCommand(ctx context.Context, data CommandJobStartStreamData) error JobInputCommand(ctx context.Context, data CommandJobInputData) error - JobExitedCommand(ctx context.Context, data CommandJobExitedData) error // this is sent FROM the job manager => main server - JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, error) - JobDebugDeleteCommand(ctx context.Context, jobId string) error + JobCmdExitedCommand(ctx context.Context, data CommandJobCmdExitedData) error // this is sent FROM the job manager => main server + + // job controller + JobControllerDeleteJobCommand(ctx context.Context, jobId string) error + JobControllerListCommand(ctx context.Context) ([]*waveobj.Job, error) JobControllerStartJobCommand(ctx context.Context, data CommandJobControllerStartJobData) (string, error) JobControllerExitJobCommand(ctx context.Context, jobId string) error JobControllerDisconnectJobCommand(ctx context.Context, jobId string) error @@ -733,9 +735,9 @@ type CommandRemoteReconnectToJobManagerData struct { } type CommandRemoteReconnectToJobManagerRtnData struct { - Success bool `json:"success"` - JobManagerExited bool `json:"jobmanagerexited"` - Error string `json:"error,omitempty"` + Success bool `json:"success"` + JobManagerGone bool `json:"jobmanagergone"` + Error string `json:"error,omitempty"` } type CommandRemoteDisconnectFromJobManagerData struct { @@ -768,14 +770,14 @@ type CommandJobConnectRtnData struct { StreamDone bool `json:"streamdone,omitempty"` StreamError string `json:"streamerror,omitempty"` HasExited bool `json:"hasexited,omitempty"` - ExitCode int `json:"exitcode,omitempty"` + ExitCode *int `json:"exitcode,omitempty"` ExitSignal string `json:"exitsignal,omitempty"` ExitErr string `json:"exiterr,omitempty"` } -type CommandJobExitedData struct { +type CommandJobCmdExitedData struct { JobId string `json:"jobid"` - ExitCode int `json:"exitcode"` + ExitCode *int `json:"exitcode"` ExitSignal string `json:"exitsignal,omitempty"` ExitErr string `json:"exiterr,omitempty"` ExitTs int64 `json:"exitts,omitempty"` diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 27d45434c4..6446b5ed25 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -1447,15 +1447,15 @@ func (ws *WshServer) GetSecretsLinuxStorageBackendCommand(ctx context.Context) ( return backend, nil } -func (ws *WshServer) JobExitedCommand(ctx context.Context, data wshrpc.CommandJobExitedData) error { - return jobcontroller.HandleJobExited(ctx, data.JobId, data) +func (ws *WshServer) JobCmdExitedCommand(ctx context.Context, data wshrpc.CommandJobCmdExitedData) error { + return jobcontroller.HandleCmdJobExited(ctx, data.JobId, data) } -func (ws *WshServer) JobDebugListCommand(ctx context.Context) ([]*waveobj.Job, error) { +func (ws *WshServer) JobControllerListCommand(ctx context.Context) ([]*waveobj.Job, error) { return wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job) } -func (ws *WshServer) JobDebugDeleteCommand(ctx context.Context, jobId string) error { +func (ws *WshServer) JobControllerDeleteJobCommand(ctx context.Context, jobId string) error { return jobcontroller.DeleteJob(ctx, jobId) } @@ -1471,7 +1471,7 @@ func (ws *WshServer) JobControllerStartJobCommand(ctx context.Context, data wshr } func (ws *WshServer) JobControllerExitJobCommand(ctx context.Context, jobId string) error { - return jobcontroller.ExitJobManager(ctx, jobId) + return jobcontroller.TerminateJobManager(ctx, jobId) } func (ws *WshServer) JobControllerDisconnectJobCommand(ctx context.Context, jobId string) error { From d2b9595536becbaccdf3c544ac87c5439cf13bd4 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 15:01:12 -0800 Subject: [PATCH 52/64] stub jobid change RPC call to FE block --- cmd/wsh/cmd/wshcmd-jobdebug.go | 3 +- frontend/app/store/wshclientapi.ts | 5 ++++ frontend/app/view/term/term-wsh.tsx | 5 ++++ frontend/types/gotypes.d.ts | 6 ++++ pkg/jobcontroller/jobcontroller.go | 45 ++++++++++++++++++++++++++--- pkg/wshrpc/wshclient/wshclient.go | 6 ++++ pkg/wshrpc/wshrpctypes.go | 6 ++++ pkg/wstore/wstore_dbops.go | 14 +++++++++ 8 files changed, 85 insertions(+), 5 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index d9b6275364..77c4a7da5e 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -78,6 +78,7 @@ var jobDebugGetOutputCmd = &cobra.Command{ var jobDebugStartCmd = &cobra.Command{ Use: "start", Short: "start a new job", + Args: cobra.MinimumNArgs(1), RunE: jobDebugStartRun, } @@ -162,7 +163,7 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { return fmt.Errorf("getting connected job ids: %w", err) } - log.Printf("connnected jobids: %v\n", connectedJobIds) + log.Printf("connected jobids: %v\n", connectedJobIds) connectedMap := make(map[string]bool) for _, jobId := range connectedJobIds { diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 87409f1866..3caeb0f201 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -717,6 +717,11 @@ class RpcApiType { return client.wshRpcCall("termgetscrollbacklines", data, opts); } + // command "termupdateattachedjob" [call] + TermUpdateAttachedJobCommand(client: WshClient, data: CommandTermUpdateAttachedJobData, opts?: RpcOpts): Promise { + return client.wshRpcCall("termupdateattachedjob", data, opts); + } + // command "test" [call] TestCommand(client: WshClient, data: string, opts?: RpcOpts): Promise { return client.wshRpcCall("test", data, opts); diff --git a/frontend/app/view/term/term-wsh.tsx b/frontend/app/view/term/term-wsh.tsx index 782a174913..16e31ae334 100644 --- a/frontend/app/view/term/term-wsh.tsx +++ b/frontend/app/view/term/term-wsh.tsx @@ -104,6 +104,11 @@ export class TermWshClient extends WshClient { } } + async handle_termupdateattachedjob(rh: RpcResponseHelper, data: CommandTermUpdateAttachedJobData): Promise { + console.log("term-update-attached-job", this.blockId, data); + // TODO: implement frontend logic to handle job attachment updates + } + async handle_termgetscrollbacklines( rh: RpcResponseHelper, data: CommandTermGetScrollbackLinesData diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index eca2305bf8..f9e08dbaff 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -614,6 +614,12 @@ declare global { lastupdated: number; }; + // wshrpc.CommandTermUpdateAttachedJobData + type CommandTermUpdateAttachedJobData = { + blockid: string; + jobid?: string; + }; + // wshrpc.CommandVarData type CommandVarData = { key: string; diff --git a/pkg/jobcontroller/jobcontroller.go b/pkg/jobcontroller/jobcontroller.go index f984a6a12b..66f928d920 100644 --- a/pkg/jobcontroller/jobcontroller.go +++ b/pkg/jobcontroller/jobcontroller.go @@ -728,7 +728,7 @@ func DeleteJob(ctx context.Context, jobId string) error { } func AttachJobToBlock(ctx context.Context, jobId string, blockId string) error { - return wstore.WithTx(ctx, func(tx *wstore.TxWrap) error { + err := wstore.WithTx(ctx, func(tx *wstore.TxWrap) error { err := wstore.DBUpdateFn(tx.Context(), blockId, func(block *waveobj.Block) { block.JobId = jobId }) @@ -736,8 +736,12 @@ func AttachJobToBlock(ctx context.Context, jobId string, blockId string) error { return fmt.Errorf("failed to update block: %w", err) } - err = wstore.DBUpdateFn(tx.Context(), jobId, func(job *waveobj.Job) { + err = wstore.DBUpdateFnErr(tx.Context(), jobId, func(job *waveobj.Job) error { + if job.AttachedBlockId != "" { + return fmt.Errorf("job %s already attached to block %s", jobId, job.AttachedBlockId) + } job.AttachedBlockId = blockId + return nil }) if err != nil { return fmt.Errorf("failed to update job: %w", err) @@ -746,16 +750,32 @@ func AttachJobToBlock(ctx context.Context, jobId string, blockId string) error { log.Printf("[job:%s] attached to block:%s", jobId, blockId) return nil }) + if err != nil { + return err + } + + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeFeBlockRouteId(blockId), + NoResponse: true, + } + bareRpc := wshclient.GetBareRpcClient() + wshclient.TermUpdateAttachedJobCommand(bareRpc, wshrpc.CommandTermUpdateAttachedJobData{ + BlockId: blockId, + JobId: jobId, + }, rpcOpts) + + return nil } func DetachJobFromBlock(ctx context.Context, jobId string, updateBlock bool) error { - return wstore.WithTx(ctx, func(tx *wstore.TxWrap) error { + var blockId string + err := wstore.WithTx(ctx, func(tx *wstore.TxWrap) error { job, err := wstore.DBMustGet[*waveobj.Job](tx.Context(), jobId) if err != nil { return fmt.Errorf("failed to get job: %w", err) } - blockId := job.AttachedBlockId + blockId = job.AttachedBlockId if blockId == "" { return nil } @@ -782,6 +802,23 @@ func DetachJobFromBlock(ctx context.Context, jobId string, updateBlock bool) err log.Printf("[job:%s] detached from block:%s", jobId, blockId) return nil }) + if err != nil { + return err + } + + if blockId != "" { + rpcOpts := &wshrpc.RpcOpts{ + Route: wshutil.MakeFeBlockRouteId(blockId), + NoResponse: true, + } + bareRpc := wshclient.GetBareRpcClient() + wshclient.TermUpdateAttachedJobCommand(bareRpc, wshrpc.CommandTermUpdateAttachedJobData{ + BlockId: blockId, + JobId: "", + }, rpcOpts) + } + + return nil } func SendInput(ctx context.Context, data wshrpc.CommandJobInputData) error { diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 7c4ba8658f..62cd66d90c 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -859,6 +859,12 @@ func TermGetScrollbackLinesCommand(w *wshutil.WshRpc, data wshrpc.CommandTermGet return resp, err } +// command "termupdateattachedjob", wshserver.TermUpdateAttachedJobCommand +func TermUpdateAttachedJobCommand(w *wshutil.WshRpc, data wshrpc.CommandTermUpdateAttachedJobData, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "termupdateattachedjob", data, opts) + return err +} + // command "test", wshserver.TestCommand func TestCommand(w *wshutil.WshRpc, data string, opts *wshrpc.RpcOpts) error { _, err := sendRpcRequestCallHelper[any](w, "test", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index fe529e63e5..2634555396 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -152,6 +152,7 @@ type WshRpcInterface interface { // terminal TermGetScrollbackLinesCommand(ctx context.Context, data CommandTermGetScrollbackLinesData) (*CommandTermGetScrollbackLinesRtnData, error) + TermUpdateAttachedJobCommand(ctx context.Context, data CommandTermUpdateAttachedJobData) error // file WshRpcFileInterface @@ -653,6 +654,11 @@ type CommandTermGetScrollbackLinesRtnData struct { LastUpdated int64 `json:"lastupdated"` } +type CommandTermUpdateAttachedJobData struct { + BlockId string `json:"blockid"` + JobId string `json:"jobid,omitempty"` +} + type CommandElectronEncryptData struct { PlainText string `json:"plaintext"` } diff --git a/pkg/wstore/wstore_dbops.go b/pkg/wstore/wstore_dbops.go index 7b16dbcf73..e9a0289ee3 100644 --- a/pkg/wstore/wstore_dbops.go +++ b/pkg/wstore/wstore_dbops.go @@ -328,6 +328,20 @@ func DBUpdateFn[T waveobj.WaveObj](ctx context.Context, id string, updateFn func }) } +func DBUpdateFnErr[T waveobj.WaveObj](ctx context.Context, id string, updateFn func(T) error) error { + return WithTx(ctx, func(tx *TxWrap) error { + val, err := DBMustGet[T](tx.Context(), id) + if err != nil { + return err + } + err = updateFn(val) + if err != nil { + return err + } + return DBUpdate(tx.Context(), val) + }) +} + func DBInsert(ctx context.Context, val waveobj.WaveObj) error { oid := waveobj.GetOID(val) if oid == "" { From 30fe82eddcc910a8e0940fe9754fc9ac1dbc5d9c Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 15:01:40 -0800 Subject: [PATCH 53/64] force cirbuf shrinking if seteffectivewindow lowers the window size in async mode --- pkg/jobmanager/cirbuf.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/jobmanager/cirbuf.go b/pkg/jobmanager/cirbuf.go index 8d14bfef78..fae4063b85 100644 --- a/pkg/jobmanager/cirbuf.go +++ b/pkg/jobmanager/cirbuf.go @@ -33,7 +33,7 @@ func MakeCirBuf(maxSize int, initSyncMode bool) *CirBuf { // SetEffectiveWindow changes the sync mode and effective window size for flow control. // The windowSize is capped at the buffer size. -// When window shrinks: data is preserved, sync mode blocks writes, async mode maintains data size. +// When window shrinks: sync mode blocks new writes, async mode truncates old data to enforce limit. // When window increases: blocked writers are woken up if space becomes available. func (cb *CirBuf) SetEffectiveWindow(syncMode bool, windowSize int) { cb.lock.Lock() @@ -49,6 +49,13 @@ func (cb *CirBuf) SetEffectiveWindow(syncMode bool, windowSize int) { cb.windowSize = windowSize cb.syncMode = syncMode + // In async mode, enforce window size by truncating buffer if needed + if !syncMode && cb.count > windowSize { + excess := cb.count - windowSize + cb.readPos = (cb.readPos + excess) % maxSize + cb.count = windowSize + } + // Only sync mode blocks writers, so only wake if we were in sync mode. // Wake when window grows (more space available) or switching to async (no longer blocking). if oldSyncMode && (windowSize > oldWindowSize || !syncMode) { From 43a4af1356bb5bd3215c5cd80663fdc290fcfd8d Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 15:08:21 -0800 Subject: [PATCH 54/64] fix context for go routine --- pkg/wcore/block.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/wcore/block.go b/pkg/wcore/block.go index 32456951bd..fc66232a32 100644 --- a/pkg/wcore/block.go +++ b/pkg/wcore/block.go @@ -173,7 +173,9 @@ func DeleteBlock(ctx context.Context, blockId string, recursive bool) error { defer func() { panichandler.PanicHandler("DetachJobFromBlock", recover()) }() - err := jobcontroller.DetachJobFromBlock(ctx, block.JobId, false) + detachCtx, cancelFn := context.WithTimeout(context.Background(), 2*time.Second) + defer cancelFn() + err := jobcontroller.DetachJobFromBlock(detachCtx, block.JobId, false) if err != nil { log.Printf("error detaching job from block %s: %v", blockId, err) } From d5ffc4dd9f192694d03edeaabbabdc5398352a37 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 15:19:53 -0800 Subject: [PATCH 55/64] sync.once the cleanup code --- pkg/wshrpc/wshremote/wshremote_job.go | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index b0384da09d..32356854b2 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -12,6 +12,7 @@ import ( "os" "os/exec" "strings" + "sync" "syscall" "time" @@ -54,6 +55,15 @@ func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, m proxy := wshutil.MakeRpcProxy("jobmanager") linkId := impl.Router.RegisterUntrustedLink(proxy) + var cleanupOnce sync.Once + cleanup := func() { + cleanupOnce.Do(func() { + conn.Close() + impl.Router.UnregisterLink(linkId) + impl.removeJobManagerConnection(jobId) + }) + } + go func() { writeErr := wshutil.AdaptOutputChToStream(proxy.ToRemoteCh, conn) if writeErr != nil { @@ -62,20 +72,12 @@ func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, m }() go func() { defer func() { - conn.Close() - impl.Router.UnregisterLink(linkId) close(proxy.FromRemoteCh) - impl.removeJobManagerConnection(jobId) + cleanup() }() wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) }() - cleanup := func() { - conn.Close() - impl.Router.UnregisterLink(linkId) - impl.removeJobManagerConnection(jobId) - } - routeId := wshutil.MakeLinkRouteId(linkId) authData := wshrpc.CommandAuthenticateToJobData{ JobAccessToken: mainServerJwtToken, From b298c3ff5e37dc20f4b97cb5bf07b06a067597a4 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 15:43:14 -0800 Subject: [PATCH 56/64] dont allow binding link routes --- pkg/wshutil/wshrouter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/wshutil/wshrouter.go b/pkg/wshutil/wshrouter.go index 310554f18d..0e7b47cbe7 100644 --- a/pkg/wshutil/wshrouter.go +++ b/pkg/wshutil/wshrouter.go @@ -615,7 +615,7 @@ func (router *WshRouter) UnregisterLink(linkId baseds.LinkId) { } func isBindableRouteId(routeId string) bool { - if routeId == "" || strings.HasPrefix(routeId, ControlPrefix) { + if routeId == "" || strings.HasPrefix(routeId, ControlPrefix) || strings.HasPrefix(routeId, RoutePrefix_Link) { return false } return true From 827d4471e3748922765724cde321ed8cf5d2bb1c Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 15:46:05 -0800 Subject: [PATCH 57/64] fix fd 3 check --- cmd/wsh/cmd/wshcmd-jobmanager.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobmanager.go b/cmd/wsh/cmd/wshcmd-jobmanager.go index 8ada93dc4d..bf5562c3a7 100644 --- a/cmd/wsh/cmd/wshcmd-jobmanager.go +++ b/cmd/wsh/cmd/wshcmd-jobmanager.go @@ -66,8 +66,9 @@ func jobManagerRun(cmd *cobra.Command, args []string) error { } readyFile := os.NewFile(3, "ready-pipe") - if readyFile == nil { - return fmt.Errorf("ready pipe (fd 3) not available") + _, err = readyFile.Stat() + if err != nil { + return fmt.Errorf("ready pipe (fd 3) not available: %v", err) } err = jobmanager.SetupJobManager(jobManagerClientId, jobManagerJobId, publicKeyBytes, jobAuthToken, readyFile) From d17bdb7c11d7e3c4f5b9086a2effcad1ebf5ff6b Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 16:04:03 -0800 Subject: [PATCH 58/64] do not create conn when checking if it is connected (update misleading comment) --- pkg/remote/conncontroller/conncontroller.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 0cc5ecaeb7..b042eb9693 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -856,20 +856,20 @@ func (conn *SSHConn) ClearWshError() { }) } -func getConnInternal(opts *remote.SSHOpts) *SSHConn { +func getConnInternal(opts *remote.SSHOpts, createIfNotExists bool) *SSHConn { globalLock.Lock() defer globalLock.Unlock() rtn := clientControllerMap[*opts] - if rtn == nil { + if rtn == nil && createIfNotExists { rtn = &SSHConn{Lock: &sync.Mutex{}, Status: Status_Init, WshEnabled: &atomic.Bool{}, Opts: opts, HasWaiter: &atomic.Bool{}} clientControllerMap[*opts] = rtn } return rtn } -// does NOT connect, can return nil if connection does not exist +// does NOT connect, does not return nil func GetConn(opts *remote.SSHOpts) *SSHConn { - conn := getConnInternal(opts) + conn := getConnInternal(opts, true) return conn } @@ -881,7 +881,7 @@ func IsConnected(connName string) (bool, error) { if err != nil { return false, fmt.Errorf("error parsing connection name: %w", err) } - conn := GetConn(connOpts) + conn := getConnInternal(connOpts, false) if conn == nil { return false, nil } @@ -917,7 +917,7 @@ func EnsureConnection(ctx context.Context, connName string) error { } func DisconnectClient(opts *remote.SSHOpts) error { - conn := getConnInternal(opts) + conn := getConnInternal(opts, false) if conn == nil { return fmt.Errorf("client %q not found", opts.String()) } From d62ef54c0b4a972d22bc19c13669c00cfed77304 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 16:04:52 -0800 Subject: [PATCH 59/64] fix json tag for attachedblockid --- pkg/waveobj/wtype.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/waveobj/wtype.go b/pkg/waveobj/wtype.go index ea46dbf149..8df86d3766 100644 --- a/pkg/waveobj/wtype.go +++ b/pkg/waveobj/wtype.go @@ -321,7 +321,7 @@ type Job struct { CmdArgs []string `json:"cmdargs,omitempty"` CmdEnv map[string]string `json:"cmdenv,omitempty"` JobAuthToken string `json:"jobauthtoken"` // job manger -> wave - AttachedBlockId string `json:"ownerblockid"` + AttachedBlockId string `json:"attachedblockid,omitempty"` // reconnect option (e.g. orphaned, so we need to kill on connect) TerminateOnReconnect bool `json:"terminateonreconnect,omitempty"` From d590d1ace862d3219926ffef044b4316001b9ae5 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 16:06:12 -0800 Subject: [PATCH 60/64] comment why we can ignore the error from RegisterTrustedLeaf --- pkg/wshrpc/wshclient/barerpcclient.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/wshrpc/wshclient/barerpcclient.go b/pkg/wshrpc/wshclient/barerpcclient.go index 4a4d17dd24..d430266372 100644 --- a/pkg/wshrpc/wshclient/barerpcclient.go +++ b/pkg/wshrpc/wshclient/barerpcclient.go @@ -27,6 +27,7 @@ func GetBareRpcClient() *wshutil.WshRpc { waveSrvClient_Once.Do(func() { waveSrvClient_Singleton = wshutil.MakeWshRpc(wshrpc.RpcContext{}, &WshServerImpl, "bare-client") waveSrvClient_RouteId = fmt.Sprintf("bare:%s", uuid.New().String()) + // we can safely ignore the error from RegisterTrustedLeaf since the route is valid wshutil.DefaultRouter.RegisterTrustedLeaf(waveSrvClient_Singleton, waveSrvClient_RouteId) wps.Broker.SetClient(wshutil.DefaultRouter) }) From 1b35d1795b8190a675cb02b833f93f2453f42549 Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 16:19:31 -0800 Subject: [PATCH 61/64] fix nits --- cmd/wsh/cmd/wshcmd-jobdebug.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cmd/wsh/cmd/wshcmd-jobdebug.go b/cmd/wsh/cmd/wshcmd-jobdebug.go index 77c4a7da5e..5ae68b7051 100644 --- a/cmd/wsh/cmd/wshcmd-jobdebug.go +++ b/cmd/wsh/cmd/wshcmd-jobdebug.go @@ -7,7 +7,6 @@ import ( "encoding/base64" "encoding/json" "fmt" - "log" "github.com/spf13/cobra" "github.com/wavetermdev/waveterm/pkg/wshrpc" @@ -163,8 +162,6 @@ func jobDebugListRun(cmd *cobra.Command, args []string) error { return fmt.Errorf("getting connected job ids: %w", err) } - log.Printf("connected jobids: %v\n", connectedJobIds) - connectedMap := make(map[string]bool) for _, jobId := range connectedJobIds { connectedMap[jobId] = true @@ -339,10 +336,6 @@ func jobDebugGetOutputRun(cmd *cobra.Command, args []string) error { } func jobDebugStartRun(cmd *cobra.Command, args []string) error { - if len(args) == 0 { - return fmt.Errorf("no command specified after --") - } - cmdToRun := args[0] cmdArgs := args[1:] From 0b839fce909ee769402890cca76a1542155a616b Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 16:23:29 -0800 Subject: [PATCH 62/64] exitcode is optional --- frontend/types/gotypes.d.ts | 4 ++-- pkg/wshrpc/wshrpctypes.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index f9e08dbaff..a4ec175c1f 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -352,7 +352,7 @@ declare global { // wshrpc.CommandJobCmdExitedData type CommandJobCmdExitedData = { jobid: string; - exitcode: number; + exitcode?: number; exitsignal?: string; exiterr?: string; exitts?: number; @@ -925,7 +925,7 @@ declare global { cmdargs?: string[]; cmdenv?: {[key: string]: string}; jobauthtoken: string; - ownerblockid: string; + attachedblockid?: string; terminateonreconnect?: boolean; jobmanagerstatus: string; jobmanagerdonereason?: string; diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 2634555396..c0d8d1214b 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -783,7 +783,7 @@ type CommandJobConnectRtnData struct { type CommandJobCmdExitedData struct { JobId string `json:"jobid"` - ExitCode *int `json:"exitcode"` + ExitCode *int `json:"exitcode,omitempty"` ExitSignal string `json:"exitsignal,omitempty"` ExitErr string `json:"exiterr,omitempty"` ExitTs int64 `json:"exitts,omitempty"` From 4866dc0ac38be05b682274ae7da4c12434d270fe Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 16:27:49 -0800 Subject: [PATCH 63/64] close readPipeWrite after start --- pkg/wshrpc/wshremote/wshremote_job.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index 32356854b2..12545c9cc1 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -171,6 +171,7 @@ func (impl *ServerImpl) RemoteStartJobCommand(ctx context.Context, data wshrpc.C if err := cmd.Start(); err != nil { return nil, fmt.Errorf("cannot start job manager: %w", err) } + readyPipeWrite.Close() log.Printf("RemoteStartJobCommand: job manager process started\n") jobAuthTokenLine := fmt.Sprintf("Wave-JobAccessToken:%s\n", data.JobAuthToken) From 15e35a72b017f5bd33a93c306cb8f32ae7d5594c Mon Sep 17 00:00:00 2001 From: sawka Date: Wed, 21 Jan 2026 16:31:18 -0800 Subject: [PATCH 64/64] fix nit, add todo --- pkg/jobmanager/jobcmd.go | 1 + pkg/wshutil/wshrouter.go | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pkg/jobmanager/jobcmd.go b/pkg/jobmanager/jobcmd.go index bd647473bb..2349e69b35 100644 --- a/pkg/jobmanager/jobcmd.go +++ b/pkg/jobmanager/jobcmd.go @@ -150,6 +150,7 @@ func (jm *JobCmd) GetExitInfo() (bool, *wshrpc.CommandJobCmdExitedData) { return true, exitData } +// TODO set up a single input handler loop + queue so we dont need to hold the lock but still get synchronized in-order execution func (jm *JobCmd) HandleInput(data wshrpc.CommandJobInputData) error { jm.lock.Lock() defer jm.lock.Unlock() diff --git a/pkg/wshutil/wshrouter.go b/pkg/wshutil/wshrouter.go index 0e7b47cbe7..32c889756b 100644 --- a/pkg/wshutil/wshrouter.go +++ b/pkg/wshutil/wshrouter.go @@ -36,6 +36,8 @@ const ( RoutePrefix_FeBlock = "feblock:" RoutePrefix_Builder = "builder:" RoutePrefix_Link = "link:" + RoutePrefix_Job = "job:" + RoutePrefix_Bare = "bare:" ) // this works like a network switch