Fixed Runaway Agent Supervisor Event

This commit is contained in:
2025-09-04 15:30:27 -06:00
parent f905a50501
commit ee7c6f3062
4 changed files with 233 additions and 11 deletions

View File

@@ -4,6 +4,9 @@ import time
import subprocess
import threading
import datetime
import json
import ctypes
from ctypes import wintypes
# Optional pywin32 imports for per-session launching
try:
@@ -24,10 +27,35 @@ BOREALIS_DIR = os.path.join(AGENT_DIR, 'Borealis')
LOG_DIR = os.path.join(ROOT, 'Logs', 'Agent')
os.makedirs(LOG_DIR, exist_ok=True)
LOG_FILE = os.path.join(LOG_DIR, 'Supervisor.log')
PID_FILE = os.path.join(LOG_DIR, 'script_agent.pid')
# Internal state for process + backoff
_script_proc = None
_spawn_backoff = 5 # seconds (exponential backoff start)
_max_backoff = 300 # cap at 5 minutes
_next_spawn_time = 0.0
_last_disable_log = 0.0
_last_fail_log = 0.0
def log(msg: str):
try:
# simple size-based rotation (~1MB)
try:
if os.path.isfile(LOG_FILE) and os.path.getsize(LOG_FILE) > 1_000_000:
bak = LOG_FILE + '.1'
try:
if os.path.isfile(bak):
os.remove(bak)
except Exception:
pass
try:
os.replace(LOG_FILE, bak)
except Exception:
pass
except Exception:
pass
ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with open(LOG_FILE, 'a', encoding='utf-8') as f:
f.write(f"[{ts}] {msg}\n")
@@ -57,28 +85,153 @@ def venv_pythonw():
return venv_python()
def ensure_script_agent():
"""Ensure LocalSystem script_agent.py is running; restart if not."""
def _settings_path():
return os.path.join(ROOT, 'agent_settings.json')
def load_settings():
cfg = {}
try:
path = _settings_path()
if os.path.isfile(path):
with open(path, 'r', encoding='utf-8') as f:
cfg = json.load(f)
except Exception:
cfg = {}
return cfg or {}
def _psutil_process_exists(pid: int) -> bool:
try:
# best-effort: avoid duplicate spawns
import psutil # type: ignore
for p in psutil.process_iter(['name', 'cmdline']):
if pid <= 0:
return False
p = psutil.Process(pid)
return p.is_running() and (p.status() != psutil.STATUS_ZOMBIE)
except Exception:
return False
def _win_process_exists(pid: int) -> bool:
try:
if pid <= 0:
return False
PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)
OpenProcess = kernel32.OpenProcess
OpenProcess.restype = wintypes.HANDLE
OpenProcess.argtypes = (wintypes.DWORD, wintypes.BOOL, wintypes.DWORD)
CloseHandle = kernel32.CloseHandle
CloseHandle.argtypes = (wintypes.HANDLE,)
h = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, False, pid)
if h:
try:
cl = (p.info.get('cmdline') or [])
if any('script_agent.py' in (part or '') for part in cl):
return
CloseHandle(h)
except Exception:
pass
return True
return False
except Exception:
return False
def process_exists(pid: int) -> bool:
# Prefer psutil if available; else Win32 API
return _psutil_process_exists(pid) or _win_process_exists(pid)
def _read_pid_file() -> int:
try:
if os.path.isfile(PID_FILE):
with open(PID_FILE, 'r', encoding='utf-8') as f:
s = f.read().strip()
return int(s)
except Exception:
pass
return 0
def _write_pid_file(pid: int):
try:
with open(PID_FILE, 'w', encoding='utf-8') as f:
f.write(str(pid))
except Exception:
pass
def _clear_pid_file():
try:
if os.path.isfile(PID_FILE):
os.remove(PID_FILE)
except Exception:
pass
def ensure_script_agent():
"""Ensure LocalSystem script_agent.py is running; restart if not, with backoff and PID tracking."""
global _script_proc, _spawn_backoff, _next_spawn_time, _last_disable_log, _last_fail_log
# Allow disabling via config
try:
cfg = load_settings()
if not cfg.get('enable_system_script_agent', True):
now = time.time()
if now - _last_disable_log > 60:
log('System script agent disabled by config (enable_system_script_agent=false)')
_last_disable_log = now
return
except Exception:
pass
# If we have a running child process, keep it
try:
if _script_proc is not None:
if _script_proc.poll() is None:
return
else:
# Child exited; clear PID file for safety
_clear_pid_file()
_script_proc = None
except Exception:
pass
# If PID file points to a living process, don't spawn
try:
pid = _read_pid_file()
if pid and process_exists(pid):
return
elif pid and not process_exists(pid):
_clear_pid_file()
except Exception:
pass
# Honor backoff window
if time.time() < _next_spawn_time:
return
py = venv_python()
script = os.path.join(ROOT, 'Data', 'Agent', 'script_agent.py')
try:
subprocess.Popen([py, '-W', 'ignore::SyntaxWarning', script], creationflags=(0x08000000 if os.name == 'nt' else 0))
log('Launched script_agent.py')
proc = subprocess.Popen(
[py, '-W', 'ignore::SyntaxWarning', script],
creationflags=(0x08000000 if os.name == 'nt' else 0),
)
_script_proc = proc
_write_pid_file(proc.pid)
log(f'Launched script_agent.py (pid {proc.pid})')
# reset backoff on success
_spawn_backoff = 5
_next_spawn_time = 0.0
except Exception as e:
log(f'Failed to launch script_agent.py: {e}')
msg = f'Failed to launch script_agent.py: {e}'
now = time.time()
# rate-limit identical failure logs to once per 10s
if now - _last_fail_log > 10:
log(msg)
_last_fail_log = now
# exponential backoff
_spawn_backoff = min(_spawn_backoff * 2, _max_backoff)
_next_spawn_time = time.time() + _spawn_backoff
def _enable_privileges():
@@ -166,4 +319,3 @@ def main():
if __name__ == '__main__':
main()

View File

@@ -1065,6 +1065,8 @@ async def _run_powershell_via_user_task(content: str):
ps = "powershell.exe"
else:
return -999, '', 'Windows only'
path = None
out_path = None
try:
temp_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'Temp')
temp_dir = os.path.abspath(temp_dir)
@@ -1110,6 +1112,18 @@ Get-ScheduledTask -TaskName $task | Out-Null
return 0, out_data or '', ''
except Exception as e:
return -999, '', str(e)
finally:
# Best-effort cleanup of temp script and output files
try:
if path and os.path.isfile(path):
os.remove(path)
except Exception:
pass
try:
if out_path and os.path.isfile(out_path):
os.remove(out_path)
except Exception:
pass
# ---------------- Dummy Qt Widget to Prevent Exit ----------------
class PersistentWindow(QtWidgets.QWidget):

View File

@@ -6,12 +6,14 @@ import asyncio
import json
import subprocess
import tempfile
from typing import Optional
import socketio
import platform
import time
import uuid
import tempfile
import contextlib
def get_project_root():
@@ -54,6 +56,13 @@ def run_powershell_script_content(content: str):
return proc.returncode, proc.stdout or "", proc.stderr or ""
except Exception as e:
return -1, "", str(e)
finally:
# Best-effort cleanup of the ephemeral script
try:
if os.path.isfile(path):
os.remove(path)
except Exception:
pass
async def main():
@@ -201,10 +210,49 @@ Get-ScheduledTask -TaskName $task | Out-Null
# Cleanup task (best-effort)
cleanup_ps = f"try {{ Unregister-ScheduledTask -TaskName '{task_name}' -Confirm:$false }} catch {{}}"
subprocess.run([ps_exe, '-NoProfile', '-ExecutionPolicy', 'Bypass', '-Command', cleanup_ps], capture_output=True, text=True)
# Best-effort removal of temp script and output files
try:
if os.path.isfile(script_path):
os.remove(script_path)
except Exception:
pass
try:
if os.path.isfile(out_path):
os.remove(out_path)
except Exception:
pass
return 0, out_data or '', ''
except Exception as e:
return -999, '', str(e)
if __name__ == '__main__':
# Ensure only a single instance of the script agent runs (Windows-only lock)
def _acquire_singleton_lock() -> bool:
try:
lock_dir = os.path.join(get_project_root(), 'Logs', 'Agent')
os.makedirs(lock_dir, exist_ok=True)
lock_path = os.path.join(lock_dir, 'script_agent.lock')
# Keep handle open for process lifetime
fh = open(lock_path, 'a')
try:
import msvcrt # type: ignore
# Lock 1 byte non-blocking; released on handle close/process exit
msvcrt.locking(fh.fileno(), msvcrt.LK_NBLCK, 1)
globals()['_LOCK_FH'] = fh
return True
except Exception:
try:
fh.close()
except Exception:
pass
return False
except Exception:
# If we cannot establish a lock, continue (do not prevent agent)
return True
if not _acquire_singleton_lock():
print('[ScriptAgent] Another instance is running; exiting.')
sys.exit(0)
asyncio.run(main())