From ee7c6f3062716c2c5ac0b584973839aefd1dc28e Mon Sep 17 00:00:00 2001 From: Nicole Rappe Date: Thu, 4 Sep 2025 15:30:27 -0600 Subject: [PATCH] Fixed Runaway Agent Supervisor Event --- Data/Agent/agent_supervisor.py | 174 ++++++++++++++++-- Data/Agent/borealis-agent.py | 14 ++ Data/Agent/script_agent.py | 48 +++++ .../Write Canary to Specific User Folder.ps1 | 8 + 4 files changed, 233 insertions(+), 11 deletions(-) create mode 100644 Scripts/Examples/Write Canary to Specific User Folder.ps1 diff --git a/Data/Agent/agent_supervisor.py b/Data/Agent/agent_supervisor.py index 1b717bf..ad326db 100644 --- a/Data/Agent/agent_supervisor.py +++ b/Data/Agent/agent_supervisor.py @@ -4,6 +4,9 @@ import time import subprocess import threading import datetime +import json +import ctypes +from ctypes import wintypes # Optional pywin32 imports for per-session launching try: @@ -24,10 +27,35 @@ BOREALIS_DIR = os.path.join(AGENT_DIR, 'Borealis') LOG_DIR = os.path.join(ROOT, 'Logs', 'Agent') os.makedirs(LOG_DIR, exist_ok=True) LOG_FILE = os.path.join(LOG_DIR, 'Supervisor.log') +PID_FILE = os.path.join(LOG_DIR, 'script_agent.pid') + +# Internal state for process + backoff +_script_proc = None +_spawn_backoff = 5 # seconds (exponential backoff start) +_max_backoff = 300 # cap at 5 minutes +_next_spawn_time = 0.0 +_last_disable_log = 0.0 +_last_fail_log = 0.0 def log(msg: str): try: + # simple size-based rotation (~1MB) + try: + if os.path.isfile(LOG_FILE) and os.path.getsize(LOG_FILE) > 1_000_000: + bak = LOG_FILE + '.1' + try: + if os.path.isfile(bak): + os.remove(bak) + except Exception: + pass + try: + os.replace(LOG_FILE, bak) + except Exception: + pass + except Exception: + pass + ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') with open(LOG_FILE, 'a', encoding='utf-8') as f: f.write(f"[{ts}] {msg}\n") @@ -57,28 +85,153 @@ def venv_pythonw(): return venv_python() -def ensure_script_agent(): - """Ensure LocalSystem script_agent.py is running; restart if not.""" +def _settings_path(): + return os.path.join(ROOT, 'agent_settings.json') + + +def load_settings(): + cfg = {} + try: + path = _settings_path() + if os.path.isfile(path): + with open(path, 'r', encoding='utf-8') as f: + cfg = json.load(f) + except Exception: + cfg = {} + return cfg or {} + + +def _psutil_process_exists(pid: int) -> bool: try: - # best-effort: avoid duplicate spawns import psutil # type: ignore - for p in psutil.process_iter(['name', 'cmdline']): + if pid <= 0: + return False + p = psutil.Process(pid) + return p.is_running() and (p.status() != psutil.STATUS_ZOMBIE) + except Exception: + return False + + +def _win_process_exists(pid: int) -> bool: + try: + if pid <= 0: + return False + PROCESS_QUERY_LIMITED_INFORMATION = 0x1000 + kernel32 = ctypes.WinDLL('kernel32', use_last_error=True) + OpenProcess = kernel32.OpenProcess + OpenProcess.restype = wintypes.HANDLE + OpenProcess.argtypes = (wintypes.DWORD, wintypes.BOOL, wintypes.DWORD) + CloseHandle = kernel32.CloseHandle + CloseHandle.argtypes = (wintypes.HANDLE,) + h = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, False, pid) + if h: try: - cl = (p.info.get('cmdline') or []) - if any('script_agent.py' in (part or '') for part in cl): - return + CloseHandle(h) except Exception: pass + return True + return False + except Exception: + return False + + +def process_exists(pid: int) -> bool: + # Prefer psutil if available; else Win32 API + return _psutil_process_exists(pid) or _win_process_exists(pid) + + +def _read_pid_file() -> int: + try: + if os.path.isfile(PID_FILE): + with open(PID_FILE, 'r', encoding='utf-8') as f: + s = f.read().strip() + return int(s) except Exception: pass + return 0 + + +def _write_pid_file(pid: int): + try: + with open(PID_FILE, 'w', encoding='utf-8') as f: + f.write(str(pid)) + except Exception: + pass + + +def _clear_pid_file(): + try: + if os.path.isfile(PID_FILE): + os.remove(PID_FILE) + except Exception: + pass + + +def ensure_script_agent(): + """Ensure LocalSystem script_agent.py is running; restart if not, with backoff and PID tracking.""" + global _script_proc, _spawn_backoff, _next_spawn_time, _last_disable_log, _last_fail_log + + # Allow disabling via config + try: + cfg = load_settings() + if not cfg.get('enable_system_script_agent', True): + now = time.time() + if now - _last_disable_log > 60: + log('System script agent disabled by config (enable_system_script_agent=false)') + _last_disable_log = now + return + except Exception: + pass + + # If we have a running child process, keep it + try: + if _script_proc is not None: + if _script_proc.poll() is None: + return + else: + # Child exited; clear PID file for safety + _clear_pid_file() + _script_proc = None + except Exception: + pass + + # If PID file points to a living process, don't spawn + try: + pid = _read_pid_file() + if pid and process_exists(pid): + return + elif pid and not process_exists(pid): + _clear_pid_file() + except Exception: + pass + + # Honor backoff window + if time.time() < _next_spawn_time: + return py = venv_python() script = os.path.join(ROOT, 'Data', 'Agent', 'script_agent.py') try: - subprocess.Popen([py, '-W', 'ignore::SyntaxWarning', script], creationflags=(0x08000000 if os.name == 'nt' else 0)) - log('Launched script_agent.py') + proc = subprocess.Popen( + [py, '-W', 'ignore::SyntaxWarning', script], + creationflags=(0x08000000 if os.name == 'nt' else 0), + ) + _script_proc = proc + _write_pid_file(proc.pid) + log(f'Launched script_agent.py (pid {proc.pid})') + # reset backoff on success + _spawn_backoff = 5 + _next_spawn_time = 0.0 except Exception as e: - log(f'Failed to launch script_agent.py: {e}') + msg = f'Failed to launch script_agent.py: {e}' + now = time.time() + # rate-limit identical failure logs to once per 10s + if now - _last_fail_log > 10: + log(msg) + _last_fail_log = now + # exponential backoff + _spawn_backoff = min(_spawn_backoff * 2, _max_backoff) + _next_spawn_time = time.time() + _spawn_backoff def _enable_privileges(): @@ -166,4 +319,3 @@ def main(): if __name__ == '__main__': main() - diff --git a/Data/Agent/borealis-agent.py b/Data/Agent/borealis-agent.py index 583006b..559739e 100644 --- a/Data/Agent/borealis-agent.py +++ b/Data/Agent/borealis-agent.py @@ -1065,6 +1065,8 @@ async def _run_powershell_via_user_task(content: str): ps = "powershell.exe" else: return -999, '', 'Windows only' + path = None + out_path = None try: temp_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'Temp') temp_dir = os.path.abspath(temp_dir) @@ -1110,6 +1112,18 @@ Get-ScheduledTask -TaskName $task | Out-Null return 0, out_data or '', '' except Exception as e: return -999, '', str(e) + finally: + # Best-effort cleanup of temp script and output files + try: + if path and os.path.isfile(path): + os.remove(path) + except Exception: + pass + try: + if out_path and os.path.isfile(out_path): + os.remove(out_path) + except Exception: + pass # ---------------- Dummy Qt Widget to Prevent Exit ---------------- class PersistentWindow(QtWidgets.QWidget): diff --git a/Data/Agent/script_agent.py b/Data/Agent/script_agent.py index 8cc0a92..080dc18 100644 --- a/Data/Agent/script_agent.py +++ b/Data/Agent/script_agent.py @@ -6,12 +6,14 @@ import asyncio import json import subprocess import tempfile +from typing import Optional import socketio import platform import time import uuid import tempfile +import contextlib def get_project_root(): @@ -54,6 +56,13 @@ def run_powershell_script_content(content: str): return proc.returncode, proc.stdout or "", proc.stderr or "" except Exception as e: return -1, "", str(e) + finally: + # Best-effort cleanup of the ephemeral script + try: + if os.path.isfile(path): + os.remove(path) + except Exception: + pass async def main(): @@ -201,10 +210,49 @@ Get-ScheduledTask -TaskName $task | Out-Null # Cleanup task (best-effort) cleanup_ps = f"try {{ Unregister-ScheduledTask -TaskName '{task_name}' -Confirm:$false }} catch {{}}" subprocess.run([ps_exe, '-NoProfile', '-ExecutionPolicy', 'Bypass', '-Command', cleanup_ps], capture_output=True, text=True) + # Best-effort removal of temp script and output files + try: + if os.path.isfile(script_path): + os.remove(script_path) + except Exception: + pass + try: + if os.path.isfile(out_path): + os.remove(out_path) + except Exception: + pass return 0, out_data or '', '' except Exception as e: return -999, '', str(e) if __name__ == '__main__': + # Ensure only a single instance of the script agent runs (Windows-only lock) + def _acquire_singleton_lock() -> bool: + try: + lock_dir = os.path.join(get_project_root(), 'Logs', 'Agent') + os.makedirs(lock_dir, exist_ok=True) + lock_path = os.path.join(lock_dir, 'script_agent.lock') + # Keep handle open for process lifetime + fh = open(lock_path, 'a') + try: + import msvcrt # type: ignore + # Lock 1 byte non-blocking; released on handle close/process exit + msvcrt.locking(fh.fileno(), msvcrt.LK_NBLCK, 1) + globals()['_LOCK_FH'] = fh + return True + except Exception: + try: + fh.close() + except Exception: + pass + return False + except Exception: + # If we cannot establish a lock, continue (do not prevent agent) + return True + + if not _acquire_singleton_lock(): + print('[ScriptAgent] Another instance is running; exiting.') + sys.exit(0) + asyncio.run(main()) diff --git a/Scripts/Examples/Write Canary to Specific User Folder.ps1 b/Scripts/Examples/Write Canary to Specific User Folder.ps1 new file mode 100644 index 0000000..9746f16 --- /dev/null +++ b/Scripts/Examples/Write Canary to Specific User Folder.ps1 @@ -0,0 +1,8 @@ +# Dynamically get the current user's Desktop path +$desktopPath = "C:\Users\nicole.rappe\Desktop" + +# Define the file path relative to the Desktop +$filePath = Join-Path $desktopPath "Canary.txt" + +# Write some content into the file +"USER Canary is alive." | Out-File -FilePath $filePath -Encoding UTF8