merge: resolve conflicts with main (psutil with fallback, use --dir flag)

2026-03-31 03:29:00 +00:00
parent e9583f92ee 08e40e5396
commit cd16ce19c6
25 changed files with 5350 additions and 75 deletions
--- a/tools/parallel-capacity-test/parallel_capacity_test.py
+++ b/tools/parallel-capacity-test/parallel_capacity_test.py
@@ -19,12 +19,78 @@ from typing import List, Optional

 try:
    import psutil
+
    HAS_PSUTIL = True
 except ImportError:
    HAS_PSUTIL = False
    print("[WARN] psutil not available - resource monitoring will be limited")


+def get_memory_percent() -> float:
+    """Get memory usage percent by reading /proc/meminfo (Linux)"""
+    try:
+        with open("/proc/meminfo", "r") as f:
+            meminfo = f.read()
+        total = 0
+        available = 0
+        for line in meminfo.splitlines():
+            if line.startswith("MemTotal:"):
+                total = int(line.split()[1])
+            elif line.startswith("MemAvailable:"):
+                available = int(line.split()[1])
+                break
+        if total > 0:
+            used = total - available
+            return (used / total) * 100
+    except (FileNotFoundError, PermissionError, ValueError):
+        pass
+    return 0.0
+
+
+def count_opencode_processes() -> int:
+    """Count opencode processes using pgrep or /proc scanning"""
+    try:
+        result = subprocess.run(
+            ["pgrep", "-c", "-x", "opencode"], capture_output=True, text=True, timeout=5
+        )
+        if result.returncode == 0:
+            return int(result.stdout.strip())
+    except (subprocess.TimeoutExpired, ValueError, subprocess.SubprocessError):
+        pass
+    try:
+        count = 0
+        for pid_dir in os.listdir("/proc"):
+            if not pid_dir.isdigit():
+                continue
+            try:
+                with open(f"/proc/{pid_dir}/comm", "r") as f:
+                    if "opencode" in f.read().lower():
+                        count += 1
+            except (PermissionError, FileNotFoundError):
+                continue
+        return count
+    except FileNotFoundError:
+        return 0
+    return 0
+
+
+def get_cpu_percent() -> float:
+    """Get CPU usage by reading /proc/stat"""
+    try:
+        with open("/proc/stat", "r") as f:
+            line = f.readline()
+        parts = line.split()
+        if parts[0] == "cpu":
+            values = [int(x) for x in parts[1:8]]
+            idle = values[3]
+            total = sum(values)
+            if total > 0:
+                return ((total - idle) / total) * 100
+    except (FileNotFoundError, PermissionError, ValueError, IndexError):
+        pass
+    return 0.0
+
+
@dataclass
 class AgentResult:
    agent_id: int
@@ -95,8 +161,13 @@ class ResourceMonitor:
    def _collect_sample(self) -> ResourceSample:
        timestamp = time.time()
        try:
-            opencode_procs = len([p for p in psutil.process_iter(['name']) 
-                                 if 'opencode' in p.info['name'].lower()])
+            opencode_procs = len(
+                [
+                    p
+                    for p in psutil.process_iter(["name"])
+                    if "opencode" in p.info["name"].lower()
+                ]
+            )
        except Exception:
            opencode_procs = 0

@@ -112,7 +183,7 @@ class ResourceMonitor:
            cpu_percent=cpu_percent,
            memory_percent=memory_percent,
            opencode_processes=opencode_procs,
-            agent_count=self._current_agent_count
+            agent_count=self._current_agent_count,
        )


@@ -135,36 +206,35 @@ class ParallelCapacityTester:

        try:
            result = subprocess.run(
-                ['opencode', 'run', task, '--dir', workdir],
+                ["opencode", "run", task, "--dir", workdir],
                capture_output=True,
                text=True,
-                timeout=self.timeout
+                timeout=self.timeout,
            )
            duration = time.time() - start_time
            output = result.stdout + result.stderr
-            success = 'PARALLEL_TEST_OK' in output
+            success = "PARALLEL_TEST_OK" in output

            return AgentResult(
                agent_id=agent_id,
                duration=duration,
-                status='success' if success else 'failed',
+                status="success" if success else "failed",
                return_code=result.returncode,
-                output=output[:500]
+                output=output[:500],
            )
        except subprocess.TimeoutExpired:
            return AgentResult(
                agent_id=agent_id,
                duration=self.timeout,
-                status='timeout',
-                return_code=-1
+                status="timeout",
+                return_code=-1,
            )
        except Exception as e:
            return AgentResult(
                agent_id=agent_id,
                duration=time.time() - start_time,
-                status='failed',
+                status="failed",
                return_code=-1,
-                error=str(e)
            )

    def _run_parallel_agents(self, num_agents: int) -> TestRun:
@@ -194,7 +264,7 @@ class ParallelCapacityTester:
            elapsed = int(time.time() - start_time)
            all_done = all(not t.is_alive() for t in threads)

-        subprocess.run(['pkill', '-f', 'opencode run'], capture_output=True)
+        subprocess.run(["pkill", "-f", "opencode run"], capture_output=True)

        for t in threads:
            t.join(timeout=5)
@@ -202,9 +272,9 @@ class ParallelCapacityTester:
        resource_samples = self.monitor.stop()
        total_duration = time.time() - start_time

-        success_count = sum(1 for r in results if r.status == 'success')
-        failed_count = sum(1 for r in results if r.status == 'failed')
-        timeout_count = sum(1 for r in results if r.status == 'timeout')
+        success_count = sum(1 for r in results if r.status == "success")
+        failed_count = sum(1 for r in results if r.status == "failed")
+        timeout_count = sum(1 for r in results if r.status == "timeout")

        durations = [r.duration for r in results]
        avg_duration = statistics.mean(durations) if durations else 0
@@ -221,7 +291,9 @@ class ParallelCapacityTester:
        else:
            peak_cpu = avg_cpu = peak_mem = avg_mem = peak_procs = 0

-        print(f"[RESULT] {num_agents} agents: {success_count} success, {failed_count} failed, {timeout_count} timeout")
+        print(
+            f"[RESULT] {num_agents} agents: {success_count} success, {failed_count} failed, {timeout_count} timeout"
+        )

        return TestRun(
            agent_count=num_agents,
@@ -237,11 +309,12 @@ class ParallelCapacityTester:
            avg_cpu_percent=avg_cpu,
            peak_memory_percent=peak_mem,
            avg_memory_percent=avg_mem,
-            peak_opencode_procs=peak_procs
+            peak_opencode_procs=peak_procs,
        )

-    def run_capacity_test(self, max_agents: int = 10, step: int = 1,
-                         quick: bool = False) -> List[TestRun]:
+    def run_capacity_test(
+        self, max_agents: int = 10, step: int = 1, quick: bool = False
+    ) -> List[TestRun]:
        if quick:
            agent_counts = [1, 2, 3, 5, 8]
        else:
@@ -253,7 +326,7 @@ class ParallelCapacityTester:
        self.results = []

        for count in agent_counts:
-            subprocess.run(['pkill', '-f', 'opencode run'], capture_output=True)
+            subprocess.run(["pkill", "-f", "opencode run"], capture_output=True)
            time.sleep(2)
            result = self._run_parallel_agents(count)
            self.results.append(result)
@@ -266,21 +339,25 @@ class ParallelCapacityTester:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        json_file = output_path / f"results_{timestamp}.json"
-        with open(json_file, 'w') as f:
+        with open(json_file, "w") as f:
            data = [asdict(run) for run in self.results]
            json.dump(data, f, indent=2)
        print(f"[INFO] Results saved to: {json_file}")

        csv_file = output_path / f"summary_{timestamp}.csv"
-        with open(csv_file, 'w') as f:
-            f.write("agents,duration,success,failed,timeout,avg_response,stddev,min_response,max_response,peak_cpu,avg_cpu,peak_mem,avg_mem,peak_procs\n")
+        with open(csv_file, "w") as f:
+            f.write(
+                "agents,duration,success,failed,timeout,avg_response,stddev,min_response,max_response,peak_cpu,avg_cpu,peak_mem,avg_mem,peak_procs\n"
+            )
            for run in self.results:
-                f.write(f"{run.agent_count},{run.total_duration:.2f},{run.success_count},"
-                       f"{run.failed_count},{run.timeout_count},{run.avg_response_time:.2f},"
-                       f"{run.stddev_response_time:.2f},{run.min_response_time:.2f},"
-                       f"{run.max_response_time:.2f},{run.peak_cpu_percent:.1f},"
-                       f"{run.avg_cpu_percent:.1f},{run.peak_memory_percent:.1f},"
-                       f"{run.avg_memory_percent:.1f},{run.peak_opencode_procs}\n")
+                f.write(
+                    f"{run.agent_count},{run.total_duration:.2f},{run.success_count},"
+                    f"{run.failed_count},{run.timeout_count},{run.avg_response_time:.2f},"
+                    f"{run.stddev_response_time:.2f},{run.min_response_time:.2f},"
+                    f"{run.max_response_time:.2f},{run.peak_cpu_percent:.1f},"
+                    f"{run.avg_cpu_percent:.1f},{run.peak_memory_percent:.1f},"
+                    f"{run.avg_memory_percent:.1f},{run.peak_opencode_procs}\n"
+                )
        print(f"[INFO] Summary saved to: {csv_file}")

        report_file = output_path / f"report_{timestamp}.md"
@@ -290,44 +367,62 @@ class ParallelCapacityTester:
        return str(json_file), str(csv_file), str(report_file)

    def _generate_markdown_report(self, output_file: Path):
-        with open(output_file, 'w') as f:
+        with open(output_file, "w") as f:
            f.write("# Parallel Capacity Test Report\n\n")
-            f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+            f.write(
+                f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+            )
            f.write("## Summary\n\n")
-            f.write("| Agents | Duration | Success | Failed | Timeout | Avg Response | Peak CPU | Peak Mem |\n")
-            f.write("|--------|----------|---------|--------|---------|--------------|----------|----------|\n")
+            f.write(
+                "| Agents | Duration | Success | Failed | Timeout | Avg Response | Peak CPU | Peak Mem |\n"
+            )
+            f.write(
+                "|--------|----------|---------|--------|---------|--------------|----------|----------|\n"
+            )
            for run in self.results:
-                f.write(f"| {run.agent_count} | {run.total_duration:.1f}s | "
-                       f"{run.success_count} | {run.failed_count} | "
-                       f"{run.timeout_count} | {run.avg_response_time:.1f}s | "
-                       f"{run.peak_cpu_percent:.1f}% | {run.peak_memory_percent:.1f}% |\n")
+                f.write(
+                    f"| {run.agent_count} | {run.total_duration:.1f}s | "
+                    f"{run.success_count} | {run.failed_count} | "
+                    f"{run.timeout_count} | {run.avg_response_time:.1f}s | "
+                    f"{run.peak_cpu_percent:.1f}% | {run.peak_memory_percent:.1f}% |\n"
+                )
            f.write("\n## Key Findings\n\n")
-            successful_runs = [r for r in self.results if r.success_count == r.agent_count]
+            successful_runs = [
+                r for r in self.results if r.success_count == r.agent_count
+            ]
            optimal = max(successful_runs, key=lambda r: r.agent_count, default=None)
            if optimal:
                f.write(f"### Optimal Configuration\n")
-                f.write(f"- **{optimal.agent_count} agents** achieved perfect success rate\n")
-                f.write(f"  - Average response time: {optimal.avg_response_time:.1f}s\n")
+                f.write(
+                    f"- **{optimal.agent_count} agents** achieved perfect success rate\n"
+                )
+                f.write(
+                    f"  - Average response time: {optimal.avg_response_time:.1f}s\n"
+                )
                f.write(f"  - Peak CPU: {optimal.peak_cpu_percent:.1f}%\n")
                f.write(f"  - Peak Memory: {optimal.peak_memory_percent:.1f}%\n\n")
            f.write("## Recommendations\n\n")
            if optimal:
-                f.write(f"1. **Recommended max agents:** {optimal.agent_count} for stable operation\n")
+                f.write(
+                    f"1. **Recommended max agents:** {optimal.agent_count} for stable operation\n"
+                )
            f.write("2. **Monitor closely:** 5+ agents\n")
-            f.write("3. **Implement circuit breaker** when failure rate exceeds threshold\n")
+            f.write(
+                "3. **Implement circuit breaker** when failure rate exceeds threshold\n"
+            )


 def main():
-    parser = argparse.ArgumentParser(description='Parallel Capacity Test Tool')
-    parser.add_argument('--agents', '-n', type=int, default=10)
-    parser.add_argument('--timeout', '-t', type=int, default=120)
-    parser.add_argument('--step', '-s', type=int, default=1)
-    parser.add_argument('--quick', '-q', action='store_true')
-    parser.add_argument('--output', '-o', type=str, default=None)
+    parser = argparse.ArgumentParser(description="Parallel Capacity Test Tool")
+    parser.add_argument("--agents", "-n", type=int, default=10)
+    parser.add_argument("--timeout", "-t", type=int, default=120)
+    parser.add_argument("--step", "-s", type=int, default=1)
+    parser.add_argument("--quick", "-q", action="store_true")
+    parser.add_argument("--output", "-o", type=str, default=None)
    args = parser.parse_args()

    script_dir = Path(__file__).parent
-    output_dir = args.output or str(script_dir / 'results')
+    output_dir = args.output or str(script_dir / "results")

    print("=" * 60)
    print("Parallel Capacity Test Tool for Hermes/OpenCode")
@@ -339,7 +434,9 @@ def main():
    tester = ParallelCapacityTester(timeout=args.timeout)

    try:
-        tester.run_capacity_test(max_agents=args.agents, step=args.step, quick=args.quick)
+        tester.run_capacity_test(
+            max_agents=args.agents, step=args.step, quick=args.quick
+        )
        json_file, csv_file, report_file = tester.save_results(output_dir)
        print("\n" + "=" * 60)
        print("TEST COMPLETE")
@@ -352,5 +449,5 @@ def main():
        sys.exit(1)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()