Files
kugetsu/skills/kugetsu/scripts/kugetsu-queue-daemon.sh
shokollm 51ec844365 fix(queue-daemon): implement timeout handling for long-running tasks
Check notified_at timestamp in check_task_completion() and mark tasks
as error if they exceed TASK_TIMEOUT_HOURS (defaults to 1 hour).

When timeout is detected:
- Kill the task process (PID) if running
- Stop the opencode session if exists
- Mark queue item as error state

Fixes #166
2026-04-07 12:53:35 +00:00

180 lines
6.8 KiB
Bash

#!/bin/bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/kugetsu-config.sh"
source "$SCRIPT_DIR/kugetsu-index.sh"
source "$SCRIPT_DIR/kugetsu-worktree.sh"
source "$SCRIPT_DIR/kugetsu-log.sh"
load_agent_env "pm-agent"
acquire_lock() {
local issue_ref="$1"
local lock_file="$QUEUE_DIR/locks/$(echo "$issue_ref" | sed 's/[\/:]/-/g' | sed 's/#/-/').lock"
mkdir -p "$(dirname "$lock_file")"
if [ -f "$lock_file" ]; then
local pid=$(cat "$lock_file" 2>/dev/null || echo "")
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
return 1
fi
rm -f "$lock_file"
fi
echo $$ > "$lock_file"
return 0
}
release_lock() {
local issue_ref="$1"
local lock_file="$QUEUE_DIR/locks/$(echo "$issue_ref" | sed 's/[\/:]/-/g' | sed 's/#/-/').lock"
rm -f "$lock_file"
}
check_task_completion() {
local item="$1"
local queue_id=$(basename "$item" .json)
local state=$(python3 -c "import json; print(json.load(open('$item')).get('state', ''))" 2>/dev/null)
[ "$state" = "notified" ] || return 0
local session_id=$(python3 -c "import json; print(json.load(open('$item')).get('opencode_session_id', ''))" 2>/dev/null)
local issue_ref=$(python3 -c "import json; print(json.load(open('$item')).get('issue_ref', ''))" 2>/dev/null)
local pid=$(python3 -c "import json; print(json.load(open('$item')).get('pid', ''))" 2>/dev/null)
local notified_at=$(python3 -c "import json; print(json.load(open('$item')).get('notified_at', ''))" 2>/dev/null)
local timed_out=false
if [ -n "$notified_at" ]; then
local notified_epoch=$(date -d "$notified_at" +%s 2>/dev/null || echo "0")
local now_epoch=$(date +%s)
local hours_elapsed=$(( (now_epoch - notified_epoch) / 3600 ))
if [ "$hours_elapsed" -ge "${TASK_TIMEOUT_HOURS:-1}" ]; then
timed_out=true
echo "Task $queue_id ($issue_ref) timed out after ${hours_elapsed}h"
fi
fi
if [ "$timed_out" = true ]; then
if [ -n "$pid" ] && [ "$pid" != "None" ]; then
kill "$pid" 2>/dev/null || true
fi
if [ -n "$session_id" ]; then
opencode session stop "$session_id" 2>/dev/null || true
fi
update_queue_item_state "$queue_id" "error"
echo "Task $queue_id ($issue_ref) marked error — timeout after ${hours_elapsed}h"
release_lock "$issue_ref"
return
fi
if [ -n "$pid" ] && [ "$pid" != "None" ]; then
if ! kill -0 "$pid" 2>/dev/null; then
local worktree_path=$(issue_ref_to_worktree_path "$issue_ref" "$WORKTREES_DIR")
local has_commits=false
if [ -d "$worktree_path" ] && [ -d "$worktree_path/.git" ]; then
if [ -n "$(git -C "$worktree_path" log --oneline origin/main..HEAD 2>/dev/null)" ]; then
has_commits=true
fi
fi
if [ "$has_commits" = true ]; then
update_queue_item_state "$queue_id" "completed"
echo "Task $queue_id ($issue_ref) completed — new commits found"
else
update_queue_item_state "$queue_id" "error"
echo "Task $queue_id ($issue_ref) marked error — no commits found after session ended"
fi
release_lock "$issue_ref"
fi
else
if [ -n "$session_id" ] && ! opencode session list 2>/dev/null | grep -q "$session_id"; then
local worktree_path=$(issue_ref_to_worktree_path "$issue_ref" "$WORKTREES_DIR")
local has_commits=false
if [ -d "$worktree_path" ] && [ -d "$worktree_path/.git" ]; then
if [ -n "$(git -C "$worktree_path" log --oneline origin/main..HEAD 2>/dev/null)" ]; then
has_commits=true
fi
fi
if [ "$has_commits" = true ]; then
update_queue_item_state "$queue_id" "completed"
echo "Task $queue_id ($issue_ref) completed — new commits found"
else
update_queue_item_state "$queue_id" "error"
echo "Task $queue_id ($issue_ref) marked error — no commits found after session ended"
fi
release_lock "$issue_ref"
fi
fi
}
get_session_id_for_issue() {
local issue_ref="$1"
local session_file=$(issue_ref_to_filename "$issue_ref")
local session_path="$SESSIONS_DIR/$session_file"
if [ -f "$session_path" ]; then
python3 -c "import json; print(json.load(open('$session_path')).get('opencode_session_id', ''))" 2>/dev/null || echo ""
else
echo ""
fi
}
process_task() {
local item="$1"
local queue_id=$(basename "$item" .json)
local issue_ref=$(python3 -c "import json; print(json.load(open('$item')).get('issue_ref', ''))" 2>/dev/null)
local message=$(python3 -c "import json; print(json.load(open('$item')).get('message', ''))" 2>/dev/null)
if ! acquire_lock "$issue_ref"; then
echo "Task $queue_id ($issue_ref) skipped — another process is handling it"
return
fi
source "$SCRIPT_DIR/kugetsu-session.sh"
if worktree_exists "$issue_ref" "$WORKTREES_DIR" || [ -f "$SESSIONS_DIR/$(issue_ref_to_filename "$issue_ref").json" ]; then
log_file="$LOGS_DIR/delegate-$(date +%s).log"
if cmd_continue "$issue_ref" "$message" >> "$log_file" 2>&1; then
sleep 1
local session_id=$(get_session_id_for_issue "$issue_ref")
update_queue_item_state "$queue_id" "notified" "$session_id" ""
echo "Task $queue_id continued for $issue_ref"
else
update_queue_item_state "$queue_id" "error"
echo "Task $queue_id ($issue_ref) failed to continue"
fi
else
log_file="$LOGS_DIR/delegate-$(date +%s).log"
if cmd_start "$issue_ref" "$message" >> "$log_file" 2>&1; then
sleep 1
local session_id=$(get_session_id_for_issue "$issue_ref")
update_queue_item_state "$queue_id" "notified" "$session_id" ""
echo "Task $queue_id started for $issue_ref"
else
update_queue_item_state "$queue_id" "error"
echo "Task $queue_id ($issue_ref) failed to start"
fi
fi
release_lock "$issue_ref"
}
while true; do
if [ -d "$QUEUE_ITEMS_DIR" ]; then
for item in "$QUEUE_ITEMS_DIR"/*.json; do
[ -f "$item" ] || continue
check_task_completion "$item"
done
for item in "$QUEUE_ITEMS_DIR"/*.json; do
[ -f "$item" ] || continue
state=$(python3 -c "import json; print(json.load(open('$item')).get('state', ''))" 2>/dev/null)
if [ "$state" = "pending" ]; then
process_task "$item"
fi
done
fi
sleep "${QUEUE_DAEMON_INTERVAL_MINUTES:-5}m"
done