From dd0cdfdc130c37dd96c41aaa007d7bf90af9707a Mon Sep 17 00:00:00 2001 From: LLM Harness Date: Sat, 25 Apr 2026 23:15:58 +0200 Subject: [PATCH] Add: LLM harness MANIFEST, CLAUDE entry point, LOADER, and templates --- CLAUDE.md | 141 ++++++++++++++++++ LOADER.sh | 111 +++++++++++++++ MANIFEST.md | 145 +++++++++++++++++++ templates/MODULE-STRUCTURE.md.template | 190 +++++++++++++++++++++++++ templates/PROJECT-HARNESS.md.template | 139 ++++++++++++++++++ 5 files changed, 726 insertions(+) create mode 100644 CLAUDE.md create mode 100644 LOADER.sh create mode 100644 MANIFEST.md create mode 100644 templates/MODULE-STRUCTURE.md.template create mode 100644 templates/PROJECT-HARNESS.md.template diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..6bff7bf --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,141 @@ +# CLAUDE.md - Claude Code Agent Harness + +**Entry point for Claude Code within the LLM Harness system.** + +--- + +## Who You Are (from agent's perspective) + +You are Claude Code running within a task-specific harness. +- You receive minimal, targeted context +- You execute ONE specific task +- You report results back to Bumblebee +- You iterate based on feedback + +--- + +## How to Run + +```bash +# Via LOADER +source LOADER.sh +brief_agent claude-code "architecture" "gravl" + +# Or direct CLI +claude-code \ + --task "architecture" \ + --project "gravl" \ + --model "opus" +``` + +--- + +## You Will Receive + +**The ONLY context loaded for your task:** + +1. **MANIFEST.md** (which task, which model) +2. **gravl/HARNESS.md** (project structure) +3. **gravl/modules/design/** (design-specific specs) +4. **NOT loaded:** + - Implementation details + - Testing modules + - Other projects + - Full codebase + +--- + +## Your Task Template + +You will be briefed like: + +``` +Task: Design Database Schema +Project: gravl +Module: design + +Context Loaded: +- ~/workspace/gravl/HARNESS.md +- ~/workspace/gravl/modules/design/SPEC.md +- ~/workspace/gravl/modules/design/CONSTRAINTS.md + +Output Expected: +- ~/workspace/gravl/modules/design/SCHEMA.md +- ~/workspace/gravl/modules/design/ER-DIAGRAM.md + +Quality Gate: +- All constraints satisfied +- Performance targets met (see CONSTRAINTS.md) +- Ready for implementation review +``` + +--- + +## Your Output Format + +**Always structure as:** + +```markdown +# [Task Name] + +## Overview +[Brief summary of what was designed] + +## Key Decisions +- Decision 1: Rationale +- Decision 2: Rationale + +## Deliverable +[The actual design/spec/PRD] + +## Validation +- ✓ Constraint 1: [how met] +- ✓ Constraint 2: [how met] +- ? Open question: [if any] + +## Next Steps +[What should happen next - implementation? review?] +``` + +--- + +## Working with Bumblebee + +**Bumblebee will:** +1. Load your context (minimally) +2. Brief you with this harness +3. Receive your output +4. Review it CRITICALLY +5. Give feedback if not approved +6. Loop until you get it right + +**You should:** +- Follow the task precisely +- Output in the template above +- Ask clarifying questions if ambiguous +- Validate against constraints +- Be ready to iterate + +--- + +## Model Assignment + +This harness will assign you to different models based on task: + +- **Opus**: Architecture, analysis, complex problems +- **Sonnet**: Implementation, straightforward tasks +- **Haiku**: Simple tasks, low-context work + +(See MANIFEST.md for full mapping) + +--- + +## Remember + +You're not working alone. You're part of a system where: +- Bumblebee plans and critiques +- You implement to specs +- Feedback loops improve quality +- Each task is focused and minimal + +Do one thing excellently, not many things poorly. diff --git a/LOADER.sh b/LOADER.sh new file mode 100644 index 0000000..fbd6db5 --- /dev/null +++ b/LOADER.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# LLM Harness Universal Loader +# Loads minimal necessary context for agent tasks +# Follows PSR-4 / module loading principle + +set -e + +HARNESS_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +MANIFEST="${HARNESS_ROOT}/MANIFEST.md" + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +# ============================================================================ +# CORE FUNCTIONS +# ============================================================================ + +get_agent_for_task() { + local task="$1" + + if [[ ! -f "$MANIFEST" ]]; then + echo -e "${RED}Error: MANIFEST.md not found${NC}" + return 1 + fi + + # Parse JSON from MANIFEST (simplified - would use jq in production) + echo "Task: $task → Check MANIFEST.md for assignment" +} + +get_model_for_task() { + local task="$1" + echo "Model assignment from MANIFEST.md" +} + +load_project_harness() { + local project="$1" + local harness_file="${HARNESS_ROOT}/../${project}/HARNESS.md" + + if [[ ! -f "$harness_file" ]]; then + echo -e "${RED}Error: ${project}/HARNESS.md not found${NC}" + return 1 + fi + + echo -e "${GREEN}✓ Loaded: ${project}/HARNESS.md${NC}" + cat "$harness_file" | head -50 +} + +load_module() { + local project="$1" + local module="$2" + local module_path="${HARNESS_ROOT}/../${project}/modules/${module}" + + if [[ ! -d "$module_path" ]]; then + echo -e "${RED}Error: Module ${project}/modules/${module} not found${NC}" + return 1 + fi + + echo -e "${GREEN}✓ Loading module: ${project}/${module}${NC}" + + # Load module files in order + [[ -f "${module_path}/APPROACH.md" ]] && echo "• APPROACH.md" + [[ -f "${module_path}/CHECKLIST.md" ]] && echo "• CHECKLIST.md" + [[ -f "${module_path}/SPEC.md" ]] && echo "• SPEC.md" + + echo "" +} + +brief_agent() { + local agent="$1" + local task="$2" + local project="$3" + + echo -e "${YELLOW}=== Agent Brief ===${NC}" + echo "Agent: $agent" + echo "Task: $task" + echo "Project: $project" + echo "" + echo "Loading context..." + + # Load agent entry point + local agent_file="${HARNESS_ROOT}/${agent^^}.md" + if [[ -f "$agent_file" ]]; then + echo -e "${GREEN}✓ Agent instructions loaded${NC}" + fi +} + +# ============================================================================ +# USAGE +# ============================================================================ + +if [[ $# -eq 0 ]]; then + echo "LLM Harness Loader" + echo "" + echo "Usage:" + echo " source LOADER.sh" + echo " get_agent_for_task " + echo " load_project_harness " + echo " load_module " + echo " brief_agent " + echo "" + echo "Example:" + echo " brief_agent claude-code architecture gravl" + exit 0 +fi + +# Execute command +"$@" diff --git a/MANIFEST.md b/MANIFEST.md new file mode 100644 index 0000000..08f90d5 --- /dev/null +++ b/MANIFEST.md @@ -0,0 +1,145 @@ +# LLM HARNESS MANIFEST + +**Universal task router for all LLM tools and models.** + +--- + +## Model/Agent Assignment Matrix + +```json +{ + "tasks": { + "architecture": { + "model": "opus", + "agent": "claude-code", + "use_case": "System design, planning, long-context analysis", + "token_budget": 100000 + }, + "implementation": { + "model": "sonnet", + "agent": "claude-code", + "use_case": "Code generation, feature building", + "token_budget": 50000 + }, + "code-review": { + "model": "opus", + "agent": "cursor", + "use_case": "Critical code review, quality gates", + "token_budget": 50000 + }, + "testing": { + "model": "sonnet", + "agent": "claude-code", + "use_case": "Test generation, validation", + "token_budget": 30000 + }, + "research": { + "model": "sonnet", + "agent": "gemini", + "use_case": "Topic research, knowledge synthesis", + "token_budget": 40000 + }, + "documentation": { + "model": "sonnet", + "agent": "claude-code", + "use_case": "Writing docs, guides, specifications", + "token_budget": 30000 + }, + "analysis": { + "model": "opus", + "agent": "claude-code", + "use_case": "Deep analysis, trade-off evaluation", + "token_budget": 60000 + } + }, + + "agents": { + "claude-code": { + "entry_point": "CLAUDE.md", + "cli": "claude-code", + "models": ["opus", "sonnet", "haiku"], + "strengths": ["architecture", "implementation", "detailed code work"] + }, + "cursor": { + "entry_point": "CURSOR.md", + "cli": "cursor", + "models": ["opus", "sonnet"], + "strengths": ["code review", "refactoring", "IDE integration"] + }, + "gemini": { + "entry_point": "GEMINI.md", + "cli": "gemini", + "models": ["gemini-3-pro", "gemini-3-flash"], + "strengths": ["research", "multimodal", "web search"] + } + }, + + "fallback": { + "if_agent_unavailable": "use_next_best_agent", + "if_model_unavailable": "downgrade_to_sonnet" + } +} +``` + +--- + +## Usage + +```bash +# Load manifest +source LOADER.sh + +# Get task assignment +get_agent_for_task "architecture" +# Output: opus + claude-code + +# Get module path for project +get_module_path "gravl" "design" +# Output: ~/workspace/gravl/modules/design +``` + +--- + +## Project Harness Structure + +Each project (gravl, job-portal, etc) should have: + +``` +~/workspace/[project]/ +├── HARNESS.md ← Project-specific structure +├── modules/ +│ ├── design/ +│ │ ├── APPROACH.md +│ │ └── specs/ +│ ├── implementation/ +│ ├── testing/ +│ └── docs/ +└── [project files] +``` + +--- + +## Minimal Data Feeding Principle + +When instructing an agent: +1. Load MANIFEST.md (which model/agent) +2. Load PROJECT/HARNESS.md (project structure) +3. Load ONLY relevant modules (task-specific data) +4. DON'T load: + - Other projects + - Unrelated modules + - Implementation details if doing architecture + - Full specs if doing design only + +--- + +## RAPL Loop Integration + +When task fails validation: +1. Capture error +2. Replan (fetch failing data) +3. Adjust approach +4. Log learning (store in `learnings/`) +5. Retry with new strategy + +Document: `~/workspace/learnings/YYYY-MM-DD.md` diff --git a/templates/MODULE-STRUCTURE.md.template b/templates/MODULE-STRUCTURE.md.template new file mode 100644 index 0000000..f2c5d91 --- /dev/null +++ b/templates/MODULE-STRUCTURE.md.template @@ -0,0 +1,190 @@ +# MODULE STRUCTURE TEMPLATE + +**Standard structure for any module within a project harness.** + +--- + +## Module Anatomy + +``` +modules/[module-name]/ +├── APPROACH.md ← How to execute this phase +├── CHECKLIST.md ← What to verify +├── SPEC.md ← What to build (input for agent) +└── [outputs]/ ← Generated by agent + ├── DESIGN.md + ├── SCHEMA.md + └── ... +``` + +--- + +## APPROACH.md + +**Purpose:** Tell agents HOW to execute this phase. + +```markdown +# [Phase Name] Approach + +## Goal +[What should be accomplished] + +## Process +1. [Step 1] +2. [Step 2] +3. [Step 3] + +## Constraints to Respect +- [Constraint 1] +- [Constraint 2] + +## Success Criteria +- [Criterion 1] +- [Criterion 2] + +## Output Files +- [file1.md] +- [file2.md] +``` + +--- + +## CHECKLIST.md + +**Purpose:** Verify the phase completed correctly. + +```markdown +# [Phase Name] Checklist + +## Functional +- [ ] All requirements from SPEC.md are met +- [ ] All constraints are satisfied +- [ ] No blocking open questions + +## Quality +- [ ] Follows code/design standards +- [ ] Performance targets met +- [ ] Security considerations addressed + +## Documentation +- [ ] All decisions documented +- [ ] Rationales explained +- [ ] Ambiguities clarified + +## Ready for Next Phase? +- [ ] All checkboxes above passed +- [ ] No blockers remain +``` + +--- + +## SPEC.md (Example) + +**Purpose:** Define requirements for the agent. + +```markdown +# [Module] Specification + +## Overview +[What problem does this solve?] + +## Requirements +1. [Functional requirement 1] +2. [Functional requirement 2] +3. [Non-functional requirement] + +## Constraints (from project CONSTRAINTS.md) +- [Constraint 1] +- [Constraint 2] + +## Acceptance Criteria +- [Criterion 1] +- [Criterion 2] + +## Related Files +- Architecture: [link] +- Database: [link] +- Testing: [link] +``` + +--- + +## Module Loading Example + +When bribing Claude Code for "design" phase: + +```bash +# Load these files (and ONLY these): +- ~/workspace/[project]/HARNESS.md (project structure) +- ~/workspace/[project]/modules/design/APPROACH.md (how to execute) +- ~/workspace/[project]/modules/design/SPEC.md (what to build) +- ~/workspace/[project]/modules/design/CHECKLIST.md (success criteria) +- ~/workspace/gravl/modules/design/CONSTRAINTS.md (from parent) + +# Output expected: +- ~/workspace/[project]/modules/design/ARCHITECTURE.md +- ~/workspace/[project]/modules/design/ER-DIAGRAM.md +- ~/workspace/[project]/modules/design/DECISION_LOG.md +``` + +--- + +## Minimal Data Principle + +✅ **DO load:** +- Module specs for THIS phase +- Related specs from same project +- Constraints that affect THIS phase + +❌ **DON'T load:** +- Future phase modules +- Implementation code +- Test files +- Other projects +- Unused historical context + +**Goal:** Agent has EXACTLY what it needs, nothing more. + +--- + +## Example: Design Module + +``` +modules/design/ +├── APPROACH.md ← "Design system architecture" +├── CHECKLIST.md ← "All constraints verified?" +├── SPEC.md ← "Build database schema + API spec" +├── CONSTRAINTS.md ← "Performance, security, scaling" +├── EXISTING_ARCHITECTURE.md ← "Current system overview" +└── [outputs]/ + ├── SCHEMA.md ← Agent creates this + ├── API_SPEC.md ← Agent creates this + ├── DECISIONS.md ← Agent creates this + └── ER_DIAGRAM.md ← Agent creates this +``` + +When bribing agent for design: +``` +Load: +- APPROACH.md +- SPEC.md +- CONSTRAINTS.md +- EXISTING_ARCHITECTURE.md + +Don't load: +- implementation/ +- testing/ +- docs/ +``` + +--- + +## Best Practices + +1. **Keep specs focused** — One module = one task +2. **Document decisions** — Why, not just what +3. **Version constraints** — Tag major constraint changes +4. **Link related modules** — Show dependencies +5. **Checkpoint at boundaries** — Save state between phases + +This keeps context minimal and agents focused. diff --git a/templates/PROJECT-HARNESS.md.template b/templates/PROJECT-HARNESS.md.template new file mode 100644 index 0000000..47e5543 --- /dev/null +++ b/templates/PROJECT-HARNESS.md.template @@ -0,0 +1,139 @@ +# PROJECT HARNESS TEMPLATE + +**Copy this to your project and customize.** + +--- + +## [Project Name] Structure + +``` +~/workspace/[project]/ +├── HARNESS.md ← This file (project-specific) +├── modules/ +│ ├── design/ ← Architecture & planning phase +│ │ ├── SPEC.md ← What to build +│ │ ├── CONSTRAINTS.md ← Non-functional requirements +│ │ ├── SCHEMA.md ← Data model (if applicable) +│ │ └── ARCHITECTURE.md ← System design +│ │ +│ ├── implementation/ ← Building phase +│ │ ├── GUIDE.md ← How to build +│ │ ├── CHECKLIST.md ← What to verify +│ │ └── CODE_STANDARDS.md ← Style & patterns +│ │ +│ ├── testing/ ← QA phase +│ │ ├── TEST_PLAN.md ← What to test +│ │ ├── CASES.md ← Test cases +│ │ └── COVERAGE.md ← Coverage targets +│ │ +│ └── docs/ ← Documentation phase +│ ├── USER_GUIDE.md ← For end users +│ ├── API.md ← For developers +│ └── DEPLOYMENT.md ← For ops +│ +├── [project files/directories] +└── README.md ← Project overview +``` + +--- + +## Phases + +### Phase 1: Design +- **Agent:** claude-code (opus model) +- **Module:** design/ +- **Input:** SPEC.md, CONSTRAINTS.md +- **Output:** ARCHITECTURE.md, SCHEMA.md +- **Validation:** All constraints addressed + +### Phase 2: Implementation +- **Agent:** claude-code (sonnet model) +- **Module:** implementation/ +- **Input:** ARCHITECTURE.md from Phase 1 +- **Output:** Working code +- **Validation:** Passes CODE_STANDARDS.md + +### Phase 3: Review +- **Agent:** cursor (opus model) +- **Module:** [code to review] +- **Input:** Implementation from Phase 2 +- **Output:** Review report, approved/rejected +- **Validation:** Critical feedback addressed + +### Phase 4: Testing +- **Agent:** claude-code (sonnet model) +- **Module:** testing/ +- **Input:** TEST_PLAN.md +- **Output:** Test code, coverage report +- **Validation:** Coverage targets met + +### Phase 5: Documentation +- **Agent:** claude-code (sonnet model) +- **Module:** docs/ +- **Input:** Code + ARCHITECTURE.md +- **Output:** USER_GUIDE.md, API.md, DEPLOYMENT.md +- **Validation:** Complete, accurate, usable + +--- + +## Key Constraints + +**[List your project's non-functional requirements]** + +- Performance: [e.g., API responses <100ms p95] +- Scalability: [e.g., support 10k concurrent users] +- Security: [e.g., encryption at rest/transit] +- Availability: [e.g., 99.9% uptime SLA] +- Compatibility: [e.g., Node.js 20+, PostgreSQL 14+] + +--- + +## Technology Stack + +**Backend:** [e.g., Node.js + Express] +**Database:** [e.g., PostgreSQL + Redis] +**Frontend:** [e.g., React + Tailwind] +**DevOps:** [e.g., Docker + Kubernetes] + +--- + +## Module Loading Rules + +When briering agents: +- Only load the CURRENT phase's module +- Don't load future phases +- Include CONSTRAINTS.md in every brief +- Include project README for context + +Example: When designing, DON'T load implementation/ or testing/ + +--- + +## Checkpoint & Recovery + +Each phase saves checkpoint: +```json +{ + "phase": "design", + "status": "completed|in_progress|blocked|error", + "timestamp": "2026-04-25T23:00:00Z", + "agent": "claude-code", + "model": "opus", + "result_file": "modules/design/ARCHITECTURE.md", + "next_phase": "implementation" +} +``` + +If phase fails, save error and replan. + +--- + +## Customization + +Change this template to fit YOUR project: +- Add/remove phases +- Change agent assignments +- Define your own constraints +- Add project-specific workflows + +Just keep the structure PSR-4 compatible.