Skip to content

Agent Evals (Nightly) #48

Agent Evals (Nightly)

Agent Evals (Nightly) #48

Workflow file for this run

name: Agent Evals (Nightly)
on:
schedule:
# Run at 3:00 AM UTC every day
- cron: '0 3 * * *'
workflow_dispatch:
inputs:
providers:
description: 'Providers to run (comma-separated: mock,gemini,ollama)'
required: false
default: 'mock,gemini'
suite:
description: 'Eval suite path'
required: false
default: 'scripts/evals/agent-smoke.jsonl'
env:
NODE_VERSION: '20'
PNPM_VERSION: '10'
jobs:
eval-mock:
name: Eval (Mock)
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'pnpm'
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Run mock evals
run: |
pnpm agent:eval \
--provider mock \
--suite ${{ github.event.inputs.suite || 'scripts/evals/agent-smoke.jsonl' }} \
--vault src/data/demo-files \
--json eval-results-mock.json
- name: Upload mock results
uses: actions/upload-artifact@v4
with:
name: eval-results-mock
path: eval-results-mock.json
retention-days: 30
eval-gemini:
name: Eval (Gemini)
runs-on: ubuntu-latest
if: ${{ github.event_name == 'schedule' || contains(github.event.inputs.providers, 'gemini') }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'pnpm'
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Run Gemini evals
env:
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
run: |
pnpm agent:eval \
--provider gemini \
--model gemini-2.0-flash \
--suite ${{ github.event.inputs.suite || 'scripts/evals/agent-smoke.jsonl' }} \
--vault src/data/demo-files \
--json eval-results-gemini.json \
--debug
- name: Upload Gemini results
uses: actions/upload-artifact@v4
with:
name: eval-results-gemini
path: eval-results-gemini.json
retention-days: 30
eval-ollama:
name: Eval (Ollama)
runs-on: ubuntu-latest
if: ${{ contains(github.event.inputs.providers || '', 'ollama') }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'pnpm'
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Install Ollama
run: |
curl -fsSL https://ollama.com/install.sh | sh
ollama serve &
sleep 5
- name: Pull model
run: ollama pull llama3-groq-tool-use
- name: Run Ollama evals
run: |
pnpm agent:eval \
--provider ollama \
--model llama3-groq-tool-use \
--suite ${{ github.event.inputs.suite || 'scripts/evals/agent-smoke.jsonl' }} \
--vault src/data/demo-files \
--json eval-results-ollama.json \
--debug
- name: Upload Ollama results
uses: actions/upload-artifact@v4
with:
name: eval-results-ollama
path: eval-results-ollama.json
retention-days: 30
summary:
name: Summary
runs-on: ubuntu-latest
needs: [eval-mock, eval-gemini]
if: always()
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: results
pattern: eval-results-*
merge-multiple: false
- name: Generate summary
run: |
echo "## Agent Eval Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Provider | Passed | Failed | Total |" >> $GITHUB_STEP_SUMMARY
echo "|----------|--------|--------|-------|" >> $GITHUB_STEP_SUMMARY
for dir in results/eval-results-*; do
if [ -d "$dir" ]; then
provider=$(basename "$dir" | sed 's/eval-results-//')
file="$dir/eval-results-${provider}.json"
if [ -f "$file" ]; then
passed=$(jq '[.results[] | select(.passed == true)] | length' "$file")
failed=$(jq '[.results[] | select(.passed == false)] | length' "$file")
total=$(jq '.results | length' "$file")
echo "| $provider | $passed | $failed | $total |" >> $GITHUB_STEP_SUMMARY
fi
fi
done
echo "" >> $GITHUB_STEP_SUMMARY
echo "Run at: $(date -u)" >> $GITHUB_STEP_SUMMARY