Skip to content
This repository was archived by the owner on Apr 6, 2026. It is now read-only.

Generate Note Embeddings #45

Generate Note Embeddings

Generate Note Embeddings #45

name: Generate Note Embeddings
# Required GitHub Secrets:
# - GCP_PROJECT_ID: Google Cloud Project ID
# - GCP_SA_KEY: Service Account JSON key with Storage Object Admin role
# - GCP_REGION: GCP region (default: us-central1)
#
# Required GitHub Variables (Settings -> Secrets and variables -> Actions -> Variables):
# - RELAY_URL: WebSocket URL of the Nostr relay (e.g., wss://nostr-relay-617806532906.us-central1.run.app)
# REQUIRED: Must be set before first run. Workflow will fail without this variable.
on:
schedule:
# Run nightly at 3 AM UTC
- cron: '0 3 * * *'
workflow_dispatch:
inputs:
full_rebuild:
description: 'Force full index rebuild'
required: false
default: 'false'
type: boolean
env:
PYTHON_VERSION: '3.11'
EMBEDDING_MODEL: 'sentence-transformers/all-MiniLM-L6-v2'
EMBEDDING_DIM: 384
GCS_BUCKET_NAME: 'Nostr-BBS-vectors'
jobs:
generate-embeddings:
name: Generate and Upload Embeddings
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -r scripts/embeddings/requirements-gcp.txt
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Download previous manifest
env:
GOOGLE_CLOUD_PROJECT: ${{ secrets.GCP_PROJECT_ID }}
run: |
python scripts/embeddings/download_manifest.py || echo '{"version": 0, "last_event_id": null}' > manifest.json
cat manifest.json
- name: Validate RELAY_URL
run: |
if [ -z "${{ vars.RELAY_URL }}" ]; then
echo "::error::RELAY_URL repository variable is not set. Configure at Settings -> Secrets and variables -> Actions -> Variables"
exit 1
fi
- name: Fetch notes from relay
env:
RELAY_URL: ${{ vars.RELAY_URL }}
run: |
python scripts/embeddings/fetch_notes.py \
--relay "$RELAY_URL" \
--since-event "$(jq -r '.last_event_id // empty' manifest.json)" \
--output notes.json
echo "Fetched $(jq length notes.json) notes"
- name: Generate embeddings
run: |
python scripts/embeddings/generate_embeddings.py \
--input notes.json \
--model "$EMBEDDING_MODEL" \
--output embeddings.npz \
--quantize int8
echo "Generated embeddings for $(python -c 'import numpy as np; d=np.load("embeddings.npz"); print(len(d["ids"]))')"
- name: Build HNSW index
run: |
python scripts/embeddings/build_index.py \
--embeddings embeddings.npz \
--existing-index index.bin 2>/dev/null || true \
--output index.bin \
--m 16 \
--ef-construction 200
- name: Update manifest
run: |
python scripts/embeddings/update_manifest.py \
--notes notes.json \
--embeddings embeddings.npz \
--output manifest.json
- name: Upload to Google Cloud Storage
env:
GOOGLE_CLOUD_PROJECT: ${{ secrets.GCP_PROJECT_ID }}
run: |
python scripts/embeddings/upload_to_gcs.py \
--bucket "$GCS_BUCKET_NAME" \
--files index.bin embeddings.npz manifest.json \
--prefix "v$(jq -r '.version' manifest.json)"
- name: Summary
run: |
echo "## Embedding Generation Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "- **Model**: $EMBEDDING_MODEL" >> $GITHUB_STEP_SUMMARY
echo "- **Dimensions**: $EMBEDDING_DIM" >> $GITHUB_STEP_SUMMARY
echo "- **Notes processed**: $(jq length notes.json)" >> $GITHUB_STEP_SUMMARY
echo "- **Index version**: $(jq -r '.version' manifest.json)" >> $GITHUB_STEP_SUMMARY
echo "- **Total vectors**: $(jq -r '.total_vectors' manifest.json)" >> $GITHUB_STEP_SUMMARY
echo "- **Index size**: $(du -h index.bin | cut -f1)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Files uploaded to GCS bucket: gs://$GCS_BUCKET_NAME" >> $GITHUB_STEP_SUMMARY
echo "Public URL: https://storage.googleapis.com/$GCS_BUCKET_NAME/latest/manifest.json" >> $GITHUB_STEP_SUMMARY