This repository was archived by the owner on Apr 6, 2026. It is now read-only.
Generate Note Embeddings #45
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Generate Note Embeddings | |
| # Required GitHub Secrets: | |
| # - GCP_PROJECT_ID: Google Cloud Project ID | |
| # - GCP_SA_KEY: Service Account JSON key with Storage Object Admin role | |
| # - GCP_REGION: GCP region (default: us-central1) | |
| # | |
| # Required GitHub Variables (Settings -> Secrets and variables -> Actions -> Variables): | |
| # - RELAY_URL: WebSocket URL of the Nostr relay (e.g., wss://nostr-relay-617806532906.us-central1.run.app) | |
| # REQUIRED: Must be set before first run. Workflow will fail without this variable. | |
| on: | |
| schedule: | |
| # Run nightly at 3 AM UTC | |
| - cron: '0 3 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| full_rebuild: | |
| description: 'Force full index rebuild' | |
| required: false | |
| default: 'false' | |
| type: boolean | |
| env: | |
| PYTHON_VERSION: '3.11' | |
| EMBEDDING_MODEL: 'sentence-transformers/all-MiniLM-L6-v2' | |
| EMBEDDING_DIM: 384 | |
| GCS_BUCKET_NAME: 'Nostr-BBS-vectors' | |
| jobs: | |
| generate-embeddings: | |
| name: Generate and Upload Embeddings | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Setup Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| cache: 'pip' | |
| - name: Install dependencies | |
| run: | | |
| pip install --upgrade pip | |
| pip install -r scripts/embeddings/requirements-gcp.txt | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.GCP_SA_KEY }} | |
| - name: Set up Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| - name: Download previous manifest | |
| env: | |
| GOOGLE_CLOUD_PROJECT: ${{ secrets.GCP_PROJECT_ID }} | |
| run: | | |
| python scripts/embeddings/download_manifest.py || echo '{"version": 0, "last_event_id": null}' > manifest.json | |
| cat manifest.json | |
| - name: Validate RELAY_URL | |
| run: | | |
| if [ -z "${{ vars.RELAY_URL }}" ]; then | |
| echo "::error::RELAY_URL repository variable is not set. Configure at Settings -> Secrets and variables -> Actions -> Variables" | |
| exit 1 | |
| fi | |
| - name: Fetch notes from relay | |
| env: | |
| RELAY_URL: ${{ vars.RELAY_URL }} | |
| run: | | |
| python scripts/embeddings/fetch_notes.py \ | |
| --relay "$RELAY_URL" \ | |
| --since-event "$(jq -r '.last_event_id // empty' manifest.json)" \ | |
| --output notes.json | |
| echo "Fetched $(jq length notes.json) notes" | |
| - name: Generate embeddings | |
| run: | | |
| python scripts/embeddings/generate_embeddings.py \ | |
| --input notes.json \ | |
| --model "$EMBEDDING_MODEL" \ | |
| --output embeddings.npz \ | |
| --quantize int8 | |
| echo "Generated embeddings for $(python -c 'import numpy as np; d=np.load("embeddings.npz"); print(len(d["ids"]))')" | |
| - name: Build HNSW index | |
| run: | | |
| python scripts/embeddings/build_index.py \ | |
| --embeddings embeddings.npz \ | |
| --existing-index index.bin 2>/dev/null || true \ | |
| --output index.bin \ | |
| --m 16 \ | |
| --ef-construction 200 | |
| - name: Update manifest | |
| run: | | |
| python scripts/embeddings/update_manifest.py \ | |
| --notes notes.json \ | |
| --embeddings embeddings.npz \ | |
| --output manifest.json | |
| - name: Upload to Google Cloud Storage | |
| env: | |
| GOOGLE_CLOUD_PROJECT: ${{ secrets.GCP_PROJECT_ID }} | |
| run: | | |
| python scripts/embeddings/upload_to_gcs.py \ | |
| --bucket "$GCS_BUCKET_NAME" \ | |
| --files index.bin embeddings.npz manifest.json \ | |
| --prefix "v$(jq -r '.version' manifest.json)" | |
| - name: Summary | |
| run: | | |
| echo "## Embedding Generation Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Model**: $EMBEDDING_MODEL" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Dimensions**: $EMBEDDING_DIM" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Notes processed**: $(jq length notes.json)" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Index version**: $(jq -r '.version' manifest.json)" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Total vectors**: $(jq -r '.total_vectors' manifest.json)" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Index size**: $(du -h index.bin | cut -f1)" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Files uploaded to GCS bucket: gs://$GCS_BUCKET_NAME" >> $GITHUB_STEP_SUMMARY | |
| echo "Public URL: https://storage.googleapis.com/$GCS_BUCKET_NAME/latest/manifest.json" >> $GITHUB_STEP_SUMMARY |