llama.cpp/.github/workflows/server-self-hosted.yml at master · CISC/llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
name: Server (self-hosted)

on:
  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: [
      '.github/workflows/server-self-hosted.yml',
      '**/CMakeLists.txt',
      '**/Makefile',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.swift',
      '**/*.m',
      'tools/server/**.*'
    ]

env:
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
  LLAMA_LOG_VERBOSITY: 10

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  server-metal:
    runs-on: [self-hosted, llama-server, macOS, ARM64]

    name: server-metal (${{ matrix.wf_name }})
    strategy:
      matrix:
        build_type: [Release]
        wf_name: ["GPUx1"]
        include:
          - build_type: Release
            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
            wf_name:    "GPUx1, backend-sampling"
          - build_type: Release
            extra_args: "GGML_METAL_DEVICES=2"
            wf_name:    "GPUx2"
          - build_type: Release
            extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
            wf_name:    "GPUx2, backend-sampling"
      fail-fast: false

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Setup Node.js
        uses: actions/setup-node@v6
        with:
          node-version: "24"
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server

      - name: Tests
        id: server_integration_tests
        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
        run: |
          cd tools/server/tests
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"

  # TODO: provision CUDA runner
  #  server-cuda:
  #    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
  #
  #    name: server-cuda (${{ matrix.wf_name }})
  #    strategy:
  #      matrix:
  #        build_type: [Release]
  #        wf_name: ["GPUx1"]
  #        include:
  #          - build_type: Release
  #            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
  #            wf_name:    "GPUx1, backend-sampling"
  #      fail-fast: false
  #
  #    steps:
  #      - name: Clone
  #        id: checkout
  #        uses: actions/checkout@v6
  #        with:
  #          fetch-depth: 0
  #          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
  #
  #      - name: Build
  #        id: cmake_build
  #        run: |
  #          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
  #          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
  #
  #      - name: Tests
  #        id: server_integration_tests
  #        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
  #        run: |
  #          cd tools/server/tests
  #          python3 -m venv venv
  #          source venv/bin/activate
  #          pip install -r requirements.txt
  #          export ${{ matrix.extra_args }}
  #          pytest -v -x -m "not slow"