Skip to content
Draft
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b0c4ad6
[ci] move CI container images to GHCR
jameslamb Dec 23, 2025
30b8679
push
jameslamb Dec 23, 2025
a7fcbf8
fix env
jameslamb Dec 23, 2025
c6ccf66
empty commit to re-trigger CI
jameslamb Dec 23, 2025
6bfed04
lowercase repo
jameslamb Dec 23, 2025
5ba87cb
cannot use github.workspace
jameslamb Dec 23, 2025
8df5568
fix OS, reduce duplication
jameslamb Dec 23, 2025
5f99901
fix job names
jameslamb Dec 23, 2025
dbb7b96
work around newer CMake + old pocl
jameslamb Dec 23, 2025
22fca6d
try forcing C++11
jameslamb Dec 23, 2025
95f1462
upgrade to PoCL 7.1
jameslamb Dec 24, 2025
6671e4a
try building the same way on x86_64
jameslamb Dec 24, 2025
946bb7d
fix CPU flag
jameslamb Dec 24, 2025
45735f0
try images in CI
jameslamb Dec 24, 2025
c0ed153
use -dev label
jameslamb Dec 24, 2025
e56817f
fix image URIs
jameslamb Dec 24, 2025
5fef0f5
try new OpenCL headers
jameslamb Dec 24, 2025
d1151d5
temporarily skip check-wheel-contents (some OpenCL headers are gettin…
jameslamb Dec 24, 2025
dc3a465
align OpenCL versions
jameslamb Dec 28, 2025
193fd23
Merge branch 'master' of github.com:microsoft/LightGBM into ci/image-…
jameslamb Dec 29, 2025
f4f655a
try getting more information from 'clinfo'
jameslamb Dec 29, 2025
b09a916
try using LLVM instead
jameslamb Jan 1, 2026
7121551
try installing Intel OpenCL support
jameslamb Jan 1, 2026
9a58b2e
ensure the PoCL ICD loader gets installed
jameslamb Jan 1, 2026
9c99f6e
check LLC_HOST_CPU values, try skipping intel driver installation
jameslamb Jan 1, 2026
83732ef
target cortex-a53
jameslamb Jan 1, 2026
a4b8ace
try compiling support for more Arm CPUs
jameslamb Jan 2, 2026
4b6b1d8
comment out some CI, start adding docs
jameslamb Jan 2, 2026
08ef096
comment out even more CI
jameslamb Jan 2, 2026
d62591c
get more debugging information, install 'all' component
jameslamb Jan 2, 2026
aff8267
try hand-writing the ICD file
jameslamb Jan 2, 2026
0540af4
fix test code
jameslamb Jan 2, 2026
8f1b053
more fixes
jameslamb Jan 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 0 additions & 44 deletions .appveyor.yml

This file was deleted.

1 change: 1 addition & 0 deletions .ci/ci-images/manylinux_2_28_aarch64/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*
73 changes: 73 additions & 0 deletions .ci/ci-images/manylinux_2_28_aarch64/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
FROM quay.io/pypa/manylinux_2_28_aarch64

# use 'bash' for RUN steps
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

# install packages
RUN <<EOF
yum update -y

yum install --nodocs -y \
epel-release

yum install --nodocs -y \
clang-devel \
gcc-c++ \
hwloc-devel \
llvm-devel \
llvm-static \
ocl-icd-devel \
sudo

yum module install --nodocs -y \
llvm-toolset

yum clean all

rm -rf /var/cache/yum
EOF

RUN <<EOF
# install a newer CMake than what the package manager has
curl -sL https://cmake.org/files/v3.23/cmake-3.23.1-linux-aarch64.sh -o cmake.sh
chmod +x cmake.sh
./cmake.sh --prefix=/usr/local --exclude-subdir
rm -f ./cmake.sh

# build PoCL
#
# NOTE: If this is updated, check if CL_TARGET_OPENCL_VERSION in cmake/IntegratedOpenCL.cmake
# needs to be updated (see comments there fore links).
git clone \
--depth 1 \
--branch v7.1 \
https://github.com/pocl/pocl.git

# explanations for some flags:
#
# * -DCMAKE_{C,CXX}_COMPILER: DEVTOOLSET_ROOTPATH is where manylinux puts the gcc toolset
# * -DLLC_HOST_CPU="generic": passed to clang's -march/-mcpu flag.
#
cmake \
-B pocl/build \
-S pocl \
-DCMAKE_BUILD_TYPE=release \
-DCMAKE_C_COMPILER="${DEVTOOLSET_ROOTPATH}/usr/bin/gcc" \
-DCMAKE_CXX_COMPILER="${DEVTOOLSET_ROOTPATH}/usr/bin/g++" \
-DENABLE_DOXYGEN=OFF \
-DENABLE_EXAMPLES=OFF \
-DENABLE_HOST_CPU_DEVICES=ON \
-DENABLE_HWLOC=ON \
-DENABLE_POCLCC=ON \
-DENABLE_SPIRV=ON \
-DENABLE_TESTS=OFF \
-DENABLE_VALGRIND=OFF \
-DINSTALL_OPENCL_HEADERS=OFF \
-DLLC_HOST_CPU=generic \
-DPOCL_DEBUG_MESSAGES=OFF \
-DPOCL_INSTALL_ICD_VENDORDIR=/etc/OpenCL/vendors

cmake --build pocl/build -j4

cmake --install pocl/build
EOF
1 change: 1 addition & 0 deletions .ci/ci-images/manylinux_2_28_x86_64/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*
102 changes: 102 additions & 0 deletions .ci/ci-images/manylinux_2_28_x86_64/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
FROM quay.io/pypa/manylinux_2_28_x86_64

# ensure that libraries like libc++ built in this image can be found by the linker
#ENV LD_LIBRARY_PATH="/usr/local/lib64:${LD_LIBRARY_PATH}:/usr/local/lib"

# use 'bash' for RUN steps
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

# install packages
RUN <<EOF
yum update -y

yum install --nodocs -y \
epel-release

yum install --nodocs -y \
clang-devel \
gcc-c++ \
hwloc-devel \
llvm-devel \
llvm-static \
ocl-icd-devel \
sudo

yum module install --nodocs -y \
llvm-toolset

yum clean all

rm -rf /var/cache/yum
EOF

RUN <<EOF
# install a newer CMake than what the package manager has
curl -sL https://cmake.org/files/v3.23/cmake-3.23.1-linux-$(arch).sh -o cmake.sh
chmod +x cmake.sh
./cmake.sh --prefix=/usr/local --exclude-subdir
rm -f ./cmake.sh

# build PoCL
git clone \
--depth 1 \
--branch v7.1 \
https://github.com/pocl/pocl.git

# explanations for some flags:
#
# * -DCMAKE_{C,CXX}_COMPILER: DEVTOOLSET_ROOTPATH is where manylinux puts the gcc toolset
# * -DLLC_HOST_CPU="x86_64": passed to clang's -march/-mcpu flag. see https://github.com/chromebrew/chromebrew/pull/9176#issuecomment-1891751465
#
cmake \
-B pocl/build \
-S pocl \
-DCMAKE_BUILD_TYPE=release \
-DCMAKE_C_COMPILER="${DEVTOOLSET_ROOTPATH}/usr/bin/gcc" \
-DCMAKE_CXX_COMPILER="${DEVTOOLSET_ROOTPATH}/usr/bin/g++" \
-DENABLE_DOXYGEN=OFF \
-DENABLE_EXAMPLES=OFF \
-DENABLE_HOST_CPU_DEVICES=ON \
-DENABLE_HWLOC=ON \
-DENABLE_POCLCC=ON \
-DENABLE_SPIRV=ON \
-DENABLE_TESTS=OFF \
-DENABLE_VALGRIND=OFF \
-DINSTALL_OPENCL_HEADERS=OFF \
-DLLC_HOST_CPU="x86-64" \
-DPOCL_DEBUG_MESSAGES=OFF \
-DPOCL_INSTALL_ICD_VENDORDIR=/etc/OpenCL/vendors

cmake --build pocl/build -j4

cmake --install pocl/build
EOF

# Install Java
RUN yum install -y \
java-1.8.0-openjdk-devel.x86_64 \
&& yum clean all

ENV JAVA_HOME_8_X64=/usr/lib/jvm/java
ENV JAVA_HOME=$JAVA_HOME_8_X64

# Install SWIG
RUN curl -sLk https://sourceforge.net/projects/swig/files/swig/swig-4.0.2/swig-4.0.2.tar.gz/download -o swig.tar.gz \
&& tar -xzf swig.tar.gz \
&& cd swig-4.0.2 \
&& ./configure --prefix=/usr/local --without-pcre \
&& make \
&& make install \
&& cd .. \
&& rm -f ./swig.tar.gz \
&& rm -rf ./swig-4.0.2

# Install miniforge
RUN curl -sL "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-$(uname -m).sh" -o miniforge.sh \
&& chmod +x miniforge.sh \
&& ./miniforge.sh -b -p /opt/miniforge \
&& rm -f ./miniforge.sh \
&& /opt/miniforge/bin/conda clean -a -y \
&& chmod -R 777 /opt/miniforge

ENV CONDA=/opt/miniforge/
5 changes: 4 additions & 1 deletion .ci/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,16 @@ else # Linux
sudo apt-get update
sudo apt-get install --no-install-recommends -y \
pocl-opencl-icd
elif [[ $(uname -m) == "x86_64" ]]; then
else # in manylinux image
sudo yum update -y
sudo yum install -y \
clinfo \
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Picking a somewhat-arbitrary place to start a thread.

Right now, the images are building successfully but Python tests with device="gpu" are all failing.

The gpu source job (where LightGBM's default device is set to "gpu") has 238 failures like this:

lightgbm.basic.LightGBMError: Check failed: (best_split_info.right_count) > (0) at /__w/LightGBM/LightGBM/src/treelearner/serial_tree_learner.cpp, line 869 .

Which look at a glance like #3679

The bdist_wheel jobs (which just run a single test checking that OpenCL support was compiled in successfully) on both x86_64 and aarch 64 are failing like this:

____________________________ test_cpu_and_gpu_work _____________________________

    @pytest.mark.skipif(
        os.environ.get("LIGHTGBM_TEST_DUAL_CPU_GPU", "0") != "1",
        reason="Set LIGHTGBM_TEST_DUAL_CPU_GPU=1 to test using CPU and GPU training from the same package.",
    )
    def test_cpu_and_gpu_work():
        # If compiled appropriately, the same installation will support both GPU and CPU.
        X, y = load_breast_cancer(return_X_y=True)
        data = lgb.Dataset(X, y)
    
        params_cpu = {"verbosity": -1, "num_leaves": 31, "objective": "binary", "device": "cpu"}
        cpu_bst = lgb.train(params_cpu, data, num_boost_round=10)
        cpu_score = log_loss(y, cpu_bst.predict(X))
    
        params_gpu = params_cpu.copy()
        params_gpu["device"] = "gpu"
        # Double-precision floats are only supported on x86_64 with PoCL
        params_gpu["gpu_use_dp"] = platform.machine() == "x86_64"
>       gpu_bst = lgb.train(params_gpu, data, num_boost_round=10)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

tests/python_package_test/test_dual.py:32: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/root/miniforge/envs/test-env/lib/python3.13/site-packages/lightgbm/engine.py:297: in train
    booster = Booster(params=params, train_set=train_set)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/root/miniforge/envs/test-env/lib/python3.13/site-packages/lightgbm/basic.py:3615: in __init__
    _safe_call(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

ret = -1

    def _safe_call(ret: int) -> None:
        """Check the return value from C API call.
    
        Parameters
        ----------
        ret : int
            The return value from C API calls.
        """
        if ret != 0:
>           raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
E           lightgbm.basic.LightGBMError: No OpenCL device found

/root/miniforge/envs/test-env/lib/python3.13/site-packages/lightgbm/basic.py:310: LightGBMError

(build link)

Ideas I'm looking into:

I'm going to focus on the gpu source builds first, because those don't rely on anything in https://github.com/microsoft/LightGBM/blob/master/cmake/IntegratedOpenCL.cmake and so should be a more minimal way to investigate this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noticing that the CI job running with an NVIDIA GPU is working: https://github.com/microsoft/LightGBM/actions/runs/20581695041/job/59110391374?pr=7109

So I guess it's just that these jobs are no longer successfully targeting the host CPUs on the GitHub runners? I'll look into that.

ocl-icd-devel \
opencl-headers \
|| exit 1
fi
echo "--- clinfo: ---"
clinfo || true
fi
if [[ $TASK == "cuda" ]]; then
echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
Expand Down
9 changes: 3 additions & 6 deletions .ci/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,7 @@ elif [[ $TASK == "bdist" ]]; then
cp "$(echo "dist/lightgbm-${LGB_VER}-py3-none-macosx"*.whl)" "${BUILD_ARTIFACTSTAGINGDIRECTORY}" || exit 1
fi
else
if [[ $ARCH == "x86_64" ]]; then
PLATFORM="manylinux_2_28_x86_64"
else
PLATFORM="manylinux2014_$ARCH"
fi
PLATFORM="manylinux_2_28_$ARCH"
sh ./build-python.sh bdist_wheel --integrated-opencl || exit 1
# rename wheel, to fix scikit-build-core choosing the platform 'linux_aarch64' instead of
# a manylinux tag
Expand All @@ -154,7 +150,8 @@ elif [[ $TASK == "bdist" ]]; then
mv \
./dist/tmp.whl \
"./dist/lightgbm-${LGB_VER}-py3-none-${PLATFORM}.whl" || exit 1
sh .ci/check-python-dists.sh ./dist || exit 1
# TODO(jameslamb): re-enable these checks before merging
#sh .ci/check-python-dists.sh ./dist || exit 1
if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
cp "dist/lightgbm-${LGB_VER}-py3-none-${PLATFORM}.whl" "${BUILD_ARTIFACTSTAGINGDIRECTORY}" || exit 1
fi
Expand Down
63 changes: 63 additions & 0 deletions .github/workflows/ci-images.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: ci-images

on:
# TODO(jameslamb): remove 'push' trigger before merging
push:
workflow_dispatch:
inputs:
tag-suffix:
type: string
default: "-dev"
description: |
Suffix for image tags, including a leading "-" if desired.
Set to the empty string to overwrite the main images used by LightGBM's CI.

jobs:
build-and-push-images:
name: build-ci-images (${{ matrix.tag }})
runs-on: ${{ matrix.os }}
permissions:
contents: read
packages: write
attestations: write
id-token: write
strategy:
fail-fast: false
# NOTE: Cannot use "{{ github.repository }}" because that'd return "{org}/LightGBM" which results in
# and error like "repository name must be lowercase".
matrix:
include:
- os: ubuntu-24.04-arm
dockerfile: .ci/ci-images/manylinux_2_28_aarch64/Dockerfile
# TODO(jameslamb): revert hard-coded tag before merging
tag: ci-manylinux_2_28_aarch64-dev
# tag: ci-manylinux_2_28_aarch64${{ inputs.tag-suffix }}
- os: ubuntu-latest
dockerfile: .ci/ci-images/manylinux_2_28_x86_64/Dockerfile
# TODO(jameslamb): revert hard-coded tag before merging
tag: ci-manylinux_2_28_x86_64-dev
# tag: ci-manylinux_2_28_x86_64${{ inputs.tag-suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Log in to the Container registry
uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ github.token }}
- name: Build and push Docker image
id: push
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
with:
context: null
file: ${{ matrix.dockerfile }}
push: true
tags: ghcr.io/${{ github.repository_owner }}/lightgbm:${{ matrix.tag }}
# create an attestation to prove that this image came from a workflow in this repository
- name: Generate artifact attestation
uses: actions/attest-build-provenance@e8998f949152b193b063cb0ec769d69d929409be # v2.4.0
with:
subject-name: ghcr.io/${{ github.repository_owner }}/lightgbm
subject-digest: ${{ steps.push.outputs.digest }}
push-to-registry: true
2 changes: 1 addition & 1 deletion .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
task: if-else
compiler: gcc
python_version: '3.11'
container: 'lightgbm.azurecr.io/vsts-agent:manylinux_2_28_x86_64'
container: 'ghcr.io/microsoft/lightgbm:ci-manylinux_2_28_x86_64-dev'
os-display-name: 'manylinux_2_28'
steps:
- name: Install packages used by third-party actions
Expand Down
Loading
Loading