-
Notifications
You must be signed in to change notification settings - Fork 19
Closed as not planned
Description
This originally came up in a discussion FluxML/NNop.jl#10 (comment).
julia> x = CuArray(randn(BFloat16, 4, 4))
4×4 CuArray{BFloat16, 2, CUDA.DeviceMemory}:
0.269531 -1.13281 0.402344 0.139648
1.63281 -1.35156 0.0893555 0.6875
-0.703125 1.00781 -0.8125 -1.09375
-1.47656 -0.318359 0.355469 -0.945312
julia> Float32.(x)
4×4 CuArray{Float32, 2, CUDA.DeviceMemory}:
2.2435f-41 6.8721f-41 2.253f-41 2.2262f-41
2.2893f-41 6.876f-41 2.2139f-41 2.2667f-41
6.8591f-41 2.2781f-41 6.863f-41 6.8714f-41
6.8783f-41 6.8388f-41 2.2496f-41 6.8678f-41I'm puzzled by why this happens, because all I had to do to have my Microfloat type work on GPU was to make a lookup table be a Tuple instead of Vector:
julia> x = CuArray(randn(Float8_E4M3, 4, 4))
4×4 CuArray{Float8_E4M3, 2, CUDA.DeviceMemory}:
-0.234375 -0.625 0.9375 -0.5625
0.46875 -0.03125 -1.0 -0.3125
0.6875 -1.125 1.0 0.5
-0.3125 -0.625 -0.8125 -0.28125
julia> Float32.(x)
4×4 CuArray{Float32, 2, CUDA.DeviceMemory}:
-0.234375 -0.625 0.9375 -0.5625
0.46875 -0.03125 -1.0 -0.3125
0.6875 -1.125 1.0 0.5
-0.3125 -0.625 -0.8125 -0.28125and it seemingly does arithmetic on par with Float16 and Float32:
julia> x = randn(Float8_E4M3, 256, 256000) |> CuArray;
julia> @be CUDA.@sync x .+ x
Benchmark: 57 samples with 1 evaluation
min 1.712 ms (113 allocs: 3.172 KiB)
median 1.754 ms (113 allocs: 3.172 KiB)
mean 1.753 ms (113 allocs: 3.172 KiB)
max 1.789 ms (113 allocs: 3.172 KiB)
julia> x = randn(Float32, 256, 256000) |> CuArray;
julia> @be CUDA.@sync x .+ x
Benchmark: 61 samples with 1 evaluation
min 1.475 ms (113 allocs: 3.172 KiB)
median 1.537 ms (113 allocs: 3.172 KiB)
mean 1.654 ms (113 allocs: 3.172 KiB, 3.04% gc time)
max 8.406 ms (113 allocs: 3.172 KiB, 96.15% gc time)
julia> x = randn(Float16, 256, 256000) |> CuArray;
julia> @be CUDA.@sync x .+ x
Benchmark: 72 samples with 1 evaluation
min 1.274 ms (113 allocs: 3.172 KiB)
median 1.336 ms (113 allocs: 3.172 KiB)
mean 1.391 ms (113 allocs: 3.172 KiB, 1.30% gc time)
max 6.255 ms (113 allocs: 3.172 KiB, 93.46% gc time)Yet BFloat16 doesn't even compile:
BFloat16 attempt
julia> x = randn(BFloat16, 256, 256000) |> CuArray;
julia> @be CUDA.@sync x .+ x
ERROR: LLVM error: Cannot select: 0x4e06aef0: bf16 = fadd 0x4e06bba0, 0x4e06afd0, /home/anton/.julia/packages/BFloat16s/LIFRh/src/bfloat16.jl:236 @[ broadcast.jl:678 @[ broadcast.jl:651 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ]
0x4e06bba0: bf16,ch = load<(load (s16) from %ir.52, !tbaa !195, addrspace 1)> 0x4bf081f0, 0x4df90440, undef:i64, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4df90440: i64 = add 0x4df90a60, Constant:i64<-2>, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4df90a60: i64 = add 0x4d7e2560, 0x4df90c20, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4d7e2560: i64,ch = CopyFromReg 0x4bf081f0, Register:i64 %18
0x4df90fa0: i64 = Register %18
0x4df90c20: i64 = shl 0x4df90de0, Constant:i32<1>, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4df90de0: i64 = add 0x4df90f30, 0x4e06bb30
0x4df90f30: i64 = mul 0x4df90590, 0x4df90980, int.jl:88 @[ abstractarray.jl:3080 @[ abstractarray.jl:3080 @[ abstractarray.jl:3064 @[ abstractarray.jl:3048 @[ abstractarray.jl:1347 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x4df90590: i64 = add 0x4e06b970, Constant:i64<-1>, int.jl:86 @[ abstractarray.jl:3087 @[ abstractarray.jl:3080 @[ abstractarray.jl:3080 @[ abstractarray.jl:3064 @[ abstractarray.jl:3048 @[ abstractarray.jl:1347 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4e06b970: i64 = select 0x4d7e25d0, 0x4d912f30, 0x4dd581a0, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ]
0x4d7e25d0: i1 = truncate 0x4d910a40, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ]
0x4d912f30: i64 = add 0x4d912e50, Constant:i64<1>, int.jl:87 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:79 @[ ntuple.jl:49 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:75 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ]
0x4dd581a0: i64,ch = CopyFromReg 0x4d910880:1, Register:i64 %26
0x4df908a0: i64 = Constant<-1>
0x4df90980: i64,ch = CopyFromReg 0x4dd57330:1, Register:i64 %20
0x4df90670: i64 = Register %20
0x4e06bb30: i64 = select 0x4d7e2410, 0x4e06ae80, 0x4d910880, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ]
0x4d7e2410: i1 = truncate 0x4d910dc0, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ]
0x4d910dc0: i16,ch = CopyFromReg 0x4d910f10:1, Register:i16 %23
0x4d9129f0: i16 = Register %23
0x4e06ae80: i64 = add 0x4d913630, Constant:i64<1>, int.jl:87 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:79 @[ ntuple.jl:49 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:75 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ]
0x4d913630: i64 = add 0x4d913550, 0x4d913470, int.jl:87 @[ abstractarray.jl:3105 @[ abstractarray.jl:3091 @[ abstractarray.jl:3053 @[ abstractarray.jl:1382 @[ abstractarray.jl:1360 @[ abstractarray.jl:1353 @[ abstractarray.jl:1312 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ] ] ] ] ]
0x4d913550: i64 = mul 0x4e06b820, 0x4e06b2e0, int.jl:88 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:79 @[ ntuple.jl:49 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:75 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ]
0x4d913470: i64 = sub 0x4e06af60, 0x4e06b200, int.jl:86 @[ abstractarray.jl:3105 @[ abstractarray.jl:3091 @[ abstractarray.jl:3053 @[ abstractarray.jl:1382 @[ abstractarray.jl:1360 @[ abstractarray.jl:1353 @[ abstractarray.jl:1312 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ] ] ] ] ]
0x4d913710: i64 = Constant<1>
0x4d910880: i64,ch = CopyFromReg 0x4d910a40:1, Register:i64 %25
0x4dd57720: i64 = Register %25
0x4d911760: i32 = Constant<1>
0x4df90750: i64 = Constant<-2>
0x4e06bc80: i64 = undef
0x4e06afd0: bf16,ch = load<(load (s16) from %ir.63, !tbaa !195, addrspace 1)> 0x4bf081f0, 0x4e06b0b0, undef:i64, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4e06b0b0: i64 = add 0x4e06b190, Constant:i64<-2>, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4e06b190: i64 = add 0x4d7e23a0, 0x4e06b270, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4d7e23a0: i64,ch = CopyFromReg 0x4dd581a0:1, Register:i64 %27
0x4dd57e90: i64 = Register %27
0x4e06b270: i64 = shl 0x4e06b350, Constant:i32<1>, /home/anton/.julia/packages/LLVM/UFrs4/src/interop/base.jl:39 @[ none:0 @[ none:0 @[ /home/anton/.julia/packages/LLVM/UFrs4/src/interop/pointer.jl:85 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:96 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:90 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:175 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4e06b350: i64 = add 0x4e06b430, 0x4e06b6d0
0x4e06b430: i64 = mul 0x4e06b510, 0x4d7e2b80, int.jl:88 @[ abstractarray.jl:3080 @[ abstractarray.jl:3080 @[ abstractarray.jl:3064 @[ abstractarray.jl:3048 @[ abstractarray.jl:1347 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4e06b510: i64 = add 0x4d912ec0, Constant:i64<-1>, int.jl:86 @[ abstractarray.jl:3087 @[ abstractarray.jl:3080 @[ abstractarray.jl:3080 @[ abstractarray.jl:3064 @[ abstractarray.jl:3048 @[ abstractarray.jl:1347 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/device/array.jl:186 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
0x4d912ec0: i64 = select 0x4d910b90, 0x4d912f30, 0x4dd58050, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ]
0x4d910b90: i1 = truncate 0x4d911140, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ] ]
0x4d912f30: i64 = add 0x4d912e50, Constant:i64<1>, int.jl:87 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:79 @[ ntuple.jl:49 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:75 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ]
0x4dd58050: i64,ch = CopyFromReg 0x4d7e29c0:1, Register:i64 %35
0x4df908a0: i64 = Constant<-1>
0x4d7e2b80: i64,ch = CopyFromReg 0x4d9110d0:1, Register:i64 %29
0x4df90ec0: i64 = Register %29
0x4e06b6d0: i64 = select 0x4dd57e20, 0x4e06ae80, 0x4d7e29c0, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ]
0x4dd57e20: i1 = truncate 0x4df90910, essentials.jl:796 @[ broadcast.jl:590 @[ broadcast.jl:587 @[ broadcast.jl:644 @[ broadcast.jl:675 @[ broadcast.jl:674 @[ broadcast.jl:650 @[ broadcast.jl:610 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 @[ none:0 ] ] ] ] ] ] ] ] ]
0x4df90910: i16,ch = CopyFromReg 0x4dd57870:1, Register:i16 %32
0x4df90d00: i16 = Register %32
0x4e06ae80: i64 = add 0x4d913630, Constant:i64<1>, int.jl:87 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:79 @[ ntuple.jl:49 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:75 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ]
0x4d913630: i64 = add 0x4d913550, 0x4d913470, int.jl:87 @[ abstractarray.jl:3105 @[ abstractarray.jl:3091 @[ abstractarray.jl:3053 @[ abstractarray.jl:1382 @[ abstractarray.jl:1360 @[ abstractarray.jl:1353 @[ abstractarray.jl:1312 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ] ] ] ] ]
0x4d913550: i64 = mul 0x4e06b820, 0x4e06b2e0, int.jl:88 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:79 @[ ntuple.jl:49 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:75 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ]
0x4d913470: i64 = sub 0x4e06af60, 0x4e06b200, int.jl:86 @[ abstractarray.jl:3105 @[ abstractarray.jl:3091 @[ abstractarray.jl:3053 @[ abstractarray.jl:1382 @[ abstractarray.jl:1360 @[ abstractarray.jl:1353 @[ abstractarray.jl:1312 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/nditeration.jl:121 @[ /home/anton/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:184 @[ /home/anton/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:323 @[ none:0 ] ] ] ] ] ] ] ] ] ] ]
0x4d913710: i64 = Constant<1>
0x4d7e29c0: i64,ch = CopyFromReg 0x4d911140:1, Register:i64 %34
0x4d9109d0: i64 = Register %34
0x4d911760: i32 = Constant<1>
0x4df90750: i64 = Constant<-2>
0x4e06bc80: i64 = undef
In function: _Z30gpu_broadcast_kernel_cartesian16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES6_EE7NDRangeILi2ES0_S0_S8_S8_EE13CuDeviceArrayI8BFloat16Li2ELi1EE11BroadcastedI12CuArrayStyleILi2E12DeviceMemoryES7_1_S3_I8ExtrudedISE_S3_I4BoolSL_ES3_IS5_S5_EESO_EE
Stacktrace:
[1] handle_error(reason::Cstring)
@ LLVM ~/.julia/packages/LLVM/UFrs4/src/core/context.jl:194
[2] LLVMTargetMachineEmitToMemoryBuffer(T::LLVM.TargetMachine, M::LLVM.Module, codegen::LLVM.API.LLVMCodeGenFileType, ErrorMessage::Base.RefValue{…}, OutMemBuf::Base.RefValue{…})
@ LLVM.API ~/.julia/packages/LLVM/UFrs4/lib/16/libLLVM.jl:11138
[3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
@ LLVM ~/.julia/packages/LLVM/UFrs4/src/targetmachine.jl:118
[4] mcgen(job::GPUCompiler.CompilerJob, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/mcgen.jl:75
[5] mcgen(job::GPUCompiler.CompilerJob{…}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
@ CUDA ~/.julia/packages/CUDA/Wfi8S/src/compiler/compilation.jl:127
[6] macro expansion
@ ~/.julia/packages/Tracy/slmNc/src/tracepoint.jl:163 [inlined]
[7] macro expansion
@ ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:406 [inlined]
[8] macro expansion
@ ~/.julia/packages/Tracy/slmNc/src/tracepoint.jl:163 [inlined]
[9] macro expansion
@ ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:403 [inlined]
[10] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/utils.jl:116
[11] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:115
[12] compile_unhooked
@ ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:80 [inlined]
[13] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:67
[14] compile
@ ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:55 [inlined]
[15] #1181
@ ~/.julia/packages/CUDA/Wfi8S/src/compiler/compilation.jl:250 [inlined]
[16] JuliaContext(f::CUDA.var"#1181#1184"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:34
[17] JuliaContext(f::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:25
[18] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/Wfi8S/src/compiler/compilation.jl:249
[19] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/execution.jl:245
[20] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/execution.jl:159
[21] macro expansion
@ ~/.julia/packages/CUDA/Wfi8S/src/compiler/execution.jl:373 [inlined]
[22] macro expansion
@ ./lock.jl:273 [inlined]
[23] cufunction(f::GPUArrays.var"#gpu_broadcast_kernel_cartesian#43", tt::Type{…}; kwargs::@Kwargs{…})
@ CUDA ~/.julia/packages/CUDA/Wfi8S/src/compiler/execution.jl:368
[24] macro expansion
@ ~/.julia/packages/CUDA/Wfi8S/src/compiler/execution.jl:112 [inlined]
[25] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
@ CUDA.CUDAKernels ~/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:124
[26] Kernel
@ ~/.julia/packages/CUDA/Wfi8S/src/CUDAKernels.jl:110 [inlined]
[27] _copyto!
@ ~/.julia/packages/GPUArrays/u6tui/src/host/broadcast.jl:71 [inlined]
[28] copyto!
@ ~/.julia/packages/GPUArrays/u6tui/src/host/broadcast.jl:44 [inlined]
[29] copy
@ ~/.julia/packages/GPUArrays/u6tui/src/host/broadcast.jl:29 [inlined]
[30] materialize(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{…}, Nothing, typeof(+), Tuple{…}})
@ Base.Broadcast ./broadcast.jl:872
[31] macro expansion
@ ~/.julia/packages/CUDA/Wfi8S/src/utilities.jl:35 [inlined]
[32] (::var"#43#44")()
@ Main ~/.julia/packages/Chairmarks/yJzRJ/src/macro_tools.jl:52
[33] _benchmark_3(::var"#43#44", ::Int64, ::Bool)
@ Chairmarks ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:145
[34] (::Chairmarks.var"#29#31"{Nothing, Bool, Int64, Bool, Tuple{…}, Tuple{}, Base.RefValue{…}, Tuple{…}})(i::Int64)
@ Chairmarks ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:124
[35] ntuple
@ ./ntuple.jl:19 [inlined]
[36] _benchmark_2(args1::Any, setup::Any, teardown::Any, gc::Bool, evals::Int64, warmup::Bool, fs::Any)
@ Chairmarks ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:121
[37] _benchmark_1(init::Any, setup::Any, teardown::Any, evals::Union{…}, samples::Union{…}, seconds::Union{…}, gc::Bool, fs::Any)
@ Chairmarks ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:40
[38] #benchmark#8
@ ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:17 [inlined]
[39] benchmark
@ ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:12 [inlined]
[40] benchmark
@ ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:5 [inlined]
[41] benchmark (repeats 2 times)
@ ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:4 [inlined]
[42] benchmark(f::Function)
@ Chairmarks ~/.julia/packages/Chairmarks/yJzRJ/src/benchmarking.jl:3
[43] top-level scope
@ REPL[142]:1
Some type information was truncated. Use `show(err)` to see complete types.Are some LLVM instructions not translating to the GPU or something?
I want to help in any way I can to make BFloat16 work on GPUs in Julia, and at the moment it seems the easiest path is a fork, with no LLVM stuff.
Metadata
Metadata
Assignees
Labels
No labels