|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +use itertools::Itertools; |
| 5 | +use num_traits::AsPrimitive; |
| 6 | +use vortex_buffer::Buffer; |
| 7 | +use vortex_buffer::BufferMut; |
| 8 | +use vortex_buffer::ByteBuffer; |
| 9 | +use vortex_buffer::ByteBufferMut; |
| 10 | +use vortex_dtype::NativePType; |
| 11 | +use vortex_vector::binaryview::BinaryView; |
| 12 | + |
| 13 | +/// Convert an offsets buffer to a buffer of element lengths. |
| 14 | +#[inline] |
| 15 | +pub fn offsets_to_lengths<P: NativePType>(offsets: &[P]) -> Buffer<P> { |
| 16 | + offsets |
| 17 | + .iter() |
| 18 | + .tuple_windows::<(_, _)>() |
| 19 | + .map(|(&start, &end)| end - start) |
| 20 | + .collect() |
| 21 | +} |
| 22 | + |
| 23 | +/// Maximum number of buffer bytes that can be referenced by a single `BinaryView` |
| 24 | +pub const MAX_BUFFER_LEN: usize = i32::MAX as usize; |
| 25 | + |
| 26 | +/// Split a large buffer of input `bytes` holding string data |
| 27 | +pub fn build_views<P: NativePType + AsPrimitive<usize>>( |
| 28 | + start_buf_index: u32, |
| 29 | + max_buffer_len: usize, |
| 30 | + mut bytes: ByteBufferMut, |
| 31 | + lens: &[P], |
| 32 | +) -> (Vec<ByteBuffer>, Buffer<BinaryView>) { |
| 33 | + let mut views = BufferMut::<BinaryView>::with_capacity(lens.len()); |
| 34 | + |
| 35 | + let mut buffers = Vec::new(); |
| 36 | + let mut buf_index = start_buf_index; |
| 37 | + |
| 38 | + let mut offset = 0; |
| 39 | + for &len in lens { |
| 40 | + let len = len.as_(); |
| 41 | + assert!(len <= max_buffer_len, "values cannot exceed max_buffer_len"); |
| 42 | + |
| 43 | + if (offset + len) > max_buffer_len { |
| 44 | + // Roll the buffer every 2GiB, to avoid overflowing VarBinView offset field |
| 45 | + let rest = bytes.split_off(offset); |
| 46 | + |
| 47 | + buffers.push(bytes.freeze()); |
| 48 | + buf_index += 1; |
| 49 | + offset = 0; |
| 50 | + |
| 51 | + bytes = rest; |
| 52 | + } |
| 53 | + let view = BinaryView::make_view(&bytes[offset..][..len], buf_index, offset.as_()); |
| 54 | + // SAFETY: we reserved the right capacity beforehand |
| 55 | + unsafe { views.push_unchecked(view) }; |
| 56 | + offset += len; |
| 57 | + } |
| 58 | + |
| 59 | + if !bytes.is_empty() { |
| 60 | + buffers.push(bytes.freeze()); |
| 61 | + } |
| 62 | + |
| 63 | + (buffers, views.freeze()) |
| 64 | +} |
| 65 | + |
| 66 | +#[cfg(test)] |
| 67 | +mod tests { |
| 68 | + use vortex_buffer::ByteBuffer; |
| 69 | + use vortex_buffer::ByteBufferMut; |
| 70 | + use vortex_vector::binaryview::BinaryView; |
| 71 | + |
| 72 | + use crate::arrays::build_views::build_views; |
| 73 | + |
| 74 | + #[test] |
| 75 | + fn test_to_canonical_large() { |
| 76 | + // We are testing generating views for raw data that should look like |
| 77 | + // |
| 78 | + // aaaaaaaaaaaaa ("a"*13) |
| 79 | + // bbbbbbbbbbbbb ("b"*13) |
| 80 | + // ccccccccccccc ("c"*13) |
| 81 | + // ddddddddddddd ("d"*13) |
| 82 | + // |
| 83 | + // In real code, this would all fit in one buffer, but to unit test the splitting logic |
| 84 | + // we split buffers at length 26, which should result in two buffers for the output array. |
| 85 | + let raw_data = |
| 86 | + ByteBufferMut::copy_from("aaaaaaaaaaaaabbbbbbbbbbbbbcccccccccccccddddddddddddd"); |
| 87 | + let lens = vec![13u8; 4]; |
| 88 | + |
| 89 | + let (buffers, views) = build_views(0, 26, raw_data, &lens); |
| 90 | + |
| 91 | + assert_eq!( |
| 92 | + buffers, |
| 93 | + vec![ |
| 94 | + ByteBuffer::copy_from("aaaaaaaaaaaaabbbbbbbbbbbbb"), |
| 95 | + ByteBuffer::copy_from("cccccccccccccddddddddddddd"), |
| 96 | + ] |
| 97 | + ); |
| 98 | + |
| 99 | + assert_eq!( |
| 100 | + views.as_slice(), |
| 101 | + &[ |
| 102 | + BinaryView::make_view(b"aaaaaaaaaaaaa", 0, 0), |
| 103 | + BinaryView::make_view(b"bbbbbbbbbbbbb", 0, 13), |
| 104 | + BinaryView::make_view(b"ccccccccccccc", 1, 0), |
| 105 | + BinaryView::make_view(b"ddddddddddddd", 1, 13), |
| 106 | + ] |
| 107 | + ) |
| 108 | + } |
| 109 | +} |
0 commit comments