Skip to content
69 changes: 49 additions & 20 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1822,9 +1822,9 @@ def melt(
Arguments correspond to pandas.melt arguments.
"""
# TODO: Implement col_level and ignore_index
value_labels: pd.Index = pd.Index(
[self.col_id_to_label[col_id] for col_id in value_vars]
)
value_labels: pd.Index = self.column_labels[
[self.value_columns.index(col_id) for col_id in value_vars]
]
id_labels = [self.col_id_to_label[col_id] for col_id in id_vars]

unpivot_expr, (var_col_ids, unpivot_out, passthrough_cols) = unpivot(
Expand Down Expand Up @@ -3417,6 +3417,7 @@ def unpivot(
joined_array, (labels_mapping, column_mapping) = labels_array.relational_join(
array_value, type="cross"
)

new_passthrough_cols = [column_mapping[col] for col in passthrough_columns]
# Last column is offsets
index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]]
Expand All @@ -3426,20 +3427,24 @@ def unpivot(
unpivot_exprs: List[ex.Expression] = []
# Supports producing multiple stacked ouput columns for stacking only part of hierarchical index
for input_ids in unpivot_columns:
# row explode offset used to choose the input column
# we use offset instead of label as labels are not necessarily unique
cases = itertools.chain(
*(
(
ops.eq_op.as_expr(explode_offsets_id, ex.const(i)),
ex.deref(column_mapping[id_or_null])
if (id_or_null is not None)
else ex.const(None),
col_expr: ex.Expression
if not input_ids:
col_expr = ex.const(None, dtype=bigframes.dtypes.INT_DTYPE)
else:
# row explode offset used to choose the input column
# we use offset instead of label as labels are not necessarily unique
cases = itertools.chain(
*(
(
ops.eq_op.as_expr(explode_offsets_id, ex.const(i)),
ex.deref(column_mapping[id_or_null])
if (id_or_null is not None)
else ex.const(None),
)
for i, id_or_null in enumerate(input_ids)
)
for i, id_or_null in enumerate(input_ids)
)
)
col_expr = ops.case_when_op.as_expr(*cases)
col_expr = ops.case_when_op.as_expr(*cases)
unpivot_exprs.append(col_expr)

joined_array, unpivot_col_ids = joined_array.compute_values(unpivot_exprs)
Expand All @@ -3457,19 +3462,43 @@ def _pd_index_to_array_value(
Create an ArrayValue from a list of label tuples.
The last column will be row offsets.
"""
id_gen = bigframes.core.identifiers.standard_id_strings()
col_ids = [next(id_gen) for _ in range(index.nlevels)]
offset_id = next(id_gen)

rows = []
labels_as_tuples = utils.index_as_tuples(index)
for row_offset in range(len(index)):
id_gen = bigframes.core.identifiers.standard_id_strings()
row_label = labels_as_tuples[row_offset]
row_label = (row_label,) if not isinstance(row_label, tuple) else row_label
row = {}
for label_part, id in zip(row_label, id_gen):
row[id] = label_part if pd.notnull(label_part) else None
row[next(id_gen)] = row_offset
for label_part, col_id in zip(row_label, col_ids):
row[col_id] = label_part if pd.notnull(label_part) else None
row[offset_id] = row_offset
rows.append(row)

return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session)
if not rows:
dtypes_list = getattr(index, "dtypes", None)
if dtypes_list is None:
dtypes_list = (
[index.dtype] if hasattr(index, "dtype") else [pd.Float64Dtype()]
)

fields = []
for col_id, dtype in zip(col_ids, dtypes_list):
try:
pa_type = bigframes.dtypes.bigframes_dtype_to_arrow_dtype(dtype)
except Exception:
pa_type = pa.string()
fields.append(pa.field(col_id, pa_type))
fields.append(pa.field(offset_id, pa.int64()))
schema = pa.schema(fields)
pt = pa.Table.from_pylist([], schema=schema)
else:
pt = pa.Table.from_pylist(rows)
pt = pt.rename_columns([*col_ids, offset_id])

return core.ArrayValue.from_pyarrow(pt, session=session)


def _resolve_index_col(
Expand Down
13 changes: 13 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5902,6 +5902,19 @@ def test_to_gbq_table_labels(scalars_df_index):
assert table.labels["test"] == "labels"


def test_to_gbq_obj_ref_persists(session):
# Test that saving and loading an Object Reference retains its dtype
bdf = session.from_glob_path(
"gs://cloud-samples-data/vision/ocr/*.jpg", name="uris"
).head(1)

destination_table = "bigframes-dev.bigframes_tests_sys.test_obj_ref_persistence"
bdf.to_gbq(destination_table, if_exists="replace")

loaded_df = session.read_gbq(destination_table)
assert loaded_df["uris"].dtype == dtypes.OBJ_REF_DTYPE


@pytest.mark.parametrize(
("col_names", "ignore_index"),
[
Expand Down
31 changes: 31 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1490,3 +1490,34 @@ def test_multiindex_eq_const(scalars_df_index, scalars_pandas_df_index):
bigframes.testing.utils.assert_index_equal(
pandas.Index(pd_result, dtype="boolean"), bf_result.to_pandas()
)


def test_count_empty_multiindex_columns(session):
df = pandas.DataFrame(
[], index=[1, 2], columns=pandas.MultiIndex.from_tuples([], names=["a", "b"])
)
bdf = session.read_pandas(df)

# count() operation unpivots columns, triggering the empty MultiIndex bug internally
count_df = bdf.count()

# The local fix ensures that empty unpivoted columns generate properly typed NULLs
# rather than failing syntax validation downstream in BigQuery.
# We compile to `.sql` to verify it succeeds locally without evaluating on BigQuery natively.
_ = count_df.to_frame().sql

# Assert structural layout is correct
assert count_df.index.nlevels == 2
assert list(count_df.index.names) == ["a", "b"]


def test_dataframe_melt_multiindex(session):
# Tests that `melt` operations via count do not cause MultiIndex drops in Arrow
df = pandas.DataFrame({"A": [1], "B": ["string"], "C": [3]})
df.columns = pandas.MultiIndex.from_tuples(
[("Group1", "A"), ("Group2", "B"), ("Group1", "C")]
)
bdf = session.read_pandas(df)

count_df = bdf.count().to_pandas()
assert count_df.shape[0] == 3
Loading