Skip to content

Commit ba9eb99

Browse files
committed
feat: support conversion of BigQuery schema to Protobuf
Add functionality to convert BigQuery table schemas to protocol buffer descriptors. This enables schema conversion for BigQuery Storage Write API operations. - Add table_schema_to_proto_descriptor function - Support basic types, structs, and range fields - Implement field name sanitization and collision avoidance - Add comprehensive test coverage - Update documentation and changelog
1 parent 3928ddc commit ba9eb99

5 files changed

Lines changed: 903 additions & 1 deletion

File tree

packages/google-cloud-bigquery-storage/CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@
44

55
[1]: https://pypi.org/project/google-cloud-bigquery-storage/#history
66

7+
## [2.37.0](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-storage-v2.36.1...google-cloud-bigquery-storage-v2.37.0) (2026-02-17)
8+
9+
10+
### Features
11+
12+
* support conversion of BigQuery schema to Protobuf ([2711330](https://github.com/googleapis/google-cloud-python/commit/2711330f0a096a2a9d1b02e51081d1af25a37501))
13+
714
## [2.36.1](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-storage-v2.36.0...google-cloud-bigquery-storage-v2.36.1) (2026-02-12)
815

916

packages/google-cloud-bigquery-storage/docs/bigquery_storage_v1/library.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ Bigquery Storage v1 API Library
88
.. automodule:: google.cloud.bigquery_storage_v1.reader
99
:members:
1010
:inherited-members:
11+
12+
.. automodule:: google.cloud.bigquery_storage_v1.schema
13+
:members:
14+
:inherited-members:

packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# this code path once we drop support for Python 3.7
2929
import importlib_metadata as metadata
3030

31-
from google.cloud.bigquery_storage_v1 import client, types
31+
from google.cloud.bigquery_storage_v1 import client, schema, types
3232

3333

3434
class BigQueryReadClient(client.BigQueryReadClient):
@@ -140,4 +140,6 @@ def _get_version(dependency_name):
140140
# google.cloud.bigquery_storage_v1.client
141141
"BigQueryReadClient",
142142
"BigQueryWriteClient",
143+
# google.cloud.bigquery_storage_v1.schema
144+
"schema",
143145
)
Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Utilities for converting BigQuery schemas to Protocol Buffer descriptors.
16+
17+
This module provides functionality to dynamically generate Protocol Buffer
18+
descriptors from BigQuery table schemas, eliminating the need to manually
19+
create and compile .proto files when using the BigQuery Storage Write API.
20+
"""
21+
22+
import re
23+
from typing import Dict, List, Tuple
24+
25+
from google.cloud.bigquery_storage_v1 import types
26+
from google.protobuf import descriptor_pb2
27+
28+
29+
# Mapping from BigQuery types to Protocol Buffer field types
30+
_BQ_TO_PROTO_TYPE_MAP: Dict[types.TableFieldSchema.Type, int] = {
31+
types.TableFieldSchema.Type.STRING: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
32+
types.TableFieldSchema.Type.INT64: descriptor_pb2.FieldDescriptorProto.TYPE_INT64,
33+
types.TableFieldSchema.Type.BOOL: descriptor_pb2.FieldDescriptorProto.TYPE_BOOL,
34+
types.TableFieldSchema.Type.BYTES: descriptor_pb2.FieldDescriptorProto.TYPE_BYTES,
35+
types.TableFieldSchema.Type.DOUBLE: descriptor_pb2.FieldDescriptorProto.TYPE_DOUBLE,
36+
# DATE is represented as days since epoch
37+
types.TableFieldSchema.Type.DATE: descriptor_pb2.FieldDescriptorProto.TYPE_INT32,
38+
# DATETIME is represented as a formatted string
39+
types.TableFieldSchema.Type.DATETIME: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
40+
# TIME is represented as a formatted string
41+
types.TableFieldSchema.Type.TIME: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
42+
# TIMESTAMP is represented as microseconds since epoch
43+
types.TableFieldSchema.Type.TIMESTAMP: descriptor_pb2.FieldDescriptorProto.TYPE_INT64,
44+
# NUMERIC and BIGNUMERIC are represented as strings
45+
types.TableFieldSchema.Type.NUMERIC: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
46+
types.TableFieldSchema.Type.BIGNUMERIC: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
47+
# GEOGRAPHY is represented as WKT string
48+
types.TableFieldSchema.Type.GEOGRAPHY: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
49+
# JSON is represented as a string
50+
types.TableFieldSchema.Type.JSON: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
51+
# INTERVAL is represented as a string
52+
types.TableFieldSchema.Type.INTERVAL: descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
53+
}
54+
55+
56+
def _sanitize_field_name(field_name: str) -> str:
57+
"""Sanitize a field name to make it proto-compatible.
58+
59+
Args:
60+
field_name: The original field name.
61+
62+
Returns:
63+
The sanitized field name.
64+
"""
65+
# Replace invalid characters with underscores.
66+
sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', field_name)
67+
# If the first character is a digit, prepend an underscore.
68+
if sanitized and sanitized[0].isdigit():
69+
sanitized = '_' + sanitized
70+
# As a convention, field names are lowercased.
71+
return sanitized.lower()
72+
73+
74+
def _get_field_label(mode: types.TableFieldSchema.Mode) -> int:
75+
"""Convert BigQuery field mode to Protocol Buffer field label.
76+
77+
Args:
78+
mode: The BigQuery field mode (NULLABLE, REQUIRED, or REPEATED).
79+
80+
Returns:
81+
The corresponding Protocol Buffer field label constant.
82+
"""
83+
if mode == types.TableFieldSchema.Mode.REQUIRED:
84+
return descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED
85+
elif mode == types.TableFieldSchema.Mode.REPEATED:
86+
return descriptor_pb2.FieldDescriptorProto.LABEL_REPEATED
87+
else: # NULLABLE or MODE_UNSPECIFIED
88+
return descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
89+
90+
91+
def _convert_bq_field_to_proto_field(
92+
bq_field: types.TableFieldSchema,
93+
field_number: int,
94+
scope: str,
95+
) -> descriptor_pb2.FieldDescriptorProto:
96+
"""Convert a BigQuery field to a Protocol Buffer field descriptor.
97+
98+
Args:
99+
bq_field: The BigQuery field schema.
100+
field_number: The field number (position) in the message.
101+
scope: The scope/type name for nested messages (STRUCT/RANGE).
102+
103+
Returns:
104+
A FieldDescriptorProto for the field.
105+
"""
106+
field_name = _sanitize_field_name(bq_field.name)
107+
mode = bq_field.mode or types.TableFieldSchema.Mode.NULLABLE
108+
109+
field_descriptor = descriptor_pb2.FieldDescriptorProto()
110+
field_descriptor.name = field_name
111+
field_descriptor.number = field_number
112+
field_descriptor.label = _get_field_label(mode)
113+
114+
if bq_field.type_ == types.TableFieldSchema.Type.STRUCT:
115+
field_descriptor.type = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE
116+
field_descriptor.type_name = scope
117+
elif bq_field.type_ == types.TableFieldSchema.Type.RANGE:
118+
field_descriptor.type = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE
119+
field_descriptor.type_name = scope
120+
else:
121+
proto_type = _BQ_TO_PROTO_TYPE_MAP.get(bq_field.type_)
122+
if proto_type is None:
123+
raise ValueError(
124+
f"Unsupported BigQuery type: {bq_field.type_} for field {bq_field.name}"
125+
)
126+
field_descriptor.type = proto_type
127+
128+
return field_descriptor
129+
130+
131+
def _convert_bq_table_schema_to_proto_descriptor_impl(
132+
table_schema: types.TableSchema,
133+
scope: str,
134+
) -> Tuple[descriptor_pb2.DescriptorProto, List[descriptor_pb2.DescriptorProto]]:
135+
"""Recursively convert BigQuery table schema to proto descriptor.
136+
137+
Args:
138+
table_schema: The BigQuery table schema.
139+
scope: The current scope for naming nested messages.
140+
141+
Returns:
142+
A tuple of (descriptor, nested_descriptors):
143+
- descriptor: The DescriptorProto for this level
144+
- nested_descriptors: List of all nested DescriptorProto objects
145+
146+
Raises:
147+
ValueError: If the schema contains unsupported field types or invalid RANGE fields.
148+
"""
149+
fields = []
150+
all_nested_descriptors = []
151+
field_number = 1
152+
153+
for bq_field in table_schema.fields:
154+
if bq_field.type_ == types.TableFieldSchema.Type.STRUCT:
155+
# Sanitize the field name for use in scope
156+
scope_name = _sanitize_field_name(bq_field.name)
157+
current_scope = f"{scope}__{scope_name}"
158+
159+
# Recursively convert nested struct
160+
nested_schema = types.TableSchema(fields=list(bq_field.fields))
161+
nested_descriptor, deeply_nested = _convert_bq_table_schema_to_proto_descriptor_impl(
162+
nested_schema, current_scope
163+
)
164+
all_nested_descriptors.append(nested_descriptor)
165+
all_nested_descriptors.extend(deeply_nested)
166+
167+
# Create field pointing to the nested message
168+
field = _convert_bq_field_to_proto_field(bq_field, field_number, current_scope)
169+
fields.append(field)
170+
171+
elif bq_field.type_ == types.TableFieldSchema.Type.RANGE:
172+
# Sanitize the field name for use in scope
173+
scope_name = _sanitize_field_name(bq_field.name)
174+
current_scope = f"{scope}__{scope_name}"
175+
176+
# Validate RANGE element type
177+
if not bq_field.range_element_type or not bq_field.range_element_type.type_:
178+
raise ValueError(
179+
f"RANGE field '{bq_field.name}' is missing range_element_type. "
180+
f"RANGE fields must specify an element type (DATE, DATETIME, or TIMESTAMP)."
181+
)
182+
183+
element_type = bq_field.range_element_type.type_
184+
185+
# Validate the element type is supported
186+
if element_type not in (
187+
types.TableFieldSchema.Type.DATE,
188+
types.TableFieldSchema.Type.DATETIME,
189+
types.TableFieldSchema.Type.TIMESTAMP,
190+
):
191+
raise ValueError(
192+
f"Unsupported element type '{element_type}' for RANGE field '{bq_field.name}'. "
193+
f"Supported types are DATE, DATETIME, and TIMESTAMP."
194+
)
195+
196+
# Create RANGE nested message with start and end fields
197+
range_fields = [
198+
types.TableFieldSchema(
199+
name="start",
200+
type_=element_type,
201+
mode=types.TableFieldSchema.Mode.NULLABLE,
202+
),
203+
types.TableFieldSchema(
204+
name="end",
205+
type_=element_type,
206+
mode=types.TableFieldSchema.Mode.NULLABLE,
207+
),
208+
]
209+
range_schema = types.TableSchema(fields=range_fields)
210+
range_descriptor, _ = _convert_bq_table_schema_to_proto_descriptor_impl(
211+
range_schema, current_scope
212+
)
213+
all_nested_descriptors.append(range_descriptor)
214+
215+
# Create field pointing to the RANGE message
216+
field = _convert_bq_field_to_proto_field(bq_field, field_number, current_scope)
217+
fields.append(field)
218+
219+
else:
220+
# Primitive field
221+
field = _convert_bq_field_to_proto_field(bq_field, field_number, "")
222+
fields.append(field)
223+
224+
field_number += 1
225+
226+
# Create the descriptor for this level
227+
descriptor = descriptor_pb2.DescriptorProto()
228+
descriptor.name = scope
229+
descriptor.field.extend(fields)
230+
231+
return descriptor, all_nested_descriptors
232+
233+
234+
def table_schema_to_proto_descriptor(
235+
table_schema: types.TableSchema,
236+
message_name: str = "root",
237+
) -> descriptor_pb2.DescriptorProto:
238+
"""Convert a BigQuery TableSchema to a Protocol Buffer DescriptorProto.
239+
240+
This function generates a Protocol Buffer descriptor that can be used with
241+
the BigQuery Storage Write API without needing to create and compile .proto
242+
files. The generated descriptor uses proto2 wire format, which is required
243+
by the Write API.
244+
245+
Args:
246+
table_schema: The BigQuery table schema to convert.
247+
message_name: Optional name for the root message type. Defaults to "root".
248+
249+
Returns:
250+
A DescriptorProto that can be used with ProtoSchema in the Write API.
251+
252+
Raises:
253+
ValueError: If the schema contains unsupported field types or invalid RANGE fields.
254+
255+
Example:
256+
>>> from google.cloud.bigquery_storage_v1 import schema, types
257+
>>>
258+
>>> # Define a BigQuery schema
259+
>>> table_schema = types.TableSchema(fields=[
260+
... types.TableFieldSchema(
261+
... name="id",
262+
... type_=types.TableFieldSchema.Type.INT64,
263+
... mode=types.TableFieldSchema.Mode.REQUIRED
264+
... ),
265+
... types.TableFieldSchema(
266+
... name="name",
267+
... type_=types.TableFieldSchema.Type.STRING
268+
... ),
269+
... ])
270+
>>>
271+
>>> # Convert to proto descriptor
272+
>>> proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema)
273+
>>>
274+
>>> # Use with Write API
275+
>>> proto_schema = types.ProtoSchema()
276+
>>> proto_schema.proto_descriptor = proto_descriptor
277+
278+
Note:
279+
For detailed information about BigQuery to Protocol Buffer type mappings,
280+
see: https://cloud.google.com/bigquery/docs/write-api#data_type_conversions
281+
"""
282+
# Convert using scope-based naming
283+
root_descriptor, nested_descriptors = _convert_bq_table_schema_to_proto_descriptor_impl(
284+
table_schema, message_name
285+
)
286+
287+
root_descriptor.nested_type.extend(nested_descriptors)
288+
289+
return root_descriptor
290+
291+
292+
__all__ = ("table_schema_to_proto_descriptor",)

0 commit comments

Comments
 (0)