diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 14e0f315dc..f083887045 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -12,9 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This module integrates BigQuery built-in functions for use with DataFrame objects, -such as array functions: -https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ +""" +Access BigQuery-specific operations and namespaces within BigQuery DataFrames. + +This module provides specialized functions and sub-modules that expose BigQuery's +advanced capabilities to DataFrames and Series. It acts as a bridge between the +pandas-compatible API and the full power of BigQuery SQL. + +Key sub-modules include: + +* :mod:`bigframes.bigquery.ai`: Generative and predictive AI functions (Gemini, BQML). +* :mod:`bigframes.bigquery.ml`: Direct access to BigQuery ML model operations. +* :mod:`bigframes.bigquery.obj`: Support for BigQuery object tables. + +This module also provides direct access to optimized BigQuery functions for: + +* **JSON Processing:** High-performance functions like ``json_extract``, ``json_value``, + and ``parse_json`` for handling semi-structured data. +* **Geospatial Analysis:** Comprehensive geographic functions such as ``st_area``, + ``st_distance``, and ``st_centroid`` (``ST_`` prefixed functions). +* **Array Operations:** Tools for working with BigQuery arrays, including ``array_agg`` + and ``array_length``. +* **Vector Search:** Integration with BigQuery's vector search and indexing + capabilities for high-dimensional data. +* **Custom SQL:** The ``sql_scalar`` function allows embedding raw SQL snippets for + advanced operations not yet directly mapped in the API. + +By using these functions, you can leverage BigQuery's high-performance engine for +domain-specific tasks while maintaining a Python-centric development experience. + +For the full list of BigQuery standard SQL functions, see: +https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-reference +""" import sys diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index bb24d5dc33..25a7df7781 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -12,9 +12,49 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This module integrates BigQuery built-in AI functions for use with Series/DataFrame objects, -such as AI.GENERATE_BOOL: -https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool""" +""" +Integrate BigQuery built-in AI functions into your BigQuery DataFrames workflow. + +The ``bigframes.bigquery.ai`` module provides a Pythonic interface to leverage BigQuery ML's +generative AI and predictive functions directly on BigQuery DataFrames and Series objects. +These functions enable you to perform advanced AI tasks at scale without moving data +out of BigQuery. + +Key capabilities include: + +* **Generative AI:** Use :func:`bigframes.bigquery.ai.generate` (Gemini) to + perform text analysis, translation, or + content generation. Specialized versions like + :func:`~bigframes.bigquery.ai.generate_bool`, + :func:`~bigframes.bigquery.ai.generate_int`, and + :func:`~bigframes.bigquery.ai.generate_double` are available for structured + outputs. +* **Embeddings:** Generate vector embeddings for text using + :func:`~bigframes.bigquery.ai.generate_embedding`, which are essential for + semantic search and retrieval-augmented generation (RAG) workflows. +* **Classification and Scoring:** Apply machine learning models to your data for + predictive tasks with :func:`~bigframes.bigquery.ai.classify` and + :func:`~bigframes.bigquery.ai.score`. +* **Forecasting:** Predict future values in time-series data using + :func:`~bigframes.bigquery.ai.forecast`. + +**Example usage:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> df = bpd.DataFrame({ + ... "text_input": [ + ... "Is this a positive review? The food was terrible.", + ... ], + ... }) # doctest: +SKIP + + >>> # Assuming a Gemini model has been created in BigQuery as 'my_gemini_model' + >>> result = bq.ai.generate_text("my_gemini_model", df["text_input"]) # doctest: +SKIP + +For more information on the underlying BigQuery ML syntax, see: +https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool +""" from bigframes.bigquery._operations.ai import ( classify, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index fcb60bf778..4db900e776 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -12,7 +12,64 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""BigQuery DataFrames provides a DataFrame API backed by the BigQuery engine.""" +""" +The primary entry point for the BigQuery DataFrames (BigFrames) pandas-compatible API. + +**BigQuery DataFrames** provides a Pythonic DataFrame and machine learning (ML) API +powered by the BigQuery engine. The ``bigframes.pandas`` module implements a large +subset of the pandas API, allowing you to perform large-scale data analysis +using familiar pandas syntax while the computations are executed in the cloud. + +**Key Features:** + +* **Petabyte-Scale Scalability:** Handle datasets that exceed local memory by + offloading computation to the BigQuery distributed engine. +* **Pandas Compatibility:** Use common pandas methods like + :func:`~bigframes.pandas.DataFrame.groupby`, + :func:`~bigframes.pandas.DataFrame.merge`, + :func:`~bigframes.pandas.DataFrame.pivot_table`, and more on BigQuery-backed + :class:`~bigframes.pandas.DataFrame` objects. +* **Direct BigQuery Integration:** Read from and write to BigQuery tables and + queries with :func:`bigframes.pandas.read_gbq` and + :func:`bigframes.pandas.DataFrame.to_gbq`. +* **User-defined Functions (UDFs):** Effortlessly deploy Python functions + functions using the :func:`bigframes.pandas.remote_function` and + :func:`bigframes.pandas.udf` decorators. +* **Data Ingestion:** Support for various formats including CSV, Parquet, JSON, + and Arrow via :func:`bigframes.pandas.read_csv`, + :func:`bigframes.pandas.read_parquet`, etc., which are automatically uploaded + to BigQuery for processing. Convert any pandas DataFrame into a BigQuery + DataFrame using :func:`bigframes.pandas.read_pandas`. + +**Example usage:** + + >>> import bigframes.pandas as bpd + +Initialize session and set options. + + >>> bpd.options.bigquery.project = "your-project-id" # doctest: +SKIP + +Load data from a BigQuery public dataset. + + >>> df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") # doctest: +SKIP + +Perform familiar pandas operations that execute in the cloud. + + >>> top_names = ( + ... df.groupby("name") + ... .agg({"number": "sum"}) + ... .sort_values("number", ascending=False) + ... .head(10) + ... ) # doctest: +SKIP + +Bring the final, aggregated results back to local memory if needed. + + >>> local_df = top_names.to_pandas() # doctest: +SKIP + +BigQuery DataFrames is designed for data scientists and analysts who need the +power of BigQuery with the ease of use of pandas. It eliminates the "data +movement bottleneck" by keeping your data in BigQuery for processing. +""" from __future__ import annotations diff --git a/docs/index.rst b/docs/index.rst index 00c59a6745..19b05bc1b6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,46 +1,52 @@ .. BigQuery DataFrames documentation main file -Welcome to BigQuery DataFrames -============================== +Scalable Python Data Analysis with BigQuery DataFrames (BigFrames) +================================================================== -**BigQuery DataFrames** (``bigframes``) provides a Pythonic interface for data analysis that scales to petabytes. It gives you the best of both worlds: the familiar API of **pandas** and **scikit-learn**, powered by the distributed computing engine of **BigQuery**. +.. meta:: + :description: BigQuery DataFrames (BigFrames) provides a scalable, pandas-compatible Python API for data analysis and machine learning on petabyte-scale datasets using the BigQuery engine. -BigQuery DataFrames consists of three main components: +**BigQuery DataFrames** (``bigframes``) is an open-source Python library that brings the power of **distributed computing** to your data science workflow. By providing a familiar **pandas** and **scikit-learn** compatible API, BigFrames allows you to analyze and model massive datasets where they live—directly in **BigQuery**. -* **bigframes.pandas**: A pandas-compatible API for data exploration and transformation. -* **bigframes.ml**: A scikit-learn-like interface for BigQuery ML, including integration with Gemini. -* **bigframes.bigquery**: Specialized functions for managing BigQuery resources and deploying custom logic. +Why Choose BigQuery DataFrames? +------------------------------- -Why BigQuery DataFrames? ------------------------- +BigFrames eliminates the "data movement bottleneck." Instead of downloading large datasets to a local environment, BigFrames translates your Python code into optimized SQL, executing complex transformations across the BigQuery fleet. -BigFrames allows you to process data where it lives. Instead of downloading massive datasets to your local machine, BigFrames translates your Python code into SQL and executes it across the BigQuery fleet. +* **Petabyte-Scale Scalability:** Effortlessly process datasets that far exceed local memory limits. +* **Familiar Python Ecosystem:** Use the same ``read_gbq``, ``groupby``, ``merge``, and ``pivot_table`` functions you already know from pandas. +* **Integrated Machine Learning:** Access BigQuery ML's powerful algorithms via a scikit-learn-like interface (``bigframes.ml``), including seamless **Gemini AI** integration. +* **Enterprise-Grade Security:** Maintain data governance and security by keeping your data within the BigQuery perimeter. +* **Hybrid Flexibility:** Easily move between distributed BigQuery processing and local pandas analysis with ``to_pandas()``. -* **Scalability:** Work with datasets that exceed local memory limits without complex refactoring. -* **Collaboration & Extensibility:** Bridge the gap between Python and SQL. Deploy custom Python functions to BigQuery, making your logic accessible to SQL-based teammates and data analysts. -* **Production-Ready Pipelines:** Move seamlessly from interactive notebooks to production. BigFrames simplifies data engineering by integrating with tools like **dbt** and **Airflow**, offering a simpler operational model than Spark. -* **Security & Governance:** Keep your data within the BigQuery perimeter. Benefit from enterprise-grade security, auditing, and data governance throughout your entire Python workflow. -* **Familiarity:** Use ``read_gbq``, ``merge``, ``groupby``, and ``pivot_table`` just like you do in pandas. +Core Components of BigFrames +---------------------------- -Quickstart ----------- +BigQuery DataFrames is organized into specialized modules designed for the modern data stack: -Install the library via pip: +1. :mod:`bigframes.pandas`: A high-performance, pandas-compatible API for scalable data exploration, cleaning, and transformation. +2. :mod:`bigframes.bigquery`: Specialized utilities for direct BigQuery resource management, including integrations with Gemini and other AI models in the :mod:`bigframes.bigquery.ai` submodule. + + +Quickstart: Scalable Data Analysis in Seconds +--------------------------------------------- + +Install BigQuery DataFrames via pip: .. code-block:: bash pip install --upgrade bigframes -Load and aggregate a public dataset in just a few lines: +The following example demonstrates how to perform a distributed aggregation on a public dataset with millions of rows using just a few lines of Python: .. code-block:: python import bigframes.pandas as bpd - # Load data from BigQuery + # Initialize BigFrames and load a public dataset df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") - # Perform familiar pandas operations at scale + # Perform familiar pandas operations that execute in the cloud top_names = ( df.groupby("name") .agg({"number": "sum"}) @@ -48,32 +54,28 @@ Load and aggregate a public dataset in just a few lines: .head(10) ) + # Bring the final, aggregated results back to local memory if needed print(top_names.to_pandas()) -User Guide ----------- +Explore the Documentation +------------------------- .. toctree:: :maxdepth: 2 + :caption: User Documentation user_guide/index -API reference -------------- - .. toctree:: - :maxdepth: 3 + :maxdepth: 2 + :caption: API Reference reference/index supported_pandas_apis -Changelog ---------- - -For a list of all BigQuery DataFrames releases: - .. toctree:: - :maxdepth: 2 + :maxdepth: 1 + :caption: Community & Updates changelog