diff --git a/.github/workflows/hamilton-core-main.yml b/.github/workflows/hamilton-core-main.yml new file mode 100644 index 000000000..f26ec478a --- /dev/null +++ b/.github/workflows/hamilton-core-main.yml @@ -0,0 +1,48 @@ +name: Unit tests (hamilton-core) + +on: + workflow_dispatch: + + pull_request: + branches: + - main + paths: + - '.github/**' + - 'hamilton/**' + - 'tests/**' + - 'pyproject.toml' + +jobs: + test: + name: "Unit Tests (hamilton-core)" + runs-on: ubuntu-latest + env: + UV_PRERELEASE: "allow" + HAMILTON_TELEMETRY_ENABLED: false + + steps: + - name: Install Graphviz on Linux + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install --yes --no-install-recommends graphviz + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: "3.12" # most popular Python version + enable-cache: true + cache-dependency-glob: "uv.lock" + activate-environment: true + + - name: Install dependencies + run: | + uv venv + . .venv/bin/activate + uv pip install ./hamilton-core[core-tests] + + # NOTE `test_caching.py` is the older caching mechanism + - name: Test hamilton main package + run: | + uv run pytest tests/ --ignore tests/integrations --ignore tests/plugins --ignore tests/test_caching.py diff --git a/hamilton-core/.gitignore b/hamilton-core/.gitignore new file mode 100644 index 000000000..750d8503c --- /dev/null +++ b/hamilton-core/.gitignore @@ -0,0 +1 @@ +hamilton/_hamilton diff --git a/hamilton-core/README.md b/hamilton-core/README.md new file mode 100644 index 000000000..52accb309 --- /dev/null +++ b/hamilton-core/README.md @@ -0,0 +1,41 @@ +# Read carefully + +> Use at your own risk + +This directory contains code for the package `sf-hamilton-core`. It is a drop-in replacement of `sf-hamilton`, with two changes: +- disable plugin autoloading +- make `pandas` and `numpy` optional dependencies; and remove `networkx` dependency (currently unused). + +This makes the Hamilton package a much lighter install and solves long library loading time. + +## As a user +If you want to try `sf-hamilton-core`, you need to: +1. Remove your current Hamilton installation: `pip uninstall sf-hamilton` +2. Install Hamilton core `pip install sf-hamilton-core` +3. Check installation `pip list` should only include `sf-hamilton-core`. + +This will install a different Python package with the name `hamilton` with the smaller dependencies and plugin autoloading disabled. + +It should be a drop-in replacement and your existing Hamilton code should just work. Though, if you're relying on plugins (e.g., parquet materializers, dataframe result builders), you will need to manually load them. + + +## How does it work + + +## Why is another package `sf-hamilton` necessary +This exists to prevent backwards incompatible changes for people who `pip install sf-hamilton` and use it in production. It is a temporary solution until a major release `sf-hamilton==2.0.0` could allow breaking changes and a more robust solution. + +### Disable plugin autoloading +Hamilton has generous number of plugins (`pandas`, `polars`, `mlflow`, `spark`). To give a good user experience, Hamilton autoloads plugins based on the available Python libraries in the current Python environment. For example, `to.mlflow()` becomes available if `mlflow` is installed. Autoloaded features notably include materializers like `from_.parquet` and `to.parquet` and data validators (pydantic, pandera, etc.) + +The issue with this approach is that Python environment with a lot of dependencies, common in data science, can be very slow to start because of all the imports. Currently, Hamilton allows to disable autoloading via a user config or Python code. This require manual setups and is not the best default for some users. + +### `pandas` and `numpy` dependencies +Hamilton was initially created for workflows that used `pandas` and `numpy` heavily. For this reason, `numpy` and `pandas` are imported at the top-level of module `hamilton.base`. Because of the package structure, as a Hamilton user, you're importing `pandas` and `numpy` every time you import `hamilton`. + +A reasonable change would be to move `numpy` and `pandas` to a "lazy" location. Then, dependencies would only be imported when features requiring them are used and they could be removed from `pyproject.toml`. Unfortunately, plugin autoloading defaults make this solution a significant breaking change and insatisfactory. + +Since plugins are loaded based on the Python package available, removing `pandas` and `numpy` would allow disable the loading of these plugins. This would break popular CSV and parquet materializers. + +### `networkx` dependency +The `sf-hamilton[visualization]` extra currently includes `networkx` as a dependency, though it is never actually used. There's a single function requiring it and it could be implemented in pure Python. This has been made even easier with the addition of `graphlib` in the standard library in Python 3.9. diff --git a/hamilton-core/hamilton/__init__.py b/hamilton-core/hamilton/__init__.py new file mode 100644 index 000000000..469c2b01d --- /dev/null +++ b/hamilton-core/hamilton/__init__.py @@ -0,0 +1,70 @@ +import importlib.util +import pathlib +import sys +from types import ModuleType +from typing import Any + + +def _load_hamilton_module() -> ModuleType: + """Patch this relative import in the Hamilton core repository + + ```python + # hamilton/__init__.py + try: + from .version import VERSION as __version__ # noqa: F401 + except ImportError: + from version import VERSION as __version__ # noqa: F401 + ``` + """ + + origin_path = pathlib.Path(__file__).parent / "_hamilton" / "__init__.py" + origin_spec = importlib.util.spec_from_file_location("hamilton", origin_path) + origin_module = importlib.util.module_from_spec(origin_spec) + + # The following lines are only required if we don't modify `hamilton/__init__.py` + # source_segment = "from version import VERSION as __version__" + # # the namespace `hamilton._hamilton` is only temporarily available; it will be removed + # # by the end of this initialization + # patched_segment = "from hamilton._hamilton.version import VERSION as __version__" + + # source_code = pathlib.Path(origin_path).read_text() + # patched_code = source_code.replace(source_segment, patched_segment) + + # exec(patched_code, origin_module.__dict__) + # sys.modules["hamilton"] = origin_module + + origin_spec.loader.exec_module(origin_module) + return origin_module + + +def _load_hamilton_registry_module(): + module_path = pathlib.Path(__file__).parent / "_hamilton" / "registry.py" + module_spec = importlib.util.spec_from_file_location("hamilton.registry", module_path) + module = importlib.util.module_from_spec(module_spec) + module_spec.loader.exec_module(module) + return module + + +def _create_proxy_module() -> ModuleType: + proxy_module = ModuleType(__name__) + sys.modules[__name__] = proxy_module + return proxy_module + + +_registry_module = _load_hamilton_registry_module() +# disable plugin autoloading +_registry_module.disable_autoload() + +_origin_module = _load_hamilton_module() +_proxy_module = _create_proxy_module() + + +def __getattr__(name: str) -> Any: + try: + return getattr(_origin_module, name) + except AttributeError: + raise + + +# `getattr()` must be available to build the package +_proxy_module.__getattr__ = __getattr__ diff --git a/hamilton-core/setup.py b/hamilton-core/setup.py new file mode 100644 index 000000000..264eaea83 --- /dev/null +++ b/hamilton-core/setup.py @@ -0,0 +1,95 @@ +import os +import pathlib +import re +import shutil +import sys + +import tomllib +from setuptools import setup + +os.chdir(os.path.abspath(os.path.dirname(__file__))) + + +def copy_hamilton_library(): + setup_dir = pathlib.Path(__file__).resolve().parent + source_dir = (setup_dir.parent / "hamilton").resolve() + dest_dir = (setup_dir / "hamilton" / "_hamilton").resolve() + + # Safety checks + if not source_dir.is_dir(): + print(f"Error: Source directory does not exist: {source_dir}") + sys.exit(1) + + if not str(dest_dir).startswith(str(setup_dir)): + print(f"Error: Destination directory {dest_dir} is outside the setup directory {setup_dir}") + sys.exit(1) + + # Remove destination if it exists to avoid errors or stale files + if dest_dir.exists(): + print("delete: ", dest_dir) + shutil.rmtree(dest_dir) + + # Copy entire directory tree from source to destination + print(f"copy from: {source_dir}; to {dest_dir}") + shutil.copytree(source_dir, dest_dir) + + +def get_version(): + version_path = pathlib.Path(__file__).parent / "hamilton" / "_hamilton" / "version.py" + content = version_path.read_text() + match = re.search(r"^VERSION\s*=\s*\(([^)]+)\)", content, re.MULTILINE) + if match: + version_tuple_str = match.group(1) # "1, 88, 0" + # Parse tuple string into list of integers + version_parts = [part.strip() for part in version_tuple_str.split(",")] + version_str = ".".join(version_parts) + return version_str + + +copy_hamilton_library() + +pyproject_path = pathlib.Path(__file__).parents[1] / "pyproject.toml" +pyproject = tomllib.loads(pyproject_path.read_text()) +project = pyproject["project"] + +readme_file = project.get("readme", None) +console_scripts = [ + f"{name}={target}" + for name, target in project.get("entry-points", {}).get("console_scripts", {}).items() +] +install_requires = list(set(project.get("dependencies", [])).difference(set(["pandas", "numpy"]))) +extras_require = { + **project.get("optional-dependencies", {}), + **{"visualization": ["graphviz"]}, # drop networkx + **{ + "core-tests": [ # dependencies required to run unit tests; used in CI + "pytest", + "pytest-asyncio", + "pandas", + "typer", + "networkx", + "graphviz", + ] + }, +} + + +setup( + name="sf-hamilton-core", + version=get_version(), + description=project.get("description", ""), + long_description=pathlib.Path(readme_file).read_text() if readme_file else "", + long_description_content_type="text/markdown" if readme_file else None, + python_requires=project.get("requires-python", None), + license=project.get("license", {}).get("text", None), + keywords=project.get("keywords", []), + author=", ".join(a["name"] for a in project.get("authors", [])), + author_email=", ".join(a["email"] for a in project.get("authors", [])), + classifiers=project.get("classifiers", []), + install_requires=install_requires, + extras_require=extras_require, + entry_points={"console_scripts": console_scripts}, + project_urls=project.get("urls", {}), + packages=["hamilton"], + package_data={"hamilton": ["*.json", "*.md", "*.txt"]}, +) diff --git a/hamilton/__init__.py b/hamilton/__init__.py index a302407d2..4d3fd39f9 100644 --- a/hamilton/__init__.py +++ b/hamilton/__init__.py @@ -1,7 +1,7 @@ try: from .version import VERSION as __version__ # noqa: F401 except ImportError: - from version import VERSION as __version__ # noqa: F401 + from hamilton.version import VERSION as __version__ # noqa: F401 # this supposedly is required for namespace packages to work. __path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/hamilton/base.py b/hamilton/base.py index 02241aaca..77fb46a9d 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -20,21 +20,22 @@ It cannot import hamilton.graph, or hamilton.driver. """ +from __future__ import annotations + import abc import collections import logging -from typing import Any, Dict, List, Optional, Tuple, Type, Union - -import numpy as np -import pandas as pd -from pandas.core.indexes import extension as pd_extension +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from hamilton import htypes from hamilton.lifecycle import api as lifecycle_api -try: - from . import htypes, node -except ImportError: - import node +if TYPE_CHECKING: + import numpy as np + import pandas as pd + + import hamilton.node as node + logger = logging.getLogger(__name__) @@ -120,6 +121,8 @@ def pandas_index_types( :param outputs: the dict we're trying to create a result from. :return: dict of all index types, dict of time series/categorical index types, dict if there is no index """ + import pandas as pd + all_index_types = collections.defaultdict(list) time_indexes = collections.defaultdict(list) no_indexes = collections.defaultdict(list) @@ -131,6 +134,8 @@ def index_key_name(pd_object: Union[pd.DataFrame, pd.Series]) -> str: def get_parent_time_index_type(): """Helper to pull the right time index parent class.""" + from pandas.core.indexes import extension as pd_extension + if hasattr(pd_extension, "NDArrayBackedExtensionIndex"): index_type = pd_extension.NDArrayBackedExtensionIndex else: @@ -220,6 +225,8 @@ def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame: :param outputs: the outputs to build a dataframe from. """ + import pandas as pd + # TODO check inputs are pd.Series, arrays, or scalars -- else error output_index_type_tuple = PandasDataFrameResult.pandas_index_types(outputs) # this next line just log warnings @@ -255,6 +262,7 @@ def build_dataframe_with_dataframes(outputs: Dict[str, Any]) -> pd.DataFrame: :param outputs: The outputs to build the dataframe from. :return: A dataframe with the outputs. """ + import pandas as pd def get_output_name(output_name: str, column_name: str) -> str: """Add function prefix to columns. @@ -300,6 +308,8 @@ def input_types(self) -> List[Type[Type]]: return [Any] def output_type(self) -> Type: + import pandas as pd + return pd.DataFrame @@ -365,6 +375,8 @@ def build_result(**outputs: Dict[str, Any]) -> np.matrix: :param outputs: function_name -> np.array. :return: numpy matrix """ + import numpy as np + # TODO check inputs are all numpy arrays/array like things -- else error num_rows = -1 columns_with_lengths = collections.OrderedDict() @@ -402,6 +414,8 @@ def input_types(self) -> List[Type[Type]]: return [Any] # Typing def output_type(self) -> Type: + import pandas as pd + return pd.DataFrame