A Python library for declarative data mapping and transformation that emphasizes reusability and composability. Define your target schemas once and bind them to different data sources with intuitive operator overloading.
- π― Reusable Schema Definitions - Define once, use across multiple data sources
- π§ Intuitive Operators -
>>,|,&,@,+for elegant data transformations - ποΈ Type Agnostic - Works with dicts, dataclasses, Pydantic models, any attributable objects
- π§© Composable Architecture - Mix and match field types and transformations
- π‘οΈ Comprehensive Validation - Built-in error handling and validation
- π Batch Processing - Transform lists of data efficiently
- π¨ Clean API - Readable, maintainable transformation code
pip install schematixfrom schematix import Schema, Field
# Define your target schema
class UserSchema(Schema):
id = Field(source='user_id', target='id')
email = Field(source='email_address', target='email', required=True)
name = Field(source='first_name') + Field(source='last_name')
# Transform data
data = {
'user_id': 123,
'email_address': '[email protected]',
'first_name': 'John',
'last_name': 'Doe'
}
user = UserSchema().transform(data)
# Result: {'id': 123, 'email': '[email protected]', 'name': 'John Doe'}Schematix provides intuitive operators for common transformation patterns:
source_field >> target_field # Extract from source, assign to targetField(source='email') | Field(source='contact_email') # Try email, fallback to contact_emailuser_fields = Field(source='name') & Field(source='email') & Field(source='age')Field(source='name') @ 'user.profile' # Extract name from data.user.profile.nameField(source='first') + Field(source='last') # "John" + "Doe" = "John Doe"
Field(source='price') + Field(source='tax') # 100 + 15 = 115class UserSchema(Schema):
id = Field()
email = Field(required=True)
name = Field()
# Bind to different data sources
reddit_users = UserSchema().bind({
'id': 'user_id',
'email': 'email_addr',
'name': ('username', str.title) # Extract username and titlecase it
})
api_users = UserSchema().bind({
'id': 'uid',
'email': 'contact.email',
'name': lambda data: f"{data['first']} {data['last']}"
})
# Transform from different sources
reddit_user = reddit_users.transform(reddit_data)
api_user = api_users.transform(api_data)from schematix import SourceField, TargetField
# SourceField with fallbacks and conditions
email = SourceField(
source='primary_email',
fallbacks=['secondary_email', 'contact.email'],
condition=lambda data: data.get('active', True)
)
# TargetField with formatting and multiple targets
name = TargetField(
target='display_name',
formatter=str.title,
additionaltargets=['full_name', 'user_name']
)from dataclasses import dataclass
@dataclass
class User:
id: int
email: str
name: str
# Convert directly to dataclass
user_obj = UserSchema().transform(data, typetarget=User)
print(type(user_obj)) # <class '__main__.User'># Merge schemas
BaseUserSchema = Schema.merge(ContactSchema, ProfileSchema)
# Copy with modifications
ExtendedUserSchema = BaseUserSchema.copy(
created_at=Field(source='registration_date'),
is_premium=Field(source='account_type', transform=lambda x: x == 'premium')
)
# Create subsets
PublicUserSchema = ExtendedUserSchema.subset('id', 'name', 'email')# GitHub API to internal user format
class GitHubUserSchema(Schema):
id = Field(source='id')
username = Field(source='login')
name = Field(source='name') | Field(source='login') # Fallback to login
email = Field(source='email')
repos = Field(source='public_repos', default=0)
profile = Field(source='html_url')
github_user = GitHubUserSchema().transform(github_api_response)# Normalize product data from different e-commerce sites
class ProductSchema(Schema):
name = Field()
price = Field(transform=lambda x: float(x.replace('$', '')))
rating = Field(default=0.0)
# Site-specific bindings
amazon_products = ProductSchema().bind({
'name': 'title',
'price': 'price.amount',
'rating': 'averageRating'
})
ebay_products = ProductSchema().bind({
'name': 'itemTitle',
'price': 'currentPrice.value',
'rating': ('feedbackScore', lambda x: x / 100) # Convert to 0-5 scale
})# Database to data warehouse transformation
class AnalyticsUserSchema(Schema):
user_id = Field(source='id', required=True)
signup_date = Field(source='created_at', transform=parse_date)
lifetime_value = Field(source='orders', transform=calculate_ltv)
segment = (
Field(source='total_spent', transform=lambda x: 'premium' if x > 1000 else 'standard') |
Field(default='unknown')
)
# Batch processing
users = AnalyticsUserSchema().transformplural(user_records)# Comprehensive validation
errors = UserSchema().validate(data)
if errors:
print(f"Validation errors: {errors}")
# Field-level error handling with fallbacks
safe_extraction = (
Field(source='primary_source', required=True) |
Field(source='backup_source') |
Field(default='fallback_value')
)import schematix as sx
# Define fields using decorators
@sx.field
class UserID:
source = 'user_id'
required = True
@sx.field.accumulated
class FullName:
fields = [
sx.Field(source='first_name'),
sx.Field(source='last_name')
]
# Define schema using decorator
@sx.schema
class UserSchema:
id = UserID
email = sx.Field(source='email_address', required=True)
name = FullName
# Same transformation capability
user = UserSchema().transform(data)Schematix now includes a powerful transform system for data processing pipelines:
from schematix.transforms import text, numbers, common
# Pipeline composition with >> operator
name_cleaner = text.strip >> text.title >> text.normalizewhitespace
# Fallback logic with | operator
safe_number = numbers.to.int | numbers.constant(0)
# Parallel processing with & operator
multi_format = numbers.format.currency & numbers.format.percent
# Real-world cleaning pipeline
email_processor = common.clean.email >> common.validate.email- Text: String manipulation, regex, encoding, formatting (35+ transforms)
- Numbers: Math operations, formatting, validation (30+ transforms)
- Dates: Parsing, formatting, timezone handling (40+ transforms)
- Collections: List/dict operations, filtering, aggregation (25+ transforms)
- Validation: Format checking, cleaning, requirements (20+ transforms)
- Common: Pre-built patterns for real-world use cases (25+ transforms)
# Context-aware transforms
full_name = transforms.multifield(['first_name', 'last_name'],
lambda f, l: f"{f} {l}")
# Conditional transforms
format_price = transforms.conditional(
lambda x: x > 100,
numbers.format.currency(),
numbers.format.commas()
)
# Safe operations with fallbacks
safe_clean = common.clean.safe.email(default="[email protected]")# Use transforms in field definitions
class UserSchema(Schema):
name = Field(source='full_name', transform=text.strip >> text.title)
email = Field(source='email_addr', transform=common.clean.email)
price = Field(source='amount', transform=numbers.to.float >> numbers.format.currency)
# Or use the short aliases
import schematix as sx
class ProductSchema(sx.Schema):
title = sx.Field(source='name', transform=sx.x.txt.title)
cost = sx.Field(source='price', transform=sx.x.num.format.currency())Schematix is actively developed and production-ready:
- β 173 passing tests with comprehensive coverage
- β Type hints throughout for excellent IDE support
- β Detailed documentation and examples
- β Semantic versioning and changelog
- β MIT License - use freely in commercial projects
Contributions are welcome! Please see CONTRIBUTING.md for guidelines.
MIT License - see LICENSE for details.