Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion internal/nn/glu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func TestSwiGLUFFN_DefaultFFNDim(t *testing.T) {
func TestNewLinearNoBias(t *testing.T) {
backend := autodiff.New(cpu.New())

linear := newLinearNoBias[Backend](128, 256, backend)
linear := NewLinear[Backend](128, 256, backend, WithBias(false))

// Check parameters
params := linear.Parameters()
Expand Down
74 changes: 66 additions & 8 deletions internal/nn/linear.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,49 @@ import (
"github.com/born-ml/born/internal/tensor"
)

// LinearOption is a functional option for configuring a Linear layer.
type LinearOption func(*linearConfig)

// linearConfig holds configuration for Linear layer creation.
type linearConfig struct {
useBias bool
}

// defaultLinearConfig returns the default configuration.
func defaultLinearConfig() linearConfig {
return linearConfig{
useBias: true, // Default: use bias (backwards compatible)
}
}

// WithBias sets whether the Linear layer should use bias.
//
// Default is true. Set to false for architectures like LLaMA that don't use bias.
//
// Example:
//
// // Linear layer without bias (LLaMA-style)
// lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
//
// // Linear layer with bias (default)
// layer := nn.NewLinear(784, 128, backend) // same as WithBias(true)
func WithBias(useBias bool) LinearOption {
return func(cfg *linearConfig) {
cfg.useBias = useBias
}
}

// Linear implements a fully connected (dense) layer.
//
// Performs the transformation: y = x @ W.T + b
// where:
// - x is the input tensor with shape [batch_size, in_features]
// - W is the weight matrix with shape [out_features, in_features]
// - b is the bias vector with shape [out_features]
// - b is the bias vector with shape [out_features] (optional, see WithBias)
// - y is the output tensor with shape [batch_size, out_features]
//
// Weights are initialized using Xavier/Glorot initialization.
// Biases are initialized to zeros.
// Biases are initialized to zeros (if enabled).
//
// Example:
//
Expand All @@ -25,6 +57,9 @@ import (
//
// input := tensor.Randn[float32](tensor.Shape{32, 784}, backend) // batch_size=32
// output := layer.Forward(input) // shape: [32, 128]
//
// // Without bias (for LLaMA-style models)
// lm_head := nn.NewLinear(512, vocab_size, backend, nn.WithBias(false))
type Linear[B tensor.Backend] struct {
inFeatures int
outFeatures int
Expand All @@ -36,24 +71,42 @@ type Linear[B tensor.Backend] struct {
// NewLinear creates a new Linear layer.
//
// Weights are initialized using Xavier/Glorot uniform distribution.
// Biases are initialized to zeros.
// Biases are initialized to zeros (if enabled).
//
// Parameters:
// - inFeatures: Number of input features
// - outFeatures: Number of output features
// - backend: Backend to use for tensor operations
// - opts: Optional configuration (see WithBias)
//
// Returns a new Linear layer.
func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B) *Linear[B] {
//
// Example:
//
// // With bias (default)
// layer := nn.NewLinear(784, 128, backend)
//
// // Without bias (for LLaMA, attention projections, etc.)
// lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B, opts ...LinearOption) *Linear[B] {
// Apply options
cfg := defaultLinearConfig()
for _, opt := range opts {
opt(&cfg)
}

// Weight: [out_features, in_features]
weightShape := tensor.Shape{outFeatures, inFeatures}
weightTensor := Xavier(inFeatures, outFeatures, weightShape, backend)
weight := NewParameter("weight", weightTensor)

// Bias: [out_features]
biasShape := tensor.Shape{outFeatures}
biasTensor := Zeros(biasShape, backend)
bias := NewParameter("bias", biasTensor)
// Bias: [out_features] (optional)
var bias *Parameter[B]
if cfg.useBias {
biasShape := tensor.Shape{outFeatures}
biasTensor := Zeros(biasShape, backend)
bias = NewParameter("bias", biasTensor)
}

return &Linear[B]{
inFeatures: inFeatures,
Expand Down Expand Up @@ -137,6 +190,11 @@ func (l *Linear[B]) OutFeatures() int {
return l.outFeatures
}

// HasBias returns true if this layer has a bias parameter.
func (l *Linear[B]) HasBias() bool {
return l.bias != nil
}

// StateDict returns a map of parameter names to raw tensors.
func (l *Linear[B]) StateDict() map[string]*tensor.RawTensor {
stateDict := make(map[string]*tensor.RawTensor)
Expand Down
93 changes: 93 additions & 0 deletions internal/nn/nn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,99 @@ func TestLinear_ForwardBatch(t *testing.T) {
}
}

// TestLinear_WithBias tests Linear layer WithBias option.
func TestLinear_WithBias(t *testing.T) {
backend := autodiff.New(cpu.New())

// Test 1: Default (with bias)
layerWithBias := nn.NewLinear(10, 5, backend)
if !layerWithBias.HasBias() {
t.Error("Default Linear should have bias")
}
if layerWithBias.Bias() == nil {
t.Error("Bias() should not be nil for default Linear")
}
if len(layerWithBias.Parameters()) != 2 {
t.Errorf("Default Linear should have 2 parameters, got %d", len(layerWithBias.Parameters()))
}

// Test 2: Explicit WithBias(true)
layerExplicitBias := nn.NewLinear(10, 5, backend, nn.WithBias(true))
if !layerExplicitBias.HasBias() {
t.Error("Linear with WithBias(true) should have bias")
}
if len(layerExplicitBias.Parameters()) != 2 {
t.Errorf("Linear with bias should have 2 parameters, got %d", len(layerExplicitBias.Parameters()))
}

// Test 3: WithBias(false)
layerNoBias := nn.NewLinear(10, 5, backend, nn.WithBias(false))
if layerNoBias.HasBias() {
t.Error("Linear with WithBias(false) should not have bias")
}
if layerNoBias.Bias() != nil {
t.Error("Bias() should be nil for Linear without bias")
}
if len(layerNoBias.Parameters()) != 1 {
t.Errorf("Linear without bias should have 1 parameter, got %d", len(layerNoBias.Parameters()))
}

// Verify weight is still properly initialized
weight := layerNoBias.Weight().Tensor()
expectedShape := tensor.Shape{5, 10}
if !weight.Shape().Equal(expectedShape) {
t.Errorf("Weight shape = %v, want %v", weight.Shape(), expectedShape)
}
}

// TestLinear_NoBias_Forward tests forward pass of Linear without bias.
func TestLinear_NoBias_Forward(t *testing.T) {
backend := autodiff.New(cpu.New())

// Create linear layer without bias
layer := nn.NewLinear(2, 2, backend, nn.WithBias(false))

// Set known weights: [[1, 2], [3, 4]] (out=2, in=2)
weightData := []float32{1, 2, 3, 4}
copy(layer.Weight().Tensor().Raw().AsFloat32(), weightData)

// Input: [[1, 1]] (batch=1, in=2)
input, _ := tensor.FromSlice([]float32{1, 1}, tensor.Shape{1, 2}, backend)

// Forward pass
output := layer.Forward(input)

// Expected:
// y = x @ W.T (no bias)
// W.T = [[1, 3], [2, 4]]
// x @ W.T = [1, 1] @ [[1, 3], [2, 4]] = [3, 7]
expected := []float32{3.0, 7.0}
actual := output.Raw().AsFloat32()

for i, exp := range expected {
if !floatEqual(actual[i], exp, 1e-5) {
t.Errorf("Output[%d] = %f, want %f", i, actual[i], exp)
}
}
}

// TestLinear_NoBias_StateDict tests StateDict for Linear without bias.
func TestLinear_NoBias_StateDict(t *testing.T) {
backend := autodiff.New(cpu.New())

layer := nn.NewLinear(4, 3, backend, nn.WithBias(false))

stateDict := layer.StateDict()

// Should have weight but no bias
if _, ok := stateDict["weight"]; !ok {
t.Error("StateDict should contain 'weight'")
}
if _, ok := stateDict["bias"]; ok {
t.Error("StateDict should not contain 'bias' for layer without bias")
}
}

// TestReLU_Forward tests ReLU activation.
func TestReLU_Forward(t *testing.T) {
backend := autodiff.New(cpu.New())
Expand Down
33 changes: 5 additions & 28 deletions internal/nn/swiglu_ffn.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,19 +91,12 @@ func NewSwiGLUFFN[B tensor.Backend](cfg SwiGLUFFNConfig, backend B) *SwiGLUFFN[B
panic(fmt.Sprintf("SwiGLUFFN: unknown GLUVariant %q, expected swiglu/geglu/reglu/glu", cfg.GLUVariant))
}

// Create projections
// Create projections using WithBias option
// Note: LLaMA doesn't use bias in FFN layers
var gateProj, upProj, downProj *Linear[B]

if cfg.UseBias {
gateProj = NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend)
upProj = NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend)
downProj = NewLinear[B](cfg.FFNDim, cfg.EmbedDim, backend)
} else {
gateProj = newLinearNoBias[B](cfg.EmbedDim, cfg.FFNDim, backend)
upProj = newLinearNoBias[B](cfg.EmbedDim, cfg.FFNDim, backend)
downProj = newLinearNoBias[B](cfg.FFNDim, cfg.EmbedDim, backend)
}
biasOpt := WithBias(cfg.UseBias)
gateProj := NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend, biasOpt)
upProj := NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend, biasOpt)
downProj := NewLinear[B](cfg.FFNDim, cfg.EmbedDim, backend, biasOpt)

return &SwiGLUFFN[B]{
gateProj: gateProj,
Expand Down Expand Up @@ -191,19 +184,3 @@ func (f *SwiGLUFFN[B]) UpProj() *Linear[B] {
func (f *SwiGLUFFN[B]) DownProj() *Linear[B] {
return f.downProj
}

// newLinearNoBias creates a Linear layer without bias.
func newLinearNoBias[B tensor.Backend](inFeatures, outFeatures int, backend B) *Linear[B] {
// Initialize weight with Xavier/Glorot
weightShape := tensor.Shape{outFeatures, inFeatures}
weightTensor := Xavier(inFeatures, outFeatures, weightShape, backend)
weight := NewParameter("weight", weightTensor)

return &Linear[B]{
weight: weight,
bias: nil, // No bias
inFeatures: inFeatures,
outFeatures: outFeatures,
backend: backend,
}
}
25 changes: 23 additions & 2 deletions nn/nn.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,35 @@
// Linear represents a fully connected (dense) layer.
type Linear[B tensor.Backend] = nn.Linear[B]

// LinearOption is a functional option for configuring a Linear layer.
type LinearOption = nn.LinearOption

// WithBias sets whether the Linear layer should use bias.
//
// Default is true. Set to false for architectures like LLaMA that don't use bias.
//
// Example:
//
// // Linear layer without bias (LLaMA-style)
// lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
//
// // Linear layer with bias (default)
// layer := nn.NewLinear(784, 128, backend) // same as WithBias(true)
func WithBias(useBias bool) LinearOption {
return nn.WithBias(useBias)

Check warning on line 43 in nn/nn.go

View check run for this annotation

Codecov / codecov/patch

nn/nn.go#L42-L43

Added lines #L42 - L43 were not covered by tests
}

// NewLinear creates a new linear layer with Xavier initialization.
//
// Example:
//
// backend := cpu.New()
// layer := nn.NewLinear(784, 128, backend)
func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B) *Linear[B] {
return nn.NewLinear(inFeatures, outFeatures, backend)
//
// // Without bias (for LLaMA, attention projections, etc.)
// lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B, opts ...LinearOption) *Linear[B] {
return nn.NewLinear(inFeatures, outFeatures, backend, opts...)

Check warning on line 56 in nn/nn.go

View check run for this annotation

Codecov / codecov/patch

nn/nn.go#L55-L56

Added lines #L55 - L56 were not covered by tests
}

// Conv2D represents a 2D convolutional layer.
Expand Down