born-ml · kolkov · Dec 27, 2025 · Dec 27, 2025
@@ -305,7 +305,7 @@ func TestSwiGLUFFN_DefaultFFNDim(t *testing.T) {
 func TestNewLinearNoBias(t *testing.T) {
 	backend := autodiff.New(cpu.New())
 
-	linear := newLinearNoBias[Backend](128, 256, backend)
+	linear := NewLinear[Backend](128, 256, backend, WithBias(false))
 
 	// Check parameters
 	params := linear.Parameters()

@@ -6,17 +6,49 @@ import (
 	"github.com/born-ml/born/internal/tensor"
 )
 
+// LinearOption is a functional option for configuring a Linear layer.
+type LinearOption func(*linearConfig)
+
+// linearConfig holds configuration for Linear layer creation.
+type linearConfig struct {
+	useBias bool
+}
+
+// defaultLinearConfig returns the default configuration.
+func defaultLinearConfig() linearConfig {
+	return linearConfig{
+		useBias: true, // Default: use bias (backwards compatible)
+	}
+}
+
+// WithBias sets whether the Linear layer should use bias.
+//
+// Default is true. Set to false for architectures like LLaMA that don't use bias.
+//
+// Example:
+//
+//	// Linear layer without bias (LLaMA-style)
+//	lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
+//
+//	// Linear layer with bias (default)
+//	layer := nn.NewLinear(784, 128, backend)  // same as WithBias(true)
+func WithBias(useBias bool) LinearOption {
+	return func(cfg *linearConfig) {
+		cfg.useBias = useBias
+	}
+}
+
 // Linear implements a fully connected (dense) layer.
 //
 // Performs the transformation: y = x @ W.T + b
 // where:
 //   - x is the input tensor with shape [batch_size, in_features]
 //   - W is the weight matrix with shape [out_features, in_features]
-//   - b is the bias vector with shape [out_features]
+//   - b is the bias vector with shape [out_features] (optional, see WithBias)
 //   - y is the output tensor with shape [batch_size, out_features]
 //
 // Weights are initialized using Xavier/Glorot initialization.
-// Biases are initialized to zeros.
+// Biases are initialized to zeros (if enabled).
 //
 // Example:
 //
@@ -25,6 +57,9 @@ import (
 //
 //	input := tensor.Randn[float32](tensor.Shape{32, 784}, backend)  // batch_size=32
 //	output := layer.Forward(input)  // shape: [32, 128]
+//
+//	// Without bias (for LLaMA-style models)
+//	lm_head := nn.NewLinear(512, vocab_size, backend, nn.WithBias(false))
 type Linear[B tensor.Backend] struct {
 	inFeatures  int
 	outFeatures int
@@ -36,24 +71,42 @@ type Linear[B tensor.Backend] struct {
 // NewLinear creates a new Linear layer.
 //
 // Weights are initialized using Xavier/Glorot uniform distribution.
-// Biases are initialized to zeros.
+// Biases are initialized to zeros (if enabled).
 //
 // Parameters:
 //   - inFeatures: Number of input features
 //   - outFeatures: Number of output features
 //   - backend: Backend to use for tensor operations
+//   - opts: Optional configuration (see WithBias)
 //
 // Returns a new Linear layer.
-func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B) *Linear[B] {
+//
+// Example:
+//
+//	// With bias (default)
+//	layer := nn.NewLinear(784, 128, backend)
+//
+//	// Without bias (for LLaMA, attention projections, etc.)
+//	lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
+func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B, opts ...LinearOption) *Linear[B] {
+	// Apply options
+	cfg := defaultLinearConfig()
+	for _, opt := range opts {
+		opt(&cfg)
+	}
+
 	// Weight: [out_features, in_features]
 	weightShape := tensor.Shape{outFeatures, inFeatures}
 	weightTensor := Xavier(inFeatures, outFeatures, weightShape, backend)
 	weight := NewParameter("weight", weightTensor)
 
-	// Bias: [out_features]
-	biasShape := tensor.Shape{outFeatures}
-	biasTensor := Zeros(biasShape, backend)
-	bias := NewParameter("bias", biasTensor)
+	// Bias: [out_features] (optional)
+	var bias *Parameter[B]
+	if cfg.useBias {
+		biasShape := tensor.Shape{outFeatures}
+		biasTensor := Zeros(biasShape, backend)
+		bias = NewParameter("bias", biasTensor)
+	}
 
 	return &Linear[B]{
 		inFeatures:  inFeatures,
@@ -137,6 +190,11 @@ func (l *Linear[B]) OutFeatures() int {
 	return l.outFeatures
 }
 
+// HasBias returns true if this layer has a bias parameter.
+func (l *Linear[B]) HasBias() bool {
+	return l.bias != nil
+}
+
 // StateDict returns a map of parameter names to raw tensors.
 func (l *Linear[B]) StateDict() map[string]*tensor.RawTensor {
 	stateDict := make(map[string]*tensor.RawTensor)

@@ -163,6 +163,99 @@ func TestLinear_ForwardBatch(t *testing.T) {
 	}
 }
 
+// TestLinear_WithBias tests Linear layer WithBias option.
+func TestLinear_WithBias(t *testing.T) {
+	backend := autodiff.New(cpu.New())
+
+	// Test 1: Default (with bias)
+	layerWithBias := nn.NewLinear(10, 5, backend)
+	if !layerWithBias.HasBias() {
+		t.Error("Default Linear should have bias")
+	}
+	if layerWithBias.Bias() == nil {
+		t.Error("Bias() should not be nil for default Linear")
+	}
+	if len(layerWithBias.Parameters()) != 2 {
+		t.Errorf("Default Linear should have 2 parameters, got %d", len(layerWithBias.Parameters()))
+	}
+
+	// Test 2: Explicit WithBias(true)
+	layerExplicitBias := nn.NewLinear(10, 5, backend, nn.WithBias(true))
+	if !layerExplicitBias.HasBias() {
+		t.Error("Linear with WithBias(true) should have bias")
+	}
+	if len(layerExplicitBias.Parameters()) != 2 {
+		t.Errorf("Linear with bias should have 2 parameters, got %d", len(layerExplicitBias.Parameters()))
+	}
+
+	// Test 3: WithBias(false)
+	layerNoBias := nn.NewLinear(10, 5, backend, nn.WithBias(false))
+	if layerNoBias.HasBias() {
+		t.Error("Linear with WithBias(false) should not have bias")
+	}
+	if layerNoBias.Bias() != nil {
+		t.Error("Bias() should be nil for Linear without bias")
+	}
+	if len(layerNoBias.Parameters()) != 1 {
+		t.Errorf("Linear without bias should have 1 parameter, got %d", len(layerNoBias.Parameters()))
+	}
+
+	// Verify weight is still properly initialized
+	weight := layerNoBias.Weight().Tensor()
+	expectedShape := tensor.Shape{5, 10}
+	if !weight.Shape().Equal(expectedShape) {
+		t.Errorf("Weight shape = %v, want %v", weight.Shape(), expectedShape)
+	}
+}
+
+// TestLinear_NoBias_Forward tests forward pass of Linear without bias.
+func TestLinear_NoBias_Forward(t *testing.T) {
+	backend := autodiff.New(cpu.New())
+
+	// Create linear layer without bias
+	layer := nn.NewLinear(2, 2, backend, nn.WithBias(false))
+
+	// Set known weights: [[1, 2], [3, 4]] (out=2, in=2)
+	weightData := []float32{1, 2, 3, 4}
+	copy(layer.Weight().Tensor().Raw().AsFloat32(), weightData)
+
+	// Input: [[1, 1]] (batch=1, in=2)
+	input, _ := tensor.FromSlice([]float32{1, 1}, tensor.Shape{1, 2}, backend)
+
+	// Forward pass
+	output := layer.Forward(input)
+
+	// Expected:
+	// y = x @ W.T (no bias)
+	// W.T = [[1, 3], [2, 4]]
+	// x @ W.T = [1, 1] @ [[1, 3], [2, 4]] = [3, 7]
+	expected := []float32{3.0, 7.0}
+	actual := output.Raw().AsFloat32()
+
+	for i, exp := range expected {
+		if !floatEqual(actual[i], exp, 1e-5) {
+			t.Errorf("Output[%d] = %f, want %f", i, actual[i], exp)
+		}
+	}
+}
+
+// TestLinear_NoBias_StateDict tests StateDict for Linear without bias.
+func TestLinear_NoBias_StateDict(t *testing.T) {
+	backend := autodiff.New(cpu.New())
+
+	layer := nn.NewLinear(4, 3, backend, nn.WithBias(false))
+
+	stateDict := layer.StateDict()
+
+	// Should have weight but no bias
+	if _, ok := stateDict["weight"]; !ok {
+		t.Error("StateDict should contain 'weight'")
+	}
+	if _, ok := stateDict["bias"]; ok {
+		t.Error("StateDict should not contain 'bias' for layer without bias")
+	}
+}
+
 // TestReLU_Forward tests ReLU activation.
 func TestReLU_Forward(t *testing.T) {
 	backend := autodiff.New(cpu.New())

@@ -91,19 +91,12 @@ func NewSwiGLUFFN[B tensor.Backend](cfg SwiGLUFFNConfig, backend B) *SwiGLUFFN[B
 		panic(fmt.Sprintf("SwiGLUFFN: unknown GLUVariant %q, expected swiglu/geglu/reglu/glu", cfg.GLUVariant))
 	}
 
-	// Create projections
+	// Create projections using WithBias option
 	// Note: LLaMA doesn't use bias in FFN layers
-	var gateProj, upProj, downProj *Linear[B]
-
-	if cfg.UseBias {
-		gateProj = NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend)
-		upProj = NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend)
-		downProj = NewLinear[B](cfg.FFNDim, cfg.EmbedDim, backend)
-	} else {
-		gateProj = newLinearNoBias[B](cfg.EmbedDim, cfg.FFNDim, backend)
-		upProj = newLinearNoBias[B](cfg.EmbedDim, cfg.FFNDim, backend)
-		downProj = newLinearNoBias[B](cfg.FFNDim, cfg.EmbedDim, backend)
-	}
+	biasOpt := WithBias(cfg.UseBias)
+	gateProj := NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend, biasOpt)
+	upProj := NewLinear[B](cfg.EmbedDim, cfg.FFNDim, backend, biasOpt)
+	downProj := NewLinear[B](cfg.FFNDim, cfg.EmbedDim, backend, biasOpt)
 
 	return &SwiGLUFFN[B]{
 		gateProj: gateProj,
@@ -191,19 +184,3 @@ func (f *SwiGLUFFN[B]) UpProj() *Linear[B] {
 func (f *SwiGLUFFN[B]) DownProj() *Linear[B] {
 	return f.downProj
 }
-
-// newLinearNoBias creates a Linear layer without bias.
-func newLinearNoBias[B tensor.Backend](inFeatures, outFeatures int, backend B) *Linear[B] {
-	// Initialize weight with Xavier/Glorot
-	weightShape := tensor.Shape{outFeatures, inFeatures}
-	weightTensor := Xavier(inFeatures, outFeatures, weightShape, backend)
-	weight := NewParameter("weight", weightTensor)
-
-	return &Linear[B]{
-		weight:      weight,
-		bias:        nil, // No bias
-		inFeatures:  inFeatures,
-		outFeatures: outFeatures,
-		backend:     backend,
-	}
-}
@@ -25,14 +25,35 @@
 // Linear represents a fully connected (dense) layer.
 type Linear[B tensor.Backend] = nn.Linear[B]
 
+// LinearOption is a functional option for configuring a Linear layer.
+type LinearOption = nn.LinearOption
+
+// WithBias sets whether the Linear layer should use bias.
+//
+// Default is true. Set to false for architectures like LLaMA that don't use bias.
+//
+// Example:
+//
+//	// Linear layer without bias (LLaMA-style)
+//	lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
+//
+//	// Linear layer with bias (default)
+//	layer := nn.NewLinear(784, 128, backend)  // same as WithBias(true)
+func WithBias(useBias bool) LinearOption {
+	return nn.WithBias(useBias)
+}
+
 // NewLinear creates a new linear layer with Xavier initialization.
 //
 // Example:
 //
 //	backend := cpu.New()
 //	layer := nn.NewLinear(784, 128, backend)
-func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B) *Linear[B] {
-	return nn.NewLinear(inFeatures, outFeatures, backend)
+//
+//	// Without bias (for LLaMA, attention projections, etc.)
+//	lm_head := nn.NewLinear(hidden_size, vocab_size, backend, nn.WithBias(false))
+func NewLinear[B tensor.Backend](inFeatures, outFeatures int, backend B, opts ...LinearOption) *Linear[B] {
+	return nn.NewLinear(inFeatures, outFeatures, backend, opts...)
 }
 
 // Conv2D represents a 2D convolutional layer.