FluxML · bors · Nov 9, 2020 · Nov 1, 2020 · Nov 8, 2020
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# v0.11.3
+* Added option to set `bias` to [false](https://github.com/FluxML/Flux.jl/pull/1379) to eliminating `bias` from being trained. 
+
 # v0.11.2
 
 * Adds the [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser.

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
@@ -24,7 +24,6 @@ ConvTranspose
 CrossCor
 SamePad
 flatten
-Flux.Zeros
 Flux.convfilter
 Flux.depthwiseconvfilter
 ```

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -83,7 +83,7 @@ extraChain(::Tuple{}, x) = ()
 
 
 """
-    Dense(in::Integer, out::Integer, σ = identity)
+    Dense(in::Integer, out::Integer, σ = identity; bias=true)
 
 Create a traditional `Dense` layer with parameters `W` and `b`.
 
@@ -92,6 +92,8 @@ Create a traditional `Dense` layer with parameters `W` and `b`.
 The input `x` must be a vector of length `in`, or a batch of vectors represented
 as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.
 
+Setting `bias` to `false` will switch bias off for the layer.
+
 # Example
 ```
 julia> d = Dense(5, 2)
@@ -101,9 +103,12 @@ julia> d(rand(5))
 2-element Array{Float32,1}:
  -0.16210233
   0.123119034
+
+julia> d = Dense(5, 2; bias=false)
+Dense(5, 2)
 ```
 """
-struct Dense{F,S<:AbstractArray,T<:AbstractArray}
+struct Dense{F,S<:AbstractArray,T<:Union{Zeros, AbstractVector}}
   W::S
   b::T
   σ::F
@@ -112,8 +117,8 @@ end
 Dense(W, b) = Dense(W, b, identity)
 
 function Dense(in::Integer, out::Integer, σ = identity;
-               initW = glorot_uniform, initb = zeros)
-  return Dense(initW(out, in), initb(out), σ)
+               initW = glorot_uniform, initb = zeros, bias=true)
+  return Dense(initW(out, in), create_bias(bias, initb, out), σ)
 end
 
 @functor Dense

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -46,7 +46,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -82,7 +82,7 @@ end
 
 Constructs the convolutional layer with user defined weight and bias arrays.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -102,15 +102,16 @@ Conv(weight = weight,
     σ = sigmoid)
 ```
 """
-function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function Conv(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(Conv, pad, size(w)[1:N-2], dilation, stride)
-  return Conv(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, size(w, N)) 
+  return Conv(σ, w, bias, stride, pad, dilation)
 end
 
-function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function Conv(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
               activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
@@ -131,7 +132,7 @@ convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
             init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
-            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+            weight = convfilter(k, ch, init = init), bias = true) where N
 
   Conv(weight, bias, σ,
       stride = stride, pad = pad, dilation = dilation)
@@ -189,7 +190,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -215,7 +216,7 @@ end
 Constructs the convolutional transpose layer with user defined weight and bias arrays.
 forward pass.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -226,22 +227,23 @@ indicating padding values for each spatial dimension at both the ends.
 
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
-function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
                       stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(ConvTranspose, pad, size(w)[1:N-2], dilation, stride)
-  return ConvTranspose(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, size(w, N)) 
+  return ConvTranspose(σ, w, bias, stride, pad, dilation)
 end
 
-function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
                         activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
 
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+                      weight = convfilter(k, reverse(ch), init = init), bias = true) where N
 
   ConvTranspose(weight, bias, σ,
               stride = stride, pad = pad, dilation = dilation)
@@ -307,7 +309,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -333,7 +335,7 @@ end
 Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
 forward pass.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -344,15 +346,16 @@ indicating padding values for each spatial dimension at both the ends.
 
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
-function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
                       stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(DepthwiseConv, pad, size(w)[1:N-2], dilation, stride)
-  return DepthwiseConv(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, prod(size(w)[N-1:end])) 
+  return DepthwiseConv(σ, w, bias, stride, pad, dilation)
 end
 
-function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
                       activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
@@ -373,7 +376,7 @@ depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+                      weight = depthwiseconvfilter(k, ch, init = init), bias = true) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
 
   return DepthwiseConv(
@@ -424,7 +427,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -461,7 +464,7 @@ end
 Constructs the standard cross convolutional layer with user defined weight and bias
 arrays.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -472,22 +475,23 @@ indicating padding values for each spatial dimension at both the ends.
 
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
-function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function CrossCor(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
                   stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(CrossCor, pad, size(w)[1:N-2], dilation, stride)
-  return CrossCor(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, size(w, N)) 
+  return CrossCor(σ, w, bias, stride, pad, dilation)
 end
 
-function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
                       activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
 
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                   init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+                  weight = convfilter(k, ch, init = init), bias = true) where N
 
   CrossCor(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)

diff --git a/src/utils.jl b/src/utils.jl
@@ -176,6 +176,20 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
 ones(dims...) = Base.ones(Float32, dims...)
 zeros(dims...) = Base.zeros(Float32, dims...)
 
+"""
+    create_bias(shallcreate::Bool, iftrue, dims...) 
+    create_bias(x, ::Any...)
+
+Return a bias parameter for a layer.
+
+Essentially handles the allowed input options for the `bias` keyword:
+    If `false`: Return the `Zeros` type which turns bias off.
+    If `true` : Return the result of `iftrue(dims)`.
+    If not a boolean, return self to handle the case of bias=somearray.
+"""
+create_bias(shallcreate::Bool, iftrue, dims...) = shallcreate ? iftrue(dims...) : Zeros()
+create_bias(x, ::Any...) = x  
+
 """
     unsqueeze(xs, dim)
 

diff --git a/src/zeros.jl b/src/zeros.jl
@@ -1,10 +1,7 @@
-import Base: +, -, *, reshape, size
-import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
+import Base: +, -, *,/, reshape, broadcasted
 
 """
     Zeros()
-    Zeros(size...)
-    Zeros(Type, size...)
 
 Acts as a stand-in for an array of zeros that can be
 used during training which is ignored by the optimisers.
@@ -13,94 +10,40 @@ Useful to turn bias off for a forward pass of a layer.
 
 ## Examples
 
-```julia
-julia> Flux.Zeros(3,3)
-3×3 Flux.Zeros{Bool,2}:
- false  false  false
- false  false  false
- false  false  false
-
-julia> Flux.Zeros(Float32, 3,3)
-3×3 Flux.Zeros{Float32,2}:
- 0.0  0.0  0.0
- 0.0  0.0  0.0
- 0.0  0.0  0.0
+```julia-repl
+julia> bias_less_conv = Conv((2,2), 1=>3; bias = false)
+Conv((2, 2), 1=>3)
 
-julia> rand(3,3) .+ Flux.Zeros()
-3×3 Array{Float64,2}:
- 0.198739  0.490459  0.785386
- 0.779074  0.39986   0.66383
- 0.854981  0.447292  0.314497
+julia> params(bias_less_conv) |> length
+1
 
-julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
-Conv((2, 2), 1=>3)
+julia> bias_less_conv.bias
+Flux.Zeros()
 ```
 """
-struct Zeros{T,N} <: AbstractArray{T,N}
-  size::Tuple
-end
-
-Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
-Zeros(sz::Integer...) = Zeros(Bool, sz...)
-
-Base.size(xs::Zeros) = xs.size
-Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
-
-Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
-
-Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
-Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
-              Zeros(T, length(inds))
-
-Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
-
-@adjoint reshape(xs::Zeros{T}, dims...) where T =
-                reshape(xs, dims...), _ -> nothing
-
-# Define basic ops
-for f in (:+, :-)
-  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
-    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
-    a
-  end
-end
-
-+(a::Zeros, b::AbstractArray) = b + a
--(a::Zeros, b::AbstractArray) = -b + a
-
-Base.copy(xs::Zeros{T,N}) where {T,N} = xs
-
-# Define broadcasting behaviour
-for op in (:+, :-)
-  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
-    bs = Broadcast.broadcast_shape(size(a), size(b))
-    size(a) == bs && return a
-    sz = similar(a, bs)
-    sz .= a
-  end
-end
-
-broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
-broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
+struct Zeros end
+# To allow for things like Dense(10, 2, initb = Zeros)
+Zeros(args...) = Zeros()
 
-function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
-  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
-end
+Base.reshape(x::Zeros, dims...) = x
 
-broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
++(::Zeros, b::AbstractArray) = b
++(a::AbstractArray, ::Zeros) = a
++(a::Zeros, ::Zeros) = a
 
-for op in (:+, :-, :*)
-  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
-end
+-(::Zeros, b::AbstractArray) = -b
+-(a::AbstractArray, ::Zeros) = a
+-(a::Zeros, ::Zeros) = a
 
 # Some opportunities to avoid scalar indexing, intermediaries
 # Since it replicates a little of what we expect Base to do,
 # it should be possible to remove in the future, but for now,
 # these help with performance.
-broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
-broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
-broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
-broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
-broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(+), a::AbstractArray, b::Zeros) = a
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = b
+broadcasted(::typeof(-), a::AbstractArray, b::Zeros) = a
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = -b
+# Need adjoints for these or else the gradient w.r.t to the non-Zeros arg will be nothing as well
+@adjoint broadcasted(::typeof(*), a::AbstractArray, b::Zeros) = zero(a), _ -> (nothing, zero(a), nothing)
+@adjoint broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = zero(b), _ -> (nothing, nothing, zero(b))
+@adjoint broadcasted(::typeof(/), a::Zeros, b::AbstractArray) = zero(b), _ -> (nothing, nothing, zero(b))