Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix some issues with Zeros option 2 #1379

Merged
merged 2 commits into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# v0.11.3
* Added option to set `bias` to [false](https://github.com/FluxML/Flux.jl/pull/1379) to eliminating `bias` from being trained.

# v0.11.2

* Adds the [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser.
Expand Down
1 change: 0 additions & 1 deletion docs/src/models/layers.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ ConvTranspose
CrossCor
SamePad
flatten
Flux.Zeros
Flux.convfilter
Flux.depthwiseconvfilter
```
Expand Down
13 changes: 9 additions & 4 deletions src/layers/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ extraChain(::Tuple{}, x) = ()


"""
Dense(in::Integer, out::Integer, σ = identity)
Dense(in::Integer, out::Integer, σ = identity; bias=true)

Create a traditional `Dense` layer with parameters `W` and `b`.

Expand All @@ -92,6 +92,8 @@ Create a traditional `Dense` layer with parameters `W` and `b`.
The input `x` must be a vector of length `in`, or a batch of vectors represented
as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.

Setting `bias` to `false` will switch bias off for the layer.

# Example
```
julia> d = Dense(5, 2)
Expand All @@ -101,9 +103,12 @@ julia> d(rand(5))
2-element Array{Float32,1}:
-0.16210233
0.123119034

julia> d = Dense(5, 2; bias=false)
Dense(5, 2)
```
"""
struct Dense{F,S<:AbstractArray,T<:AbstractArray}
struct Dense{F,S<:AbstractArray,T<:Union{Zeros, AbstractVector}}
W::S
b::T
σ::F
Expand All @@ -112,8 +117,8 @@ end
Dense(W, b) = Dense(W, b, identity)

function Dense(in::Integer, out::Integer, σ = identity;
initW = glorot_uniform, initb = zeros)
return Dense(initW(out, in), initb(out), σ)
initW = glorot_uniform, initb = zeros, bias=true)
return Dense(initW(out, in), create_bias(bias, initb, out), σ)
end

@functor Dense
Expand Down
52 changes: 28 additions & 24 deletions src/layers/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array.

Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Setting `bias` to `false` will switch bias off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand Down Expand Up @@ -82,7 +82,7 @@ end

Constructs the convolutional layer with user defined weight and bias arrays.

Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Setting `bias` to `false` would switch `bias` off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand All @@ -102,15 +102,16 @@ Conv(weight = weight,
σ = sigmoid)
```
"""
function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
function Conv(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride)
dilation = expand(Val(N-2), dilation)
pad = calc_padding(Conv, pad, size(w)[1:N-2], dilation, stride)
return Conv(σ, w, b, stride, pad, dilation)
bias = create_bias(b, zeros, size(w, N))
return Conv(σ, w, bias, stride, pad, dilation)
end

function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
function Conv(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end
Expand All @@ -131,7 +132,7 @@ convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};

function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
weight = convfilter(k, ch, init = init), bias = true) where N

Conv(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation)
Expand Down Expand Up @@ -189,7 +190,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array.

Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Setting `bias` to `false` will switch bias off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand All @@ -215,7 +216,7 @@ end
Constructs the convolutional transpose layer with user defined weight and bias arrays.
forward pass.

Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Setting `bias` to `false` will switch bias off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand All @@ -226,22 +227,23 @@ indicating padding values for each spatial dimension at both the ends.

For keyword-only constuctor, see also [`Conv`](@ref)
"""
function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
function ConvTranspose(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride)
dilation = expand(Val(N-2), dilation)
pad = calc_padding(ConvTranspose, pad, size(w)[1:N-2], dilation, stride)
return ConvTranspose(σ, w, b, stride, pad, dilation)
bias = create_bias(b, zeros, size(w, N))
return ConvTranspose(σ, w, bias, stride, pad, dilation)
end

function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end

function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
weight = convfilter(k, reverse(ch), init = init), bias = true) where N

ConvTranspose(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation)
Expand Down Expand Up @@ -307,7 +309,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array.

Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Setting `bias` to `false` will switch bias off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand All @@ -333,7 +335,7 @@ end
Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
forward pass.

Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Setting `bias` to `false` would switch `bias` off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand All @@ -344,15 +346,16 @@ indicating padding values for each spatial dimension at both the ends.

For keyword-only constuctor, see also [`Conv`](@ref)
"""
function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride)
dilation = expand(Val(N-2), dilation)
pad = calc_padding(DepthwiseConv, pad, size(w)[1:N-2], dilation, stride)
return DepthwiseConv(σ, w, b, stride, pad, dilation)
bias = create_bias(b, zeros, prod(size(w)[N-1:end]))
return DepthwiseConv(σ, w, bias, stride, pad, dilation)
end

function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end
Expand All @@ -373,7 +376,7 @@ depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};

function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
weight = depthwiseconvfilter(k, ch, init = init), bias = true) where N
@assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"

return DepthwiseConv(
Expand Down Expand Up @@ -424,7 +427,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array.

Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Setting `bias` to `false` will switch bias off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand Down Expand Up @@ -461,7 +464,7 @@ end
Constructs the standard cross convolutional layer with user defined weight and bias
arrays.

Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Setting `bias` to `false` would switch `bias` off for the layer.

Takes the keyword arguments `pad`, `stride` and `dilation`.
For input dimension N,
Expand All @@ -472,22 +475,23 @@ indicating padding values for each spatial dimension at both the ends.

For keyword-only constuctor, see also [`Conv`](@ref)
"""
function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
function CrossCor(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride)
dilation = expand(Val(N-2), dilation)
pad = calc_padding(CrossCor, pad, size(w)[1:N-2], dilation, stride)
return CrossCor(σ, w, b, stride, pad, dilation)
bias = create_bias(b, zeros, size(w, N))
return CrossCor(σ, w, bias, stride, pad, dilation)
end

function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end

function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
weight = convfilter(k, ch, init = init), bias = true) where N

CrossCor(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation)
Expand Down
14 changes: 14 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,20 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
ones(dims...) = Base.ones(Float32, dims...)
zeros(dims...) = Base.zeros(Float32, dims...)

"""
create_bias(shallcreate::Bool, iftrue, dims...)
create_bias(x, ::Any...)

Return a bias parameter for a layer.

Essentially handles the allowed input options for the `bias` keyword:
If `false`: Return the `Zeros` type which turns bias off.
If `true` : Return the result of `iftrue(dims)`.
If not a boolean, return self to handle the case of bias=somearray.
"""
create_bias(shallcreate::Bool, iftrue, dims...) = shallcreate ? iftrue(dims...) : Zeros()
create_bias(x, ::Any...) = x

"""
unsqueeze(xs, dim)

Expand Down
109 changes: 26 additions & 83 deletions src/zeros.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import Base: +, -, *, reshape, size
import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
import Base: +, -, *,/, reshape, broadcasted

"""
Zeros()
Zeros(size...)
Zeros(Type, size...)

Acts as a stand-in for an array of zeros that can be
used during training which is ignored by the optimisers.
Expand All @@ -13,94 +10,40 @@ Useful to turn bias off for a forward pass of a layer.

## Examples

```julia
julia> Flux.Zeros(3,3)
3×3 Flux.Zeros{Bool,2}:
false false false
false false false
false false false

julia> Flux.Zeros(Float32, 3,3)
3×3 Flux.Zeros{Float32,2}:
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
```julia-repl
julia> bias_less_conv = Conv((2,2), 1=>3; bias = false)
Conv((2, 2), 1=>3)

julia> rand(3,3) .+ Flux.Zeros()
3×3 Array{Float64,2}:
0.198739 0.490459 0.785386
0.779074 0.39986 0.66383
0.854981 0.447292 0.314497
julia> params(bias_less_conv) |> length
1

julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
Conv((2, 2), 1=>3)
julia> bias_less_conv.bias
Flux.Zeros()
```
"""
struct Zeros{T,N} <: AbstractArray{T,N}
size::Tuple
end

Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
Zeros(sz::Integer...) = Zeros(Bool, sz...)

Base.size(xs::Zeros) = xs.size
Base.axes(xs::Zeros) = Base.OneTo.(size(xs))

Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()

Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
Zeros(T, length(inds))

Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))

@adjoint reshape(xs::Zeros{T}, dims...) where T =
reshape(xs, dims...), _ -> nothing

# Define basic ops
for f in (:+, :-)
@eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
@assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
a
end
end

+(a::Zeros, b::AbstractArray) = b + a
-(a::Zeros, b::AbstractArray) = -b + a

Base.copy(xs::Zeros{T,N}) where {T,N} = xs

# Define broadcasting behaviour
for op in (:+, :-)
@eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
bs = Broadcast.broadcast_shape(size(a), size(b))
size(a) == bs && return a
sz = similar(a, bs)
sz .= a
end
end

broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
struct Zeros end
# To allow for things like Dense(10, 2, initb = Zeros)
Zeros(args...) = Zeros()

function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
end
Base.reshape(x::Zeros, dims...) = x

broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
+(::Zeros, b::AbstractArray) = b
+(a::AbstractArray, ::Zeros) = a
+(a::Zeros, ::Zeros) = a

for op in (:+, :-, :*)
@eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
end
-(::Zeros, b::AbstractArray) = -b
-(a::AbstractArray, ::Zeros) = a
-(a::Zeros, ::Zeros) = a

# Some opportunities to avoid scalar indexing, intermediaries
# Since it replicates a little of what we expect Base to do,
# it should be possible to remove in the future, but for now,
# these help with performance.
broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
broadcasted(::typeof(+), a::AbstractArray, b::Zeros) = a
broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = b
broadcasted(::typeof(-), a::AbstractArray, b::Zeros) = a
broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = -b
# Need adjoints for these or else the gradient w.r.t to the non-Zeros arg will be nothing as well
@adjoint broadcasted(::typeof(*), a::AbstractArray, b::Zeros) = zero(a), _ -> (nothing, zero(a), nothing)
@adjoint broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = zero(b), _ -> (nothing, nothing, zero(b))
@adjoint broadcasted(::typeof(/), a::Zeros, b::AbstractArray) = zero(b), _ -> (nothing, nothing, zero(b))
Loading