5章 誤差逆伝播

前章では、ニューラルネットワークの学習について説明しました。

5.1 計算グラフ

5.1.1 計算グラフで解く

5.1.2 局所的な計算

5.1.3 なぜ計算グラフで解くのか?

5.2 連鎖率

5.2.1 計算グラフの逆伝播

5.2.3 連鎖率と計算グラフ

5.3 逆伝播

5.3.1 加算ノードの逆伝播

5.3.2 乗算ノードの逆伝播

5.3.3 リンゴの例

5.4 単純なレイヤの実装

5.4.1 乗算レイヤの実装

mutable struct MulLayer
    x
    y
end

function MulLayer()
    return MulLayer(nothing, nothing)
end

function forward(self::MulLayer, x, y)
    self.x = x
    self.y = y                
    out = x * y

    return out
end

function backward(self::MulLayer, dout)
    dx = dout * self.y # xとyをひっくり返す
    dy = dout * self.x

    return dx, dy
end
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = forward(mul_apple_layer, apple, apple_num)
price = forward(mul_tax_layer, apple_price, tax)

print(price) # 220.00000000000003

# backward
dprice = 1
dapple_price, dtax = backward(mul_tax_layer, dprice)
dapple, dapple_num = backward(mul_apple_layer, dapple_price)

print("$dapple, $dapple_num, $dtax") # 2.2, 110.00000000000001, 200

5.4.2 加算レイヤの実装

abstract type AddLayer end

function forward(self::AddLayer, x, y)
    out = x + y

    return out
end

function backward(self::AddLayer, dout)
    dx = dout * 1
    dy = dout * 1

    return dx, dy
end

apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = forward(mul_apple_layer, apple, apple_num)  # (1)
orange_price = forward(mul_orange_layer, orange, orange_num)  # (2)
all_price = forward(add_apple_orange_layer, apple_price, orange_price)  # (3)
price = forward(mul_tax_layer, all_price, tax)  # (4)

# backward
dprice = 1
dall_price, dtax = backward(mul_tax_layer, dprice)  # (4)
dapple_price, dorange_price = backward(add_apple_orange_layer, dall_price)  # (3)
dorange, dorange_num = backward(mul_orange_layer, dorange_price)  # (2)
dapple, dapple_num = backward(mul_apple_layer, dapple_price)  # (1)

print(price) # 715.0000000000001
print(dapple_num, dapple, dorange, dorange_num, dtax) # 110.00000000000001 2.2 3.3000000000000003 165.0 650

5.5 活性化関数レイヤの実装

5.5.1 ReLUレイヤ

mutable struct Relu
    mask
end

Relu() = Relu(nothing)

function forward(self::Relu, x)
        self.mask = (x .<= 0)
        out = copy(x)
        out[self.mask] = 0

        return out
end

function backward(self::Relu, dout)
        dout[self.mask] = 0
        dx = dout

        return dx
end

julia> x = [1.0 -2.0; -0.5 3.0]
2×2 Matrix{Float64}:
  1.0  -2.0
 -0.5   3.0

julia> x = [1.0 -0.5; -2.0 3.0]
2×2 Matrix{Float64}:
  1.0  -0.5
 -2.0   3.0

julia> mask = (x.<=0)
2×2 BitMatrix:
 0  1
 1  0

5.5.2 Sigmoidレイヤ

mutable struct Sigmoid
    out
end

Sigmoid() = Sigmoid(nothing)

function forward(self::Sigmoid, x)
    out = 1 ./ (1 .+ exp.(x))
    self.out = out

    return out
end

function backward(self, dout)
    dx = dout .* (1.0 .- self.out) .* self.out

    return dx
end

5.6 Affine/Softmaxレイヤの実装

5.6.1 Affineレイヤ

julia> X = rand(2);    # 入力

julia> W = rand(3, 2); # 重み

julia> B = rand(3);    # バイアス

julia>

julia> size(X)
(2,)

julia> size(W)
(3, 2)

julia> size(B)
(3,)

julia>

julia> Y = W*X + B;

5.6.2 バッチ版Affineレイヤ

julia> X_dot_W = [0 0 0; 10 10 10]
2×3 Matrix{Int64}:
  0   0   0
 10  10  10

julia> B = [1 2 3]
1×3 Matrix{Int64}:
 1  2  3

julia> dY = [1 2 3; 4 5 6]
2×3 Matrix{Int64}:
 1  2  3
 4  5  6

julia> dB = sum(dY, dims=1)
1×3 Matrix{Int64}:
 5  7  9

mutable struct Affine
    W
    b
    x
    dW
    db
end

function Affine(W, b)
    Affine(W, b, nothing, nothing, nothing)
end

function forward(self::Affine, x)
    self.x = x
    out = self.x * self.W .+ self.b

    return out
end

function backward(self::Affine, dout)
    dx = dout * self.W'
    self.dW = self.x' * dout
    self.db = np.sum(dout, dims=1)
    
    return dx
end

5.6.3 Softmax-with-Lossレイヤ

mutable struct SoftmaxWithLoss
    loss
    y # softmaxの出力
    t # 教師データ
end

SoftmaxWithLoss() = SoftmaxWithLoss(nothing, nothing, nothing)

function forward(self::SoftmaxWithLoss, x, t)
    self.t = t
    self.y = softmax(x)
    self.loss = cross_entropy_error(self.y, self.t)
    
    return self.loss
end

function backward(self::SoftmaxWithLoss, dout=1)
    batch_size = size(self.t, 1)
    dx = (self.y - self.t) ./ batch_size

    return dx
end

5.7 誤差逆伝播法の実装

5.7.1 ニューラルネットワークの学習の全体図

5.7.2 誤差逆伝播法に対応したニューラルネットワークの実装

それでは、実装を行います。

include("common/commons.jl")

import  OrderedCollections: OrderedDict

import  .Gradient: numerical_gradient
using   .Layers


mutable struct TwoLayerNet
    params::Dict
    layers::Dict
    lastLayer
end

function TwoLayerNet(input_size, hidden_size, output_size, weight_init_std = 0.01)
    # 重みの初期化
    params = Dict(
        "W1" => weight_init_std * randn(input_size, hidden_size),
        "b1" => zeros(1, hidden_size),
        "W2" => weight_init_std * randn(hidden_size, output_size) ,
        "b2" => zeros(1, output_size)
    )
   
    # レイヤの生成
    layers = Dict(
        "Affine1" => Affine(params["W1"], self.params["b1"]),
        "Relu1" => Relu(),
        "Affine2" => Affine(params["W2"], self.params["b2"])
    )

    lastLayer = SoftmaxWithLoss()

    return TwoLayerNet(params, layers, lastLayer)
end
        
function predict(self::TwoLayerNet, x)
    for (_, layer)=self.layers
        x = forward(layer, x)
    end
    return x
end

# x:入力データ, t:教師データ
function loss(self::TwoLayerNet, x, t)
    y = predict(self, x)
    return forward(self.lastLayer, y, t)
end

function accuracy(self::TwoLayerNet, x, t::Vector{T}) where T <: Integer
    y = predict(self, x)
    y = [i[2] for i=argmax(y, dims=2)]
    
    accuracy = sum(y == eltype(y).(t)) / size(x, 1)
    return accuracy
end

function accuracy(self::TwoLayerNet, x, t)
    y = predict(self, x)
    y = argmax(y, dims=2)
    t = argmax(t, dims=2)
    
    accuracy = sum(y .== t) / size(x, 1)
    return accuracy
end

# x:入力データ, t:教師データ
function numerical_gradient(self::TwoLayerNet, x, t)
    loss_W = (W)->loss(self, x, t)
    
    grads = Dict(
        "W1" => numerical_gradient(loss_W, self.params["W1"]),
        "b1" => numerical_gradient(loss_W, self.params["b1"]),
        "W2" => numerical_gradient(loss_W, self.params["W2"]),
        "b2" => numerical_gradient(loss_W, self.params["b2"])
    )
    return grads
end

function gradient(self::TwoLayerNet, x, t)
    # forward
    loss(self, x, t)

    # backward
    dout = 1
    dout = backward(self.lastLayer, dout)
    
    layers = reverse(collect(values(self.layers)))
    for layer in layers
        dout = backward(layer, dout)
    end
    # 設定
    grads = Dict(
        "W1" => self.layers["Affine1"].dW,
        "b1" => self.layers["Affine1"].db,
        "W2" => self.layers["Affine2"].dW,
        "b2" => self.layers["Affine2"].db
    )
    return grads
end

5.7.3 誤差逆伝播法の勾配確認

これまで勾配を求める方法を2つ説明してきました。

include("dataset/mnist.jl")
include("ch05/two_layer_net.jl")

import Statistics: mean

import .MNIST: load_mnist
using  .TwoLayerNet_ch05 # TwoLayerNet

# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=true, one_hot_label=true)

network = TwoLayerNet(784, 50, 10)

x_batch = x_train[1:3,:]
t_batch = t_train[1:3,:]

grad_numerical = numerical_gradient(network, x_batch, t_batch)
grad_backprop = gradient(network, x_batch, t_batch)

for (key,_) in grad_numerical
    diff = mean( abs.(grad_backprop[key] - grad_numerical[key]) )
    println("$key : $diff")
end

W2 : 5.124005846948428e-9
W1 : 4.1938210893942225e-10
b2 : 1.3958183245899036e-7
b1 : 2.3698598518540016e-9

5.7.4 誤差逆伝播法を使った学習

それでは最後に、誤差逆伝播法を使ったニューラルネットワークの学習の実装を掲載します。

include("dataset/mnist.jl")
include("ch05/two_layer_net.jl")

import Random: shuffle
import Printf: @sprintf

import .MNIST: load_mnist
using  .TwoLayerNet_ch05 # TwoLayerNet


# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=true, one_hot_label=true)

network = TwoLayerNet(784, 50, 10)

iters_num = 10000
train_size = size(x_train, 1)
batch_size = 100
learning_rate = 0.1

train_loss_list = zeros(0)
train_acc_list = zeros(0)
test_acc_list = zeros(0)

iter_per_epoch = max(train_size / batch_size, 1)

for i in 0:iters_num
    batch_mask = shuffle(1:train_size)[1:batch_size]
    x_batch = x_train[batch_mask, :]
    t_batch = t_train[batch_mask, :]
    
    # 勾配
    #grad = numerical_gradient(network, x_batch, t_batch)
    grad = gradient(network, x_batch, t_batch)
    
    # 更新
    for (key, _)=network.params # key=("W1", "b1", "W2", "b2")
        network.params[key] -= learning_rate * grad[key]
    end
    
    loss_val = loss(network, x_batch, t_batch)
    append!(train_loss_list, loss_val)
    
    if i % iter_per_epoch == 0
        train_acc = accuracy(network, x_train, t_train)
        test_acc = accuracy(network, x_test, t_test)
        append!(train_acc_list, train_acc)
        append!(test_acc_list, test_acc)
        println("$train_acc, $test_acc")
    end
end

5.8 まとめ