5章 誤差逆伝播
前章では、ニューラルネットワークの学習について説明しました。
~
5.1 計算グラフ
5.1.1 計算グラフで解く
5.1.2 局所的な計算
5.1.3 なぜ計算グラフで解くのか?
5.2 連鎖率
5.2.1 計算グラフの逆伝播
5.2.3 連鎖率と計算グラフ
5.3 逆伝播
5.3.1 加算ノードの逆伝播
5.3.2 乗算ノードの逆伝播
5.3.3 リンゴの例
5.4 単純なレイヤの実装
5.4.1 乗算レイヤの実装
~
mutable struct MulLayer
x
y
end
function MulLayer()
return MulLayer(nothing, nothing)
end
function forward(self::MulLayer, x, y)
self.x = x
self.y = y
out = x * y
return out
end
function backward(self::MulLayer, dout)
dx = dout * self.y # xとyをひっくり返す
dy = dout * self.x
return dx, dy
end
apple = 100
apple_num = 2
tax = 1.1
# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()
# forward
apple_price = forward(mul_apple_layer, apple, apple_num)
price = forward(mul_tax_layer, apple_price, tax)
print(price) # 220.00000000000003
~
# backward
dprice = 1
dapple_price, dtax = backward(mul_tax_layer, dprice)
dapple, dapple_num = backward(mul_apple_layer, dapple_price)
print("$dapple, $dapple_num, $dtax") # 2.2, 110.00000000000001, 200
5.4.2 加算レイヤの実装
~
abstract type AddLayer end
function forward(self::AddLayer, x, y)
out = x + y
return out
end
function backward(self::AddLayer, dout)
dx = dout * 1
dy = dout * 1
return dx, dy
end
~
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1
# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()
# forward
apple_price = forward(mul_apple_layer, apple, apple_num) # (1)
orange_price = forward(mul_orange_layer, orange, orange_num) # (2)
all_price = forward(add_apple_orange_layer, apple_price, orange_price) # (3)
price = forward(mul_tax_layer, all_price, tax) # (4)
# backward
dprice = 1
dall_price, dtax = backward(mul_tax_layer, dprice) # (4)
dapple_price, dorange_price = backward(add_apple_orange_layer, dall_price) # (3)
dorange, dorange_num = backward(mul_orange_layer, dorange_price) # (2)
dapple, dapple_num = backward(mul_apple_layer, dapple_price) # (1)
print(price) # 715.0000000000001
print(dapple_num, dapple, dorange, dorange_num, dtax) # 110.00000000000001 2.2 3.3000000000000003 165.0 650
5.5 活性化関数レイヤの実装
5.5.1 ReLUレイヤ
~
mutable struct Relu
mask
end
Relu() = Relu(nothing)
function forward(self::Relu, x)
self.mask = (x .<= 0)
out = copy(x)
out[self.mask] = 0
return out
end
function backward(self::Relu, dout)
dout[self.mask] = 0
dx = dout
return dx
end
~
julia> x = [1.0 -2.0; -0.5 3.0]
2×2 Matrix{Float64}:
1.0 -2.0
-0.5 3.0
julia> x = [1.0 -0.5; -2.0 3.0]
2×2 Matrix{Float64}:
1.0 -0.5
-2.0 3.0
julia> mask = (x.<=0)
2×2 BitMatrix:
0 1
1 0
5.5.2 Sigmoidレイヤ
mutable struct Sigmoid
out
end
Sigmoid() = Sigmoid(nothing)
function forward(self::Sigmoid, x)
out = 1 ./ (1 .+ exp.(x))
self.out = out
return out
end
function backward(self, dout)
dx = dout .* (1.0 .- self.out) .* self.out
return dx
end
5.6 Affine/Softmaxレイヤの実装
5.6.1 Affineレイヤ
~
julia> X = rand(2); # 入力
julia> W = rand(3, 2); # 重み
julia> B = rand(3); # バイアス
julia>
julia> size(X)
(2,)
julia> size(W)
(3, 2)
julia> size(B)
(3,)
julia>
julia> Y = W*X + B;
~
5.6.2 バッチ版Affineレイヤ
~
julia> X_dot_W = [0 0 0; 10 10 10]
2×3 Matrix{Int64}:
0 0 0
10 10 10
julia> B = [1 2 3]
1×3 Matrix{Int64}:
1 2 3
~
julia> dY = [1 2 3; 4 5 6]
2×3 Matrix{Int64}:
1 2 3
4 5 6
julia> dB = sum(dY, dims=1)
1×3 Matrix{Int64}:
5 7 9
~
mutable struct Affine
W
b
x
dW
db
end
function Affine(W, b)
Affine(W, b, nothing, nothing, nothing)
end
function forward(self::Affine, x)
self.x = x
out = self.x * self.W .+ self.b
return out
end
function backward(self::Affine, dout)
dx = dout * self.W'
self.dW = self.x' * dout
self.db = np.sum(dout, dims=1)
return dx
end
5.6.3 Softmax-with-Lossレイヤ
~
mutable struct SoftmaxWithLoss
loss
y # softmaxの出力
t # 教師データ
end
SoftmaxWithLoss() = SoftmaxWithLoss(nothing, nothing, nothing)
function forward(self::SoftmaxWithLoss, x, t)
self.t = t
self.y = softmax(x)
self.loss = cross_entropy_error(self.y, self.t)
return self.loss
end
function backward(self::SoftmaxWithLoss, dout=1)
batch_size = size(self.t, 1)
dx = (self.y - self.t) ./ batch_size
return dx
end
5.7 誤差逆伝播法の実装
5.7.1 ニューラルネットワークの学習の全体図
5.7.2 誤差逆伝播法に対応したニューラルネットワークの実装
それでは、実装を行います。
~
include("common/commons.jl")
import OrderedCollections: OrderedDict
import .Gradient: numerical_gradient
using .Layers
mutable struct TwoLayerNet
params::Dict
layers::Dict
lastLayer
end
function TwoLayerNet(input_size, hidden_size, output_size, weight_init_std = 0.01)
# 重みの初期化
params = Dict(
"W1" => weight_init_std * randn(input_size, hidden_size),
"b1" => zeros(1, hidden_size),
"W2" => weight_init_std * randn(hidden_size, output_size) ,
"b2" => zeros(1, output_size)
)
# レイヤの生成
layers = Dict(
"Affine1" => Affine(params["W1"], self.params["b1"]),
"Relu1" => Relu(),
"Affine2" => Affine(params["W2"], self.params["b2"])
)
lastLayer = SoftmaxWithLoss()
return TwoLayerNet(params, layers, lastLayer)
end
function predict(self::TwoLayerNet, x)
for (_, layer)=self.layers
x = forward(layer, x)
end
return x
end
# x:入力データ, t:教師データ
function loss(self::TwoLayerNet, x, t)
y = predict(self, x)
return forward(self.lastLayer, y, t)
end
function accuracy(self::TwoLayerNet, x, t::Vector{T}) where T <: Integer
y = predict(self, x)
y = [i[2] for i=argmax(y, dims=2)]
accuracy = sum(y == eltype(y).(t)) / size(x, 1)
return accuracy
end
function accuracy(self::TwoLayerNet, x, t)
y = predict(self, x)
y = argmax(y, dims=2)
t = argmax(t, dims=2)
accuracy = sum(y .== t) / size(x, 1)
return accuracy
end
# x:入力データ, t:教師データ
function numerical_gradient(self::TwoLayerNet, x, t)
loss_W = (W)->loss(self, x, t)
grads = Dict(
"W1" => numerical_gradient(loss_W, self.params["W1"]),
"b1" => numerical_gradient(loss_W, self.params["b1"]),
"W2" => numerical_gradient(loss_W, self.params["W2"]),
"b2" => numerical_gradient(loss_W, self.params["b2"])
)
return grads
end
function gradient(self::TwoLayerNet, x, t)
# forward
loss(self, x, t)
# backward
dout = 1
dout = backward(self.lastLayer, dout)
layers = reverse(collect(values(self.layers)))
for layer in layers
dout = backward(layer, dout)
end
# 設定
grads = Dict(
"W1" => self.layers["Affine1"].dW,
"b1" => self.layers["Affine1"].db,
"W2" => self.layers["Affine2"].dW,
"b2" => self.layers["Affine2"].db
)
return grads
end
~
5.7.3 誤差逆伝播法の勾配確認
これまで勾配を求める方法を2つ説明してきました。
~
include("dataset/mnist.jl")
include("ch05/two_layer_net.jl")
import Statistics: mean
import .MNIST: load_mnist
using .TwoLayerNet_ch05 # TwoLayerNet
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=true, one_hot_label=true)
network = TwoLayerNet(784, 50, 10)
x_batch = x_train[1:3,:]
t_batch = t_train[1:3,:]
grad_numerical = numerical_gradient(network, x_batch, t_batch)
grad_backprop = gradient(network, x_batch, t_batch)
for (key,_) in grad_numerical
diff = mean( abs.(grad_backprop[key] - grad_numerical[key]) )
println("$key : $diff")
end
~
W2 : 5.124005846948428e-9
W1 : 4.1938210893942225e-10
b2 : 1.3958183245899036e-7
b1 : 2.3698598518540016e-9
~
5.7.4 誤差逆伝播法を使った学習
それでは最後に、誤差逆伝播法を使ったニューラルネットワークの学習の実装を掲載します。
~
include("dataset/mnist.jl")
include("ch05/two_layer_net.jl")
import Random: shuffle
import Printf: @sprintf
import .MNIST: load_mnist
using .TwoLayerNet_ch05 # TwoLayerNet
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=true, one_hot_label=true)
network = TwoLayerNet(784, 50, 10)
iters_num = 10000
train_size = size(x_train, 1)
batch_size = 100
learning_rate = 0.1
train_loss_list = zeros(0)
train_acc_list = zeros(0)
test_acc_list = zeros(0)
iter_per_epoch = max(train_size / batch_size, 1)
for i in 0:iters_num
batch_mask = shuffle(1:train_size)[1:batch_size]
x_batch = x_train[batch_mask, :]
t_batch = t_train[batch_mask, :]
# 勾配
#grad = numerical_gradient(network, x_batch, t_batch)
grad = gradient(network, x_batch, t_batch)
# 更新
for (key, _)=network.params # key=("W1", "b1", "W2", "b2")
network.params[key] -= learning_rate * grad[key]
end
loss_val = loss(network, x_batch, t_batch)
append!(train_loss_list, loss_val)
if i % iter_per_epoch == 0
train_acc = accuracy(network, x_train, t_train)
test_acc = accuracy(network, x_test, t_test)
append!(train_acc_list, train_acc)
append!(test_acc_list, test_acc)
println("$train_acc, $test_acc")
end
end
~
5.8 まとめ
~