7章 畳み込みニューラルネットワーク

7.1 全体の構造

7.2 畳み込み層

7.2.1 全結合層の問題点

7.2.2 畳み込み演算

7.2.3 パディング

7.2.4 ストライド

7.2.5 3次元データの畳み込み演算

7.2.6 ブロックで考える

7.2.7 バッチ処理

7.3 プーリング層

7.3.1 プーリング層の特徴

7.4 Convolution/Poolingレイヤの実装

7.4.1 4次元配列

julia> x = rand(10, 1, 28, 28); # ランダムにデータを生成

julia> size(x)
(10, 1, 28, 28)

julia> size(x[1, :, :, :]) # (1, 28, 28)
julia> size(x[2, :, :, :]) # (1, 28, 28)

julia> size(x[1, 1, :, :]) # (28, 28)

7.4.2 im2colによる展開

7.4.3 Convolutionレイヤの実装

im2col(input_data, filter_h, filter_w, stride=1, pad=0)
  • input_data ── (データ数, チャンネル, 高さ, 幅)の4次元配列からなる入力データ
  • filter_h ── フィルターの高さ
  • filter_w ── フィルターの横幅
  • stride ── ストライド
  • pad ── パディング

x1 = rand(1, 3, 7, 7);
col1 = im2col(x1, 5, 5, stride=1, pad=0)
print(size(col1)) # ( 9, 75)

x2 = rand(10, 3, 7, 7);
col2 = im2col(x2, 5, 5, stride=1, pad=0)
print(size(col2)) # ( 90, 75)

mutable struct Convolution
    W
    b
    stride
    pad
end

function Convolution(W, b; stride=1, pad=0)
    return Convolution(W, b, stride, pad)
end

function forward(self::Convolution, x)
    FN, C, FH, FW = size(self.W)
    N, C, H, W = size(x)
    out_h = fld1(H + 2*self.pad - FH, self.stride)
    out_w = fld1(W + 2*self.pad - FW, self.stride)

    col = im2col(x, FH, FW, stride=self.stride, pad=self.pad)
    col_W = reshape(self.W, (FN, :))'
    out = col * col_W .+ self.b

    out = permutedims(reshape(out, (N, out_h, out_w, :)), [1, 4, 2, 3])
    
    return out
end

7.4.4 Poolingレイヤの実装

mutable struct Pooling
    pool_h
    pool_w
    stride
    pad
end

function Pooling(pool_h, pool_w; stride=2, pad=0)
    return Pooling(pool_h, pool_w, stride, pad)
end

function forward(self::Pooling, x)
    N, C, H, W = size(x)
    out_h = fld1(H - self.pool_h, self.stride)
    out_w = fld1(W - self.pool_w, self.stride)

    # 展開(1)
    col = im2col(x, self.pool_h, self.pool_w, stride=self.stride, pad=self.pad)
    col = reshape(col, (:, self.pool_h*self.pool_w))

    # 最大値(2)
    out = maximum(col, dims=2)
    # 整形(3)
    out = permutedims(reshape(out, (N, out_h, out_w, C)), (1, 4, 2, 3))

    return out
end

7.5 CNNの実装

mutable struct SimpleConvNet
    params::Dict
    layers::OrderedDict
    last_layer 
end

function SimpleConvNet(input_dim=(1, 28, 28), 
                       conv_param=Dict("filter_num"=>30, "filter_size"=>5, 
                                       "pad"=>0, "stride"=>1),
                       hidden_size=100, output_size=10, weight_init_std=0.01)
    filter_num = conv_param["filter_num"]
    filter_size = conv_param["filter_size"]
    filter_pad = conv_param["pad"]
    filter_stride = conv_param["stride"]
    input_size = input_dim[2]
    conv_output_size = (input_size - filter_size + 2*filter_pad) / filter_stride + 1
    pool_output_size = round(Int, filter_num * (conv_output_size/2) * (conv_output_size/2))

    params = Dict(
        "W1" => weight_init_std * randn(filter_num, input_dim[1], filter_size, filter_size),
        "b1" => zeros(1, filter_num),
        "W2" => weight_init_std * randn(pool_output_size, hidden_size),
        "b2" => zeros(1, hidden_size),
        "W3" => weight_init_std * randn(hidden_size, output_size),
        "b3" => zeros(1, output_size)
    )

    layers = OrderedDict(
        "Conv1" => Convolution(params["W1"], params["b1"],
                               stride=conv_param["stride"],
                               pad=conv_param["pad"]),
        "Relu1" => Relu(),
        "Pool1" => Pooling(2, 2, stride=2),
        "Affine1" => Affine(params["W2"], params["b2"]),
        "Relu2" => Relu(),
        "Affine2" => Affine(params["W3"], params["b3"]),
    )
    last_layer = SoftmaxWithLoss()

    return SimpleConvNet(params, layers, last_layer)
end

function predict(self::SimpleConvNet, x)
    for (_,layer) = self.layers
        x = forward(layer, x)
    end
    return x
end

function loss(self::SimpleConvNet, x, t)
    y = predict(self, x)
    return forward(self.last_layer, y, t)
end

function gradient(self::SimpleConvNet, x, t)
    # forward
    loss(self, x, t)

    # backward
    dout = 1
    dout = backward(self.last_layer, dout)

    layers = reverse(collect(values(self.layers)))
    for layer = layers
        dout = backward(layer, dout)
    end

    # 設定
    grads = Dict(
        "W1" => self.layers["Conv1"].dW,
        "b1" => self.layers["Conv1"].db,
        "W2" => self.layers["Affine1"].dW,
        "b2" => self.layers["Affine1"].db,
        "W3" => self.layers["Affine2"].dW,
        "b3" => self.layers["Affine2"].db
    )
    return grads
end

7.6 CNNの可視化

7.6.1 1層目の重みの可視化

7.6.2 階層構造による情報抽出

7.7 代表的なCNN

7.7.1 LeNet

7.7.2 AlexNet

7.8 まとめ