关于Example: Neural Network的解读

参考matlab的DeepLearningToolBox

CNN Example

load mnist_uint8有train_x<60000 x 784>,test_x<10000 x 784>,train_y<60000 x 10>和test_y<10000 x 10>

train_x = double( reshape( train_x', 28, 28, 60000)) / 255;表示将train_x转换为三维矩阵,reshape()是按列进行转换的,而train_x样本是按行向量存储的,则需要进行转置;同理,test_x = double(reshape(test_x',28,28,10000))/255train_y = double(train_y')test_y = double(test_y')

rand('state',0)保持状态不变

cnn.layers = {
    struct('type', 'i')
    struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5)
    struct('type', 's', 'scale', 2)
    struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5)
    struct('type', 's', 'scale', 2)
    };
    //通过cnn.layers{i}可查看具体层信息,例如cnn.layers{3}: type:'c', outputmaps: 6, kernelsize: 5
    //在这网络中,只有convolution层有activation,则sub-sampling层的激励导数为1
    // subsampling层无参数,则不需要更新权值,只有convolution层需要更新权值

cnn = cnnsetup(cnn, train_x, train_y);

cnnsetup函数

function net = cnnsetup(net, x, y)
    inputmaps = 1;
    mapsize = size(squeeze(x(:, :, 1)));
    //计算train_x的图像size,mapsize=[ 28x28]
    //squeeze是删除矩阵中的单一维,对二维矩阵不起作用,x(:, :, 1)返回28x28x1,则squeeze(x(:, :, 1))返回28x28
    
    for l = 1 : numel(net.layers)                       //numel返回矩阵中满足指定条件的个数,即返回5 
        if strcmp( net.layers{l}.type, 's')            //subsampling layer
                mapsize = mapsize / net.layers{l}.scale;    //net.layers{3}.scale=2,即将mapsize / 2
                // layers{3}.mapsize<12 x 12>, layers{5}.mapsize<4 x 4>
                for j = 1 : inputmaps
                        net.layers{l}.b{j} = 0;
                //layers{3}.b{6}=0,layers{3}.b{12}=0   将bias置0
                end
        end
        
        if strcmp( net.layers{l}.type, 'c' )
                mapsize = mapsize - net.layers{l}.kernelsize + 1;           //卷积核的步长为1
                fan_out = net.layers{l}.outputmaps * net.layers{l}.kernelsize ^ 2;
                // convolution layer的输出参数个数 layers{2}.outmap=6,layers{4}.outmap=12,kernel=5
                for j = 1 : net.layers{l}.outputmaps
                        fan_in = inputmaps * net.layers{l}.kernelsize ^ 2;
                        //  convolution layer的输入参数个数,kernel=5
                        for i = 1 : inputmaps
                                net.layers{l}.k{i}{j} = (rand(net.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out));  
                                //初始化kernel,layers{l}.k{i}{j}<5 x 5>
                                //表示 l-1 层的第i个feature map,到 l 层的第j个feature maps,总共i * j个kernel
                                // Convolution层的kernel是翻转的,即反向的,则在convn函数中不用使用rot180变成正的
                                // 在卷积网络的卷积计算,实际是相关计算,kernel不翻转;真正卷积kernel应该翻转
                        end
                        net.layers{l}.b{j} = 0; //初始化bias,总共有output maps个bias
                end
                inputmaps = net.layers{l}.outputmaps;
                //将本层的输出数修改为下一层的输入数
        end
    end
    
    fvnum = prod(mapsize) * inputmaps;
    //prod表示对数组的连乘,求全连接层的输入数量,fvnum=4*4*16192
    onum = size(y,1)
    // train_y的类别数量,train_y经过转置,即全连接层的输出数量,onum=10
    
    net.ffb = zeros(onum, 1);
    // <10x1>  在全连接层的输出部分,增加的bias
    net.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum));
    // <10 x 192> 权值初始化
end

opts.alpha = 1opts.batchsize = 50opts.numepochs = 1是设置参数

cnn = cnntrain(cnn, train_x, train_y, opts);训练神经网络

cnntrain函数

function net = cnntrain( net , x , y , opts)
    m = size(x, 3) //样本个数, m=60000
    numbatches = m / opt.batchsize; //  batch的数量,batchsize=50,则numbatches=1200

    if rem(numbatches, 1) ~= 0
           error('numbatches not integer');
    end
    // 保证有整数个batch

    net.rL = [];        //loss function值
    for i = 1 : opts.numepochs
            disp(['epoch ' num2str(i) '/' num2str(opts.numepochs)]);    //显示当前迭代次数,只迭代了一次
            tic;
            kk = randperm(m);           //随机打乱一个数字序列,乱序训练,[3 1 2 5 4]
            for l = 1 : numbatches
                    batch_x = x(:, :, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
                    // kk表示打乱的序列,每个batch中处理50个样本,batch_x = train_x(28, 28, 50),1-50, 51-100,...
                    batch_y = y(:, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
                    // batch_y = train_y(10, n),n=50
                
                    net = cnnff( net, batch_x);     //迭代1200次batchnum,再迭代echo次
                    net = cnnbp( net, batch_y);
                    net = cnnapplygrads( net, opts);
                
                    if isempty(net.rL)
                        net.rL(1) = net.L;          //初始Loss值
                    end
                
                    net.rL(end + 1) = 0.99 * net.rL(end) + 0.01 * net.L;       
                     //更新Loss function值,一个batch一次训练,则总共训练了1200次
            end
            toc;
    end
end

cnnff函数

function net = cnnff( net, x)
    n = numel( net.layers );    //网络层数,n=5
    net.layers{1}.a{1} = x;     //a{1}=batch_x <28 x 28 x 50>
    inputmaps = 1;    
                
    for l = 2 : n
        if strcmp( net.layers{l}.type, 'c')
            for j = 1 : net.layers{l}.outputmaps            //l层由l-1层所有 i 个feature map相加求得
                    z = zeros(size(net.layers{l - 1}.a{1}) - [net.layers{l}.kernelsize - 1 net.layers{l}.kernelsize - 1 0]);
                    // net.layers{1}.a{1}<28 x 28 x 50> - [4 4 0]
                    //net.layers{2}.a{i} <24 x 24 x 50>,包含了50个样本,总共有6个a{i} 
                    for i = 1 : inputmaps
                            z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid');
                            //z是将 l- 1层中所有 i 个feature maps卷积求和,通过activation,得到 l 层的第 j 个feature maps
                            //convn函数计算过程中,第一个参数各个维度无变化,对第二个参数kernel进行翻转
                            //在这里的kernel是反向的,则在convn函数翻转kernel,则kernel为正向,最后实现相关计算
                    end
                    net.layers{l}.a{j} = sigm(z + net.layers{l}.b{j})   
                    // layers{2}.b{j}为<1 x 1>,在a{j}矩阵的每个元素相加
             end
             
             inputmaps = net.layers{l}.outmaps;
             //将本层的输出output maps数目作为下一层的输入input maps数目
             
        elseif strcmp( net.layers{l}.type, 's')
            for j = 1 : inputmaps           // 6个或12个
                   z = convn( net.layers{l - 1}.a{j}, ones(net.layers{l}.scale) / (net.layers{l}.scale ^ 2),   'valid');
                   //  layer{3}.a{j}<12 x 12 x 50>, layer{5}.a{j}<4 x 4 x 50>
                   // 使用mean-pooling,将patch内像素相加除以scale^2
                   // 使用convn函数,用全1矩阵实现patch像素求和,sub-sampling的kernel翻转无影响
                   net.layers{l}.a{j} = z(1 : net.layers{l}.scale : end, 1 : net.layers{l}.scale : end, :);
                   // 在z中每隔2个取一次样,达到降维的目的,且恰好忽略convn函数中计算的无用值
            end
        end
    end

    net.fv = [ ];
    for j = 1 : numel( net.layers{n}.a )    //numel( layer{5}.a ) = 12
            sa = size( net.layers{n}.a{j} );    // layer{5}.a{i} <4 x 4 x 50>
            net.fv = [ net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3)) ];     
            // reshape结果为<16 x 50>,不断添加到fv矩阵后,通过12次重复得到fv<192 x 50>
            //一个样本的情况下,12个output maps<4 x 4>展开为<192 x 1>,有50个样本,则为<192 x 50>
    end

    net.o = sigm(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2)));       
    //repmat表示将矩阵平铺和复制为大矩阵,ffb<10 x 1>拓展为<10 x 50>
    //<10 x 50> (<10 x 192> * <192 x 50> + <10 x 50>) ;一个样本为<10 x 1>,50个样本为<10 x 50>
end

cnnbp函数

function net = cnntrain( net, x, y, opts)
    m = size(x, 3);             //batch_x<28 x 28 x 50>,则样本数为50
    net.e = net.o - y;         // <10 x 50>表示50个样本的误差
    net.L = 1/2 * sum( net.e(:) .^2 ) / size(net.e, 2) );     
    // net.e(:)表示将net.e按列取出,排成一列<500 x 1>;sum求和,再除以样本数,表示平均错误率
    
    net.od = net.e .* ( net.o .* (1 - net.o));          //sigmoid函数的导数为f(x)*(1-f(x))
    // od表示output delta  <10 x 50>
    net.fvd = (net.ffW' * net.od);
    //全连接层的delta <192 x 50> ( <192 x 10> * <10 x 50> )
    //每列元素表示一个样本误差,一个样本的反向结果为<192 x 1>,50个样本则为<192 x 50>
    
    if strcmp(net.layers{n}.type, 'c')
            net.fvd = net.fvd .* (net.fv .* (1 - net.fv));      //subsampling层无激励函数
    end
    
    sa = size(net.layers{n}.a{1});  //<4 x 4 x 50>
    fvnum = sa(1) * sa(2);  //<16>即feature vector的像素个数
    for j = 1 : numel(net.layers{n}.a)      //numel为12
            net.layers{n}.d{j} = reshape(net.fvd(((j - 1) * fvnum + 1) : j * fvnum, :), sa(1), sa(2), sa(3));
            // 根据feature vector delta,即全连接层的误差,变换为12个<4 x 4 x 50>subsampling层的误差形式
    end
    
    for l = (n - 1) : -1 : 1
        if strcmp(net.layers{l}.type, 'c')
            for j = 1 : numel(net.layers{l}.a)
                    net.layers{l}.d{j} = net.layers{l}.a{j} .* (1 - net.layers{l}.a{j}) .* (expand(net.layers{l + 1}.d{j}, [net.layers{l + 1}.scale net.layers{l + 1}.scale 1]) / net.layers{l + 1}.scale ^ 2);
                    // net.layers{l}.d{j} 表示第 l 层上第 j 个feature map的梯度 <8 x 8 x 50> 
                    // layers{5}.d{j}<4 x 4 x 50>,使用expand函数,拓展为<8 x 8 x 50>,即upsample操作
                    //由于只有convolutional层进行有activation function,要对sigmoid求导
            end
        elseif strcmp( net.layers{l}.type, 's' )
            for i = 1 : numel( net.layers{l}.a )        //  1:6,表示 l 层的第 i 个delta是由 l+1 层的所有 j 个delta卷积求和
                    z = zeros(size(net.layers{l}.a{1}));    // layers{3}.a{1}<12 x 12 x 50>
                    for j = 1 : numel(net.layers{l + 1}.a)
                            z = z + convn(net.layers{l + 1}.d{j}, rot180(net.layers{l + 1}.k{i}{j}), 'full');
                            //<12 x 12 x 1>,默认情况的计算8+5-1=12;
                            //layers{l+1}.k{i}{j}表示 上一层 l 层的第 i 个feature map对应到下一层 l+1 层的第 j 个feature map上的卷积kernel参数;
                            //有公式计算得, l+1 层 向 l 层传递,使用layers{l+1}.k{i}{j},累加卷积 j 个map求和得第 i 个map                                
                            // rot180(kernel)表示对kernel翻转,但是由于kernel是已翻转过的,所以在这里反而是正的
                            // 而sub-sampling层的反向delta求导函数convn要求kernel是翻转的,所以输入正的kernel满足条件
                    end
                    net.layers{l}.d{i} = z;
            end
        end
    end
    
    for l = 2 : n
        if strcmp(net.layers{l}.type, 'c')
            for j = 1 : numel(net.layers{l}.a)
                for i = 1 : numel(net.layers{l - 1}.a)
                        net.layers{l}.dk{i}{j} = convn(flipall(net.layers{l - 1}.a{i}), net.layers{l}.d{j}, 'valid') / size(net.layers{l}.d{j}, 3);
                        // dk{i}{j}与k{i}{j}的size相同,flippall是将矩阵在三个维度翻转,第三维度翻转不影响结果
                        // <28 x 28 x 50> 卷积 <24 x 24 x 50>结果为<5 x 5>,且是50个样本的总结果,需除以size
                        // 原公式是将d{j}变为rot180(d{j}),使convn函数中不翻转d{j},
                        // 由于在此网络中kernel都是以反向形式存在,所以rot180再次翻转dk{i}{j},保证相同形式;
                             
                        // 在这里是将d{j}保持不变,反向形式,相反是通过flippall翻转a{i},而convn函数处理过程中,
                        // 会将d{j}翻转,则d{j}与a{i}同为反向形式,因此结果dk{i}{j}恰好为反向
                end
                net.layers{l}.db{j} = sum(net.layers{l}.d{j}(:)) / size(net.layers{l}.d{j}, 3) 
                //<1 x 1>,表示第 l 层上第 j 个feature map对应的bias的偏导
                // db{j}表示将所以d{j}值相加,总共有50个样本,则除以size
            end
        end
    end
    
    net.dffW = net.od * (net.fv)' / size(net.od, 2);
    // <10 x 192>( <10 x 50> * <50 x 192> / 50),即求平均值
    net.dffb = mean(net.od, 2); 
    //<10 x 1>求平均所得
    
    function X = rot180(X)
            X = flipdim(flipdim(X, 1), 2);
    end
end 

cnnapplygrads函数

function net = cnnapplygrads(net, opts)
    for l = 2 : numel(net.layers)       // 逐层更新
        if strcmp(net.layers{l}.type, 'c')
            for j = 1:numel(net.layers{l}.a)
                for ii = 1:numel(net.layers{l-1}.a)
                        net.layers{l}.k{ii}{j} = net.layers{l}.k{ii}{j} - opts.alpha * net.layers{l}.dk{ii}{j};
                        // layers{2}.k{ii}{j}<5 x 5> 根据公式更新权值
                end
                net.layers{l}.b{j} = net.layers{l}.b{j} - opts.alhpa * net.layers{l}.db{j};
                // <1 x 1>,更新bias
            end
        end
    end
    
    net.ffW = net.ffW - opts.alpha*net.dffW;
    // <10 x 192>,更新全连接层
    net.ffb = net.ffb - opts.alpha*net.dffb;
    // <10 x 1>,更新全连接层bias
end       

[er, bad] = cnntest(cnn, test_x, test_y);

cnntest函数

function [er, bad] = cnntest(net, x, y)

    net = cnnff(net, x);    //cnnff的输出为net.o,不需要label
    [~, h] = max(net.o);
    // ~表示最大值,h表示最大值位置,即net.o<10 x 10000>,则h<1 x 10000>,其中数值表示各样本预测类别
    [~, a] = max(y);
    // y<10 x 10000>,则a<1 x 10000>,其中数值表示各样本真实类别
    bad = find(h ~= a);
    // 计算预测错误的样本个数,<1 x 1113>
    
    er = numel(bad) / size(y,2);
    // 错误样本数除以总数,即错误率
end

figure; plot(cnn.rL);cnn.rL<1 x 1201>总共计算了numbatch次,得到numbatch个错误率,画图显示结果