Convolutional Neural Network 代码解读
关于Example: Neural Network的解读
参考matlab的DeepLearningToolBox
CNN Example
load mnist_uint8
有train_x<60000 x 784>,test_x<10000 x 784>,train_y<60000 x 10>和test_y<10000 x 10>
train_x = double( reshape( train_x', 28, 28, 60000)) / 255;
表示将train_x转换为三维矩阵,reshape()是按列进行转换的,而train_x样本是按行向量存储的,则需要进行转置;同理,test_x = double(reshape(test_x',28,28,10000))/255
,train_y = double(train_y')
和test_y = double(test_y')
rand('state',0)
保持状态不变
cnn.layers = {
struct('type', 'i')
struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5)
struct('type', 's', 'scale', 2)
struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5)
struct('type', 's', 'scale', 2)
};
//通过cnn.layers{i}可查看具体层信息,例如cnn.layers{3}: type:'c', outputmaps: 6, kernelsize: 5
//在这网络中,只有convolution层有activation,则sub-sampling层的激励导数为1
// subsampling层无参数,则不需要更新权值,只有convolution层需要更新权值
cnn = cnnsetup(cnn, train_x, train_y);
cnnsetup函数
function net = cnnsetup(net, x, y)
inputmaps = 1;
mapsize = size(squeeze(x(:, :, 1)));
//计算train_x的图像size,mapsize=[ 28x28]
//squeeze是删除矩阵中的单一维,对二维矩阵不起作用,x(:, :, 1)返回28x28x1,则squeeze(x(:, :, 1))返回28x28
for l = 1 : numel(net.layers) //numel返回矩阵中满足指定条件的个数,即返回5
if strcmp( net.layers{l}.type, 's') //subsampling layer
mapsize = mapsize / net.layers{l}.scale; //net.layers{3}.scale=2,即将mapsize / 2
// layers{3}.mapsize<12 x 12>, layers{5}.mapsize<4 x 4>
for j = 1 : inputmaps
net.layers{l}.b{j} = 0;
//layers{3}.b{6}=0,layers{3}.b{12}=0 将bias置0
end
end
if strcmp( net.layers{l}.type, 'c' )
mapsize = mapsize - net.layers{l}.kernelsize + 1; //卷积核的步长为1
fan_out = net.layers{l}.outputmaps * net.layers{l}.kernelsize ^ 2;
// convolution layer的输出参数个数 layers{2}.outmap=6,layers{4}.outmap=12,kernel=5
for j = 1 : net.layers{l}.outputmaps
fan_in = inputmaps * net.layers{l}.kernelsize ^ 2;
// convolution layer的输入参数个数,kernel=5
for i = 1 : inputmaps
net.layers{l}.k{i}{j} = (rand(net.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out));
//初始化kernel,layers{l}.k{i}{j}<5 x 5>
//表示 l-1 层的第i个feature map,到 l 层的第j个feature maps,总共i * j个kernel
// Convolution层的kernel是翻转的,即反向的,则在convn函数中不用使用rot180变成正的
// 在卷积网络的卷积计算,实际是相关计算,kernel不翻转;真正卷积kernel应该翻转
end
net.layers{l}.b{j} = 0; //初始化bias,总共有output maps个bias
end
inputmaps = net.layers{l}.outputmaps;
//将本层的输出数修改为下一层的输入数
end
end
fvnum = prod(mapsize) * inputmaps;
//prod表示对数组的连乘,求全连接层的输入数量,fvnum=4*4*16192
onum = size(y,1)
// train_y的类别数量,train_y经过转置,即全连接层的输出数量,onum=10
net.ffb = zeros(onum, 1);
// <10x1> 在全连接层的输出部分,增加的bias
net.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum));
// <10 x 192> 权值初始化
end
opts.alpha = 1
,opts.batchsize = 50
和opts.numepochs = 1
是设置参数
cnn = cnntrain(cnn, train_x, train_y, opts);
训练神经网络
cnntrain函数
function net = cnntrain( net , x , y , opts)
m = size(x, 3) //样本个数, m=60000
numbatches = m / opt.batchsize; // batch的数量,batchsize=50,则numbatches=1200
if rem(numbatches, 1) ~= 0
error('numbatches not integer');
end
// 保证有整数个batch
net.rL = []; //loss function值
for i = 1 : opts.numepochs
disp(['epoch ' num2str(i) '/' num2str(opts.numepochs)]); //显示当前迭代次数,只迭代了一次
tic;
kk = randperm(m); //随机打乱一个数字序列,乱序训练,[3 1 2 5 4]
for l = 1 : numbatches
batch_x = x(:, :, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
// kk表示打乱的序列,每个batch中处理50个样本,batch_x = train_x(28, 28, 50),1-50, 51-100,...
batch_y = y(:, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
// batch_y = train_y(10, n),n=50
net = cnnff( net, batch_x); //迭代1200次batchnum,再迭代echo次
net = cnnbp( net, batch_y);
net = cnnapplygrads( net, opts);
if isempty(net.rL)
net.rL(1) = net.L; //初始Loss值
end
net.rL(end + 1) = 0.99 * net.rL(end) + 0.01 * net.L;
//更新Loss function值,一个batch一次训练,则总共训练了1200次
end
toc;
end
end
cnnff函数
function net = cnnff( net, x)
n = numel( net.layers ); //网络层数,n=5
net.layers{1}.a{1} = x; //a{1}=batch_x <28 x 28 x 50>
inputmaps = 1;
for l = 2 : n
if strcmp( net.layers{l}.type, 'c')
for j = 1 : net.layers{l}.outputmaps //l层由l-1层所有 i 个feature map相加求得
z = zeros(size(net.layers{l - 1}.a{1}) - [net.layers{l}.kernelsize - 1 net.layers{l}.kernelsize - 1 0]);
// net.layers{1}.a{1}<28 x 28 x 50> - [4 4 0]
//net.layers{2}.a{i} <24 x 24 x 50>,包含了50个样本,总共有6个a{i}
for i = 1 : inputmaps
z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid');
//z是将 l- 1层中所有 i 个feature maps卷积求和,通过activation,得到 l 层的第 j 个feature maps
//convn函数计算过程中,第一个参数各个维度无变化,对第二个参数kernel进行翻转
//在这里的kernel是反向的,则在convn函数翻转kernel,则kernel为正向,最后实现相关计算
end
net.layers{l}.a{j} = sigm(z + net.layers{l}.b{j})
// layers{2}.b{j}为<1 x 1>,在a{j}矩阵的每个元素相加
end
inputmaps = net.layers{l}.outmaps;
//将本层的输出output maps数目作为下一层的输入input maps数目
elseif strcmp( net.layers{l}.type, 's')
for j = 1 : inputmaps // 6个或12个
z = convn( net.layers{l - 1}.a{j}, ones(net.layers{l}.scale) / (net.layers{l}.scale ^ 2), 'valid');
// layer{3}.a{j}<12 x 12 x 50>, layer{5}.a{j}<4 x 4 x 50>
// 使用mean-pooling,将patch内像素相加除以scale^2
// 使用convn函数,用全1矩阵实现patch像素求和,sub-sampling的kernel翻转无影响
net.layers{l}.a{j} = z(1 : net.layers{l}.scale : end, 1 : net.layers{l}.scale : end, :);
// 在z中每隔2个取一次样,达到降维的目的,且恰好忽略convn函数中计算的无用值
end
end
end
net.fv = [ ];
for j = 1 : numel( net.layers{n}.a ) //numel( layer{5}.a ) = 12
sa = size( net.layers{n}.a{j} ); // layer{5}.a{i} <4 x 4 x 50>
net.fv = [ net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3)) ];
// reshape结果为<16 x 50>,不断添加到fv矩阵后,通过12次重复得到fv<192 x 50>
//一个样本的情况下,12个output maps<4 x 4>展开为<192 x 1>,有50个样本,则为<192 x 50>
end
net.o = sigm(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2)));
//repmat表示将矩阵平铺和复制为大矩阵,ffb<10 x 1>拓展为<10 x 50>
//<10 x 50> (<10 x 192> * <192 x 50> + <10 x 50>) ;一个样本为<10 x 1>,50个样本为<10 x 50>
end
cnnbp函数
function net = cnntrain( net, x, y, opts)
m = size(x, 3); //batch_x<28 x 28 x 50>,则样本数为50
net.e = net.o - y; // <10 x 50>表示50个样本的误差
net.L = 1/2 * sum( net.e(:) .^2 ) / size(net.e, 2) );
// net.e(:)表示将net.e按列取出,排成一列<500 x 1>;sum求和,再除以样本数,表示平均错误率
net.od = net.e .* ( net.o .* (1 - net.o)); //sigmoid函数的导数为f(x)*(1-f(x))
// od表示output delta <10 x 50>
net.fvd = (net.ffW' * net.od);
//全连接层的delta <192 x 50> ( <192 x 10> * <10 x 50> )
//每列元素表示一个样本误差,一个样本的反向结果为<192 x 1>,50个样本则为<192 x 50>
if strcmp(net.layers{n}.type, 'c')
net.fvd = net.fvd .* (net.fv .* (1 - net.fv)); //subsampling层无激励函数
end
sa = size(net.layers{n}.a{1}); //<4 x 4 x 50>
fvnum = sa(1) * sa(2); //<16>即feature vector的像素个数
for j = 1 : numel(net.layers{n}.a) //numel为12
net.layers{n}.d{j} = reshape(net.fvd(((j - 1) * fvnum + 1) : j * fvnum, :), sa(1), sa(2), sa(3));
// 根据feature vector delta,即全连接层的误差,变换为12个<4 x 4 x 50>subsampling层的误差形式
end
for l = (n - 1) : -1 : 1
if strcmp(net.layers{l}.type, 'c')
for j = 1 : numel(net.layers{l}.a)
net.layers{l}.d{j} = net.layers{l}.a{j} .* (1 - net.layers{l}.a{j}) .* (expand(net.layers{l + 1}.d{j}, [net.layers{l + 1}.scale net.layers{l + 1}.scale 1]) / net.layers{l + 1}.scale ^ 2);
// net.layers{l}.d{j} 表示第 l 层上第 j 个feature map的梯度 <8 x 8 x 50>
// layers{5}.d{j}<4 x 4 x 50>,使用expand函数,拓展为<8 x 8 x 50>,即upsample操作
//由于只有convolutional层进行有activation function,要对sigmoid求导
end
elseif strcmp( net.layers{l}.type, 's' )
for i = 1 : numel( net.layers{l}.a ) // 1:6,表示 l 层的第 i 个delta是由 l+1 层的所有 j 个delta卷积求和
z = zeros(size(net.layers{l}.a{1})); // layers{3}.a{1}<12 x 12 x 50>
for j = 1 : numel(net.layers{l + 1}.a)
z = z + convn(net.layers{l + 1}.d{j}, rot180(net.layers{l + 1}.k{i}{j}), 'full');
//<12 x 12 x 1>,默认情况的计算8+5-1=12;
//layers{l+1}.k{i}{j}表示 上一层 l 层的第 i 个feature map对应到下一层 l+1 层的第 j 个feature map上的卷积kernel参数;
//有公式计算得, l+1 层 向 l 层传递,使用layers{l+1}.k{i}{j},累加卷积 j 个map求和得第 i 个map
// rot180(kernel)表示对kernel翻转,但是由于kernel是已翻转过的,所以在这里反而是正的
// 而sub-sampling层的反向delta求导函数convn要求kernel是翻转的,所以输入正的kernel满足条件
end
net.layers{l}.d{i} = z;
end
end
end
for l = 2 : n
if strcmp(net.layers{l}.type, 'c')
for j = 1 : numel(net.layers{l}.a)
for i = 1 : numel(net.layers{l - 1}.a)
net.layers{l}.dk{i}{j} = convn(flipall(net.layers{l - 1}.a{i}), net.layers{l}.d{j}, 'valid') / size(net.layers{l}.d{j}, 3);
// dk{i}{j}与k{i}{j}的size相同,flippall是将矩阵在三个维度翻转,第三维度翻转不影响结果
// <28 x 28 x 50> 卷积 <24 x 24 x 50>结果为<5 x 5>,且是50个样本的总结果,需除以size
// 原公式是将d{j}变为rot180(d{j}),使convn函数中不翻转d{j},
// 由于在此网络中kernel都是以反向形式存在,所以rot180再次翻转dk{i}{j},保证相同形式;
// 在这里是将d{j}保持不变,反向形式,相反是通过flippall翻转a{i},而convn函数处理过程中,
// 会将d{j}翻转,则d{j}与a{i}同为反向形式,因此结果dk{i}{j}恰好为反向
end
net.layers{l}.db{j} = sum(net.layers{l}.d{j}(:)) / size(net.layers{l}.d{j}, 3)
//<1 x 1>,表示第 l 层上第 j 个feature map对应的bias的偏导
// db{j}表示将所以d{j}值相加,总共有50个样本,则除以size
end
end
end
net.dffW = net.od * (net.fv)' / size(net.od, 2);
// <10 x 192>( <10 x 50> * <50 x 192> / 50),即求平均值
net.dffb = mean(net.od, 2);
//<10 x 1>求平均所得
function X = rot180(X)
X = flipdim(flipdim(X, 1), 2);
end
end
cnnapplygrads函数
function net = cnnapplygrads(net, opts)
for l = 2 : numel(net.layers) // 逐层更新
if strcmp(net.layers{l}.type, 'c')
for j = 1:numel(net.layers{l}.a)
for ii = 1:numel(net.layers{l-1}.a)
net.layers{l}.k{ii}{j} = net.layers{l}.k{ii}{j} - opts.alpha * net.layers{l}.dk{ii}{j};
// layers{2}.k{ii}{j}<5 x 5> 根据公式更新权值
end
net.layers{l}.b{j} = net.layers{l}.b{j} - opts.alhpa * net.layers{l}.db{j};
// <1 x 1>,更新bias
end
end
end
net.ffW = net.ffW - opts.alpha*net.dffW;
// <10 x 192>,更新全连接层
net.ffb = net.ffb - opts.alpha*net.dffb;
// <10 x 1>,更新全连接层bias
end
[er, bad] = cnntest(cnn, test_x, test_y);
cnntest函数
function [er, bad] = cnntest(net, x, y)
net = cnnff(net, x); //cnnff的输出为net.o,不需要label
[~, h] = max(net.o);
// ~表示最大值,h表示最大值位置,即net.o<10 x 10000>,则h<1 x 10000>,其中数值表示各样本预测类别
[~, a] = max(y);
// y<10 x 10000>,则a<1 x 10000>,其中数值表示各样本真实类别
bad = find(h ~= a);
// 计算预测错误的样本个数,<1 x 1113>
er = numel(bad) / size(y,2);
// 错误样本数除以总数,即错误率
end
figure; plot(cnn.rL);
cnn.rL<1 x 1201>总共计算了numbatch次,得到numbatch个错误率,画图显示结果