backprop a per
TRANSCRIPT
-
8/12/2019 Backprop a Per
1/7
-
8/12/2019 Backprop a Per
2/7
% make sure that at least 3 layers have been specified and that the% the dimensions of the specified input layer and output layer are% equivalent to the dimensions of the input vectors and desired outputif length(L) < 3
error('backprop:invalidNetworkStructure','The network must have at least3 layers');else
if N ~= L(1) || M ~= L(end)e = sprintf('Dimensions of input (%d) does not match input layer
(%d)',N,L(1));error('backprop:invalidLayerSize', e);
elseif M ~= L(end)e = sprintf('Dimensions of output (%d) does not match output layer
(%d)',M,L(end));error('backprop:invalidLayerSize', e);
endend
%%%%% INITIALIZATION PHASE %%%%%nLayers = length(L); % we'll use the number of layers often
% randomize the weight matrices (uniform random values in [-1 1], there% is a weight matrix between each layer of nodes. Each layer (exclusive the% output layer) has a bias node whose activation is always 1, that is, the% node function is C(net) = 1. Furthermore, there is a link from each node% in layer i to the bias node in layer j (the last row of each matrix)% because it is less computationally expensive then the alternative. The% weights of all links to bias nodes are irrelevant and are defined as 0w = cell(nLayers-1,1); % a weight matrix between each layerfor i=1:nLayers-2
w{i} = [1 - 2.*rand(L(i+1),L(i)+1) ; zeros(1,L(i)+1)];endw{end} = 1 - 2.*rand(L(end),L(end-1)+1);
% initialize stopping conditionsmse = Inf; % assuming the intial weight matrices are badepochs = 0;
%%%%% PREALLOCATION PHASE %%%%%% for faster computation preallocate activation,net,prev_w and sum_w
% Activation: there is an activation matrix a{i} for each layer in the% network such that a{1} = the network input and a{end} = network output% Since we're doing batch mode, each activation matrix a{i} is a% P-by-K (P=num of samples,K=nodes at layer i) matrix such that% a{i}(j) denotes the activation vector of layer i for the jth input and% a{i}(j,k) is the activation(output) of the kth node in layer i for the jth
% inputa = cell(nLayers,1); % one activation matrix for each layera{1} = [X ones(P,1)]; % a{1} is the input + '1' for the bias node activation
% a{1} remains the same throught the computationfor i=2:nLayers-1
a{i} = ones(P,L(i)+1); % inner layers include a bias node (P-by-Nodes+1)enda{end} = ones(P,L(end)); % no bias node at output layer
% Net: like activation, there is a net matrix net{i} for each layer
-
8/12/2019 Backprop a Per
3/7
% exclusive the input such that net{i} = sum(w(i,j) * a(j)) for j = i-1% and each net matrix net{i} is a P-by-K matrix such that net{i}(j) denotes% the net vector at layer i for the jth sample and net{i}(j,k) denotes the% net input at node k of the ith layer for the jth samplenet = cell(nLayers-1,1); % one net matrix for each layer exclusive inputfor i=1:nLayers-2;
net{i} = ones(P,L(i+1)+1); % affix bias nodeendnet{end} = ones(P,L(end));
% Since we're using batch mode and momentum, two additional matrices are% needed: prev_dw is the delta weight matrices at time (t-1) and sum_dw% is the sum of the delta weights for each presentation of the input% the notation here is the same as net and activation, that is prev_dw{i}% is P-by-K matrix where prev_dw{i} is the delta weight matrix for allsamples% at time (t-1) and sum_dw{i} is a P-by-K matrix where sum_dw{i} is the% the sum of the weight matrix at layer i for all samplesprev_dw = cell(nLayers-1,1);sum_dw = cell(nLayers-1,1);
for i=1:nLayers-1prev_dw{i} = zeros(size(w{i})); % prev_dw starts at 0sum_dw{i} = zeros(size(w{i}));
end
% loop until computational bounds are exceeded or the network has converged% to a satisfactory condition. We allow for 30000 epochs here, it may be% necessary to increase or decrease this bound depending on the number of% trainingwhile mse > smse && epochs < 30000
% FEEDFORWARD PHASE: calculate input/output off each layer for allsamples
for i=1:nLayers-1net{i} = a{i} * w{i}'; % compute inputs to current layer
% compute activation(output of current layer, for all layers% exclusive the output, the last node is the bias node and% its activation is 1if i < nLayers-1 % inner layers
a{i+1} = [2./(1+exp(-net{i}(:,1:end-1)))-1 ones(P,1)];else % output layers
a{i+1} = 2 ./ (1 + exp(-net{i})) - 1;end
end
% calculate sum squared error of all sampleserr = (D-a{end}); % save this for later
sse = sum(sum(err.^2)); % sum of the error for all samples, and all nodes
% BACKPROPAGATION PHASE: calculate the modified error at the outputlayer:
% S'(Output) * (D-Output) in this case S'(Output) = (1+Output)*(1-Output)% then starting at the output layer, calculate the sum of the weight% matrices for all samples: LearningRate * ModifiedError * Activation% then backpropagate the error such that the modified error for this% layer is: S'(Activation) * ModifiedError * weight matrixdelta = err .* (1 + a{end}) .* (1 - a{end});
-
8/12/2019 Backprop a Per
4/7
for i=nLayers-1:-1:1sum_dw{i} = n * delta' * a{i};if i > 1
delta = (1+a{i}) .* (1-a{i}) .* (delta*w{i});end
end
% update the prev_w, weight matrices, epoch count and msefor i=1:nLayers-1
% we have the sum of the delta weights, divide through by the% number of samples and add momentum * delta weight at (t-1)% finally, update the weight matricesprev_dw{i} = (sum_dw{i} ./ P) + (m * prev_dw{i});w{i} = w{i} + prev_dw{i};
endepochs = epochs + 1;mse = sse/(P*M); % mse = 1/P * 1/M * summed squared error
end
% return the trained network
Network.structure = L;Network.weights = w;Network.epochs = epochs;Network.mse = mse;
========================================================
-
8/12/2019 Backprop a Per
5/7
% backprop a per-period backpropagation training for a multilayer feedforward% neural network.% Network = backprop(Layers,N,M,SatisfactoryMSE,Input,Desired) returns% Network, a two field structure of the form Network.structure = Layers% and Network.weights where weights is a cell array specifying the final% weight matrices computed by minimizing the mean squared error between% the Desired output and the actual output of the network given a set of% training samples: Input and the SatisfactoryMSE (satisfactory mean% squared error)%% Input:% Layers - a vector of integers specifying the number of nodes at each% layer, i.e for all i, Layers(i) = number of nodes at layer i, there% must be at least three layers and the input layer Layers(1) must% equal the dimension of each vector in Input, likewise, Layers(end)% must be equal to the dimension of each vector in Desired% N - training rate for network learning (0.1 - 0.9)% M - momentum for the weight update rule [0.1 - 0.9)% SatisfactoryMSE - the mse at which to terminate computation% Input - the training samples, a P-by-N matrix, where each Input[p] is
% a training vector% Desired - the desired outputs, a P-by-M matrix where each Desired[p]% is the desired output for the corresponding input Input[p]%% This algorithm uses the hyperbolic tangent node function% 2/(1+e^(-net)) - 1, for use with bipolar data%% NOTE: due to its generality this algorithm is not as efficient as a% one designed for a specific problem if the number of desired layers is% known ahead of time, it is better to a) 'unfold' the loops inside the% loop presenting the data. That is, calculate the input and output of each% layer explicitly one by one and subsequently the modified error andweight% matrix modifications b) remove momentum and training rate as parameters% if they are known%% Author: Dale Patterson% $Version: 2.2.1 $ $Date: 2.25.06 $%function Network = backprop(L,n,m,smse,X,D)
% determine number of input samples, desired output and their dimensions[P,N] = size(X);[Pd,M] = size(D);
% make user that each input vector has a corresponding desired outputif P ~= Pd
error('backprop:invalidTrainingAndDesired', ...'The number of input vectors and desired ouput do not match');
end
% make sure that at least 3 layers have been specified and that the% the dimensions of the specified input layer and output layer are% equivalent to the dimensions of the input vectors and desired outputif length(L) < 3
error('backprop:invalidNetworkStructure','The network must have at least3 layers');
-
8/12/2019 Backprop a Per
6/7
elseif N ~= L(1) || M ~= L(end)
e = sprintf('Dimensions of input (%d) does not match input layer(%d)',N,L(1));
error('backprop:invalidLayerSize', e);elseif M ~= L(end)
e = sprintf('Dimensions of output (%d) does not match output layer(%d)',M,L(end));
error('backprop:invalidLayerSize', e);end
end
% will use the number of layers often, so save the number herenLayers = length(L);
% randomize the weight matrices (uniform random values in [-.5 .5], there% is a weight matrix between each layer of nodes. Each layer (exclusive the% output layer) has a bias node whose activation is always 1, that is, the% node function is C(net) = 1. Furthermore, there is a link from each node% in layer i to the bias node in layer j (the last row of each matrix)
% because this is less computationally expensive then the alternative.% NOTE: below that the wieghts of all links to bias nodes are defined as% zerow = cell(nLayers-1,1); % a weight matrix between each layerfor i=1:nLayers-2
w{i} = [.5 - rand(L(i+1),L(i)+1) ; zeros(1,L(i)+1)];endw{end} = .5 - rand(L(end),L(end-1)+1);
X = [X ones(P,1)]; % affix the column of bias activations to the input layer
% preallocate activation,net vectors and delta weight matrices for faster% computation% activation vectors, all but output layer include bias activationa = cell(nLayers,1);for i=1:nLayers-1
a{i} = ones(L(i)+1,1);enda{end} = ones(L(end),1);
% net vectors, one for each node in that layer but there is% no net for input layernet = cell(nLayers-1,1);for i=1:nLayers-2;
net{i} = ones(L(i+1)+1,1);endnet{end} = ones(L(end),1);
% delta weight matricesdw = cell(nLayers-1,1);for i=1:nLayers-1
dw{i} = zeros(size(w{i}));end
% initialize stopping conditionsmse = Inf; % assuming the intial weight matrices are badpresentations = 0; % we'll measure by epoch instead of presentation
-
8/12/2019 Backprop a Per
7/7
% loop until computational bounds are exceeded or the network has converged% to a satisfactory condition. We allow for 30000 epochs, it may be% necessary to reduce this if the number of training samples is largewhile mse > smse && presentations < P * 10000
sse = 0; % running total of squared errorfor p=1:P
% get the current input vector and desired outputa{1} = X(p,:)';Dp = D(p,:)';
% compute the inputs and outputs to each layerfor i=1:nLayers-1
% compute inputs to this layernet{i} = w{i} * a{i};
% compute outputs of this layer% for all layers but the output layer, the last node is the% bias node and its activation is 1if i < nLayers-1
a{i+1} = [2./(1+exp(-net{i}(1:end-1)))-1 ; 1];else
a{i+1} = 2./(1+exp(-net{i})) - 1;end
end
% accumlate the squared errorsse = sse + sum((Dp-a{end}).^2);
% calculate the modified error at each layer and update the weight% matrices accordingly. first calculate delta, the modified error% for the output nodes (S'(Output[net])*(Dp-Output[Activation])% then for each weight matrix, add n * delta * activation and% propagate delta to the previous layerdelta = (Dp-a{end}) .* (1+a{end}) .* (1-a{end});for i=nLayers-1:-1:1
dw{i} = n * delta * a{i}' + (m .* dw{i});w{i} = w{i} + dw{i};if i > 1 % dont compute mod err for input layer
delta = (1+a{i}).*(1-a{i}).*(delta'*w{i})';end
endendpresentations = presentations + P;mse = sse/(P*M); % mse = 1/P * 1/M * summed squared error
end
% return the trained networkNetwork.structure = L;Network.weights = w;Network.mse = mse;Network.presentations = presentations;