backprop a per

Upload: alejandro-tapia

Post on 03-Jun-2018

215 views

Category:

Documents


0 download

TRANSCRIPT

  • 8/12/2019 Backprop a Per

    1/7

  • 8/12/2019 Backprop a Per

    2/7

    % make sure that at least 3 layers have been specified and that the% the dimensions of the specified input layer and output layer are% equivalent to the dimensions of the input vectors and desired outputif length(L) < 3

    error('backprop:invalidNetworkStructure','The network must have at least3 layers');else

    if N ~= L(1) || M ~= L(end)e = sprintf('Dimensions of input (%d) does not match input layer

    (%d)',N,L(1));error('backprop:invalidLayerSize', e);

    elseif M ~= L(end)e = sprintf('Dimensions of output (%d) does not match output layer

    (%d)',M,L(end));error('backprop:invalidLayerSize', e);

    endend

    %%%%% INITIALIZATION PHASE %%%%%nLayers = length(L); % we'll use the number of layers often

    % randomize the weight matrices (uniform random values in [-1 1], there% is a weight matrix between each layer of nodes. Each layer (exclusive the% output layer) has a bias node whose activation is always 1, that is, the% node function is C(net) = 1. Furthermore, there is a link from each node% in layer i to the bias node in layer j (the last row of each matrix)% because it is less computationally expensive then the alternative. The% weights of all links to bias nodes are irrelevant and are defined as 0w = cell(nLayers-1,1); % a weight matrix between each layerfor i=1:nLayers-2

    w{i} = [1 - 2.*rand(L(i+1),L(i)+1) ; zeros(1,L(i)+1)];endw{end} = 1 - 2.*rand(L(end),L(end-1)+1);

    % initialize stopping conditionsmse = Inf; % assuming the intial weight matrices are badepochs = 0;

    %%%%% PREALLOCATION PHASE %%%%%% for faster computation preallocate activation,net,prev_w and sum_w

    % Activation: there is an activation matrix a{i} for each layer in the% network such that a{1} = the network input and a{end} = network output% Since we're doing batch mode, each activation matrix a{i} is a% P-by-K (P=num of samples,K=nodes at layer i) matrix such that% a{i}(j) denotes the activation vector of layer i for the jth input and% a{i}(j,k) is the activation(output) of the kth node in layer i for the jth

    % inputa = cell(nLayers,1); % one activation matrix for each layera{1} = [X ones(P,1)]; % a{1} is the input + '1' for the bias node activation

    % a{1} remains the same throught the computationfor i=2:nLayers-1

    a{i} = ones(P,L(i)+1); % inner layers include a bias node (P-by-Nodes+1)enda{end} = ones(P,L(end)); % no bias node at output layer

    % Net: like activation, there is a net matrix net{i} for each layer

  • 8/12/2019 Backprop a Per

    3/7

    % exclusive the input such that net{i} = sum(w(i,j) * a(j)) for j = i-1% and each net matrix net{i} is a P-by-K matrix such that net{i}(j) denotes% the net vector at layer i for the jth sample and net{i}(j,k) denotes the% net input at node k of the ith layer for the jth samplenet = cell(nLayers-1,1); % one net matrix for each layer exclusive inputfor i=1:nLayers-2;

    net{i} = ones(P,L(i+1)+1); % affix bias nodeendnet{end} = ones(P,L(end));

    % Since we're using batch mode and momentum, two additional matrices are% needed: prev_dw is the delta weight matrices at time (t-1) and sum_dw% is the sum of the delta weights for each presentation of the input% the notation here is the same as net and activation, that is prev_dw{i}% is P-by-K matrix where prev_dw{i} is the delta weight matrix for allsamples% at time (t-1) and sum_dw{i} is a P-by-K matrix where sum_dw{i} is the% the sum of the weight matrix at layer i for all samplesprev_dw = cell(nLayers-1,1);sum_dw = cell(nLayers-1,1);

    for i=1:nLayers-1prev_dw{i} = zeros(size(w{i})); % prev_dw starts at 0sum_dw{i} = zeros(size(w{i}));

    end

    % loop until computational bounds are exceeded or the network has converged% to a satisfactory condition. We allow for 30000 epochs here, it may be% necessary to increase or decrease this bound depending on the number of% trainingwhile mse > smse && epochs < 30000

    % FEEDFORWARD PHASE: calculate input/output off each layer for allsamples

    for i=1:nLayers-1net{i} = a{i} * w{i}'; % compute inputs to current layer

    % compute activation(output of current layer, for all layers% exclusive the output, the last node is the bias node and% its activation is 1if i < nLayers-1 % inner layers

    a{i+1} = [2./(1+exp(-net{i}(:,1:end-1)))-1 ones(P,1)];else % output layers

    a{i+1} = 2 ./ (1 + exp(-net{i})) - 1;end

    end

    % calculate sum squared error of all sampleserr = (D-a{end}); % save this for later

    sse = sum(sum(err.^2)); % sum of the error for all samples, and all nodes

    % BACKPROPAGATION PHASE: calculate the modified error at the outputlayer:

    % S'(Output) * (D-Output) in this case S'(Output) = (1+Output)*(1-Output)% then starting at the output layer, calculate the sum of the weight% matrices for all samples: LearningRate * ModifiedError * Activation% then backpropagate the error such that the modified error for this% layer is: S'(Activation) * ModifiedError * weight matrixdelta = err .* (1 + a{end}) .* (1 - a{end});

  • 8/12/2019 Backprop a Per

    4/7

    for i=nLayers-1:-1:1sum_dw{i} = n * delta' * a{i};if i > 1

    delta = (1+a{i}) .* (1-a{i}) .* (delta*w{i});end

    end

    % update the prev_w, weight matrices, epoch count and msefor i=1:nLayers-1

    % we have the sum of the delta weights, divide through by the% number of samples and add momentum * delta weight at (t-1)% finally, update the weight matricesprev_dw{i} = (sum_dw{i} ./ P) + (m * prev_dw{i});w{i} = w{i} + prev_dw{i};

    endepochs = epochs + 1;mse = sse/(P*M); % mse = 1/P * 1/M * summed squared error

    end

    % return the trained network

    Network.structure = L;Network.weights = w;Network.epochs = epochs;Network.mse = mse;

    ========================================================

  • 8/12/2019 Backprop a Per

    5/7

    % backprop a per-period backpropagation training for a multilayer feedforward% neural network.% Network = backprop(Layers,N,M,SatisfactoryMSE,Input,Desired) returns% Network, a two field structure of the form Network.structure = Layers% and Network.weights where weights is a cell array specifying the final% weight matrices computed by minimizing the mean squared error between% the Desired output and the actual output of the network given a set of% training samples: Input and the SatisfactoryMSE (satisfactory mean% squared error)%% Input:% Layers - a vector of integers specifying the number of nodes at each% layer, i.e for all i, Layers(i) = number of nodes at layer i, there% must be at least three layers and the input layer Layers(1) must% equal the dimension of each vector in Input, likewise, Layers(end)% must be equal to the dimension of each vector in Desired% N - training rate for network learning (0.1 - 0.9)% M - momentum for the weight update rule [0.1 - 0.9)% SatisfactoryMSE - the mse at which to terminate computation% Input - the training samples, a P-by-N matrix, where each Input[p] is

    % a training vector% Desired - the desired outputs, a P-by-M matrix where each Desired[p]% is the desired output for the corresponding input Input[p]%% This algorithm uses the hyperbolic tangent node function% 2/(1+e^(-net)) - 1, for use with bipolar data%% NOTE: due to its generality this algorithm is not as efficient as a% one designed for a specific problem if the number of desired layers is% known ahead of time, it is better to a) 'unfold' the loops inside the% loop presenting the data. That is, calculate the input and output of each% layer explicitly one by one and subsequently the modified error andweight% matrix modifications b) remove momentum and training rate as parameters% if they are known%% Author: Dale Patterson% $Version: 2.2.1 $ $Date: 2.25.06 $%function Network = backprop(L,n,m,smse,X,D)

    % determine number of input samples, desired output and their dimensions[P,N] = size(X);[Pd,M] = size(D);

    % make user that each input vector has a corresponding desired outputif P ~= Pd

    error('backprop:invalidTrainingAndDesired', ...'The number of input vectors and desired ouput do not match');

    end

    % make sure that at least 3 layers have been specified and that the% the dimensions of the specified input layer and output layer are% equivalent to the dimensions of the input vectors and desired outputif length(L) < 3

    error('backprop:invalidNetworkStructure','The network must have at least3 layers');

  • 8/12/2019 Backprop a Per

    6/7

    elseif N ~= L(1) || M ~= L(end)

    e = sprintf('Dimensions of input (%d) does not match input layer(%d)',N,L(1));

    error('backprop:invalidLayerSize', e);elseif M ~= L(end)

    e = sprintf('Dimensions of output (%d) does not match output layer(%d)',M,L(end));

    error('backprop:invalidLayerSize', e);end

    end

    % will use the number of layers often, so save the number herenLayers = length(L);

    % randomize the weight matrices (uniform random values in [-.5 .5], there% is a weight matrix between each layer of nodes. Each layer (exclusive the% output layer) has a bias node whose activation is always 1, that is, the% node function is C(net) = 1. Furthermore, there is a link from each node% in layer i to the bias node in layer j (the last row of each matrix)

    % because this is less computationally expensive then the alternative.% NOTE: below that the wieghts of all links to bias nodes are defined as% zerow = cell(nLayers-1,1); % a weight matrix between each layerfor i=1:nLayers-2

    w{i} = [.5 - rand(L(i+1),L(i)+1) ; zeros(1,L(i)+1)];endw{end} = .5 - rand(L(end),L(end-1)+1);

    X = [X ones(P,1)]; % affix the column of bias activations to the input layer

    % preallocate activation,net vectors and delta weight matrices for faster% computation% activation vectors, all but output layer include bias activationa = cell(nLayers,1);for i=1:nLayers-1

    a{i} = ones(L(i)+1,1);enda{end} = ones(L(end),1);

    % net vectors, one for each node in that layer but there is% no net for input layernet = cell(nLayers-1,1);for i=1:nLayers-2;

    net{i} = ones(L(i+1)+1,1);endnet{end} = ones(L(end),1);

    % delta weight matricesdw = cell(nLayers-1,1);for i=1:nLayers-1

    dw{i} = zeros(size(w{i}));end

    % initialize stopping conditionsmse = Inf; % assuming the intial weight matrices are badpresentations = 0; % we'll measure by epoch instead of presentation

  • 8/12/2019 Backprop a Per

    7/7

    % loop until computational bounds are exceeded or the network has converged% to a satisfactory condition. We allow for 30000 epochs, it may be% necessary to reduce this if the number of training samples is largewhile mse > smse && presentations < P * 10000

    sse = 0; % running total of squared errorfor p=1:P

    % get the current input vector and desired outputa{1} = X(p,:)';Dp = D(p,:)';

    % compute the inputs and outputs to each layerfor i=1:nLayers-1

    % compute inputs to this layernet{i} = w{i} * a{i};

    % compute outputs of this layer% for all layers but the output layer, the last node is the% bias node and its activation is 1if i < nLayers-1

    a{i+1} = [2./(1+exp(-net{i}(1:end-1)))-1 ; 1];else

    a{i+1} = 2./(1+exp(-net{i})) - 1;end

    end

    % accumlate the squared errorsse = sse + sum((Dp-a{end}).^2);

    % calculate the modified error at each layer and update the weight% matrices accordingly. first calculate delta, the modified error% for the output nodes (S'(Output[net])*(Dp-Output[Activation])% then for each weight matrix, add n * delta * activation and% propagate delta to the previous layerdelta = (Dp-a{end}) .* (1+a{end}) .* (1-a{end});for i=nLayers-1:-1:1

    dw{i} = n * delta * a{i}' + (m .* dw{i});w{i} = w{i} + dw{i};if i > 1 % dont compute mod err for input layer

    delta = (1+a{i}).*(1-a{i}).*(delta'*w{i})';end

    endendpresentations = presentations + P;mse = sse/(P*M); % mse = 1/P * 1/M * summed squared error

    end

    % return the trained networkNetwork.structure = L;Network.weights = w;Network.mse = mse;Network.presentations = presentations;