function [E, Y, VOut, WOut] = lyngby_nn_ctrain(X, T, VOld, WOld, Reg, ...
    arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, ...
    arg11, arg12)
	
% lyngby_nn_ctrain     - Classifier neural network training 
%
%	function [E, Y, VOut, WOut] = lyngby_nn_ctrain(X, T, VOld, ...
%	          WOld, Reg, 'PropertyName', 'PropertyValue')
%
%	Input:	X       Neural network input
%               T       Target output 
%		VOld    Old input weights
%		WOld    Old output weights
%               Reg     Regularization (weight decay)
%               Property:
%                 MaxIteration  {200} Iteration stop criterion
%                 MinCost       {0} Iteration stop criterion
%                 MinGradient   {10^(-7)} Iteration stop criterion
%                 Method        Optimization type
%                 WeightAcc     [ {off} | on ] Accumulate weights
%                 Info          [ {0} | 1 ] Reporting of
%                               costfunction and gradient. Zero
%                               means off
%
%	Output:	E	Entropic error (cost without regularization)
%		Y	Computed Outputs
%		WOut	New trained output weights or accumulated
%		        weights (depending on 'WeightAcc')
%		VOut	New trained hidden weights or accumulated
%		        weights (depending on 'WeightAcc') 
%
%       This function trains a neural network, either pruned or fully
%       connected. It will continue until one of the stop criterions
%       are meet: maxIteration is the number of epochs (optimization
%       steps), minCost is the highest acceptable value for the cost
%       function, minGradient is the hightest acceptable value for the
%       norm of the gradient.
%   
%       See also: LYNGBY, LYNGBY_NN_CMAIN, LYNGBY_NN_CFORWARD,
%                 LYNGBY_NN_CERROR. 

% cvs : $Id: lyngby_nn_ctrain.m,v 1.6 2000/10/23 14:37:40 fnielsen Exp $
%       $Revision: 1.6 $

    % Stop criterions
    maxIteration = 200;
    minCost      = 0; 
    minGradient  = 10^(-6);
    method       = 65;
    bWeightAcc   = 0;
    bInfo        = 0;
    
    n = 1;
    while n < nargin-5
      eval(sprintf('arg = lower(arg%d);', n)); 
      if strcmp(arg, 'maxiteration')
	n = n + 1;
	eval(sprintf('arg = arg%d;', n));
	if ~isstr(arg)
	  if arg > 0 
	    maxIteration = arg;
	  else
	    error([ 'Argument to ''MaxIteration'' should larger ' ...
	      'than 0.']);
	  end
	else	  
	  error('''MaxIteration'' PropertyName should be followed by a value');
	end
      elseif strcmp(arg, 'mincost')
	n = n + 1;
	eval(sprintf('arg = arg%d;', n));
	if ~isstr(arg)
	  if length(voxel) == 1
	    minCost = arg;
	  else
	    error(['''MinCost'' PropertyName should be followed by a ' ...
		  'a single value']);
	  end
	else
	  error('''MinCost'' PropertyName should be followed by a value');
	end
      elseif strcmp(arg, 'mingradient')
	n = n + 1;
	eval(sprintf('arg = arg%d;', n));
	if ~isstr(arg)
	  if length(arg) == 1
	    minGradient = arg;
	  else
	    error(['''MinGradient'' PropertyName should be followed by a ' ...
		  'a single value']);
	  end
	else 
	  error('''MinGradient'' PropertyName should be followed by a value');
	end
      
      elseif strcmp(arg, 'method')
	n = n + 1;
	eval(sprintf('arg = arg%d;', n));
	if ~isstr(arg)
	  if length(arg) == 1
	    method = arg;
	  else
	    error(['''Method'' PropertyName should be followed by a ' ...
		  'a single value']);
	  end
	else 
	  error('''Method'' PropertyName should be followed by a value');
	end
      
      elseif strcmp(arg, 'weightacc')
	n = n + 1;
	eval(sprintf('arg = arg%d;', n));
	if isstr(arg)
	  if strcmp(lower(arg), 'on')
	    bWeightAcc = 1;
	  elseif strcmp(lower(arg), 'off')
	    bWeightAcc = 0;
	  else
	    error(['''WeightAcc'' PropertyName should be followed by a ' ...
		  'a string, - either ''on'' or ''off''']);
	  end
	else 
	  error('''Method'' PropertyName should be followed by a value');
	end
      
      elseif strcmp(arg, 'info')
	n = n + 1;
	eval(sprintf('arg = arg%d;', n));
	if ~isstr(arg)
	  if arg
	    bInfo = 1;
	  else
	    bInfo = 0;
	  end
	else 
	  error('''Info'' PropertyName should be followed by a value');
	end
      else
	error(sprintf('Invalid property: %s', arg));
      end
      n = n + 1;
    end

    stepsize = 0.1;
    stepsize2 = stepsize;
    stepsizeIncrease = 1.25;
    stepsizeDecrease = 0.5;
    momentum = 0.9;
    interpolate = 100;
    
    % Sizes
    [Np,  Ni] = size(X);
    [Nii, Nh]  = size(VOld);
    [Nhh, No]  = size(WOld);
    Nw = No*Nhh;
    Nv = Nh*Nii;
    
    % Pruning Mask
    VMask = ones(size(VOld));
    WMask = ones(size(WOld));
    VMaskI = find(VOld==0);
    WMaskI = find(WOld==0);
    VMask(VMaskI) = zeros(size(VMaskI));
    WMask(WMaskI) = zeros(size(WMaskI));
    
    % Apply Mask to Input
    if any(size(VMask) == [1 1])
      X = X .* (ones(Np,1) * VMask'); 
    else

      X = X .* (ones(Np,1) * any(VMask')); 
    end

    % Accumulation of weights
    if bWeightAcc
      VOut = zeros(prod(size(VOld)), maxIteration);
      WOut = zeros(prod(size(WOld)), maxIteration);
    end
      
    % Training
    V = VOld;
    W = WOld;
    DV = zeros(size(V));
    DW = zeros(size(W));
    
    HB = ones(Np, 1);
    
    iteration = 0;
    cost      = Inf;
    gradient  = Inf;
    while (iteration < maxIteration) & (cost >= minCost) & (gradient >= minGradient) 
      logString = '';
      
      [Y,O,H] = lyngby_nn_cforward(X, V, W);
      E       = lyngby_nn_cerror(T, Y, O);
      C	      = lyngby_nn_cost(E, V, W, Reg);
      EY      = Y - T;
      
      if any(method == [ 12 15 16 17 18 35 36 37 65 66 67])
	% First order derivative
	
	% First order derivative of Regularization 
	dRUv  = lyngby_nn_cdru(V, W, Reg);      
	dRV   = reshape(dRUv(1:Nv), Nii, Nh);
	dRW   = reshape(dRUv(Nv+(1:Nw)), Nhh, No);
	
	% Output weights, First order derivative
	dEW       = lyngby_nn_cdew(EY, [H HB], W);
	dCW       = dEW + dRW;

	% Hidden weights, First order derivative
	[dEV,H12] = lyngby_nn_cdev(EY, H, X, V, W);
	dCV       = dEV + dRV;

      end
      
      if any(method == [ 65 66 67 ])
	% Regularization, Diagonal Second order derivative

	ddRUv = lyngby_nn_cddru(V, W, Reg);
	ddRV  = reshape(ddRUv(1:Nv), Nii, Nh);
	ddRW  = reshape(ddRUv(Nv+(1:Nw)), Nhh, No);
      end
      
      if any(method == [ 65 66])
	% Output weights, Diagonal second order derivative
	
	ddEW = lyngby_nn_cddewds(Y, H, W);
	ddCW = ddEW + ddRW;
      end
      
      if any(method == [ 65 66 67])
	% Input weights, Symmetric diagonal secord order derivative

	ddEV = lyngby_nn_cddevds(X, Y, V, W, H12);
	ddCV = ddEV + ddRV;
      end

      if any(method == [ 67 ])
	% Output weights, Asymmetric diagonal secord order derivative

	ddEW = lyngby_nn_cddewda(Y, T, H, W);
	ddCW = ddEW + ddRW;
      end
      

      
      if any(method == [68 ])
	% Input weights, Asymmetric diagonal secord order derivative

	% ddEV = lyngby_nn_cddevda(X, Y, H, W, H12);
	% ddCV = ddEV + ddRV;
      end
      

      if any(method == [ 75 ])
	% Full hessian second order derivative
	
	% Full
	%ddEW	= lyngby_nn_cddewf(Y, H);
	ddCW	= ddEW + diag(Reg);
      end
      
      % Hidden weights, Second order derivative
      % Pseudo
      
      % Actual optimization
      
      if method == 12 
	% First order: gradient descent   
	
	DV = - stepsize * dCV .* VMask;
	DW = - stepsize * dCW .* WMask;
	succes = 1;

	
	VNew = V + DV;
	WNew = W + DW;

	if bInfo
	  [Y,O,H] = lyngby_nn_cforward(X, V, W);
	  ENew    = lyngby_nn_cerror(T, Y, O);
	  CNew    = lyngby_nn_cost(ENew, V, W, Reg);
	
	  logString = sprintf('Succes: %2d, %s', succes, logString);
	else
	  ENew    = E; 
	  CNew    = C; 
	end

      elseif method == 15 
	% First order: softline gradient descent   
	
	DV = - stepsize * dCV .* VMask;
	DW = - stepsize * dCW .* WMask;
	[CNew, VNew, WNew, succes] = lyngby_nn_csoftline(X, T, V, W, ...
	    Reg, DV, DW, C, 100);
	logString = sprintf('Succes: %2d, %s', succes, logString);
	
      elseif method == 16
	% First order: softline gradient descent with adaptive stepsize
	
	DV = - stepsize * dCV .* VMask;
	DW = - stepsize * dCW .* WMask;
	[CNew, VNew, WNew, succes] = lyngby_nn_csoftline(X, T, V, W, ...
	    Reg, DV, DW, C, 10);
	if succes == 1 
	  stepsize = stepsize * stepsizeIncrease;
	elseif succes == 0
	  stepsize = stepsize * stepsizeDecrease;
	else
	  stepsize = stepsize * 2^(-succes+1);
	end
	logString = sprintf('Succes: %2d, Stepsize: %5g, %s', succes, stepsize, logString); 

      elseif method == 17 
	% First order: softline gradient descent, with gradient if
	% unsuccesful
	
	DV = - stepsize * dCV .* VMask;
	DW = - stepsize * dCW .* WMask;
	[CNew, VNew, WNew, succes] = lyngby_nn_csoftline(X, T, V, W, ...
	    Reg, DV, DW, C, 10);
	if ~succes
	  VNew = V + DV;
	  WNew = W + DW;
	  succes = 1;
	  
	  [Y,O,H] = lyngby_nn_cforward(X, V, W);
	  ENew    = lyngby_nn_cerror(T, Y, O);
	  CNew    = lyngby_nn_cost(ENew, V, W, Reg);
	  
	end
	logString = sprintf('Succes: %2d, %s', succes, logString);
	
      elseif method == 18
	% First order: softline gradient descent with adaptive
	% stepsize, with gradient descent if unsuccesful
	
	DV = - stepsize * dCV .* VMask;
	DW = - stepsize * dCW .* WMask;
	[CNew, VNew, WNew, succes] = lyngby_nn_csoftline(X, T, V, W, ...
	    Reg, DV, DW, C, 10);
	if succes == 1 
	  stepsize = stepsize * stepsizeIncrease;
	elseif succes == 0
	  stepsize = stepsize * stepsizeDecrease;

	  VNew = V + DV;
	  WNew = W + DW;
	  succes = 1;
	  
	  [Y,O,H] = lyngby_nn_cforward(X, V, W);
	  ENew    = lyngby_nn_cerror(T, Y, O);
	  CNew    = lyngby_nn_cost(ENew, V, W, Reg);
	else
	  stepsize = stepsize * 2^(-succes+1);
	end
	logString = sprintf('Succes: %2d, Stepsize: %5g, %s', succes, stepsize, logString); 


      
      elseif method >= 30 & method < 40
	% First order with momentum         	
	  
	DVOld = DV;
	DWOld = DW;

	if method == 35
	  % Straight momentum gradient descent with soft linesearch
	
	  DV = (- stepsize * dCV + momentum * DVOld) .* VMask;
	  DW = (- stepsize * dCW + momentum * DWOld) .* WMask;
	  [CNew, VNew, WNew, succes] = lyngby_nn_csoftline(X, T, V, W, ...
	      Reg, DV, DW, C, 30);
	  logString = sprintf('Succes: %2d, Stepsize: %5g, %s', succes, stepsize, logString); 
	  
	elseif method == 36
	  % Poor-mans Levenberg-Marquardt: 
	  % Interpolating between momentum calculated step and an
	  % infinitely small gradient step, with soft linesearch
	  
	  DV =  -dCV ./ (-dCV ./ (- dCV + momentum * DVOld + ...
	      realmin) + interpolate + realmin) .* VMask;
	  DW =  -dCW ./ (-dCW ./ (- dCW + momentum * DWOld + ...
	      realmin) + interpolate + realmin) .* WMask;
	  [CNew, VNew, WNew, succes] = lyngby_nn_csoftline(X, T, V, W, ...
	      Reg, DV, DW, C, 1);
	  if succes == 1
	    interpolate = interpolate * 1.25;
	  elseif succes == 0
	    interpolate = interpolate * 0.75;
	  end
	  logString = sprintf('Succes: %2d, Interpolate: %5g, %s', succes, interpolate, logString); 
	  
	elseif method == 37
	  % Hybrid optimization:
	  %   1. Momentum gradient descent with soft linesearch
	  %   2. Gradient descent with output layer
	  
	  DV = (- stepsize2 * dCV + momentum * DVOld) .* VMask;
	  DW = (- stepsize2 * dCW + momentum * DWOld) .* WMask;
	  [CNew, VNew, WNew, succes, DV, DW] = lyngby_nn_csoftline(X, T, V, W, ...
	      Reg, DV, DW, C, 10);
	  if succes == 1 
	    stepsize2 = stepsize2 * stepsizeIncrease;
	  elseif succes == 0
	    stepsize2 = stepsize2 * stepsizeDecrease;
	  else
	    stepsize2 = stepsize2 * 2^(-succes+1);
	  end

	  if ~succes
	    DV = - stepsize * dCV .* VMask;
	    DV = zeros(size(DV));
	    DW = - stepsize * dCW .* WMask;
	    % DW = zeros(size(DW));
	    [CNew, VNew, WNew, succes] = lyngby_nn_csoftline(X, T, V, W, ...
		Reg, DV, DW, C, 20);
	    if succes == 1 
	      stepsize = stepsize * stepsizeIncrease;
	    elseif succes == 0
	      stepsize = stepsize * stepsizeDecrease;
	    else
	      stepsize = stepsize * 2^(-succes+1);
	    end
	    logString = sprintf('first %s', logString);

	  end
	  logString = sprintf('Succes: %2d, Stepsize: %5g, %s', ...
	      succes, stepsize, logString); 
	end

      elseif method == 40
	% Conjugate gradient
	
	N = [ [Nii Nhh No ] ; [Ni Nh No] ];
	UOldv = [ V(:) ; W(:) ]; 
	
	func = 'lyngby_nn_ccdc';
	tol = 0.001;

	[UNewv, CNew, iter, nbeval] = lyngby_opt_conjgrad(UOldv, func, ...
	    tol, X, T, Reg, N);
	succes = 1;
	[VNew, WNew] = lyngby_nn_u2vw(UNewv, N);
	
	
	logString = sprintf(['Conjgrad, Iterations: %4d, '...
	      '%s'], iter, logString); 

	
      elseif method == 65
	% Hybrid optimization:
	%   1. Symetrical diagonal second Order, soft linesearch
	%   2. Gradient descent, soft linesearch

	DVOld = DV;
	DWOld = DW;
	
	if iteration > 20
	  DV = zeros(size(DV));
	  DV = - (dCV ./ (ddCV + realmin)) .* VMask;
	  DW = zeros(size(DW));
	  DW = - (dCW ./ (ddCW + realmin)) .* WMask;
	  
	  [CNew, VNew, WNew, succes, DV, DW] = lyngby_nn_csoftline(X, ...
	      T, V, W, Reg, DV, DW, C, 10);
	else
	  succes = 0;
	end
	s = 1;
	if ~succes 
	  DV = - dCV .* VMask;
	  DW = - dCW .* WMask;
	  [CNew, VNew, WNew, succes, DV, DW] = lyngby_nn_csoftline(X, T, V, W, ...
	      Reg, DV, DW, C, 20);
	  logString = sprintf('first %d, %s', succes, logString);
	end
	logString = sprintf(['Succes: %2d, '...
	      '%s'], succes, logString); 
      
      elseif method == 66
	% Poor mans levenberg Marquardt

	DVOld = DV;
	DWOld = DW;
	
	DV = - (dCV ./ (ddCV + interpolate)) .* VMask;
	DW = - (dCW ./ (ddCW + interpolate)) .* WMask;

	VNew = (V + DV) .* VMask;;
	WNew = (W + DW) .* WMask;;

	[YNew,ONew,HNew] = lyngby_nn_cforward(X, VNew, WNew);
	ENew             = lyngby_nn_cerror(T, YNew, ONew);
	CNew            = lyngby_nn_cost(ENew, VNew, WNew, Reg);
	EYNew = Y - T;

	if (CNew < C) 
	  interpolate = interpolate * 0.9;
	  Y = YNew;
	  O = ONew;
	  H = HNew;
	  E = ENew;
	  EY = EYNew;
	  succes = 1;
	else 
	  interpolate = interpolate * 2;
	  VNew = V;
	  WNew = W;
	  CNew = C;
	  succes = 2;
	end

	if bInfo
	  logString = sprintf(['Succes %d , Interpolate: %5g, '...
		'%s'],  succes, interpolate, logString); 
	end
      
      elseif method == 67
	% Hybrid optimization:
	%   1. Symetrical diagonal second Order, soft linesearch
	%   2. Gradient descent, soft linesearch

	DVOld = DV;
	DWOld = DW;
	
	if iteration > 50
	  DV = zeros(size(DV));
	  DV = - (dCV ./ (ddCV + realmin)) .* VMask;
	  DW = zeros(size(DW));
	  DW = - (dCW ./ (ddCW + realmin)) .* WMask;
	  
	  [CNew, VNew, WNew, succes, DV, DW] = lyngby_nn_csoftline(X, ...
	      T, V, W, Reg, DV, DW, C, 10);
	else
	  succes = 0;
	end
	s = 1;
	if ~succes 
	  DV = - dCV .* VMask;
	  DW = - dCW .* WMask;
	  [CNew, VNew, WNew, succes, DV, DW] = lyngby_nn_csoftline(X, T, V, W, ...
	      Reg, DV, DW, C, 20);
	  logString = sprintf('first %s', logString);
	end
	logString = sprintf(['Succes: %2d, '...
	      '%s'], succes, logString); 
      
      end

      W = WNew;
      V = VNew;
      C = CNew;
      if (succes == 0)
	% disp('Warning: could not find a better solution');
      end

      if bWeightAcc
	VOut(:,iteration+1) = VNew(:);
	WOut(:,iteration+1) = WNew(:);
      end

      % Update stop criterions
      cost = C;
      if method == 40
	gradient = Inf;
      else
	dCUv = [dCV(:) ; dCW(:) ];
	gradient = norm(dCUv);
      end
      if bInfo
	disp(sprintf('Iteration: %5d, Cost: %6f, Gradient: %8f, %s', ...
	    iteration+1, cost, gradient, logString));
      end
      iteration = iteration + 1;
    end
    
    if bWeightAcc
      % If the stop criterion is not maxIteration
      VOut = VOut(:,1:iteration);
      WOut = WOut(:,1:iteration);
    else
      VOut = VNew;
      WOut = WNew;
    end


















