function [W, Z, G, eigs] = sdpp(X, Y, W0, G0, varargin)
%SDPP Supervised distance preserving projections (SDPP)
% COEFF = SDPP(X,Y) returns the supervised distance preserving projections
% of an N by D matrix X with the N by M response matrix Y.
%
% optional parameter values
% 'Algorithm'       - the algorithm sdpp uses to perform supervised
%                     distance preserving projections.
%       'sqlp'      - sequential quadratic programming. Recommended method
%                     if the size of the problem d is smaller than 100.
%       'cg'        - conjugate gradient search. Recommended if the size of
%                     the problem is large (default).
% 'Neighborhood.Type'   - the algorithm sdpp uses to perform supervised
% NeighborhoodNames = {'knn','mnn','epsilon','nn','nc'};
%                     distance preserving projections.
%       'knn'       - linear k nearest neighbors. (default)
%       'mnn'       - mutual nearest neighbors = knn, but keep only the
%                     neighbors that are mutual neighbors.
%       'epsilon'   - all points within an supplied epsilon distance to x_0
%                     are neighbors to x_0.
%       'nn'        - natural neighbors. The natural neighbors of a test
%                     point are defined to be the training points whose
%                     voronoi cells are adjacent to the cell containing the
%                     test point.
%       'nc'        - nearest correlation based neighboorhood. Tries to
%                     find three points that are aligned on a line.
% 'Neighborhood.k'         - supply if you use 'knn' neighborhood
% 'Neighborhood.epsilon'   - supply if you use 'epsilon' neighborhood
% 'Neighborhood.useTolerance' - supply if you use 'nc' neighborhood
%       0                  - use angles to calculate nearest correlation
%                            neighborhood (default).
%       1                  - use distances to calculate nearest correlation
%                            neighborhoods.
% 'Neighborhood.tolerance' - supply if you use 'nc' neighborhood.
% 'Neighborhood.angle'     - supply if you use 'nc' neighborhood.
%
% 'Classification'   - a boolean value.
%       0            - sdpp projects the data according to a
%                      regression task. (default)
%       1            - sdpp projects the data as a classification task.
% 'Kernel.precomputed  ' - a boolean value.
%       0            - the supplied x and y are data samples. Kernel
%                      matrices are computed. The calculation of W is also
%                      avoided.
%       1            - the supplied x and y were kernels; no new kernel
%                      matrices are computed.
% 'Kernel.x.KernelType'    - the kernel calculated for x.
%       'gaussian'         - constructs kernels of the type
%                            e^{-(|x1-x2|^2)/2t^2}. The width of the
%                            gaussians fitted are given in Kernel.x.t. If
%                            no width is given, then the width is
%                            determined with silvermans rule of thumb.
%       'polynomial'       - kernels of the type (x1'*x2)^d. The degree is
%                            given in Kernel.x.d.
%       'polyplus'         - kernels of the type (x1'*x2+1)^d. The degree is
%                            given in Kernel.x.d.
%       'linear'           - calculates a linear kernel x1'*x2.
%       'delta'            - returns a kernel that equals zero for features
%                            that have zero euclidean distance.
% 'Kernel.y.KernelType'    -  similar options for y. Uses the same values as
%                            Kernel.x if not supplied.
% 'Kernel.w.KernelType'    -  similar options for kernel PCA. Uses the same
%                            values as Kernel.x if nothing supplied. Kernel
%                            PCA is used to initialize the solution for the
%                            conjugate gradient solver.
% 'Sparse'                  - 0/1. If the neighborhood matrix is sparse,
%                             the function evaluations can calculated
%                             faster, by taking advantage of the sparsity.
%                             The sparse version is not an approximated
%                             solution; it produces the same solution as
%                             the dense solution. Sparsity can be used in
%                             the conjugate gradient version of the
%                             algorithm. Sparse argument is ignored in the
%                             SQLP version.
% 'Dimension'               - positive integer. How many dimensions should
%                             the final solution have? The optimization can
%                             be performed with fewer variables if this is
%                             specified.
% 'G'                       - the neighborhood matrix. This is an affinity
%                             matrix that can be weighted. The weights tell
%                             how much any particular distances affect the
%                             cost function. Values 0 mean that the
%                             distances are not taken into consideration at
%                             all.
%  'Lambda'                 - Ridge regression regularization size. Typical
%                             values are between 1e-1 and 1e-8.
%
%   example run:
% X = rand(100,5); Y = rand(100,2);
% nn.Type = 'knn'; nn.k = 7;
% sdpp(X,Y, [],[], 'Algorithm','cg','Neighborhood',nn)
% Mika Juuti et al. 2014-09-22

if nargin<1
    % if called with no arguments, then the user probably
    % needs help. Give it to them.
    help sdpp
    return
end

if nargin < 2 || ~ismatrix(X) || ~ismatrix(Y)
    error('sdpp:argNum', 'A matrix X and its response matrix Y must be supplied.')
end

[n, d] = size(X);
[n2, m] = size(Y);

if ~exist('W0')
    W0 = [];
    if ~exist('G0')
        G0 = [];
    end
elseif ~isnumeric(W0)
    varargin = {varargin{1:end} W0 G0};
    W0 = [];
    G0 = [];
end

defaultNeighborhood.Type = 'knn'; defaultNeighborhood.k = round(log(n));
defaultNeighborhood.symmetric = 0;

paramNames = {'Neighborhood', 'Algorithm', 'Verbose', ...
    'Classification','Lambda','Iterations','Kernel','Sparse','Dimension','Options'};
defaults   = {defaultNeighborhood,'cg', 0, 0, 1e-6,100,[],-1,d};

[vNeighborhood, vAlgorithm, verbose, ...
    vClassification, vLambda, vIterations, vKernel, vSparse, vDimension] ...
    = internal.stats.parseArgs(paramNames, defaults, varargin{:});

if(~isfield(vNeighborhood,'Type')); vNeighborhood.Type = defaultNeighborhood.Type; end
if(~isfield(vNeighborhood,'k')); vNeighborhood.k = defaultNeighborhood.k; end

% Validate String value for  Algorithm value
AlgorithmNames = {'sqlp','cg'};
vAlgorithm = internal.stats.getParamVal(vAlgorithm,AlgorithmNames,...
    '''Algorithm''');

options.Neighborhood = vNeighborhood;
options.Neighborhood.useKernel = 0;
options.Kernel = vKernel;
options.Verbose = verbose;

if (d > 100 && ~strcmp(vAlgorithm,'cg')) || (~isempty(options.Kernel) && n >= 100 && ~strcmp(vAlgorithm,'cg'))
    %Suggested to use conjugate gradient method.\nPress ^C to terminate.
    if verbose
        disp('the problem size is larger than 100: algorithm automatically switched to conjugate gradient method');
    end
    vAlgorithm = 'cg';
    pause(2)
end

% A simplifying assumption: not having
if ~exist('svec') && strcmp(vAlgorithm,'sqlp')
    run('SDPT3-4.0/Installmex')
    addpath(genpath('SDPT3-4.0'));
end

%% Calculate the neighborhood
if isempty(options.Kernel)
    if isempty(G0)
        G = neighborhood(X, options);
    else
        G = G0;
    end
end

% perform PCA with X if no initial value for W is supplied.
if ~isnumeric(W0)
    error('sdpp:w0', 'Supply an initial guess w0 or a [] as the third argument.')
end
if nargin < 3 || isempty(W0) && isempty(options.Kernel)
    if n > 1; W0 = pca(X); else W0 = 1; end
end

%% Construct kernel
if ~isempty(options.Kernel)
    if isfield(options.Kernel,'precomputed') && ~options.Kernel.precomputed  && n == d && n2 == m % avoid computing the kernels
        Kx = X;
        Ky = Y;
    else
        if verbose
            disp('calculating kernels...')
        end
        if ~isfield(options.Kernel,'x') || ~isfield(options.Kernel.x,'KernelType') % use defaults
            options.Kernel.x.KernelType = 'gaussian';
            options.Kernel.x.t = silverman(X);
        end
        Kx = constructKernel(X,[],options.Kernel.x);
        H = eye(n)- ones(n,1)*ones(1,n)/n;
        Kx = H*Kx*H;
        if ~isfield(options.Kernel,'y') || ~isfield(options.Kernel.y,'KernelType') % use defaults
            options.Kernel.y = options.Kernel.x;
        end
        Ky = constructKernel(Y,[],options.Kernel.y);
        %         Ky = H*Ky*H; doing the centralization of Ky messes up the
        %         classification: there are no values with 0's or 1's, just
        %         0-mean(y) and 1-mean(y).
        if ~isfield(options.Kernel,'w') || ~isfield(options.Kernel.w,'KernelType') % use defaults
            options.Kernel.w = options.Kernel.x;
        end
        if d > n
            options.Kernel.w.m = n;
        end
        [~, W0] = kernel_pca(X,options.Kernel.w);
    end
end

% Calculate neighborhood in kernel space
if ~isempty(options.Kernel)
    if isempty(G0)
        options.Neighborhood.useKernel = 1;
        G = neighborhood(Kx, options);
    else
        G = G0;
    end
end

%% Calculate the projections
% if the user supplies an algorithm a check is made
% the CG solution is fast for sparse matrices.
% if the matrix is dense and % the dimensionality of the data is
% sufficiently small (d<100), SQLP is recommended (requires mex files).
switch lower(vAlgorithm)
    case 'sqlp'
        if verbose
            disp('Using quadractic semidefinite programming (sqlp) for calculations.')
        end
        if ~isempty(options.Kernel)
            P = SDPP_sqlp(Kx, Ky, W0, G, vClassification, 1, vIterations,vSparse);
        else
            P = SDPP_sqlp(X, Y, W0, G, vClassification, 0, vIterations,vSparse);
        end
        [lsv, eigs]=svd(P);
        W = lsv(:,1:vDimension)*eigs(1:vDimension,1:vDimension);
        eigs = diag(eigs(1:vDimension,1:vDimension))';% extract the diagonal
    case 'cg'
        if verbose
            disp('Using conjugate gradient descent for calculations.')
        end
        
        vDimension = min(size(W0,2),vDimension);
        if ~isempty(options.Kernel)
            [W0, G] = SDPP_grad(Kx, Ky, W0, G, 1, vIterations, vLambda, ...
                vClassification, vSparse, vDimension);
        else
            [W0, G] = SDPP_grad(X, Y, W0, G, 0, vIterations, vLambda, ...
                vClassification, vSparse, vDimension);
        end
        [U,S] = svd(W0,0);
        W = U*S;
        eigs = diag(S)';% extract the diagonal
end



% if vDimension < n
%     W = lsv(:,1:d)*eigs(1:d,1:d);
%     eigs = diag(eigs(1:d,1:d))';% extract the diagonal
% else
%     W = lsv*eigs;
%     eigs = diag(eigs)';
% end

if verbose
    disp('The eigenvalues of the d first projections vectors are:')
    eigs'
end

if isempty(options.Kernel)
    Z = X*W;
else
    Z = Kx*W;
end

end