% Example 5.7 - Gap Statistic


% First step is to get the clusters
% for 1 to K clusters.

load lungB
% Take the transpose, because the
% colums are the observations.
X = lungB';
[n,p] = size(X);
% Standardize the columns.
for i = 1:p
    X(:,i) = X(:,i)/std(X(:,i));
end
% Test for a maximum of 10 clusters.
K = 10;
Y = pdist(X,'euclidean');
Z = linkage(Y,'complete');
% First get the observed log(W_k).
% We will use the squared Euclidean distance
% for the gap statistic.
% Get the one for 1 cluster first.
W(1) = sum(pdist(X).^2)/(2*n);
for k = 2:K
    % Find the index for k.
    inds = cluster(Z,k);
    for r = 1:k
        indr = find(inds==r);
        nr = length(indr);
        % Find squared Euclidean distances.
        ynr = pdist(X(indr,:)).^2;
        D(r) = sum(ynr)/(2*nr);
    end
    W(k) = sum(D);
end

% Now find the estimated expected
% values.
B = 10;
% Find the range of columns of X for gap-uniform
minX = min(X);
maxX = max(X);
Wb = zeros(B,K);
% Now do this for the bootstrap.
for b = 1:B
    b
    % Generate according to the gap-uniform method.
    % Find the min values and max values.
    Xb = [];
    for j = 1:p
        Xb = [Xb, unifrnd(minX(j),maxX(j),n,1)];
    end
    Yb = pdist(Xb,'euclidean');
    Zb = linkage(Yb,'complete');
    % First get the observed log(W_k)
    % We will use the squared Euclidean distance.
    % Get the one for 1 cluster first.
    Wb(b,1) = sum(pdist(Xb).^2)/(2*n);
    for k = 2:K
        % Find the index for k.
        inds = cluster(Zb,k);
        for r = 1:k
            indr = find(inds==r);
            nr = length(indr);
            % Find squared Euclidean distances.
            ynr = pdist(Xb(indr,:)).^2;
            D(r) = sum(ynr)/(2*nr);
        end
        Wb(b,k) = sum(D);
    end
end
% Find the mean and standard deviation
Wobs = log(W);
muWb = mean(log(Wb));
sdk = (B-1)*std(log(Wb))/B;
gap = muWb - Wobs;
% Find the weighted version.
sk = sdk*sqrt(1 + 1/B);
gapsk = gap - sk;
% Find the lowest one that is larger:
ineq = gap(1:9) - gapsk(2:10);
ind = find(ineq > 0);
khat = ind(1);

plot(1:K,Wobs,'o-',1:K,muWb,'x-')
legend({'Observed';'Expected'})
xlabel('Number of Clusters k')
ylabel('Observed and Expected log(W_k)')


figure,plot(1:K,gap,'o-'),title('Gap')
xlabel('Number of Clusters k')
ylabel('Gap')
