% Complexity pursuit. Ata Kaban and Ella Bingham % Load newsgroup data and preprocess by removing empty documents. terms = textread('newsterms2106','%s'); load -ascii newsmatrix2106; %has 5000 terms T = 5000; termDoc = reshape(newsmatrix, T, size(newsmatrix,1)/T); termDoc = sparse(termDoc); nonempty = find(sum(termDoc,1)~=0); termDoc = termDoc(:,nonempty); % Make data binary termDoc = (termDoc > 0); [T,N] = size(termDoc); n_cl = 10; %number of latent components estimated K = 100; %dimensionality of SVD preprocessing % Preprocess by singular value decomposition [U,E,V] = svds(termDoc,K); % Then do the complexity pursuit step = 5; arn = 1; %degree of autoregressive model assumed usedNlinearity = 2; %must be 1 (sign) or 2 (tanh), otherwise linear Z = sqrt(N)*V'; Zd=[zeros(K,step), Z(:,1:(N-step))]; %time-delayed data C1=Z*Zd'/N; mu=1; resid = zeros(n_cl,N); W=randn(n_cl,K); W=inv(sqrtm(W*W'))*W; epsilon = 0.01; diff = epsilon*prod(size(W)); rounds = 0; maxrounds = 200; %might need adjustment while (diff > epsilon); %symmetric approach rounds = rounds + 1; if rounds > maxrounds, break, end Wold = W; alpha = zeros(n_cl, max(arn,1)); for i=1:n_cl if arn == 1; alpha(i)=W(i,:)*C1*W(i,:)'; resid(i,:)=W(i,:)*(Z-alpha(i)*Zd); if usedNlinearity == 1; %sign W(i,:)=W(i,:)+mu*((Z-alpha(i)*Zd)*(-sign(resid(i,:)))')'/N; elseif usedNlinearity == 2; %tanh W(i,:)=W(i,:)+mu*((Z-alpha(i)*Zd)*(-tanh(resid(i,:)))')'/N; else; %linear g W(i,:)=W(i,:)+mu*((Z-alpha(i)*Zd)*(-resid(i,:))')'/N; end; elseif arn > 1; u = W*Z; th = ar(u(i,:)', arn); % Estimate the autoregr. constants a = th2poly(th); alpha(i,:) = -a(2:length(a)); Z_minus_sum = Z; for tau = 1:arn Z_minus_sum = Z_minus_sum - alpha(i,tau) * ... [zeros(size(Z,1),tau) Z(:,1:size(Z,2)-tau)]; end; %for tau resid(i,:) = W(i,:)*Z_minus_sum; if usedNlinearity == 1; %sign W(i,:) = W(i,:)+mu*(Z_minus_sum*(-sign(resid(i,:)))')'/N; elseif usedNlinearity == 2; %tanh W(i,:) = W(i,:)+mu*(Z_minus_sum*(-tanh(resid(i,:)))')'/N; else % linear g W(i,:) = W(i,:)+mu*(Z_minus_sum*(-resid(i,:))')'/N; end; else; %ordinary ICA: arn = 0 alpha(i) = 0; resid(i,:) = W(i,:)*Z; if usedNlinearity == 1; %sign W(i,:)=W(i,:)+mu*((Z-alpha(i)*Zd)*(-sign(resid(i,:)))')'/N; elseif usedNlinearity == 2; %tanh W(i,:)=W(i,:)+mu*((Z-alpha(i)*Zd)*(-tanh(resid(i,:)))')'/N; else %linear g W(i,:)=W(i,:)+mu*((Z-alpha(i)*Zd)*(-resid(i,:))')'/N; end; % here Zd is obsolete as alpha=0 end; %if arn end %for i %if rounds == 1, resz = Z-alpha(1)*Zd; alpha1 = alpha; end W=inv(sqrtm(W*W'))*W; diff = 0; for i=1:n_cl diff = diff + min(norm(W(i,:)-Wold(i,:)), norm(W(i,:)+Wold(i,:))); end; diff end %while sterm=W*E*U'; %sterm=W*U might also do, depending on the data. % Turn around the negatively skewed source signals (if any) negskew = find(skewness(sterm')<0) sdoc = W*V'; S = [sterm, sdoc]; S(negskew,:) = -S(negskew,:);