%DX = discretize(X, dataType, discrProp, scaling)
%
%Discretizes a given matrix X. The output DX is an input for swap-method.
%
%
%Data type: 
% s = similar features (default, if others are not obviously true)
% d = dissimilar features (columns), large values in different features are comparable 
% c = dissimilar features (columns), small and large values are comparable
% n = nominal data or prediscretized numerical data with similar features (only for reasonable number of different values)
%
%Discretization properties (three properties each having a set of disjoint options, concatenate):
% u = unique tolerance ranges for each value (default)
% d = discrete, shared torerance ranges
%
% g = global-based calculation of the tolerance ranges by using all values (default)
% l = local-based calculation of the tolerance ranges by using values in the same row or column
% s = same global discretization for row and column tolerance ranges
%
% n = fixed number of neighbors (default)
% e = fixed length eps of the tolerance range
% m = maximum union of the neighbor and eps tolerance ranges
%
%Scaling: 
% Numerical value s that is used to scale the size of the tolerance ranges, default is s = 1
%
%
%Examples:
%
% DX = discretize(X)                  %discretizes with default parameters
% DX = discretize(X,'s','ugn',1)   %this equals the default for most matrices
% DX = discretize(X,'s','de')         %this produces approximately the same randomization used in an earlier journal paper
% DX = discretize(X,'d','',0.5)         %dissimilar features with smaller tolerance ranges

% Implemented by Markus Ojala in 2010

function DX = discretize(X,dataType,discrProp,scaling,dispoff)

%Parse the input parameters and assign default values for missing arguments

%Data type
dataTypeNotGiven = nargin<2 || isempty(dataType);
if(dataTypeNotGiven)    
    if(iscell(X) || numel(unique(X)) < 4*sqrt(nnz(X)/sqrt(numel(X)))) %nominal
        dataType = 'n'; 
    elseif(hasDissimilarFeatures(X))
        dataType = 'd';
    elseif(hasDissimilarFeatures(X',1/3)) %1/3 is for being not so strict to report an error
        error('Features seems to be on rows and dissimilar. In such case, transpose X and call the method again. Otherwise, explicitly give the similarity parameter dataProp=''s''.');        
    else
        dataType = 's';
    end
end
dataType = lower(dataType(1));
if(~ismember(dataType, 'ndcs')), error(['Unknown data type: ' dataType]); end

%Discretization properties
if(nargin<3 || isempty(discrProp)), discrProp = ''; end
discrProp = lower(unique(discrProp));
if(isempty(regexp(discrProp,'[ud]', 'once'))), discrProp = [discrProp 'u']; end %unique tolerance ranges
if(isempty(regexp(discrProp,'[gls]', 'once'))), discrProp = [discrProp 'g']; end %global tolerance ranges
if(isempty(regexp(discrProp,'[nem]', 'once'))), discrProp = [discrProp 'n']; end %fixed number of neighbors
if(length(discrProp)>3), error(['Unknown discretization property or contradicting properties: ' discrProp]); end
discrProp = sort(discrProp);

%Scaling
if(nargin<4 || isempty(scaling)), scaling = 1; end

%Check the validity of default data type from the user
if(dataTypeNotGiven && nargin<5)
    disp('Data type (check the validity):')
    disp(' ');
    if(dataType == 's'), disp('  s = similar features'); end
    if(dataType == 'd'), disp('  d = dissimilar features (columns), large values in different features are comparable'); end
    if(dataType == 'c'), disp('  c = dissimilar features (columns), small and large values are comparable'); end
    if(dataType == 'n'), disp('  n = nominal data or prediscretized numerical data with similar features (only for reasonable number of different values)'); end
end

%Check the sparsity of the matrix and inform the user
if(nargin<5 &&  issparse(X) && nnz(X)/numel(X)>0.5), disp('Note: the matrix is almost full - making it full may improve the performance.'); end
if(nargin<5 && ~issparse(X) && nnz(X)/numel(X)<0.5), disp('Note: the matrix is quite sparse - making it sparse may improve the performance.'); end

%Perform the discretization for each data type separately
switch dataType    
    case 's' %similar        
        DX = discretizeMatrixWithSimilarFeatures(X,discrProp,scaling);
        
    case 'n' %nominal        
        if(iscell(X))            
            [origNominalValues,dump,Y] = unique(X);
            Y = reshape(Y,size(X));
        else
            if(~issparse(X))
                [origNominalValues,dump,Y] = unique(X);
                Y = reshape(Y,size(X));
            else
                [i,j,v]=find(X);
                [origNominalValues,dump,w] = unique(v);
                Y = sparse(i,j,w,size(X,1),size(X,2));                                              
            end
        end
        
        DX = discretizeMatrixWithSimilarFeatures(Y,'sun',0); %only equal values in Y are put to same class
        
        DX.dataType = dataType;
        DX.origNominalValues = origNominalValues; 
        DX.discrProp = '';
        DX.scaling = [];
                
    case {'d','c'} %dissimilar
        
        if(~issparse(X))            
            sortedColValues = cell(1,size(X,2));
            for j=1:size(X,2),
                i = ~isnan(X(:,j)); %remove nans
                [sortedColValues{j},rank2element] = sort(X(i,j)); %find column values and element ranks
                element2fixedRanks = fixEqualElementsRanks(sortedColValues{j},rank2element); %ranks in range [1,n], equal rank for equal values
                X(i,j) = (element2fixedRanks-1)/(numel(sortedColValues{j})-1); %replace elements with their proportional ranks in range [0,1]
            end

            if(dataType=='c') %make small and large values equal, i.e., 0 -> 0, 1 -> 0, 0.5 -> 1
                X = min(2*X,2-2*X);
            end
        else
            sortedColValues = cell(1,size(X,2));
            for j=1:size(X,2),
                i = find(X(:,j)); %find nonzero elements from column j
                i = i(~isnan(X(i,j))); %remove nans
                [sortedColValues{j},rank2element] = sort(X(i,j)); %find column values and element ranks                
                element2fixedRanks = fixEqualElementsRanks(sortedColValues{j},rank2element); %ranks in range [1,n], equal rank for equal values
                X(i,j) = (element2fixedRanks-1)/(numel(sortedColValues{j})-1)+1; %replace elements with their proportional ranks in range [1,2]
            end
            
            if(dataType=='c') %make small and large values equal, i.e., 1 -> 1, 2 -> 1, 1.5 -> 2
                X = spfun(@(i) min(2*(i-1)+1,3-2*(i-1)),X);
            end            
        end 
        
        DX = discretizeMatrixWithSimilarFeatures(X,discrProp,scaling);
        
        DX.dataType = dataType;
        DX.sortedColValues = sortedColValues;            
end

end

%X is supposed to contain dissimilar features if the median of each column
%is not contained in the value range of 1/sqrt(d) of the neighboring
%columns in sorted order of min and max values. This guarantees that there
%are enough possible swaps
function t = hasDissimilarFeatures(X,scale)
%this was: prctile(min(X),85) > prctile(max(X),15)

if(nargin<2)
    scale=1;
end

me=sort(nanmedian(X));
ma=sort(nanmax(X));
mi=sort(nanmin(X));

k=ceil(sqrt(size(X,2))/2*scale);

t = any(me(k+1:end) > ma(1:end-k)) || any(me(1:end-k) < mi(k+1:end));

end

function DX = discretizeMatrixWithSimilarFeatures(X,discrProp,scaling)

rows = size(X,1); cols = size(X,2);

%Handle sparsity
if(~issparse(X))
    DX = struct('pos2elem',0,'elem2rowIndexRange',0,'elem2colIndexRange',0,'sortedValues',0,'dataType','s','discrProp',discrProp,'scaling',scaling);
    [DX.sortedValues,order] = sort(X(:));
    DX.pos2elem = zeros(size(X),'int32');
    DX.pos2elem(order) = 1:numel(X);
else    
    DX = struct('elem2row',0,'elem2col',0,'elem2rowIndexRange',0,'elem2colIndexRange',0,'sortedValues',0,'dataType','s','discrProp',discrProp,'scaling',scaling,'rows',rows,'cols',cols);
    [i,j,v] = find(X);
    [DX.sortedValues,order] = sort(v);
    DX.elem2row = int32(i(order));
    DX.elem2col = int32(j(order));
end

%Remove nans temporarily
numNan = sum(isnan(DX.sortedValues));
sortedValuesNonNans = DX.sortedValues(1:end-numNan);

%Perform the discretization, check the globality
switch(regexp(discrProp,'[sgl]','match','once'))
    case 's' %same discretization
        DX.elem2rowIndexRange = globalDiscretization(sortedValuesNonNans,ceil(sqrt(numel(X))),discrProp,scaling);        
        DX.elem2colIndexRange = DX.elem2rowIndexRange;
            
    case 'g' %global discretization
        DX.elem2rowIndexRange = globalDiscretization(sortedValuesNonNans,rows,discrProp,scaling);        
        DX.elem2colIndexRange = globalDiscretization(sortedValuesNonNans,cols,discrProp,scaling);
        
    case 'l' %local discretization
        if(~issparse(X))
            globalRanks = DX.pos2elem;
            globalRanks(globalRanks > numel(sortedValuesNonNans)) = 0;
        else
            globalRanks = sparse(double(DX.elem2row),double(DX.elem2col),1:numel(DX.elem2row),rows,cols);
        end            
        
        DX.elem2rowIndexRange = localColumnDiscretization(sortedValuesNonNans,globalRanks',discrProp,scaling);        
        DX.elem2colIndexRange = localColumnDiscretization(sortedValuesNonNans,globalRanks,discrProp,scaling);
end

%assign nans to one class
DX.elem2rowIndexRange = int32([DX.elem2rowIndexRange; repmat([numel(sortedValuesNonNans)+1, numel(DX.sortedValues)],numNan,1)]);    
DX.elem2colIndexRange = int32([DX.elem2colIndexRange; repmat([numel(sortedValuesNonNans)+1, numel(DX.sortedValues)],numNan,1)]);

end



function indexRange = globalDiscretization(sortedValues,samples,discrProp,scaling)
elems = numel(sortedValues);

%av number of nnz in each sample = elems/samples
%appropriate number of classes = sqrt(above)
%the average class size is elems/classes = sqrt(elems*samples)
avSize = scaling*sqrt(elems*samples); %average/expected number of global neighbors for elements in some sense

switch(regexp(discrProp,'[ud]','match','once'))    
    case 'u' %unique tolerance ranges        
        switch(regexp(discrProp,'[nem]','match','once'))
            case 'n' %fixed number of global neighbors
                nbrs = floor(avSize/2);
                indexRange = [max(1,(1:elems)' - nbrs), min(elems,(1:elems)' + nbrs)];
                
                %If there are elements having exactly the same value, the above
                %calculation will produce different neighborhoods for the same values.
                indexRange = fixEqualElementsIndexRange(sortedValues,indexRange);
                
            case 'e' %global eps = maximum distance to acceptable elements
                eps = avSize * median(diff(sortedValues)) / 2;
                indexRange = valueRange2indexRange(sortedValues,[sortedValues-eps, sortedValues+eps]);
                
            case 'm' %global neighbors + eps
                indexRangeEps = globalDiscretization(sortedValues,samples,'ue',scaling);
                indexRangeNbr = globalDiscretization(sortedValues,samples,'un',scaling);
                indexRange = [min(indexRangeNbr(:,1),indexRangeEps(:,1)), max(indexRangeNbr(:,2),indexRangeEps(:,2))];                
        end
    
        
    case 'd' %discrete tolerance ranges                
        switch(regexp(discrProp,'[nem]','match','once'))        
            case 'n' %discrete classes with equal number of elements = discrete neighbors
                classes = round(elems/avSize);
                elem2class = ceil(classes*(1:elems)/elems);
                indexRange = globalDiscretization(elem2class,samples,'un',0);
                
            case 'e' %discrete classes with equal length = discrete eps
                eps = avSize * median(diff(sortedValues));
                classes = round((sortedValues(end)-sortedValues(1))/eps);               
                elem2class = min(elems,max(1,ceil((sortedValues-sortedValues(1))/(sortedValues(end)-sortedValues(1))*classes)));
                indexRange = globalDiscretization(elem2class,samples,'un',0);
                                
            case 'm' %discrete neighbors + eps
                indexRangeEps = globalDiscretization(sortedValues,samples,'de',scaling);
                indexRangeNbr = globalDiscretization(sortedValues,samples,'dn',scaling);
                indexRange = [min(indexRangeNbr(:,1),indexRangeEps(:,1)), max(indexRangeNbr(:,2),indexRangeEps(:,2))];       
        end
end

end


function indexRange = localColumnDiscretization(sortedValues,globalRanks,discrProp,scaling)
elems = numel(sortedValues);
indexRange = zeros(elems,2);

%discretize each column separately by using global discretization
for j = 1:size(globalRanks,2)
    sortedColRanks = sort(nonzeros(globalRanks(:,j))); %the global column ranks of column j in sorted order    
    sortedColValues = sortedValues(sortedColRanks); %the corresponding values    
    colIndexRange = globalDiscretization(sortedColValues,1,discrProp,scaling); %discretize these values by using the global discretization with one sample, result is local column ranks   
    colRank2GlobalRank = sortedColRanks; colRank2GlobalRank(1)=1; colRank2GlobalRank(end) = elems; %replace upper and lower bounds with global bounds
    indexRange(sortedColRanks,:) = colRank2GlobalRank(colIndexRange); %transform the obtained column ranks to global ranks
end

% If the range ends in the area where there are same values, we
% extend the indexRange to the end of the area with the same values:
indexRange = valueRange2indexRange(sortedValues,sortedValues(indexRange));
end







%Next, there are a few functions for doing small general things


% If there are elements having exactly the same rank, we assign to each
% that the same rank that is the median of the ranks
function element2fixedRanks = fixEqualElementsRanks(sortedValues,rank2element)
element2fixedRanks = zeros(size(rank2element));
n = numel(sortedValues); i=1;
while i<=n,
    j=i;
    while j<n && sortedValues(i) == sortedValues(j+1),
        j = j+1;
    end
    element2fixedRanks(rank2element(i:j)) = (i+j)/2;
    i=j+1;
end
end


% If there are elements having exactly the same value, the same values will
% have different neighborhoods. We need to do two things to fix this: 
function indexRange = fixEqualElementsIndexRange(sortedValues,indexRange)
% 1. If the range ends in the area where there are same values, we
% extend the indexRange to the end of the area with the same values:       
indexRange = valueRange2indexRange(sortedValues,sortedValues(indexRange));

% 2. For elements having the same value, we assign to each that indexRange 
% which the current middle element according to the ordering has
n = numel(sortedValues); i=1;
while i<=n,
    j=i;
    while j<n && sortedValues(i) == sortedValues(j+1),
        j = j+1;
    end    
    indexRange(i:j,1) = indexRange(floor((i+j)/2),1); %or bsxfun
    indexRange(i:j,2) = indexRange(ceil((i+j)/2),2); 
    i=j+1;
end
end


function indexRange = valueRange2indexRange(sortedValues,valueRange)
n = size(valueRange,1);
indexRange = zeros(n,2);

%lower bounds
[lowBound,order] = sort(valueRange(:,1));
j=n;
for i=n:-1:1,
    while j>1 && lowBound(i) <= sortedValues(j-1)
        j = j-1;
    end    
    indexRange(order(i),1) = j;
end    

%upper bounds
[upBound,order] = sort(valueRange(:,2));
j=1;
for i=1:n,
    while j<n && upBound(i) >= sortedValues(j+1)
        j = j+1;
    end    
    indexRange(order(i),2) = j;
end 
end


function xValueRange = yValueRange2xValueRange(x,f,yValueRange)
n = numel(x);
yValueRange = yValueRange([1 1:n n],:);
x = [-inf x inf];
xValueRange = zeros(n,2);

%upper y-bound -> lower x-bound
j=n+2;
for i=n:-1:1,
    while(j>1 && yValueRange(j-1,2) >= f(i))
        j = j-1;
    end
    xValueRange(i,1) = x(j);
end

%lower y-bound -> upper x-bound
j=1;
for i=1:n,
    while(j<n+2 && yValueRange(j+1,1) <= f(i))
        j = j+1;
    end
    xValueRange(i,2) = x(j);
end
end
    
