# Load data
paleo <- read.table("paleo.txt");

# Define functions
## General functions
support <- function(data, itemset) {
  sum(apply(data[,itemset],1,prod))/nrow(data)
}

support.items <- function(data, itemset) {
  supp <- as.array(apply(data[,itemset],2,sum)/nrow(data));
  dimnames(supp) <- NULL;
  supp
}

lift <- function(data, itemset) {
  num <- support(data, itemset);
  den <- prod(support.items(data, itemset));
  num/den
}

## Mining closed patterns with positive lift
frequent.patterns <- function(data, minsup) {
  # Init
  frequent <- list();
  nfreq <- 0;
  # Level-1
  L1.cand <- 1:ncol(data);
  L1.support <- support.items(data, L1.cand);
  L1 <- L1.cand[L1.support >= minsup];
  L1.support <- L1.support[L1.support >= minsup];
  # Store Level-1
  if(length(L1) > 0) {
    Lprev <- as.list(L1);
  } else {
    Lprev <- list();
  }
  # Level-2 and up
  while(length(Lprev) > 0) {
    Lnext <- generate.candidates(Lprev, data);
    Lnext.support <- matrix(0,length(Lnext),1);
    if(length(Lnext) > 0) {
      for (i in 1:length(Lnext)) {
        Lnext.support[i] <- support(data, Lnext[[i]]);
      }
      Lnext <- Lnext[Lnext.support >= minsup];
      Lnext.support <- Lnext.support[Lnext.support >= minsup];
    }
    # Store Next Level 
    if(length(Lnext) > 0) {
      for (i in 1:length(Lnext)) {
        X <- list();
        X[[1]] <- Lnext[[i]];
        X[[2]] <- Lnext.support[i];
        nfreq <- nfreq + 1;
        frequent[[nfreq]] <- X;
      }
      Lprev <- Lnext;
    } else {
      Lprev <- list();
    }
  }
  # Return
  frequent
}

generate.candidates <- function(itemsets, data) {
  size <- length(itemsets[[1]]); # It is assumed all given itemsets have equal size
  nvars <- ncol(data);
  new <- list();
  ncand <- 0;
  for (i in 1:length(itemsets)) {
    set <- itemsets[[i]];
    if(set[size] < nvars) {
      for (j in {set[size]+1}:nvars) {
        ncand <- ncand + 1;
        new[[ncand]] <- c(set,j);
      }
    }
  }
  new
}

prune.closed <- function(frequent) {
  prune <- logical(length(frequent));
  for (i in 2:length(frequent)) {
    itemsupp.i <- frequent[[i]];
    itemset.i <- itemsupp.i[[1]];
    supp.i <- itemsupp.i[[2]];
    for (j in 1:{i-1}) {
      itemsupp.j <- frequent[[j]];
      itemset.j <- itemsupp.j[[1]];
      supp.j <- itemsupp.j[[2]];
      if(supp.i == supp.j) {
        if(length(intersect(itemset.i,itemset.j))==length(itemset.j)) {
          prune[i] <- TRUE;
        }
      }
    }
  }
  frequent[!prune]
}

closed.patterns <- function(data, minsup) {
  frequent <- frequent.patterns(data, minsup);
  closed <- prune.closed(frequent);
  closed
}

input.patterns <- function(data, minsup) {
  # Mine closed frequent patterns
  closed <- closed.patterns(data, minsup);
  # Select only patterns with positive lift
  poslift <- logical(length=length(closed));
  for (i in 1:length(closed)) {
    itemsup <- closed[[i]];
    itemset <- itemsup[[1]];
    poslift[i] <- lift(data, itemset) >= 1.0;
  }
  # Return
  closed[poslift]
}

flatten.patterns <- function(patterns) {
  flat <- list();
  for (i in 1:length(patterns)) {
    itemsupp <- patterns[[i]];
    flat[[i]] <- itemsupp[[1]];
  }
  flat
}

## Test statistic and randomization
test.statistic <- function(data, itemsets) {
  T = 0;
  for (i in 1:length(itemsets)) {
    T = T + lift(data,itemsets[[i]]);
  }
  T
}

frequency.dist <- function(data1, data2, constraints) {
  dist <- 0;
  if(length(constraints) > 0) {
    for (i in 1:length(constraints)) {
      supp1 <- support(data1, constraints[[i]]);
      supp2 <- support(data2, constraints[[i]]);
      dist <- dist + abs(supp1 - supp2);
    }
  }
  dist
}

find.ones <- function(data) {
  index <- matrix(0,sum(data),2);
  n <- 0;
  for (i in 1:nrow(data)) {
    for (j in 1:ncol(data)) {
      if(data[i,j] == 1) {
        n <- n + 1;
        index[n,1] <- i;
        index[n,2] <- j;
      }
    }
  }
  index
}

index.constraints <- function(constraints, ncolumns) {
  index <- list();
  for (i in 1:ncolumns) {
    index[[i]] <- vector(mode = "integer");
  }
  if(length(constraints) > 0) {
    for (i in 1:length(constraints)) {
      cnstr <- constraints[[i]];
      for (j in 1:length(cnstr)) {
        column <- cnstr[j];
        index[[column]] <- c(index[[column]], i);
      }
    }
  }
  index
}

random.data <- function(data, index, constraints, supplist, constraints.index, nswaps, w) {
  # Uses Itemset-Swap as introduced in Hanhijarvi et al. (KDD 2009).
  # Init
  data.rand <- data;
  index.rand <- index;
  supplist.rand <- supplist;
  nones <- nrow(index.rand);
  # Do nswaps iterations
  for (i in 1:nswaps) {
    # Pick ones
    ones <- sample(nones,2,replace=TRUE);
    r1 <- index.rand[ones[1],1];
    c1 <- index.rand[ones[1],2];
    r2 <- index.rand[ones[2],1];
    c2 <- index.rand[ones[2],2];
    # Check if 'cross' indices are zeros
    if(data.rand[r1,c2] == 0 & data.rand[r2,c1] == 0) {
      # Swap ones and zeros
      data.rand[r1,c1] <- 0;
      data.rand[r2,c2] <- 0;
      data.rand[r1,c2] <- 1;
      data.rand[r2,c1] <- 1;
      # Accept if distance is less or equal, or with some probability
      freqdist.change <- 0;
      supplist.new <- supplist.rand;
      if(length(constraints) > 0) {
        columns <- c(c1,c2);
        relevant.constraints <- union(constraints.index[[c1]],constraints.index[[c2]]);
        if(length(relevant.constraints) > 0) {
          for (i in relevant.constraints) {
            cnstr.all <- constraints[[i]];
            cnstr.changed <- intersect(cnstr.all,columns);
            if(length(cnstr.changed) == 1) { # If both columns are in the constraint, it cannot support the constraint
              cnstr.other <- setdiff(cnstr.all,cnstr.changed);
              # Check if other columns in constraint are supported, in this case we have change of support
              supp.r1 <- prod(data.rand[r1,cnstr.other]) == 1;
              supp.r2 <- prod(data.rand[r2,cnstr.other]) == 1;
              if(supp.r1 & !supp.r2) {
                if(cnstr.changed == c1) {
                  # Support of this itemset is decreased by one
                  supplist.new[i] <- supplist.new[i] - 1;
                  if(supplist.rand[i] < supplist[i]) {
                    freqdist.change <- freqdist.change + 1;
                  } else {
                    freqdist.change <- freqdist.change - 1;
                  }
                } else {
                  # Support of this itemset is increased by one
                  supplist.new[i] <- supplist.new[i] + 1;
                  if(supplist.rand[i] > supplist[i]) {
                    freqdist.change <- freqdist.change + 1;
                  } else {
                    freqdist.change <- freqdist.change - 1;
                  }
                }
              } else if(supp.r2 & !supp.r1) {
                if(cnstr.changed == c1) {
                  # Support of this itemset is increased by one
                  supplist.new[i] <- supplist.new[i] + 1;
                  if(supplist.rand[i] > supplist[i]) {
                    freqdist.change <- freqdist.change + 1;
                  } else {
                    freqdist.change <- freqdist.change - 1;
                  }
                } else {
                  # Support of this itemset is decreased by one
                  supplist.new[i] <- supplist.new[i] - 1;
                  if(supplist.rand[i] < supplist[i]) {
                    freqdist.change <- freqdist.change + 1;
                  } else {
                    freqdist.change <- freqdist.change - 1;
                  }
                }
              }
            }
          }
        }
      }
      dist.difference <- exp(-w*{freqdist.change});
      if(runif(1) < dist.difference) {
        # Store
        index.rand[ones[1],2] <- c2;
        index.rand[ones[2],2] <- c1;
        supplist.rand <- supplist.new;
      }
      else {
        # Undo
        data.rand[r1,c1] <- 1;
        data.rand[r2,c2] <- 1;
        data.rand[r1,c2] <- 0;
        data.rand[r2,c1] <- 0;
      }
    }
  }
  # Return
  data.rand
}
  
p.value <- function(data, patterns, constraints, nrand, nswaps, w) {
  t.input = test.statistic(data, patterns);
  t.rand = array(0,c(nrand,1));
  index <- find.ones(data);
  supplist <- c();
  if(length(constraints) > 0) {
    for (i in 1:length(constraints)) {
      supplist <- c(supplist, support(data,constraints[[i]]));
    }
  }
  constraints.index <- index.constraints(constraints, ncol(data));
  for (i in 1:nrand) {
    data.rand <- random.data(data,index,constraints,supplist,constraints.index,nswaps,w);
    t.rand[i] <- test.statistic(data.rand, patterns);
  }
  pnorm(-t.input,-mean(t.rand),sd(t.rand),lower.tail=TRUE,log=TRUE)
}

find.constraints <- function(data, patterns, constraints.possible, constraints.impose, nrand, nswaps, w) {
  p.best <- -Inf;
  index.best <- 0;
  for (i in constraints.possible) {
    constraints <- patterns[c(constraints.impose, i)];
    p <- p.value(data,patterns,constraints,nrand,nswaps,w);
    if(p > p.best) {
      index.best <- i;
      p.best <- p;
    }
  }
  c(index.best, p.best)
}

# Compute set of patterns/constraints
patterns.full <- input.patterns(paleo,0.1);
patterns <- flatten.patterns(patterns.full);

## Run experiment
# Params
nrand <- 100;
nswaps <- 8000;
w <- 4; # As in Hanhijarvi et al. (KDD 2009)
k <- 118;

# Init
X <- as.matrix(paleo);
results <- matrix(0,k+1,2);
results[1,1] <- -1; # To make clear no constraint is used initially
results[1,2] <- p.value(X, patterns, list(), nrand, nswaps, w); # Initial p-value
constraints.imposed <- c();
constraints.possible <- 1:length(patterns);

# Run iterative algorithm
t1 = proc.time();
print(results[1,]);
for (i in 1:k) {
  results[i+1,] <- find.constraints(X, patterns, constraints.possible, constraints.imposed, nrand, nswaps, w);
  print(results[i+1,]);
  constraints.imposed <- c(constraints.imposed, results[i+1,1]);
  print(constraints.imposed);
  constraints.possible <- setdiff(constraints.possible, results[i+1,1]);
}
t2 = proc.time();

# Save data
save.image("/users/lijffijt/Pattern/p/src/sigset11/results118.RData")

# Load data
load("/users/lijffijt/Pattern/p/src/sigset11/results118.RData")

## Analyze results
# Create extended results matrix
results.extended <- matrix(0,119,4);
results.extended[,1:2] <- results;
for (i in 1:119) {
  results.extended[i,2] <- log10(exp(results[i,2]));
  if(i > 1) {
    itemset_i <- patterns[[results[i,1]]];
    results.extended[i,3] <- support(X, itemset_i);
    results.extended[i,4] <- lift(X, itemset_i);
  }
}
rm(i,itemset_i)
results.extended

# Write to LaTeX table
library(xtable)
Table <- matrix(0,10,8);
Table[1:10,1:4] <- results.extended[2:11,];
Table[1:10,5:8] <- results.extended[12:21,];
xtable(Table)
rm(Table);

# Plot p-values vs. constraints
plot(0:118,10^results.extended[,2],type="b",xlab="Number of constraints",ylab="p-value",log="y",yaxt="n");
axis(2,10^c(-300,-200,-100,0));

