// Copyright 2005,2006 Jouni K. Seppnen          -*- coding: iso-8859-1 -*-
// Distributed under the Boost Software License, Version 1.0.
// See accompanying file LICENSE.

#define VERSION "1.0"

#include "dense.h"
#include "util.h"

#include <cassert>
#include <cmath>
#include <cstdlib>

#include <algorithm>
#include <functional>
#include <fstream>
#include <set>
#include <sstream>

#include <boost/detail/algorithm.hpp>
#include <boost/lambda/bind.hpp>
#include <boost/lambda/construct.hpp>
#include <boost/lambda/lambda.hpp>

namespace dense {

using std::adjacent_find;
using std::cerr;
using std::map;
using std::min;
using std::ostream;
using std::set;
using std::string;
using std::vector;

using boost::is_sorted;
using boost::lambda::_1;
using boost::lambda::_2;
using boost::lambda::var;
using boost::lambda::bind;
using boost::lambda::delete_array;


struct ct {
  ulong *candidate; int level;
  ct(ulong *candidate_, int level_) 
    : candidate(candidate_), level(level_) {}
};

ostream &operator<<(ostream &output, const ct &candidate)
{
  for (int i = 0; i < candidate.level; i++)
    output << candidate.candidate[i] << (i==candidate.level ? "" : " ");
  return output;
}

struct candidateLessThan {
  int level;
  candidateLessThan(int level_) : level(level_) {}

  inline bool 
  operator()(const ulong *first, const ulong *second) const
  {
    for (int i = 0; i < level; i++) 
      if (first[i] < second[i])
	return true;
      else if (first[i] > second[i])
        return false;
    return false;
  }

};

struct candidateEqual {
  int level;
  candidateEqual(int level_) : level(level_) {}

  inline bool 
  operator()(const ulong *first, const ulong *second) const
  {
    for (int i = 0; i < level; i++)
      if (first[i] != second[i])
	return false;
    return true;
  }

};


ulong DenseItemsetSearch::findOrMakeMapping(const string &word)
{
  using std::make_pair;
  map<string, ulong>::iterator where(wordMapping.find(word));
  if(where == wordMapping.end()) {
    where = wordMapping.insert(make_pair(word, nextWord++)).first;
    words.push_back(word);
    ulong *s = new ulong[1];
    s[0] = 0;
    istat.push_back(s);
  }
  return where->second;
}

void DenseItemsetSearch::scan1(void)
{
  using std::ifstream;
  using std::stringstream;

  ifstream in(inFilename.c_str());
  string line;
  for(getline(in, line); !in.eof(); getline(in, line)) {
    dataSize++;
    stringstream ss(line);
    string word;
    set<ulong> tuple;
    while (ss >> word)
      tuple.insert(findOrMakeMapping(word));
    for_each(tuple.begin(), tuple.end(), var(istat)[_1][0]++);
  }
  in.close();
}

void DenseItemsetSearch::pruneAndPrint1(ostream &output)
{
  using std::ceil;

  vector<ulong*> newIstat;
  vector<string> newWords;
  map<string, ulong> newWordMapping;
  vector<ulong*>::const_iterator i;
  tupleThreshold = (ulong)ceil(sigma*dataSize);
  ulong threshold = (ulong)ceil(tupleThreshold*delta);

  for (i = istat.begin(); i != istat.end(); i++)
    {
      int j = i-istat.begin();
      if((*i)[0] >= threshold) {
	// Is dense
	double weakDensity = min(1.0, (double)(*i)[0]/tupleThreshold);
	output << weakDensity << ' ' << words[j] << '\n';
	newIstat.push_back(*i);
	newWords.push_back(words[j]);
	newWordMapping.insert(make_pair(words[j], newWords.size()-1));
      } else {
	// Is not dense (and thus cannot be part of a dense set): forget
	delete[] *i;
      }
    }
  istat.swap(newIstat);
  words.swap(newWords);
  wordMapping.swap(newWordMapping);
}

void DenseItemsetSearch::makeCandidates2(void)
{
  ulong nfreq = istat.size();

  candidates.resize(nfreq*(nfreq-1)/2, NULL);
  ulong idx=0;
  for (ulong i=0; i<nfreq; i++)
    for (ulong j=i+1; j<nfreq; j++) {
      ulong *c = new ulong[2];
      c[0] = i; c[1] = j;
      candidates[idx++] = c;
    }
  assert(idx==nfreq*(nfreq-1)/2);

  assert(is_sorted(candidates.begin(), candidates.end(),
		   candidateLessThan(2)));
  assert(adjacent_find(candidates.begin(), candidates.end(),
		       candidateEqual(2))
	 == candidates.end());

  level = 2;

  for_each(istat.begin(), istat.end(), bind(delete_array(), _1));
  istat = vector<ulong*>(idx);
  for (ulong i = 0; i < idx; i++) {
    ulong *s = new ulong[3];
    s[0] = s[1] = s[2] = 0;
    istat[i] = s;
  }
}

void DenseItemsetSearch::scan(void)
{
  std::ifstream in(inFilename.c_str());
  string line;

  for (getline(in, line); !in.eof(); getline(in, line)) {
    std::stringstream ss(line);
    string word;
    set<ulong> tuple;
    while (ss >> word) {
      map<string, ulong>::iterator mapping(wordMapping.find(word));
      if(mapping != wordMapping.end())
	tuple.insert(mapping->second);
    }
    bool debug = false;
    for (ulong i = 0; i < candidates.size(); i++) {
      int size = intersectionSize(tuple.begin(), tuple.end(), 
				  candidates[i], candidates[i]+level);
      istat[i][size]++;
    }
  }
  in.close();
}

void DenseItemsetSearch::pruneAndPrint(ostream &output)
{
  vector<ulong*> newCandidates;
  for (ulong i = 0; i < candidates.size(); i++) {
    ulong items = 0, tuples = 0;
    for (int j = level; j >= 0; j--) {
      ulong newtuples = istat[i][j];
      tuples += newtuples;
      items += j * newtuples;
      if(tuples > tupleThreshold) {
        assert(items >= j * (tuples-tupleThreshold));
	items -= j * (tuples-tupleThreshold);
	break;
      }
    }
    double weakDensity = min(1.0, (double)items/tupleThreshold/level);
    if (weakDensity >= delta) {
      output << weakDensity;
      for (int j = 0; j < level; j++)
	output << ' ' << words[candidates[i][j]];
      output << '\n';
      newCandidates.push_back(candidates[i]);
    } else {
      delete[] candidates[i];
    }
  }
  candidates.swap(newCandidates);
}

void DenseItemsetSearch::makeCandidates(void)
{
  using std::copy;
  using std::binary_search;

  assert(is_sorted(candidates.begin(), candidates.end(),
		   candidateLessThan(level)));
  assert(adjacent_find(candidates.begin(), candidates.end(),
		       candidateEqual(level))
	 == candidates.end());

  // Compute new candidates
  vector<ulong*> newCandidates;
  vector<ulong*>::iterator i = candidates.begin(), j, k, l;
  ulong *cand = new ulong[level+1], *subset = new ulong[level];
  while (i != candidates.end()) {
    j = i+1;
    while (j != candidates.end() && std::equal(*i, *i + level-1, *j))
      j++;
    // Now the interval candidates[i,j) has sets differing only in
    // their last element; all unions of them are possible new
    // candidates.
    if (j == i+1) {
      i++;
      continue;
    }
    copy(*i, *i + level-1, cand);
    for(k = i; k != j-1; k++)
      for(l = k+1; l != j; l++) {
	cand[level-1] = (*k)[level-1];
	cand[level] = (*l)[level-1];

	// Check if all subsets of size level are existing
	// candidates. Don't need to check *k and *l.
	copy(cand+1, cand+level+1, subset);
	bool ok = true;
	for (int m = 0; m < level; m++) {
	  if (!binary_search(candidates.begin(), candidates.end(),
			     subset, candidateLessThan(level))) {
	    ok = false;
	    break;
	  }
	  subset[m] = cand[m];
	}
	if (ok) {
	  ulong *nc = new ulong[level+1];
	  copy(cand, cand+level+1, nc);
	  newCandidates.push_back(nc);
	}
      }
    i = j;
  }
  delete[] cand;
  delete[] subset;

  // Destroy the old candidates and replace by the new ones.
  for_each(candidates.begin(), candidates.end(), 
	   bind(delete_array(), _1));
  candidates.swap(newCandidates);

  level++;

  assert(is_sorted(candidates.begin(), candidates.end(),
		   candidateLessThan(level)));
  assert(adjacent_find(candidates.begin(), candidates.end(),
		       candidateEqual(level))
	 == candidates.end());

  // Finally reinitialize the istat arrays.
  for_each(istat.begin(), istat.end(), bind(delete_array(), _1));
  istat = vector<ulong*>(candidates.size());
  for (ulong i = 0; i < candidates.size(); i++) {
    ulong *s = new ulong[level+1];
    for (int j = 0; j < level+1; j++)
      s[j] = 0;
    istat[i] = s;
  }

}

void DenseItemsetSearch::run(ostream &output, ostream &status)
{
  status << "Reading input...";
  scan1();
  status << "done.\nPruning items...";
  pruneAndPrint1(output);
  status << "done.\nBuilding level 2 candidates...";
  makeCandidates2();
  while(!candidates.empty()) {
    status << candidates.size() << " candidates.\nReading input...";
    scan();
    status << "done.\nPruning itemsets...";
    pruneAndPrint(output);
    status << candidates.size() << " dense itemsets.\n"
	   << "Building level " << level+1 << " candidates...";
    makeCandidates();
  }
  status << "no candidates. Stopping.\n";
}

}

int main(int argc, char **argv)
{
  if(argc == 4) {
    dense::DenseItemsetSearch 
      D(argv[1], strtod(argv[2], NULL), strtod(argv[3], NULL));
    // filename,       sigma,                  delta
    D.run(std::cout, std::cerr);
  } else
    std::cerr << "Usage: dense filename sigma delta\n"
	      << "\n"
	      << "dense.cc version " << VERSION 
	      << ", Copyright (C) 2005,2006 Jouni K. Seppnen\n"
	      << "This program comes with ABSOLUTELY NO WARRANTY.\n"
	      << "This is free software, and you are welcome to redistribute it\n"
	      << "under certain conditions. See the file LICENSE for details\n"
	      << "on the lack of warranty and your right to redistribute.\n";
  return 0;
}
