types.c

Go to the documentation of this file.
00001 /*
00002 Type and hapax accumulation curves
00003 Copyright (C) 2007  Jukka Suomela
00004 
00005 This program is free software; you can redistribute it and/or
00006 modify it under the terms of the GNU General Public License
00007 as published by the Free Software Foundation; either version 2
00008 of the License, or (at your option) any later version.
00009 
00010 This program is distributed in the hope that it will be useful,
00011 but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013 GNU General Public License for more details.
00014 
00015 You should have received a copy of the GNU General Public License
00016 along with this program; if not, write to the Free Software
00017 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
00018 
00019 To contact the author, email jukka.suomela@cs.helsinki.fi
00020 or see http://www.cs.helsinki.fi/jukka.suomela/ for further
00021 details. See http://www.cs.helsinki.fi/jukka.suomela/types/ for
00022 more information on this program.
00023 */
00024 
00036 // #define NDEBUG
00037 #include <assert.h>
00038 #include <errno.h>
00039 #include <limits.h>
00040 #include <math.h>
00041 #include <stdarg.h>
00042 #include <stdbool.h>
00043 #include <stdint.h>
00044 #include <stdio.h>
00045 #include <stdlib.h>
00046 #include <string.h>
00047 
00050 
00052 #define VERSION "2007-05-15"
00053 
00055 #define YEAR "2007"
00056 
00060 
00062 
00063 #define LEVELS_LIST 0.0001, 0.001, 0.01, 0.05, 0.10
00064 
00069 
00070 #ifndef WORD_BITS
00072 
00073 #  define WORD_BITS 32
00074 #endif
00075 
00076 #ifndef BITCOUNT_BITS
00078 
00079 #  define BITCOUNT_BITS 11
00080 #endif
00081 
00083 
00084 typedef unsigned bitcount_t;
00085 
00089 
00090 #if WORD_BITS == 32
00092 typedef uint_fast32_t word_t;
00094 #  define WORD_ONE UINT32_C(1)
00096 #  define MSB_SHIFT 5
00097 #elif WORD_BITS == 64
00098 typedef uint_fast64_t word_t;
00099 #  define WORD_ONE UINT64_C(1)
00100 #  define MSB_SHIFT 6
00101 #else
00102 #  error "Unsupported WORD_BITS"
00103 #endif
00104 
00106 #define LSB_MASK ((WORD_ONE << MSB_SHIFT) - WORD_ONE)
00107 
00109 
00110 inline static size_t
00111 get_msb_index(size_t i)
00112 {
00113     return i >> MSB_SHIFT;
00114 }
00115 
00117 
00118 inline static word_t
00119 get_lsb_bit(size_t i)
00120 {
00121     return WORD_ONE << (i & LSB_MASK);
00122 }
00123 
00128 
00130 #define STRINGIFY2(x) #x
00132 #define STRINGIFY(x) STRINGIFY2(x)
00133 
00135 #define MIN(x, y) ((x) < (y) ? (x) : (y))
00137 #define MAX(x, y) ((x) > (y) ? (x) : (y))
00138 
00143 
00145 #define MYMALLOC(target, type, count) \
00146     do { \
00147         type *my_result = mymalloc(size_multiply(sizeof(type), count)); \
00148         target = my_result; \
00149     } while (0)
00150 
00152 #define MYMALLOCZ(target, type, count, init) \
00153     do { \
00154         size_t my_count = count; \
00155         type *my_result = mymalloc(size_multiply(sizeof(type), my_count)); \
00156         for (size_t my_i = 0; my_i < my_count; my_i++) { \
00157             my_result[my_i] = init; \
00158         } \
00159         target = my_result; \
00160     } while (0)
00161 
00163 #define MYMALLOC2Z(target, type, count1, count2, init) \
00164     MYMALLOCZ(target, type, size_multiply(count1, count2), init)
00165 
00167 static size_t
00168 size_multiply(size_t a, size_t b)
00169 {
00170     size_t x = a;
00171     x *= b;
00172     if (x / b != a) {
00173         fprintf(stderr, "size overflow (%zu * %zu > %zu)\n", a, b, SIZE_MAX);
00174         exit(EXIT_FAILURE);
00175     }
00176     return x;
00177 }
00178 
00180 static void *
00181 mymalloc(size_t s)
00182 {
00183     void *p = malloc(s);
00184     if (p == NULL) {
00185         fprintf(stderr, "malloc failed (%zu bytes)\n", s);
00186         exit(EXIT_FAILURE);
00187     }
00188     return p;
00189 }
00190 
00194 
00196 
00197 static void
00198 myfscanf(int expected, const char *description, FILE *stream, const char *format, ...)
00199 {
00200     va_list ap;
00201     va_start(ap, format);
00202     int result = vfscanf(stream, format, ap);
00203     va_end(ap);
00204 
00205     if (expected != result) {
00206         if (result == EOF) {
00207             fprintf(stderr, "%s: expected %d fields, got EOF: %s\n", description, expected, format);
00208         } else if (expected == EOF) {
00209             fprintf(stderr, "%s: expected EOF, got %d fields: %s\n", description, result, format);
00210         } else {
00211             fprintf(stderr, "%s: expected %d fields, got %d fields: %s\n", description, expected, result, format);
00212         }
00213         exit(EXIT_FAILURE);
00214     }
00215 }
00216 
00218 static void
00219 bad_uint(const char *s)
00220 {
00221     fprintf(stderr, "not a valid unsigned integer: %s\n", s);
00222     exit(EXIT_FAILURE);
00223 }
00224 
00226 static unsigned
00227 get_uint(const char *s)
00228 {
00229     if (*s == '\0') {
00230         bad_uint(s);
00231     }
00232     char *pend;
00233     errno = 0;
00234     unsigned long v = strtoul(s, &pend, 10);
00235     if (*pend != '\0') {
00236         bad_uint(s);
00237     }
00238     if (v == ULONG_MAX && errno == ERANGE) {
00239         bad_uint(s);
00240     }
00241 #if UINT_MAX < ULONG_MAX
00242     if (v > UINT_MAX) {
00243         bad_uint(s);
00244     }
00245 #endif
00246     return (unsigned int)v;
00247 }
00248 
00253 
00255 #define BUILTIN_RNG_BITS 15
00256 
00258 #define BUILTIN_RNG_MAX ((1U << BUILTIN_RNG_BITS) - 1)
00259 
00261 #define BUILTIN_RNG_LARGE_BITS (BUILTIN_RNG_BITS * 2)
00262 
00264 #define BUILTIN_RNG_LARGE_MAX ((1U << BUILTIN_RNG_LARGE_BITS) - 1)
00265 
00267 typedef uint_fast32_t builtin_rng_state_t;
00268 
00270 inline static void
00271 builtin_rng_init(builtin_rng_state_t *state)
00272 {
00273     *state = 1;
00274 }
00275 
00277 
00278 inline static uint_fast16_t
00279 builtin_rng(builtin_rng_state_t *state)
00280 {
00281     *state = *state * 1103515245 + 12345;
00282     return (*state >> 16) & BUILTIN_RNG_MAX;
00283 }
00284 
00286 
00287 inline static uint_fast32_t
00288 builtin_rng_large(builtin_rng_state_t *state)
00289 {
00290     uint_fast32_t value = builtin_rng(state) << BUILTIN_RNG_BITS;
00291     value |= builtin_rng(state);
00292     return value;
00293 }
00294 
00296 inline static unsigned
00297 builtin_rng_n(builtin_rng_state_t *state, unsigned n)
00298 {
00299     return (unsigned)((double)n * (builtin_rng_large(state) / (BUILTIN_RNG_LARGE_MAX + 1.0)));
00300 }
00301 
00305 
00307 
00308 inline static unsigned
00309 myrand_n(unsigned n)
00310 {
00311     return (unsigned)((double)n * (rand() / (RAND_MAX + 1.0)));
00312 }
00313 
00315 inline static void
00316 myswap(unsigned i, unsigned j, unsigned * restrict table)
00317 {
00318     unsigned tmp = table[j];
00319     table[j] = table[i];
00320     table[i] = tmp;
00321 }
00322 
00324 inline static void
00325 identity_permutation(unsigned n, unsigned * restrict table)
00326 {
00327     for (unsigned i = 0; i < n; i++) {
00328         table[i] = i;
00329     }
00330 }
00331 
00333 
00334 inline static void
00335 rand_permutation(unsigned n, unsigned * restrict table)
00336 {
00337     identity_permutation(n, table);
00338     for (unsigned i = 0; i < n; i++) {
00339         unsigned j = myrand_n(n - i) + i;
00340         myswap(i, j, table);
00341     }
00342 }
00343 
00345 
00346 inline static void
00347 builtin_rng_permutation(builtin_rng_state_t * restrict state, unsigned n, unsigned * restrict table)
00348 {
00349     identity_permutation(n, table);
00350     for (unsigned i = 0; i < n; i++) {
00351         unsigned j = builtin_rng_n(state, n - i) + i;
00352         myswap(i, j, table);
00353     }
00354 }
00355 
00359 
00361 
00362 inline static unsigned
00363 naive_bitcount(unsigned w)
00364 {
00365     unsigned c = 0;
00366     while (w != 0) {
00367         c += (w & 1U);
00368         w >>= 1;
00369     }
00370     return c;
00371 }
00372 
00374 static bitcount_t bitcounts[1U << BITCOUNT_BITS];
00375 
00377 static void
00378 init_bitcount(void)
00379 {
00380     for (unsigned i = 0; i < (1U << BITCOUNT_BITS); i++) {
00381         bitcounts[i] = naive_bitcount(i);
00382     }
00383 }
00384 
00386 inline static unsigned
00387 bitcount(word_t w)
00388 {
00389     return
00390 #if BITCOUNT_BITS == 8
00391         bitcounts[w & 0xFFU]
00392         + bitcounts[(w >> 8) & 0xFFU]
00393         + bitcounts[(w >> 16) & 0xFFU]
00394         + bitcounts[(w >> 24) & 0xFFU]
00395 #  if WORD_BITS == 64
00396         + bitcounts[(w >> 32) & 0xFFU]
00397         + bitcounts[(w >> 40) & 0xFFU]
00398         + bitcounts[(w >> 48) & 0xFFU]
00399         + bitcounts[(w >> 56) & 0xFFU]
00400 #  endif
00401 #elif BITCOUNT_BITS == 11
00402         bitcounts[w & 0x7FFU]
00403         + bitcounts[(w >> 11) & 0x7FFU]
00404         + bitcounts[(w >> 22) & 0x7FFU]
00405 #  if WORD_BITS == 64
00406         + bitcounts[(w >> 33) & 0x7FFU]
00407         + bitcounts[(w >> 44) & 0x7FFU]
00408         + bitcounts[(w >> 55) & 0x7FFU]
00409 #  endif
00410 #elif BITCOUNT_BITS == 16
00411         bitcounts[w & 0xFFFFU]
00412         + bitcounts[(w >> 16) & 0xFFFFU]
00413 #  if WORD_BITS == 64
00414         + bitcounts[(w >> 32) & 0xFFFFU]
00415         + bitcounts[(w >> 48) & 0xFFFFU]
00416 #  endif
00417 #else
00418 #  error "Unsupported BITCOUNT_BITS"
00419 #endif
00420         ;
00421 }
00422 
00424 typedef struct {
00426     word_t at_least_1;
00428     word_t at_least_2;
00429 } zom_t;
00430 
00432 inline static zom_t zom_add(zom_t x, zom_t y)
00433 {
00434     zom_t z;
00435     z.at_least_1 = x.at_least_1 | y.at_least_1;
00436     z.at_least_2 = x.at_least_2 | y.at_least_2 | (x.at_least_1 & y.at_least_1);
00437     return z;
00438 }
00439 
00441 inline static word_t zom_exactly_1(zom_t x)
00442 {
00443     return x.at_least_1 & ~x.at_least_2;
00444 }
00445 
00447 inline static void
00448 word_clear(word_t * restrict array, size_t n)
00449 {
00450     for (unsigned i = 0; i < n; i++) {
00451         array[i] = 0;
00452     }
00453 }
00454 
00456 inline static void
00457 zom_clear(zom_t * restrict array, size_t n)
00458 {
00459     for (unsigned i = 0; i < n; i++) {
00460         array[i].at_least_1 = 0;
00461         array[i].at_least_2 = 0;
00462     }
00463 }
00464 
00469 
00471 static void
00472 usage(const char *name)
00473 {
00474     printf(
00475         "Type and hapax accumulation curves\n"
00476         "\n"
00477         "usage:\n"
00478         "  %s [OPTION]... ITERATIONS SLOTS\n"
00479         "  %s [OPTION]... ITERATIONS --slot-size SLOT_SIZE\n"
00480         "\n"
00481         "  --help  This help.\n"
00482         "  --version  Version and copyright information.\n"
00483         "  --slot-size  Spesify the size of a slot instead of the number of the slots.\n"
00484         "  --hapax  Count hapaxes only.\n"
00485         "  --items-from-table  Item counts from the incidence matrix.\n"
00486         "  --brief  Skip redundant identical rows in output.\n"
00487         "  --raw-data  Output data for each permutation.\n"
00488         "  --nonrandom  For debugging: use the identity permutation.\n"
00489         "  --builtin-rng  For debugging: use the built-in pseudorandom number generator.\n"
00490         "\n"
00491         "  ITERATIONS  The number of iterations.\n"
00492         "  SLOTS  The number of slots.\n"
00493         "  SLOT_SIZE  The size of each slot.\n"
00494         "\n"
00495         "Input format (fields separated by any whitespace):\n"
00496         "\n"
00497         "  <number-of-samples> <number-of-types>\n"
00498         "  <sample-1-label> <sample-1-total-items> ...\n"
00499         "  <sample-m-label> <sample-m-total-items>\n"
00500         "  <type-1-label> ...\n"
00501         "  <type-n-label>\n"
00502         "  <count-sample-1-type-1> ... <count-sample-1-type-n> ...\n"
00503         "  <count-sample-m-type-1> ... <count-sample-m-type-n> ...\n"
00504         "\n"
00505         "Output format (one line for each slot):\n"
00506         "\n"
00507         "  <item-count> <lower-bounds> ... <upper-bounds> ...\n"
00508         "  ...\n"
00509         "\n"
00510         "Lower and upper bounds are printed for the following quantiles:\n"
00511         "\n"
00512         "  " STRINGIFY((LEVELS_LIST)) "\n"
00513         "\n"
00514         "Output format for --raw-data:\n"
00515         "\n"
00516         "  <item-count> <value>\n"
00517         "  ...\n"
00518         "\n",
00519         name, name);
00520 
00521     exit(EXIT_SUCCESS);
00522 }
00523 
00525 static void
00526 version(void)
00527 {
00528     printf(
00529         "Type and hapax accumulation curves, version " VERSION "\n"
00530         "Copyright (C) " YEAR "  Jukka Suomela\n"
00531         "\n"
00532         "This program comes with ABSOLUTELY NO WARRANTY.\n"
00533         "This is free software, and you are welcome to redistribute it\n"
00534         "under the terms of the GNU General Public License\n"
00535         "<http://www.gnu.org/licenses/gpl.html>.\n\n"
00536     );
00537     exit(EXIT_SUCCESS);
00538 }
00539 
00541 typedef struct {
00543 
00544     word_t * restrict incidenceb;
00546 
00547     zom_t * restrict incidencezom;
00549 
00550     unsigned iterations;
00552 
00553     unsigned slots;
00555 
00556     unsigned requested_slot_size;
00558 
00559     unsigned nsample;
00561 
00562     unsigned ntype;
00564 
00565     unsigned type_words;
00567 
00568     unsigned total_items;
00570 
00571     unsigned *sample_items;
00573     bool slot_size;
00575     bool items_from_table;
00577 
00578     bool hapax;
00580 
00581     bool brief;
00583 
00584     bool raw_data;
00586     bool nonrandom;
00588     bool builtin_rng;
00589 } input_t;
00590 
00592 
00593 static void
00594 parse_command_line(input_t * restrict pinput, int argc, char **argv)
00595 {
00596     pinput->iterations = 0;
00597     pinput->slots = 0;
00598     pinput->requested_slot_size = 0;
00599     pinput->hapax = false;
00600     pinput->slot_size = false;
00601     pinput->items_from_table = false;
00602     pinput->brief = false;
00603     pinput->raw_data = false;
00604     pinput->nonrandom = false;
00605     pinput->builtin_rng = false;
00606 
00607     for (int i = 1; i < argc; i++) {
00608         if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
00609             usage(argv[0]);
00610         } else if (strcmp(argv[i], "--version") == 0 || strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "-V") == 0) {
00611             version();
00612         } else if (strcmp(argv[i], "--hapax") == 0) {
00613             pinput->hapax = true;
00614         } else if (strcmp(argv[i], "--slot-size") == 0) {
00615             pinput->slot_size = true;
00616         } else if (strcmp(argv[i], "--items-from-table") == 0) {
00617             pinput->items_from_table = true;
00618         } else if (strcmp(argv[i], "--raw-data") == 0) {
00619             pinput->raw_data = true;
00620         } else if (strcmp(argv[i], "--brief") == 0) {
00621             pinput->brief = true;
00622         } else if (strcmp(argv[i], "--nonrandom") == 0) {
00623             pinput->nonrandom = true;
00624         } else if (strcmp(argv[i], "--builtin-rng") == 0) {
00625             pinput->builtin_rng = true;
00626         } else if (pinput->iterations == 0) {
00627             pinput->iterations = get_uint(argv[i]);
00628             if (pinput->iterations == 0) {
00629                 fprintf(stderr, "invalid arguments: the number of iterations cannot be 0\n");
00630                 exit(EXIT_FAILURE);
00631             }
00632         } else if (!pinput->raw_data && !pinput->slot_size && pinput->slots == 0) {
00633             pinput->slots = get_uint(argv[i]);
00634             if (pinput->slots < 2) {
00635                 fprintf(stderr, "invalid arguments: the number of slots has to be at least 2\n");
00636                 exit(EXIT_FAILURE);
00637             }
00638         } else if (!pinput->raw_data && pinput->slot_size && pinput->requested_slot_size == 0) {
00639             pinput->requested_slot_size = get_uint(argv[i]);
00640             if (pinput->requested_slot_size == 0) {
00641                 fprintf(stderr, "invalid arguments: the slot size cannot be 0\n");
00642                 exit(EXIT_FAILURE);
00643             }
00644         } else {
00645             fprintf(stderr, "too many command line arguments: '%s'\n", argv[i]);
00646             exit(EXIT_FAILURE);
00647         }
00648     }
00649 
00650     if (pinput->iterations == 0) {
00651         fprintf(stderr, "invalid arguments: the number of iterations not given\n");
00652         exit(EXIT_FAILURE);
00653     }
00654     if (!pinput->raw_data) {
00655         if (pinput->slot_size) {
00656             if (pinput->requested_slot_size == 0) {
00657                 fprintf(stderr, "invalid arguments: the slot size not given\n");
00658                 exit(EXIT_FAILURE);
00659             }
00660         } else {
00661             if (pinput->slots == 0) {
00662                 fprintf(stderr, "invalid arguments: the number of iterations not given\n");
00663                 exit(EXIT_FAILURE);
00664             }
00665         }
00666     }
00667 }
00668 
00670 
00671 static void
00672 process_input(input_t * restrict pinput)
00673 {
00674     myfscanf(2, "<number-of-samples> <number-of-types>", stdin, "%u %u", &pinput->nsample, &pinput->ntype);
00675     if (pinput->nsample == 0) {
00676         fprintf(stderr, "invalid input: the number of samples has to be at least 1\n");
00677         exit(EXIT_FAILURE);
00678     }
00679     if (pinput->ntype == 0) {
00680         fprintf(stderr, "invalid input: the number of types has to be at least 1\n");
00681         exit(EXIT_FAILURE);
00682     }
00683 
00684     pinput->type_words = (pinput->ntype + WORD_BITS - 1) / WORD_BITS;
00685     pinput->total_items = 0;
00686     MYMALLOC(pinput->sample_items, unsigned, pinput->nsample);
00687     for (unsigned i = 0; i < pinput->nsample; i++) {
00688         unsigned v;
00689         myfscanf(1, "<sample-label> <sample-total-items>", stdin, "%*s %u", &v);
00690         if (pinput->items_from_table) {
00691             pinput->sample_items[i] = 0;
00692         } else {
00693             if (v == 0) {
00694                 fprintf(stderr, "invalid input in sample %u: the number of items has to be at least 1\n", i);
00695                 exit(EXIT_FAILURE);
00696             }
00697             pinput->sample_items[i] = v;
00698             pinput->total_items += pinput->sample_items[i];
00699         }
00700     }
00701     for (unsigned i = 0; i < pinput->ntype; i++) {
00702         myfscanf(0, "<type-label>", stdin, "%*s");
00703     }
00704     if (pinput->hapax) {
00705         const zom_t zero = { 0, 0 };
00706         MYMALLOC2Z(pinput->incidencezom, zom_t, pinput->nsample, pinput->type_words, zero);
00707     } else {
00708         MYMALLOC2Z(pinput->incidenceb, word_t, pinput->nsample, pinput->type_words, 0);
00709     }
00710     for (unsigned i = 0; i < pinput->nsample; i++) {
00711         for (unsigned j = 0; j < pinput->ntype; j++) {
00712             unsigned v;
00713             myfscanf(1, "<count-sample-type>", stdin, "%u", &v);
00714             if (v > 0) {
00715                 size_t pos = (size_t)i * pinput->type_words + get_msb_index(j);
00716                 word_t mask = get_lsb_bit(j);
00717                 if (pinput->hapax) {
00718                     pinput->incidencezom[pos].at_least_1 |= mask;
00719                     if (v > 1) {
00720                         pinput->incidencezom[pos].at_least_2 |= mask;
00721                     }
00722                 } else {
00723                     pinput->incidenceb[pos] |= mask;
00724                 }
00725             }
00726             if (pinput->items_from_table) {
00727                 pinput->sample_items[i] += v;
00728                 pinput->total_items += v;
00729             }
00730         }
00731     }
00732     myfscanf(EOF, "end of input", stdin, "%*s");
00733 
00734     if (pinput->total_items == 0) {
00735         fprintf(stderr, "invalid input: the total number of items has to be at least 1\n");
00736         exit(EXIT_FAILURE);
00737     }
00738 }
00739 
00741 static void
00742 free_input(const input_t *pinput)
00743 {
00744     free(pinput->sample_items);
00745     if (pinput->hapax) {
00746         free(pinput->incidencezom);
00747     } else {
00748         free(pinput->incidenceb);
00749     }
00750 }
00751 
00755 
00757 typedef struct {
00759     unsigned * restrict slot_threshold;
00761 
00762     unsigned * restrict lower_bound;
00764 
00765     unsigned * restrict upper_bound;
00766 } output_t;
00767 
00769 typedef struct {
00771     unsigned lower;
00773     unsigned upper;
00774 } bounds_t;
00775 
00777 
00778 static void
00779 prepare_slots(input_t * restrict pinput, output_t * restrict poutput)
00780 {
00781     if (pinput->slot_size) {
00782         assert(pinput->slots == 0);
00783         pinput->slots = (pinput->total_items + pinput->requested_slot_size - 1) / pinput->requested_slot_size + 1;
00784     }
00785     MYMALLOC(poutput->slot_threshold, unsigned, pinput->slots);
00786     if (pinput->slot_size) {
00787         for (unsigned i = 0; i < pinput->slots - 1; i++) {
00788             poutput->slot_threshold[i] = i * pinput->requested_slot_size;
00789         }
00790     } else {
00791         for (unsigned i = 0; i < pinput->slots; i++) {
00792             poutput->slot_threshold[i] = (unsigned)lround((double)pinput->total_items * (double)i / (double)(pinput->slots - 1));
00793         }
00794     }
00795     poutput->slot_threshold[0] = 0;
00796     poutput->slot_threshold[pinput->slots - 1] = pinput->total_items;
00797 }
00798 
00800 inline static void
00801 next_permutation(builtin_rng_state_t * restrict rng_state, const input_t *pinput, unsigned * restrict sample_order)
00802 {
00803     if (pinput->nonrandom) {
00804         identity_permutation(pinput->nsample, sample_order);
00805     } else if (pinput->builtin_rng) {
00806         builtin_rng_permutation(rng_state, pinput->nsample, sample_order);
00807     } else {
00808         rand_permutation(pinput->nsample, sample_order);
00809     }
00810 }
00811 
00813 inline static void
00814 calculate_bounds_normal(const input_t *pinput, unsigned sample, word_t * restrict accum, unsigned * restrict p_accum_types, bounds_t * restrict pb_sample)
00815 {
00816     const word_t * restrict vector = pinput->incidenceb + (size_t)sample * pinput->type_words;
00817 
00818     unsigned types = 0;
00819     for (unsigned j = 0; j < pinput->type_words; j++) {
00820         accum[j] |= vector[j];
00821         types += bitcount(accum[j]);
00822     }
00823 
00824     assert(*p_accum_types <= types);
00825 
00826     pb_sample->lower = *p_accum_types;
00827     pb_sample->upper = types;
00828     *p_accum_types = types;
00829 }
00830 
00832 inline static void
00833 calculate_bounds_hapax(const input_t *pinput, unsigned sample, zom_t * restrict accum, unsigned * restrict p_accum_hapaxes, bounds_t * restrict pb_sample)
00834 {
00835     const zom_t * restrict vector = pinput->incidencezom + (size_t)sample * pinput->type_words;
00836 
00837     unsigned removed = 0;
00838     unsigned created = 0;
00839     unsigned temporary = 0;
00840     for (unsigned j = 0; j < pinput->type_words; j++) {
00841         const zom_t current = vector[j];
00842         const zom_t accum_old = accum[j];
00843         const zom_t accum_new = zom_add(accum_old, current);
00844         accum[j] = accum_new;
00845 
00846         const word_t hapax_old = zom_exactly_1(accum_old);
00847         const word_t hapax_new = zom_exactly_1(accum_new);
00848         const word_t hapax_temporary = ~accum_old.at_least_1 & current.at_least_2;
00849         assert((hapax_old & hapax_temporary) == 0);
00850         assert((hapax_new & hapax_temporary) == 0);
00851         temporary += bitcount(hapax_temporary);
00852         removed += bitcount(hapax_old & ~hapax_new);
00853         created += bitcount(~hapax_old & hapax_new);
00854     }
00855 
00856     assert(*p_accum_hapaxes >= removed);
00857     assert(*p_accum_hapaxes + created + temporary <= pinput->ntype);
00858 
00859     pb_sample->lower = *p_accum_hapaxes - removed;
00860     pb_sample->upper = *p_accum_hapaxes + created + temporary;
00861     *p_accum_hapaxes = *p_accum_hapaxes + created - removed;
00862 }
00863 
00865 
00866 inline static void
00867 update_bounds_normal(bounds_t * restrict pb_this_gap, const bounds_t *pb_sample)
00868 {
00869     // The number of accumulated types is nondecreasing.
00870     assert(pb_this_gap->lower <= pb_sample->lower);
00871     assert(pb_sample->lower <= pb_this_gap->upper);
00872     assert(pb_this_gap->upper <= pb_sample->upper);
00873     pb_this_gap->upper = pb_sample->upper;
00874 }
00875 
00877 
00878 inline static void
00879 update_bounds_hapax(bounds_t * restrict pb_this_gap, const bounds_t *pb_sample)
00880 {
00881     pb_this_gap->lower = MIN(pb_sample->lower, pb_this_gap->lower);
00882     pb_this_gap->upper = MAX(pb_sample->upper, pb_this_gap->upper);
00883 }
00884 
00886 inline static void
00887 record(unsigned slots, const output_t *poutput, bounds_t b_slot, unsigned current_slot)
00888 {
00889     poutput->lower_bound[(size_t)b_slot.lower * slots + current_slot]++;
00890     poutput->upper_bound[(size_t)b_slot.upper * slots + current_slot]++;
00891 }
00892 
00894 
00895 inline static void
00896 record_normal(unsigned slots, const output_t *poutput, bounds_t b_prev_gap, bounds_t b_this_gap, unsigned current_slot)
00897 {
00898     // The number of accumulated types is nondecreasing.
00899     assert(b_prev_gap.lower <= b_this_gap.lower);
00900     assert(b_prev_gap.upper <= b_this_gap.upper);
00901     const bounds_t b_slot = {
00902         b_prev_gap.lower,
00903         b_this_gap.upper
00904     };
00905     record(slots, poutput, b_slot, current_slot);
00906 }
00907 
00909 
00910 inline static void
00911 record_hapax(unsigned slots, const output_t *poutput, bounds_t b_prev_gap, bounds_t b_this_gap, unsigned current_slot)
00912 {
00913     // Here we need to compute the minimums and maximums.
00914     const bounds_t b_slot = {
00915         MIN(b_prev_gap.lower, b_this_gap.lower),
00916         MAX(b_prev_gap.upper, b_this_gap.upper)
00917     };
00918     record(slots, poutput, b_slot, current_slot);
00919 }
00920 
00922 #define DEF_CALCULATE_STATISTICS(suffix, representation) \
00923 static void \
00924 calculate_statistics_ ## suffix(const input_t * restrict pinput, const output_t *poutput) \
00925 { \
00926     builtin_rng_state_t rng_state; \
00927     builtin_rng_init(&rng_state); \
00928     \
00929     for (unsigned iteration = 0; iteration < pinput->iterations; iteration++) { \
00930         unsigned sample_order[pinput->nsample]; \
00931         next_permutation(&rng_state, pinput, sample_order); \
00932         \
00933         representation ## _t accum[pinput->type_words]; \
00934         representation ## _clear(accum, pinput->type_words); \
00935         \
00936         unsigned accum_items = 0; \
00937         unsigned accum_value = 0; \
00938         unsigned current_slot = 0; \
00939         \
00940         bounds_t b_prev_gap = { 0, 0 }; \
00941         bounds_t b_this_gap = { 0, 0 }; \
00942         \
00943         for (unsigned i = 0; i < pinput->nsample; i++) { \
00944             const unsigned sample = sample_order[i]; \
00945             accum_items += pinput->sample_items[sample]; \
00946             \
00947             bounds_t b_sample; \
00948             calculate_bounds_ ## suffix(pinput, sample, accum, &accum_value, &b_sample); \
00949             update_bounds_ ## suffix(&b_this_gap, &b_sample); \
00950             \
00951             while (current_slot + 1 < pinput->slots && accum_items >= poutput->slot_threshold[current_slot + 1]) { \
00952                 record_ ## suffix(pinput->slots, poutput, b_prev_gap, b_this_gap, current_slot); \
00953                 current_slot++; \
00954                 b_prev_gap = b_this_gap; \
00955                 b_this_gap = b_sample; \
00956             } \
00957             if (accum_items == poutput->slot_threshold[current_slot]) { \
00958                 b_this_gap.lower = accum_value; \
00959                 b_this_gap.upper = accum_value; \
00960             } \
00961         } \
00962         \
00963         assert(current_slot == pinput->slots - 1); \
00964         record_ ## suffix(pinput->slots, poutput, b_prev_gap, b_this_gap, current_slot); \
00965     } \
00966 }
00967 
00969 DEF_CALCULATE_STATISTICS(normal, word)
00970 
00971 
00972 DEF_CALCULATE_STATISTICS(hapax, zom)
00973 
00975 static void
00976 calculate_statistics(const input_t *pinput, output_t * restrict poutput)
00977 {
00978     MYMALLOC2Z(poutput->lower_bound, unsigned, pinput->ntype+1, pinput->slots, 0);
00979     MYMALLOC2Z(poutput->upper_bound, unsigned, pinput->ntype+1, pinput->slots, 0);
00980 
00981     if (pinput->hapax) {
00982         calculate_statistics_hapax(pinput, poutput);
00983     } else {
00984         calculate_statistics_normal(pinput, poutput);
00985     }
00986 }
00987 
00989 static void
00990 free_output(const output_t *poutput)
00991 {
00992     free(poutput->slot_threshold);
00993     free(poutput->lower_bound);
00994     free(poutput->upper_bound);
00995 }
00996 
01000 
01002 static void
01003 print_raw_data(unsigned accum_items, unsigned accum_value)
01004 {
01005     printf("%8u %4u\n", accum_items, accum_value);
01006 }
01007 
01009 #define DEF_CALCULATE_RAW_DATA(suffix, representation) \
01010 static void \
01011 calculate_raw_data_ ## suffix(const input_t *pinput) \
01012 { \
01013     builtin_rng_state_t rng_state; \
01014     builtin_rng_init(&rng_state); \
01015     \
01016     for (unsigned iteration = 0; iteration < pinput->iterations; iteration++) { \
01017         unsigned sample_order[pinput->nsample]; \
01018         next_permutation(&rng_state, pinput, sample_order); \
01019         \
01020         representation ## _t accum[pinput->type_words]; \
01021         representation ## _clear(accum, pinput->type_words); \
01022         \
01023         unsigned accum_items = 0; \
01024         unsigned accum_value = 0; \
01025         \
01026         print_raw_data(accum_items, accum_value); \
01027         for (unsigned i = 0; i < pinput->nsample; i++) { \
01028             const unsigned sample = sample_order[i]; \
01029             accum_items += pinput->sample_items[sample]; \
01030             \
01031             bounds_t b_sample; \
01032             calculate_bounds_ ## suffix(pinput, sample, accum, &accum_value, &b_sample); \
01033             \
01034             print_raw_data(accum_items, accum_value); \
01035         } \
01036     } \
01037 }
01038 
01040 DEF_CALCULATE_RAW_DATA(normal, word)
01041 
01042 
01043 DEF_CALCULATE_RAW_DATA(hapax, zom)
01044 
01046 static void
01047 calculate_raw_data(const input_t *pinput)
01048 {
01049     if (pinput->hapax) {
01050         calculate_raw_data_hapax(pinput);
01051     } else {
01052         calculate_raw_data_normal(pinput);
01053     }
01054 }
01055 
01059 
01061 const double LEVELS[] = { LEVELS_LIST };
01062 
01064 #define NLEVELS (sizeof LEVELS / sizeof LEVELS[0])
01065 
01067 static void
01068 print_row(unsigned slot_threshold, const unsigned *lower_bounds, const unsigned *upper_bounds)
01069 {
01070     printf("%8u", slot_threshold);
01071     for (unsigned l = 0; l < NLEVELS; l++) {
01072         printf(" %4u", lower_bounds[l]);
01073     }
01074     for (unsigned l = 0; l < NLEVELS; l++) {
01075         printf(" %4u", upper_bounds[NLEVELS - 1 - l]);
01076     }
01077     printf("\n");
01078 }
01079 
01081 static void
01082 print_result(const input_t *pinput, const output_t *poutput)
01083 {
01084     unsigned prev_lower_bounds[NLEVELS];
01085     unsigned prev_upper_bounds[NLEVELS];
01086     bool prev_valid = false;
01087     bool prev_delayed = false;
01088 
01089     for (unsigned i = 0; i < pinput->slots; i++) {
01090         unsigned lower_bounds[NLEVELS];
01091         unsigned cum = 0;
01092         unsigned next_level = 0;
01093         for (unsigned j = 0; j <= pinput->ntype; j++) {
01094             cum += poutput->lower_bound[(size_t)j * pinput->slots + i];
01095             double fract = (double)cum / (double)pinput->iterations;
01096             while (next_level < NLEVELS && LEVELS[next_level] < fract) {
01097                 lower_bounds[next_level] = j;
01098                 next_level++;
01099             }
01100         }
01101         assert(next_level == NLEVELS);
01102 
01103         unsigned upper_bounds[NLEVELS];
01104         cum = 0;
01105         next_level = 0;
01106         for (unsigned j = 0; j <= pinput->ntype; j++) {
01107             cum += poutput->upper_bound[(size_t)(pinput->ntype - j) * pinput->slots + i];
01108             double fract = (double)cum / (double)pinput->iterations;
01109             while (next_level < NLEVELS && LEVELS[next_level] < fract) {
01110                 upper_bounds[next_level] = pinput->ntype - j;
01111                 next_level++;
01112             }
01113         }
01114         assert(next_level == NLEVELS);
01115 
01116         if (pinput->brief) {
01117             if (prev_valid) {
01118                 bool same = true;
01119                 for (unsigned l = 0; l < NLEVELS; l++) {
01120                     if (prev_lower_bounds[l] != lower_bounds[l] || prev_upper_bounds[l] != upper_bounds[l]) {
01121                         same = false;
01122                         break;
01123                     }
01124                 }
01125                 if (same) {
01126                     prev_delayed = true;
01127                 } else if (prev_delayed) {
01128                     print_row(poutput->slot_threshold[i - 1], prev_lower_bounds, prev_upper_bounds);
01129                     prev_valid = false;
01130                 } else {
01131                     prev_valid = false;
01132                 }
01133             }
01134             if (!prev_valid) {
01135                 print_row(poutput->slot_threshold[i], lower_bounds, upper_bounds);
01136                 for (unsigned l = 0; l < NLEVELS; l++) {
01137                     prev_lower_bounds[l] = lower_bounds[l];
01138                     prev_upper_bounds[l] = upper_bounds[l];
01139                 }
01140                 prev_valid = true;
01141                 prev_delayed = false;
01142             }
01143         } else {
01144             print_row(poutput->slot_threshold[i], lower_bounds, upper_bounds);
01145         }
01146     }
01147 
01148     if (pinput->brief && prev_valid && prev_delayed) {
01149         print_row(poutput->slot_threshold[pinput->slots - 1], prev_lower_bounds, prev_upper_bounds);
01150     }
01151 }
01152 
01156 
01158 int
01159 main(int argc, char **argv)
01160 {
01161     init_bitcount();
01162 
01163     input_t input;
01164     parse_command_line(&input, argc, argv);
01165     process_input(&input);
01166 
01167     if (input.raw_data) {
01168         calculate_raw_data(&input);
01169     } else {
01170         output_t output;
01171         prepare_slots(&input, &output);
01172         calculate_statistics(&input, &output);
01173         print_result(&input, &output);
01174         free_output(&output);
01175     }
01176     free_input(&input);
01177     return EXIT_SUCCESS;
01178 }
01179