1 files changed, 345 insertions, 0 deletions
diff --git a/huffman.c b/huffman.c
new file mode 100644
index 0000000..3fada10
--- /dev/null
+++ b/huffman.c
@@ -0,0 +1,345 @@
+/*
+ * huffman.c: implementation of huffman.h.
+ */
+
+#include <assert.h>
+
+#include "halibut.h"
+#include "huffman.h"
+
+static const unsigned fibonacci[] = {
+    0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597,
+    2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418,
+    317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465,
+    14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296,
+    433494437, 701408733, 1134903170, 1836311903, 2971215073,
+};
+
+/*
+ * Binary heap functions used by buildhuf(). Each one assumes the
+ * heap to be stored in an array of ints, with two ints per node
+ * (user data and key). They take in the old heap length, and
+ * return the new one.
+ */
+#define HEAPPARENT(x) (((x)-2)/4*2)
+#define HEAPLEFT(x) ((x)*2+2)
+#define HEAPRIGHT(x) ((x)*2+4)
+static int addheap(int *heap, int len, int userdata, int key)
+{
+    int me, dad, tmp;
+
+    me = len;
+    heap[len++] = userdata;
+    heap[len++] = key;
+
+    while (me > 0) {
+	dad = HEAPPARENT(me);
+	if (heap[me+1] < heap[dad+1]) {
+	    tmp = heap[me]; heap[me] = heap[dad]; heap[dad] = tmp;
+	    tmp = heap[me+1]; heap[me+1] = heap[dad+1]; heap[dad+1] = tmp;
+	    me = dad;
+	} else
+	    break;
+    }
+
+    return len;
+}
+static int rmheap(int *heap, int len, int *userdata, int *key)
+{
+    int me, lc, rc, c, tmp;
+
+    len -= 2;
+    *userdata = heap[0];
+    *key = heap[1];
+    heap[0] = heap[len];
+    heap[1] = heap[len+1];
+
+    me = 0;
+
+    while (1) {
+	lc = HEAPLEFT(me);
+	rc = HEAPRIGHT(me);
+	if (lc >= len)
+	    break;
+	else if (rc >= len || heap[lc+1] < heap[rc+1])
+	    c = lc;
+	else
+	    c = rc;
+	if (heap[me+1] > heap[c+1]) {
+	    tmp = heap[me]; heap[me] = heap[c]; heap[c] = tmp;
+	    tmp = heap[me+1]; heap[me+1] = heap[c+1]; heap[c+1] = tmp;
+	} else
+	    break;
+	me = c;
+    }
+
+    return len;
+}
+
+struct hufscratch {
+    int *parent, *length, *heap;
+};
+
+/*
+ * The core of the Huffman algorithm: takes an input array of
+ * symbol frequencies, and produces an output array of code
+ * lengths.
+ *
+ * We cap the output code lengths to fit in an unsigned char (which is
+ * safe since our clients will impose some smaller limit on code
+ * length anyway). So if you see 255 in the output, it means '255 or
+ * more' and is a sign that whatever limit you really wanted has
+ * certainly been overflowed.
+ */
+static void buildhuf(struct hufscratch *sc, const int *freqs,
+                     unsigned char *lengths, int nsyms)
+{
+    int heapsize;
+    int i, j, n;
+    int si, sj;
+
+    for (i = 0; i < nsyms; i++)
+        sc->parent[i] = 0;
+
+    /*
+     * Begin by building the heap.
+     */
+    heapsize = 0;
+    for (i = 0; i < nsyms; i++)
+	if (freqs[i] > 0)	       /* leave unused symbols out totally */
+	    heapsize = addheap(sc->heap, heapsize, i, freqs[i]);
+
+    /*
+     * Now repeatedly take two elements off the heap and merge
+     * them.
+     */
+    n = nsyms;
+    while (heapsize > 2) {
+	heapsize = rmheap(sc->heap, heapsize, &i, &si);
+	heapsize = rmheap(sc->heap, heapsize, &j, &sj);
+	sc->parent[i] = n;
+	sc->parent[j] = n;
+	heapsize = addheap(sc->heap, heapsize, n, si + sj);
+	n++;
+    }
+
+    /*
+     * Now we have our tree, in the form of a link from each node
+     * to the index of its parent. Count back down the tree to
+     * determine the code lengths.
+     */
+    for (i = 0; i < 2*nsyms+1; i++)
+        sc->length[i] = 0;
+    /* The tree root has length 0 after that, which is correct. */
+    for (i = n-1; i-- ;)
+	if (sc->parent[i] > 0)
+	    sc->length[i] = 1 + sc->length[sc->parent[i]];
+
+    /*
+     * And that's it. (Simple, wasn't it?) Copy the lengths into
+     * the output array and leave.
+     * 
+     * Here we cap lengths to fit in unsigned char.
+     */
+    for (i = 0; i < nsyms; i++)
+	lengths[i] = (sc->length[i] > 255 ? 255 : sc->length[i]);
+}
+
+/*
+ * Wrapper around buildhuf() which enforces the restriction on code
+ * length.
+ */
+void build_huffman_tree(int *freqs, unsigned char *lengths,
+                        int nsyms, int limit)
+{
+    struct hufscratch hsc, *sc = &hsc;
+    int smallestfreq, totalfreq, nactivesyms;
+    int num, denom, adjust;
+    int i;
+    int maxprob;
+
+    sc->parent = snewn(2*nsyms+1, int);
+    sc->length = snewn(2*nsyms+1, int);
+    sc->heap = snewn(2*nsyms, int);
+
+    /*
+     * Nasty special case: if the frequency table has fewer than
+     * two non-zero elements, we must invent some, because we can't
+     * have fewer than one bit encoding a symbol.
+     */
+    assert(nsyms >= 2);
+    {
+	int count = 0;
+	for (i = 0; i < nsyms; i++)
+	    if (freqs[i] > 0)
+		count++;
+	if (count < 2) {
+	    for (i = 0; i < nsyms && count > 0; i++)
+		if (freqs[i] == 0) {
+		    freqs[i] = 1;
+		    count--;
+		}
+	}
+    }
+
+    /*
+     * First, try building the Huffman table the normal way. If
+     * this works, it's optimal, so we don't want to mess with it.
+     */
+    buildhuf(sc, freqs, lengths, nsyms);
+
+    for (i = 0; i < nsyms; i++)
+	if (lengths[i] > limit)
+	    break;
+
+    if (i == nsyms)
+        goto cleanup;                  /* OK */
+
+    /*
+     * The Huffman algorithm can only ever generate a code length
+     * of N bits or more if there is a symbol whose probability is
+     * less than the reciprocal of the (N+2)th Fibonacci number
+     * (counting from F_0=0 and F_1=1), i.e. 1/2584 for N=16, or
+     * 1/55 for N=8. (This is a necessary though not sufficient
+     * condition.)
+     *
+     * Why is this? Well, consider the input symbol with the
+     * smallest probability. Let that probability be x. In order
+     * for this symbol to have a code length of at least 1, the
+     * Huffman algorithm will have to merge it with some other
+     * node; and since x is the smallest probability, the node it
+     * gets merged with must be at least x. Thus, the probability
+     * of the resulting combined node will be at least 2x. Now in
+     * order for our node to reach depth 2, this 2x-node must be
+     * merged again. But what with? We can't assume the node it
+     * merges with is at least 2x, because this one might only be
+     * the _second_ smallest remaining node. But we do know the
+     * node it merges with must be at least x, so our order-2
+     * internal node is at least 3x.
+     *
+     * How small a node can merge with _that_ to get an order-3
+     * internal node? Well, it must be at least 2x, because if it
+     * was smaller than that then it would have been one of the two
+     * smallest nodes in the previous step and been merged at that
+     * point. So at least 3x, plus at least 2x, comes to at least
+     * 5x for an order-3 node.
+     *
+     * And so it goes on: at every stage we must merge our current
+     * node with a node at least as big as the bigger of this one's
+     * two parents, and from this starting point that gives rise to
+     * the Fibonacci sequence. So we find that in order to have a
+     * node n levels deep (i.e. a maximum code length of n), the
+     * overall probability of the root of the entire tree must be
+     * at least F_{n+2} times the probability of the rarest symbol.
+     * In other words, since the overall probability is 1, it is a
+     * necessary condition for a code length of 16 or more that
+     * there must be at least one symbol with probability <=
+     * 1/F_18.
+     *
+     * (To demonstrate that a probability this big really can give
+     * rise to a code length of 16, consider the set of input
+     * frequencies { 1-epsilon, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,
+     * 89, 144, 233, 377, 610, 987 }, for arbitrarily small
+     * epsilon.)
+     *
+     * So here buildhuf() has returned us an overlong code. So to
+     * ensure it doesn't do it again, we add a constant to all the
+     * (non-zero) symbol frequencies, causing them to become more
+     * balanced and removing the danger. We can then feed the
+     * results back to the standard buildhuf() and be
+     * assert()-level confident that the resulting code lengths
+     * contain nothing outside the permitted range.
+     */
+    assert(limit+3 < (int)lenof(fibonacci));
+    maxprob = fibonacci[limit+3];
+    totalfreq = nactivesyms = 0;
+    smallestfreq = -1;
+    for (i = 0; i < nsyms; i++) {
+	if (freqs[i] == 0)
+	    continue;
+	if (smallestfreq < 0 || smallestfreq > freqs[i])
+	    smallestfreq = freqs[i];
+	totalfreq += freqs[i];
+	nactivesyms++;
+    }
+    assert(smallestfreq <= totalfreq / maxprob);
+
+    /*
+     * We want to find the smallest integer `adjust' such that
+     * (totalfreq + nactivesyms * adjust) / (smallestfreq +
+     * adjust) is less than maxprob. A bit of algebra tells us
+     * that the threshold value is equal to
+     *
+     *   totalfreq - maxprob * smallestfreq
+     *   ----------------------------------
+     *          maxprob - nactivesyms
+     *
+     * rounded up, of course. And we'll only even be trying
+     * this if
+     */
+    num = totalfreq - smallestfreq * maxprob;
+    denom = maxprob - nactivesyms;
+    adjust = (num + denom - 1) / denom;
+
+    /*
+     * Now add `adjust' to all the input symbol frequencies.
+     */
+    for (i = 0; i < nsyms; i++)
+	if (freqs[i] != 0)
+	    freqs[i] += adjust;
+
+    /*
+     * Rebuild the Huffman tree...
+     */
+    buildhuf(sc, freqs, lengths, nsyms);
+
+    /*
+     * ... and this time it ought to be OK.
+     */
+    for (i = 0; i < nsyms; i++)
+	assert(lengths[i] <= limit);
+
+  cleanup:
+    /*
+     * Finally, free our scratch space.
+     */
+    sfree(sc->parent);
+    sfree(sc->length);
+    sfree(sc->heap);
+}
+
+#define MAXCODELEN 31                  /* codes must fit in an int */
+
+int compute_huffman_codes(const unsigned char *lengths, int *codes, int nsyms)
+{
+    unsigned count[MAXCODELEN], startcode[MAXCODELEN], code;
+    int maxlen, i;
+
+    /* Count the codes of each length. */
+    maxlen = 0;
+    for (i = 1; i < MAXCODELEN; i++)
+	count[i] = 0;
+    for (i = 0; i < nsyms; i++) {
+	count[lengths[i]]++;
+	if (maxlen < lengths[i])
+	    maxlen = lengths[i];
+    }
+
+    /* Determine the starting code for each length block. */
+    code = 0;
+    for (i = 1; i < MAXCODELEN; i++) {
+	startcode[i] = code;
+	code += count[i];
+	if (code > (1U << i))
+	    maxlen = -1;	       /* overcommitted */
+	code <<= 1;
+    }
+    if (code < (1U << MAXCODELEN))
+	maxlen = -2;		       /* undercommitted */
+
+    /* Determine the code for each symbol. */
+    for (i = 0; i < nsyms; i++)
+	codes[i] = startcode[lengths[i]]++;
+
+    return maxlen;
+}