summaryrefslogtreecommitdiff
path: root/huffman.c
diff options
context:
space:
mode:
Diffstat (limited to 'huffman.c')
-rw-r--r--huffman.c345
1 files changed, 345 insertions, 0 deletions
diff --git a/huffman.c b/huffman.c
new file mode 100644
index 0000000..3fada10
--- /dev/null
+++ b/huffman.c
@@ -0,0 +1,345 @@
+/*
+ * huffman.c: implementation of huffman.h.
+ */
+
+#include <assert.h>
+
+#include "halibut.h"
+#include "huffman.h"
+
+static const unsigned fibonacci[] = {
+ 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597,
+ 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418,
+ 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465,
+ 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296,
+ 433494437, 701408733, 1134903170, 1836311903, 2971215073,
+};
+
+/*
+ * Binary heap functions used by buildhuf(). Each one assumes the
+ * heap to be stored in an array of ints, with two ints per node
+ * (user data and key). They take in the old heap length, and
+ * return the new one.
+ */
+#define HEAPPARENT(x) (((x)-2)/4*2)
+#define HEAPLEFT(x) ((x)*2+2)
+#define HEAPRIGHT(x) ((x)*2+4)
+static int addheap(int *heap, int len, int userdata, int key)
+{
+ int me, dad, tmp;
+
+ me = len;
+ heap[len++] = userdata;
+ heap[len++] = key;
+
+ while (me > 0) {
+ dad = HEAPPARENT(me);
+ if (heap[me+1] < heap[dad+1]) {
+ tmp = heap[me]; heap[me] = heap[dad]; heap[dad] = tmp;
+ tmp = heap[me+1]; heap[me+1] = heap[dad+1]; heap[dad+1] = tmp;
+ me = dad;
+ } else
+ break;
+ }
+
+ return len;
+}
+static int rmheap(int *heap, int len, int *userdata, int *key)
+{
+ int me, lc, rc, c, tmp;
+
+ len -= 2;
+ *userdata = heap[0];
+ *key = heap[1];
+ heap[0] = heap[len];
+ heap[1] = heap[len+1];
+
+ me = 0;
+
+ while (1) {
+ lc = HEAPLEFT(me);
+ rc = HEAPRIGHT(me);
+ if (lc >= len)
+ break;
+ else if (rc >= len || heap[lc+1] < heap[rc+1])
+ c = lc;
+ else
+ c = rc;
+ if (heap[me+1] > heap[c+1]) {
+ tmp = heap[me]; heap[me] = heap[c]; heap[c] = tmp;
+ tmp = heap[me+1]; heap[me+1] = heap[c+1]; heap[c+1] = tmp;
+ } else
+ break;
+ me = c;
+ }
+
+ return len;
+}
+
+struct hufscratch {
+ int *parent, *length, *heap;
+};
+
+/*
+ * The core of the Huffman algorithm: takes an input array of
+ * symbol frequencies, and produces an output array of code
+ * lengths.
+ *
+ * We cap the output code lengths to fit in an unsigned char (which is
+ * safe since our clients will impose some smaller limit on code
+ * length anyway). So if you see 255 in the output, it means '255 or
+ * more' and is a sign that whatever limit you really wanted has
+ * certainly been overflowed.
+ */
+static void buildhuf(struct hufscratch *sc, const int *freqs,
+ unsigned char *lengths, int nsyms)
+{
+ int heapsize;
+ int i, j, n;
+ int si, sj;
+
+ for (i = 0; i < nsyms; i++)
+ sc->parent[i] = 0;
+
+ /*
+ * Begin by building the heap.
+ */
+ heapsize = 0;
+ for (i = 0; i < nsyms; i++)
+ if (freqs[i] > 0) /* leave unused symbols out totally */
+ heapsize = addheap(sc->heap, heapsize, i, freqs[i]);
+
+ /*
+ * Now repeatedly take two elements off the heap and merge
+ * them.
+ */
+ n = nsyms;
+ while (heapsize > 2) {
+ heapsize = rmheap(sc->heap, heapsize, &i, &si);
+ heapsize = rmheap(sc->heap, heapsize, &j, &sj);
+ sc->parent[i] = n;
+ sc->parent[j] = n;
+ heapsize = addheap(sc->heap, heapsize, n, si + sj);
+ n++;
+ }
+
+ /*
+ * Now we have our tree, in the form of a link from each node
+ * to the index of its parent. Count back down the tree to
+ * determine the code lengths.
+ */
+ for (i = 0; i < 2*nsyms+1; i++)
+ sc->length[i] = 0;
+ /* The tree root has length 0 after that, which is correct. */
+ for (i = n-1; i-- ;)
+ if (sc->parent[i] > 0)
+ sc->length[i] = 1 + sc->length[sc->parent[i]];
+
+ /*
+ * And that's it. (Simple, wasn't it?) Copy the lengths into
+ * the output array and leave.
+ *
+ * Here we cap lengths to fit in unsigned char.
+ */
+ for (i = 0; i < nsyms; i++)
+ lengths[i] = (sc->length[i] > 255 ? 255 : sc->length[i]);
+}
+
+/*
+ * Wrapper around buildhuf() which enforces the restriction on code
+ * length.
+ */
+void build_huffman_tree(int *freqs, unsigned char *lengths,
+ int nsyms, int limit)
+{
+ struct hufscratch hsc, *sc = &hsc;
+ int smallestfreq, totalfreq, nactivesyms;
+ int num, denom, adjust;
+ int i;
+ int maxprob;
+
+ sc->parent = snewn(2*nsyms+1, int);
+ sc->length = snewn(2*nsyms+1, int);
+ sc->heap = snewn(2*nsyms, int);
+
+ /*
+ * Nasty special case: if the frequency table has fewer than
+ * two non-zero elements, we must invent some, because we can't
+ * have fewer than one bit encoding a symbol.
+ */
+ assert(nsyms >= 2);
+ {
+ int count = 0;
+ for (i = 0; i < nsyms; i++)
+ if (freqs[i] > 0)
+ count++;
+ if (count < 2) {
+ for (i = 0; i < nsyms && count > 0; i++)
+ if (freqs[i] == 0) {
+ freqs[i] = 1;
+ count--;
+ }
+ }
+ }
+
+ /*
+ * First, try building the Huffman table the normal way. If
+ * this works, it's optimal, so we don't want to mess with it.
+ */
+ buildhuf(sc, freqs, lengths, nsyms);
+
+ for (i = 0; i < nsyms; i++)
+ if (lengths[i] > limit)
+ break;
+
+ if (i == nsyms)
+ goto cleanup; /* OK */
+
+ /*
+ * The Huffman algorithm can only ever generate a code length
+ * of N bits or more if there is a symbol whose probability is
+ * less than the reciprocal of the (N+2)th Fibonacci number
+ * (counting from F_0=0 and F_1=1), i.e. 1/2584 for N=16, or
+ * 1/55 for N=8. (This is a necessary though not sufficient
+ * condition.)
+ *
+ * Why is this? Well, consider the input symbol with the
+ * smallest probability. Let that probability be x. In order
+ * for this symbol to have a code length of at least 1, the
+ * Huffman algorithm will have to merge it with some other
+ * node; and since x is the smallest probability, the node it
+ * gets merged with must be at least x. Thus, the probability
+ * of the resulting combined node will be at least 2x. Now in
+ * order for our node to reach depth 2, this 2x-node must be
+ * merged again. But what with? We can't assume the node it
+ * merges with is at least 2x, because this one might only be
+ * the _second_ smallest remaining node. But we do know the
+ * node it merges with must be at least x, so our order-2
+ * internal node is at least 3x.
+ *
+ * How small a node can merge with _that_ to get an order-3
+ * internal node? Well, it must be at least 2x, because if it
+ * was smaller than that then it would have been one of the two
+ * smallest nodes in the previous step and been merged at that
+ * point. So at least 3x, plus at least 2x, comes to at least
+ * 5x for an order-3 node.
+ *
+ * And so it goes on: at every stage we must merge our current
+ * node with a node at least as big as the bigger of this one's
+ * two parents, and from this starting point that gives rise to
+ * the Fibonacci sequence. So we find that in order to have a
+ * node n levels deep (i.e. a maximum code length of n), the
+ * overall probability of the root of the entire tree must be
+ * at least F_{n+2} times the probability of the rarest symbol.
+ * In other words, since the overall probability is 1, it is a
+ * necessary condition for a code length of 16 or more that
+ * there must be at least one symbol with probability <=
+ * 1/F_18.
+ *
+ * (To demonstrate that a probability this big really can give
+ * rise to a code length of 16, consider the set of input
+ * frequencies { 1-epsilon, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,
+ * 89, 144, 233, 377, 610, 987 }, for arbitrarily small
+ * epsilon.)
+ *
+ * So here buildhuf() has returned us an overlong code. So to
+ * ensure it doesn't do it again, we add a constant to all the
+ * (non-zero) symbol frequencies, causing them to become more
+ * balanced and removing the danger. We can then feed the
+ * results back to the standard buildhuf() and be
+ * assert()-level confident that the resulting code lengths
+ * contain nothing outside the permitted range.
+ */
+ assert(limit+3 < (int)lenof(fibonacci));
+ maxprob = fibonacci[limit+3];
+ totalfreq = nactivesyms = 0;
+ smallestfreq = -1;
+ for (i = 0; i < nsyms; i++) {
+ if (freqs[i] == 0)
+ continue;
+ if (smallestfreq < 0 || smallestfreq > freqs[i])
+ smallestfreq = freqs[i];
+ totalfreq += freqs[i];
+ nactivesyms++;
+ }
+ assert(smallestfreq <= totalfreq / maxprob);
+
+ /*
+ * We want to find the smallest integer `adjust' such that
+ * (totalfreq + nactivesyms * adjust) / (smallestfreq +
+ * adjust) is less than maxprob. A bit of algebra tells us
+ * that the threshold value is equal to
+ *
+ * totalfreq - maxprob * smallestfreq
+ * ----------------------------------
+ * maxprob - nactivesyms
+ *
+ * rounded up, of course. And we'll only even be trying
+ * this if
+ */
+ num = totalfreq - smallestfreq * maxprob;
+ denom = maxprob - nactivesyms;
+ adjust = (num + denom - 1) / denom;
+
+ /*
+ * Now add `adjust' to all the input symbol frequencies.
+ */
+ for (i = 0; i < nsyms; i++)
+ if (freqs[i] != 0)
+ freqs[i] += adjust;
+
+ /*
+ * Rebuild the Huffman tree...
+ */
+ buildhuf(sc, freqs, lengths, nsyms);
+
+ /*
+ * ... and this time it ought to be OK.
+ */
+ for (i = 0; i < nsyms; i++)
+ assert(lengths[i] <= limit);
+
+ cleanup:
+ /*
+ * Finally, free our scratch space.
+ */
+ sfree(sc->parent);
+ sfree(sc->length);
+ sfree(sc->heap);
+}
+
+#define MAXCODELEN 31 /* codes must fit in an int */
+
+int compute_huffman_codes(const unsigned char *lengths, int *codes, int nsyms)
+{
+ unsigned count[MAXCODELEN], startcode[MAXCODELEN], code;
+ int maxlen, i;
+
+ /* Count the codes of each length. */
+ maxlen = 0;
+ for (i = 1; i < MAXCODELEN; i++)
+ count[i] = 0;
+ for (i = 0; i < nsyms; i++) {
+ count[lengths[i]]++;
+ if (maxlen < lengths[i])
+ maxlen = lengths[i];
+ }
+
+ /* Determine the starting code for each length block. */
+ code = 0;
+ for (i = 1; i < MAXCODELEN; i++) {
+ startcode[i] = code;
+ code += count[i];
+ if (code > (1U << i))
+ maxlen = -1; /* overcommitted */
+ code <<= 1;
+ }
+ if (code < (1U << MAXCODELEN))
+ maxlen = -2; /* undercommitted */
+
+ /* Determine the code for each symbol. */
+ for (i = 0; i < nsyms; i++)
+ codes[i] = startcode[lengths[i]]++;
+
+ return maxlen;
+}