diff options
Diffstat (limited to 'huffman.c')
| -rw-r--r-- | huffman.c | 345 |
1 files changed, 345 insertions, 0 deletions
diff --git a/huffman.c b/huffman.c new file mode 100644 index 0000000..3fada10 --- /dev/null +++ b/huffman.c @@ -0,0 +1,345 @@ +/* + * huffman.c: implementation of huffman.h. + */ + +#include <assert.h> + +#include "halibut.h" +#include "huffman.h" + +static const unsigned fibonacci[] = { + 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, + 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, + 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465, + 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296, + 433494437, 701408733, 1134903170, 1836311903, 2971215073, +}; + +/* + * Binary heap functions used by buildhuf(). Each one assumes the + * heap to be stored in an array of ints, with two ints per node + * (user data and key). They take in the old heap length, and + * return the new one. + */ +#define HEAPPARENT(x) (((x)-2)/4*2) +#define HEAPLEFT(x) ((x)*2+2) +#define HEAPRIGHT(x) ((x)*2+4) +static int addheap(int *heap, int len, int userdata, int key) +{ + int me, dad, tmp; + + me = len; + heap[len++] = userdata; + heap[len++] = key; + + while (me > 0) { + dad = HEAPPARENT(me); + if (heap[me+1] < heap[dad+1]) { + tmp = heap[me]; heap[me] = heap[dad]; heap[dad] = tmp; + tmp = heap[me+1]; heap[me+1] = heap[dad+1]; heap[dad+1] = tmp; + me = dad; + } else + break; + } + + return len; +} +static int rmheap(int *heap, int len, int *userdata, int *key) +{ + int me, lc, rc, c, tmp; + + len -= 2; + *userdata = heap[0]; + *key = heap[1]; + heap[0] = heap[len]; + heap[1] = heap[len+1]; + + me = 0; + + while (1) { + lc = HEAPLEFT(me); + rc = HEAPRIGHT(me); + if (lc >= len) + break; + else if (rc >= len || heap[lc+1] < heap[rc+1]) + c = lc; + else + c = rc; + if (heap[me+1] > heap[c+1]) { + tmp = heap[me]; heap[me] = heap[c]; heap[c] = tmp; + tmp = heap[me+1]; heap[me+1] = heap[c+1]; heap[c+1] = tmp; + } else + break; + me = c; + } + + return len; +} + +struct hufscratch { + int *parent, *length, *heap; +}; + +/* + * The core of the Huffman algorithm: takes an input array of + * symbol frequencies, and produces an output array of code + * lengths. + * + * We cap the output code lengths to fit in an unsigned char (which is + * safe since our clients will impose some smaller limit on code + * length anyway). So if you see 255 in the output, it means '255 or + * more' and is a sign that whatever limit you really wanted has + * certainly been overflowed. + */ +static void buildhuf(struct hufscratch *sc, const int *freqs, + unsigned char *lengths, int nsyms) +{ + int heapsize; + int i, j, n; + int si, sj; + + for (i = 0; i < nsyms; i++) + sc->parent[i] = 0; + + /* + * Begin by building the heap. + */ + heapsize = 0; + for (i = 0; i < nsyms; i++) + if (freqs[i] > 0) /* leave unused symbols out totally */ + heapsize = addheap(sc->heap, heapsize, i, freqs[i]); + + /* + * Now repeatedly take two elements off the heap and merge + * them. + */ + n = nsyms; + while (heapsize > 2) { + heapsize = rmheap(sc->heap, heapsize, &i, &si); + heapsize = rmheap(sc->heap, heapsize, &j, &sj); + sc->parent[i] = n; + sc->parent[j] = n; + heapsize = addheap(sc->heap, heapsize, n, si + sj); + n++; + } + + /* + * Now we have our tree, in the form of a link from each node + * to the index of its parent. Count back down the tree to + * determine the code lengths. + */ + for (i = 0; i < 2*nsyms+1; i++) + sc->length[i] = 0; + /* The tree root has length 0 after that, which is correct. */ + for (i = n-1; i-- ;) + if (sc->parent[i] > 0) + sc->length[i] = 1 + sc->length[sc->parent[i]]; + + /* + * And that's it. (Simple, wasn't it?) Copy the lengths into + * the output array and leave. + * + * Here we cap lengths to fit in unsigned char. + */ + for (i = 0; i < nsyms; i++) + lengths[i] = (sc->length[i] > 255 ? 255 : sc->length[i]); +} + +/* + * Wrapper around buildhuf() which enforces the restriction on code + * length. + */ +void build_huffman_tree(int *freqs, unsigned char *lengths, + int nsyms, int limit) +{ + struct hufscratch hsc, *sc = &hsc; + int smallestfreq, totalfreq, nactivesyms; + int num, denom, adjust; + int i; + int maxprob; + + sc->parent = snewn(2*nsyms+1, int); + sc->length = snewn(2*nsyms+1, int); + sc->heap = snewn(2*nsyms, int); + + /* + * Nasty special case: if the frequency table has fewer than + * two non-zero elements, we must invent some, because we can't + * have fewer than one bit encoding a symbol. + */ + assert(nsyms >= 2); + { + int count = 0; + for (i = 0; i < nsyms; i++) + if (freqs[i] > 0) + count++; + if (count < 2) { + for (i = 0; i < nsyms && count > 0; i++) + if (freqs[i] == 0) { + freqs[i] = 1; + count--; + } + } + } + + /* + * First, try building the Huffman table the normal way. If + * this works, it's optimal, so we don't want to mess with it. + */ + buildhuf(sc, freqs, lengths, nsyms); + + for (i = 0; i < nsyms; i++) + if (lengths[i] > limit) + break; + + if (i == nsyms) + goto cleanup; /* OK */ + + /* + * The Huffman algorithm can only ever generate a code length + * of N bits or more if there is a symbol whose probability is + * less than the reciprocal of the (N+2)th Fibonacci number + * (counting from F_0=0 and F_1=1), i.e. 1/2584 for N=16, or + * 1/55 for N=8. (This is a necessary though not sufficient + * condition.) + * + * Why is this? Well, consider the input symbol with the + * smallest probability. Let that probability be x. In order + * for this symbol to have a code length of at least 1, the + * Huffman algorithm will have to merge it with some other + * node; and since x is the smallest probability, the node it + * gets merged with must be at least x. Thus, the probability + * of the resulting combined node will be at least 2x. Now in + * order for our node to reach depth 2, this 2x-node must be + * merged again. But what with? We can't assume the node it + * merges with is at least 2x, because this one might only be + * the _second_ smallest remaining node. But we do know the + * node it merges with must be at least x, so our order-2 + * internal node is at least 3x. + * + * How small a node can merge with _that_ to get an order-3 + * internal node? Well, it must be at least 2x, because if it + * was smaller than that then it would have been one of the two + * smallest nodes in the previous step and been merged at that + * point. So at least 3x, plus at least 2x, comes to at least + * 5x for an order-3 node. + * + * And so it goes on: at every stage we must merge our current + * node with a node at least as big as the bigger of this one's + * two parents, and from this starting point that gives rise to + * the Fibonacci sequence. So we find that in order to have a + * node n levels deep (i.e. a maximum code length of n), the + * overall probability of the root of the entire tree must be + * at least F_{n+2} times the probability of the rarest symbol. + * In other words, since the overall probability is 1, it is a + * necessary condition for a code length of 16 or more that + * there must be at least one symbol with probability <= + * 1/F_18. + * + * (To demonstrate that a probability this big really can give + * rise to a code length of 16, consider the set of input + * frequencies { 1-epsilon, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, + * 89, 144, 233, 377, 610, 987 }, for arbitrarily small + * epsilon.) + * + * So here buildhuf() has returned us an overlong code. So to + * ensure it doesn't do it again, we add a constant to all the + * (non-zero) symbol frequencies, causing them to become more + * balanced and removing the danger. We can then feed the + * results back to the standard buildhuf() and be + * assert()-level confident that the resulting code lengths + * contain nothing outside the permitted range. + */ + assert(limit+3 < (int)lenof(fibonacci)); + maxprob = fibonacci[limit+3]; + totalfreq = nactivesyms = 0; + smallestfreq = -1; + for (i = 0; i < nsyms; i++) { + if (freqs[i] == 0) + continue; + if (smallestfreq < 0 || smallestfreq > freqs[i]) + smallestfreq = freqs[i]; + totalfreq += freqs[i]; + nactivesyms++; + } + assert(smallestfreq <= totalfreq / maxprob); + + /* + * We want to find the smallest integer `adjust' such that + * (totalfreq + nactivesyms * adjust) / (smallestfreq + + * adjust) is less than maxprob. A bit of algebra tells us + * that the threshold value is equal to + * + * totalfreq - maxprob * smallestfreq + * ---------------------------------- + * maxprob - nactivesyms + * + * rounded up, of course. And we'll only even be trying + * this if + */ + num = totalfreq - smallestfreq * maxprob; + denom = maxprob - nactivesyms; + adjust = (num + denom - 1) / denom; + + /* + * Now add `adjust' to all the input symbol frequencies. + */ + for (i = 0; i < nsyms; i++) + if (freqs[i] != 0) + freqs[i] += adjust; + + /* + * Rebuild the Huffman tree... + */ + buildhuf(sc, freqs, lengths, nsyms); + + /* + * ... and this time it ought to be OK. + */ + for (i = 0; i < nsyms; i++) + assert(lengths[i] <= limit); + + cleanup: + /* + * Finally, free our scratch space. + */ + sfree(sc->parent); + sfree(sc->length); + sfree(sc->heap); +} + +#define MAXCODELEN 31 /* codes must fit in an int */ + +int compute_huffman_codes(const unsigned char *lengths, int *codes, int nsyms) +{ + unsigned count[MAXCODELEN], startcode[MAXCODELEN], code; + int maxlen, i; + + /* Count the codes of each length. */ + maxlen = 0; + for (i = 1; i < MAXCODELEN; i++) + count[i] = 0; + for (i = 0; i < nsyms; i++) { + count[lengths[i]]++; + if (maxlen < lengths[i]) + maxlen = lengths[i]; + } + + /* Determine the starting code for each length block. */ + code = 0; + for (i = 1; i < MAXCODELEN; i++) { + startcode[i] = code; + code += count[i]; + if (code > (1U << i)) + maxlen = -1; /* overcommitted */ + code <<= 1; + } + if (code < (1U << MAXCODELEN)) + maxlen = -2; /* undercommitted */ + + /* Determine the code for each symbol. */ + for (i = 0; i < nsyms; i++) + codes[i] = startcode[lengths[i]]++; + + return maxlen; +} |