237 lines
6.8 KiB
Go
237 lines
6.8 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package bzip2
|
|
|
|
import "sort"
|
|
|
|
// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
|
|
// symbol.
|
|
type huffmanTree struct {
|
|
// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
|
|
// root of the tree and nextNode contains the index of the next element
|
|
// of nodes to use when the tree is being constructed.
|
|
nodes []huffmanNode
|
|
nextNode int
|
|
}
|
|
|
|
// A huffmanNode is a node in the tree. left and right contain indexes into the
|
|
// nodes slice of the tree. If left or right is invalidNodeValue then the child
|
|
// is a left node and its value is in leftValue/rightValue.
|
|
//
|
|
// The symbols are uint16s because bzip2 encodes not only MTF indexes in the
|
|
// tree, but also two magic values for run-length encoding and an EOF symbol.
|
|
// Thus there are more than 256 possible symbols.
|
|
type huffmanNode struct {
|
|
left, right uint16
|
|
leftValue, rightValue uint16
|
|
}
|
|
|
|
// invalidNodeValue is an invalid index which marks a leaf node in the tree.
|
|
const invalidNodeValue = 0xffff
|
|
|
|
// Decode reads bits from the given bitReader and navigates the tree until a
|
|
// symbol is found.
|
|
func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
|
|
nodeIndex := uint16(0) // node 0 is the root of the tree.
|
|
|
|
for {
|
|
node := &t.nodes[nodeIndex]
|
|
|
|
var bit uint16
|
|
if br.bits > 0 {
|
|
// Get next bit - fast path.
|
|
br.bits--
|
|
bit = uint16(br.n>>(br.bits&63)) & 1
|
|
} else {
|
|
// Get next bit - slow path.
|
|
// Use ReadBits to retrieve a single bit
|
|
// from the underling io.ByteReader.
|
|
bit = uint16(br.ReadBits(1))
|
|
}
|
|
|
|
// Trick a compiler into generating conditional move instead of branch,
|
|
// by making both loads unconditional.
|
|
l, r := node.left, node.right
|
|
|
|
if bit == 1 {
|
|
nodeIndex = l
|
|
} else {
|
|
nodeIndex = r
|
|
}
|
|
|
|
if nodeIndex == invalidNodeValue {
|
|
// We found a leaf. Use the value of bit to decide
|
|
// whether is a left or a right value.
|
|
l, r := node.leftValue, node.rightValue
|
|
if bit == 1 {
|
|
v = l
|
|
} else {
|
|
v = r
|
|
}
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// newHuffmanTree builds a Huffman tree from a slice containing the code
|
|
// lengths of each symbol. The maximum code length is 32 bits.
|
|
func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
|
|
// There are many possible trees that assign the same code length to
|
|
// each symbol (consider reflecting a tree down the middle, for
|
|
// example). Since the code length assignments determine the
|
|
// efficiency of the tree, each of these trees is equally good. In
|
|
// order to minimize the amount of information needed to build a tree
|
|
// bzip2 uses a canonical tree so that it can be reconstructed given
|
|
// only the code length assignments.
|
|
|
|
if len(lengths) < 2 {
|
|
panic("newHuffmanTree: too few symbols")
|
|
}
|
|
|
|
var t huffmanTree
|
|
|
|
// First we sort the code length assignments by ascending code length,
|
|
// using the symbol value to break ties.
|
|
pairs := make([]huffmanSymbolLengthPair, len(lengths))
|
|
for i, length := range lengths {
|
|
pairs[i].value = uint16(i)
|
|
pairs[i].length = length
|
|
}
|
|
|
|
sort.Slice(pairs, func(i, j int) bool {
|
|
if pairs[i].length < pairs[j].length {
|
|
return true
|
|
}
|
|
if pairs[i].length > pairs[j].length {
|
|
return false
|
|
}
|
|
if pairs[i].value < pairs[j].value {
|
|
return true
|
|
}
|
|
return false
|
|
})
|
|
|
|
// Now we assign codes to the symbols, starting with the longest code.
|
|
// We keep the codes packed into a uint32, at the most-significant end.
|
|
// So branches are taken from the MSB downwards. This makes it easy to
|
|
// sort them later.
|
|
code := uint32(0)
|
|
length := uint8(32)
|
|
|
|
codes := make([]huffmanCode, len(lengths))
|
|
for i := len(pairs) - 1; i >= 0; i-- {
|
|
if length > pairs[i].length {
|
|
length = pairs[i].length
|
|
}
|
|
codes[i].code = code
|
|
codes[i].codeLen = length
|
|
codes[i].value = pairs[i].value
|
|
// We need to 'increment' the code, which means treating |code|
|
|
// like a |length| bit number.
|
|
code += 1 << (32 - length)
|
|
}
|
|
|
|
// Now we can sort by the code so that the left half of each branch are
|
|
// grouped together, recursively.
|
|
sort.Slice(codes, func(i, j int) bool {
|
|
return codes[i].code < codes[j].code
|
|
})
|
|
|
|
t.nodes = make([]huffmanNode, len(codes))
|
|
_, err := buildHuffmanNode(&t, codes, 0)
|
|
return t, err
|
|
}
|
|
|
|
// huffmanSymbolLengthPair contains a symbol and its code length.
|
|
type huffmanSymbolLengthPair struct {
|
|
value uint16
|
|
length uint8
|
|
}
|
|
|
|
// huffmanCode contains a symbol, its code and code length.
|
|
type huffmanCode struct {
|
|
code uint32
|
|
codeLen uint8
|
|
value uint16
|
|
}
|
|
|
|
// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
|
|
// the Huffman tree at the given level. It returns the index of the newly
|
|
// constructed node.
|
|
func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
|
|
test := uint32(1) << (31 - level)
|
|
|
|
// We have to search the list of codes to find the divide between the left and right sides.
|
|
firstRightIndex := len(codes)
|
|
for i, code := range codes {
|
|
if code.code&test != 0 {
|
|
firstRightIndex = i
|
|
break
|
|
}
|
|
}
|
|
|
|
left := codes[:firstRightIndex]
|
|
right := codes[firstRightIndex:]
|
|
|
|
if len(left) == 0 || len(right) == 0 {
|
|
// There is a superfluous level in the Huffman tree indicating
|
|
// a bug in the encoder. However, this bug has been observed in
|
|
// the wild so we handle it.
|
|
|
|
// If this function was called recursively then we know that
|
|
// len(codes) >= 2 because, otherwise, we would have hit the
|
|
// "leaf node" case, below, and not recursed.
|
|
//
|
|
// However, for the initial call it's possible that len(codes)
|
|
// is zero or one. Both cases are invalid because a zero length
|
|
// tree cannot encode anything and a length-1 tree can only
|
|
// encode EOF and so is superfluous. We reject both.
|
|
if len(codes) < 2 {
|
|
return 0, StructuralError("empty Huffman tree")
|
|
}
|
|
|
|
// In this case the recursion doesn't always reduce the length
|
|
// of codes so we need to ensure termination via another
|
|
// mechanism.
|
|
if level == 31 {
|
|
// Since len(codes) >= 2 the only way that the values
|
|
// can match at all 32 bits is if they are equal, which
|
|
// is invalid. This ensures that we never enter
|
|
// infinite recursion.
|
|
return 0, StructuralError("equal symbols in Huffman tree")
|
|
}
|
|
|
|
if len(left) == 0 {
|
|
return buildHuffmanNode(t, right, level+1)
|
|
}
|
|
return buildHuffmanNode(t, left, level+1)
|
|
}
|
|
|
|
nodeIndex = uint16(t.nextNode)
|
|
node := &t.nodes[t.nextNode]
|
|
t.nextNode++
|
|
|
|
if len(left) == 1 {
|
|
// leaf node
|
|
node.left = invalidNodeValue
|
|
node.leftValue = left[0].value
|
|
} else {
|
|
node.left, err = buildHuffmanNode(t, left, level+1)
|
|
}
|
|
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
if len(right) == 1 {
|
|
// leaf node
|
|
node.right = invalidNodeValue
|
|
node.rightValue = right[0].value
|
|
} else {
|
|
node.right, err = buildHuffmanNode(t, right, level+1)
|
|
}
|
|
|
|
return
|
|
}
|