// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.
// Package bzip2 implements bzip2 decompression.
package bzip2import// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot// of guessing: https://en.wikipedia.org/wiki/Bzip2// The source code to pyflate was useful for debugging:// http://www.paul.sladen.org/projects/pyflate// A StructuralError is returned when the bzip2 data is found to be// syntactically invalid.typeStructuralErrorstringfunc ( StructuralError) () string {return"bzip2 data invalid: " + string()}// A reader decompresses bzip2 compressed data.type reader struct { br bitReader fileCRC uint32 blockCRC uint32 wantBlockCRC uint32 setupDone bool// true if we have parsed the bzip2 header. eof bool blockSize int// blockSize in bytes, i.e. 900 * 1000. c [256]uint// the ``C'' array for the inverse BWT. tt []uint32// mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits. tPos uint32// Index of the next output byte in tt. preRLE []uint32// contains the RLE data still to be processed. preRLEUsed int// number of entries of preRLE used. lastByte int// the last byte value seen. byteRepeats uint// the number of repeats of lastByte seen. repeats uint// the number of copies of lastByte to output.}// NewReader returns an io.Reader which decompresses bzip2 data from r.// If r does not also implement [io.ByteReader],// the decompressor may read more data than necessary from r.func ( io.Reader) io.Reader { := new(reader) .br = newBitReader()return}const bzip2FileMagic = 0x425a// "BZ"const bzip2BlockMagic = 0x314159265359const bzip2FinalMagic = 0x177245385090// setup parses the bzip2 header.func ( *reader) ( bool) error { := &.brif { := .ReadBits(16)if != bzip2FileMagic {returnStructuralError("bad magic value") } } := .ReadBits(8)if != 'h' {returnStructuralError("non-Huffman entropy encoding") } := .ReadBits(8)if < '1' || > '9' {returnStructuralError("invalid compression level") } .fileCRC = 0 .blockSize = 100 * 1000 * ( - '0')if .blockSize > len(.tt) { .tt = make([]uint32, .blockSize) }returnnil}func ( *reader) ( []byte) ( int, error) {if .eof {return0, io.EOF }if !.setupDone { = .setup(true) := .br.Err()if != nil { = }if != nil {return0, } .setupDone = true } , = .read() := .br.Err()if != nil { = }return}func ( *reader) ( []byte) int {// bzip2 is a block based compressor, except that it has a run-length // preprocessing step. The block based nature means that we can // preallocate fixed-size buffers and reuse them. However, the RLE // preprocessing would require allocating huge buffers to store the // maximum expansion. Thus we process blocks all at once, except for // the RLE which we decompress as required. := 0for (.repeats > 0 || .preRLEUsed < len(.preRLE)) && < len() {// We have RLE data pending.// The run-length encoding works like this: // Any sequence of four equal bytes is followed by a length // byte which contains the number of repeats of that byte to // include. (The number of repeats can be zero.) Because we are // decompressing on-demand our state is kept in the reader // object.if .repeats > 0 { [] = byte(.lastByte) ++ .repeats--if .repeats == 0 { .lastByte = -1 }continue } .tPos = .preRLE[.tPos] := byte(.tPos) .tPos >>= 8 .preRLEUsed++if .byteRepeats == 3 { .repeats = uint() .byteRepeats = 0continue }if .lastByte == int() { .byteRepeats++ } else { .byteRepeats = 0 } .lastByte = int() [] = ++ }return}func ( *reader) ( []byte) (int, error) {for { := .readFromBlock()if > 0 || len() == 0 { .blockCRC = updateCRC(.blockCRC, [:])return , nil }// End of block. Check CRC.if .blockCRC != .wantBlockCRC { .br.err = StructuralError("block checksum mismatch")return0, .br.err }// Find next block. := &.brswitch .ReadBits64(48) {default:return0, StructuralError("bad magic value found")casebzip2BlockMagic:// Start of block. := .readBlock()if != nil {return0, }casebzip2FinalMagic:// Check end-of-file CRC. := uint32(.ReadBits64(32))if .err != nil {return0, .err }if .fileCRC != { .err = StructuralError("file checksum mismatch")return0, .err }// Skip ahead to byte boundary. // Is there a file concatenated to this one? // It would start with BZ.if .bits%8 != 0 { .ReadBits(.bits % 8) } , := .r.ReadByte()if == io.EOF { .err = io.EOF .eof = truereturn0, io.EOF }if != nil { .err = return0, } , := .r.ReadByte()if != nil {if == io.EOF { = io.ErrUnexpectedEOF } .err = return0, }if != 'B' || != 'Z' {return0, StructuralError("bad magic value in continuation file") }if := .setup(false); != nil {return0, } } }}// readBlock reads a bzip2 block. The magic number should already have been consumed.func ( *reader) () ( error) { := &.br .wantBlockCRC = uint32(.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. .blockCRC = 0 .fileCRC = (.fileCRC<<1 | .fileCRC>>31) ^ .wantBlockCRC := .ReadBits(1)if != 0 {returnStructuralError("deprecated randomized files") } := uint(.ReadBits(24))// If not every byte value is used in the block (i.e., it's text) then // the symbol set is reduced. The symbols used are stored as a // two-level, 16x16 bitmap. := .ReadBits(16) := make([]bool, 256) := 0for := uint(0); < 16; ++ {if &(1<<(15-)) != 0 { := .ReadBits(16)for := uint(0); < 16; ++ {if &(1<<(15-)) != 0 { [16*+] = true ++ } } } }if == 0 {// There must be an EOF symbol.returnStructuralError("no symbols in input") }// A block uses between two and six different Huffman trees. := .ReadBits(3)if < 2 || > 6 {returnStructuralError("invalid number of Huffman trees") }// The Huffman tree can switch every 50 symbols so there's a list of // tree indexes telling us which tree to use for each 50 symbol block. := .ReadBits(15) := make([]uint8, )// The tree indexes are move-to-front transformed and stored as unary // numbers. := newMTFDecoderWithRange()for := range { := 0for { := .ReadBits(1)if == 0 {break } ++ }if >= {returnStructuralError("tree index too large") } [] = .Decode() }// The list of symbols for the move-to-front transform is taken from // the previously decoded symbol bitmap. := make([]byte, ) := 0for := 0; < 256; ++ {if [] { [] = byte() ++ } } := newMTFDecoder() += 2// to account for RUNA and RUNB symbols := make([]huffmanTree, )// Now we decode the arrays of code-lengths for each tree. := make([]uint8, )for := range {// The code lengths are delta encoded from a 5-bit base value. := .ReadBits(5)for := range {for {if < 1 || > 20 {returnStructuralError("Huffman length out of range") }if !.ReadBit() {break }if .ReadBit() { -- } else { ++ } } [] = uint8() } [], = newHuffmanTree()if != nil {return } } := 1// the next tree index to useiflen() == 0 {returnStructuralError("no tree selectors given") }ifint([0]) >= len() {returnStructuralError("tree selector out of range") } := [[0]] := 0// indexes bz2.buf, the output buffer.// The output of the move-to-front transform is run-length encoded and // we merge the decoding into the Huffman parsing loop. These two // variables accumulate the repeat count. See the Wikipedia page for // details. := 0 := 0// The `C' array (used by the inverse BWT) needs to be zero initialized.clear(.c[:]) := 0// counts the number of symbols decoded by the current tree.for {if == 50 {if >= {returnStructuralError("insufficient selector indices for number of symbols") }ifint([]) >= len() {returnStructuralError("tree selector out of range") } = [[]] ++ = 0 } := .Decode() ++if < 2 {// This is either the RUNA or RUNB symbol.if == 0 { = 1 } += << <<= 1// This limit of 2 million comes from the bzip2 source // code. It prevents repeat from overflowing.if > 2*1024*1024 {returnStructuralError("repeat count too large") }continue }if > 0 {// We have decoded a complete run-length so we need to // replicate the last output symbol.if > .blockSize- {returnStructuralError("repeats past end of block") }for := 0; < ; ++ { := .First() .tt[] = uint32() .c[]++ ++ } = 0 }ifint() == -1 {// This is the EOF symbol. Because it's always at the // end of the move-to-front list, and never gets moved // to the front, it has this unique value.break }// Since two metasymbols (RUNA and RUNB) have values 0 and 1, // one would expect |v-2| to be passed to the MTF decoder. // However, the front of the MTF list is never referenced as 0, // it's always referenced with a run-length of 1. Thus 0 // doesn't need to be encoded and we have |v-1| in the next // line. := .Decode(int( - 1))if >= .blockSize {returnStructuralError("data exceeds block size") } .tt[] = uint32() .c[]++ ++ }if >= uint() {returnStructuralError("origPtr out of bounds") }// We have completed the entropy decoding. Now we can perform the // inverse BWT and setup the RLE buffer. .preRLE = .tt[:] .preRLEUsed = 0 .tPos = inverseBWT(.preRLE, , .c[:]) .lastByte = -1 .byteRepeats = 0 .repeats = 0returnnil}// inverseBWT implements the inverse Burrows-Wheeler transform as described in// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.// In that document, origPtr is called “I” and c is the “C” array after the// first pass over the data. It's an argument here because we merge the first// pass with the Huffman decoding.//// This also implements the “single array” method from the bzip2 source code// which leaves the output, still shuffled, in the bottom 8 bits of tt with the// index of the next byte in the top 24-bits. The index of the first byte is// returned.func inverseBWT( []uint32, uint, []uint) uint32 { := uint(0)for := 0; < 256; ++ { += [] [] = - [] }for := range { := [] & 0xff [[]] |= uint32() << 8 []++ }return [] >> 8}// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,// causing the bits in the input to be processed in the reverse of the usual order.var crctab [256]uint32func init() {const = 0x04C11DB7for := rangecrctab { := uint32() << 24for := 0; < 8; ++ {if &0x80000000 != 0 { = ( << 1) ^ } else { <<= 1 } }crctab[] = }}// updateCRC updates the crc value to incorporate the data in b.// The initial value is 0.func updateCRC( uint32, []byte) uint32 { := ^for , := range { = crctab[byte(>>24)^] ^ ( << 8) }return ^}
The pages are generated with Goldsv0.7.3. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.