// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package bzip2 implements bzip2 decompression.
package bzip2 import // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot // of guessing: https://en.wikipedia.org/wiki/Bzip2 // The source code to pyflate was useful for debugging: // http://www.paul.sladen.org/projects/pyflate // A StructuralError is returned when the bzip2 data is found to be // syntactically invalid. type StructuralError string func ( StructuralError) () string { return "bzip2 data invalid: " + string() } // A reader decompresses bzip2 compressed data. type reader struct { br bitReader fileCRC uint32 blockCRC uint32 wantBlockCRC uint32 setupDone bool // true if we have parsed the bzip2 header. eof bool blockSize int // blockSize in bytes, i.e. 900 * 1000. c [256]uint // the ``C'' array for the inverse BWT. tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits. tPos uint32 // Index of the next output byte in tt. preRLE []uint32 // contains the RLE data still to be processed. preRLEUsed int // number of entries of preRLE used. lastByte int // the last byte value seen. byteRepeats uint // the number of repeats of lastByte seen. repeats uint // the number of copies of lastByte to output. } // NewReader returns an io.Reader which decompresses bzip2 data from r. // If r does not also implement [io.ByteReader], // the decompressor may read more data than necessary from r. func ( io.Reader) io.Reader { := new(reader) .br = newBitReader() return } const bzip2FileMagic = 0x425a // "BZ" const bzip2BlockMagic = 0x314159265359 const bzip2FinalMagic = 0x177245385090 // setup parses the bzip2 header. func ( *reader) ( bool) error { := &.br if { := .ReadBits(16) if != bzip2FileMagic { return StructuralError("bad magic value") } } := .ReadBits(8) if != 'h' { return StructuralError("non-Huffman entropy encoding") } := .ReadBits(8) if < '1' || > '9' { return StructuralError("invalid compression level") } .fileCRC = 0 .blockSize = 100 * 1000 * ( - '0') if .blockSize > len(.tt) { .tt = make([]uint32, .blockSize) } return nil } func ( *reader) ( []byte) ( int, error) { if .eof { return 0, io.EOF } if !.setupDone { = .setup(true) := .br.Err() if != nil { = } if != nil { return 0, } .setupDone = true } , = .read() := .br.Err() if != nil { = } return } func ( *reader) ( []byte) int { // bzip2 is a block based compressor, except that it has a run-length // preprocessing step. The block based nature means that we can // preallocate fixed-size buffers and reuse them. However, the RLE // preprocessing would require allocating huge buffers to store the // maximum expansion. Thus we process blocks all at once, except for // the RLE which we decompress as required. := 0 for (.repeats > 0 || .preRLEUsed < len(.preRLE)) && < len() { // We have RLE data pending. // The run-length encoding works like this: // Any sequence of four equal bytes is followed by a length // byte which contains the number of repeats of that byte to // include. (The number of repeats can be zero.) Because we are // decompressing on-demand our state is kept in the reader // object. if .repeats > 0 { [] = byte(.lastByte) ++ .repeats-- if .repeats == 0 { .lastByte = -1 } continue } .tPos = .preRLE[.tPos] := byte(.tPos) .tPos >>= 8 .preRLEUsed++ if .byteRepeats == 3 { .repeats = uint() .byteRepeats = 0 continue } if .lastByte == int() { .byteRepeats++ } else { .byteRepeats = 0 } .lastByte = int() [] = ++ } return } func ( *reader) ( []byte) (int, error) { for { := .readFromBlock() if > 0 || len() == 0 { .blockCRC = updateCRC(.blockCRC, [:]) return , nil } // End of block. Check CRC. if .blockCRC != .wantBlockCRC { .br.err = StructuralError("block checksum mismatch") return 0, .br.err } // Find next block. := &.br switch .ReadBits64(48) { default: return 0, StructuralError("bad magic value found") case bzip2BlockMagic: // Start of block. := .readBlock() if != nil { return 0, } case bzip2FinalMagic: // Check end-of-file CRC. := uint32(.ReadBits64(32)) if .err != nil { return 0, .err } if .fileCRC != { .err = StructuralError("file checksum mismatch") return 0, .err } // Skip ahead to byte boundary. // Is there a file concatenated to this one? // It would start with BZ. if .bits%8 != 0 { .ReadBits(.bits % 8) } , := .r.ReadByte() if == io.EOF { .err = io.EOF .eof = true return 0, io.EOF } if != nil { .err = return 0, } , := .r.ReadByte() if != nil { if == io.EOF { = io.ErrUnexpectedEOF } .err = return 0, } if != 'B' || != 'Z' { return 0, StructuralError("bad magic value in continuation file") } if := .setup(false); != nil { return 0, } } } } // readBlock reads a bzip2 block. The magic number should already have been consumed. func ( *reader) () ( error) { := &.br .wantBlockCRC = uint32(.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. .blockCRC = 0 .fileCRC = (.fileCRC<<1 | .fileCRC>>31) ^ .wantBlockCRC := .ReadBits(1) if != 0 { return StructuralError("deprecated randomized files") } := uint(.ReadBits(24)) // If not every byte value is used in the block (i.e., it's text) then // the symbol set is reduced. The symbols used are stored as a // two-level, 16x16 bitmap. := .ReadBits(16) := make([]bool, 256) := 0 for := uint(0); < 16; ++ { if &(1<<(15-)) != 0 { := .ReadBits(16) for := uint(0); < 16; ++ { if &(1<<(15-)) != 0 { [16*+] = true ++ } } } } if == 0 { // There must be an EOF symbol. return StructuralError("no symbols in input") } // A block uses between two and six different Huffman trees. := .ReadBits(3) if < 2 || > 6 { return StructuralError("invalid number of Huffman trees") } // The Huffman tree can switch every 50 symbols so there's a list of // tree indexes telling us which tree to use for each 50 symbol block. := .ReadBits(15) := make([]uint8, ) // The tree indexes are move-to-front transformed and stored as unary // numbers. := newMTFDecoderWithRange() for := range { := 0 for { := .ReadBits(1) if == 0 { break } ++ } if >= { return StructuralError("tree index too large") } [] = .Decode() } // The list of symbols for the move-to-front transform is taken from // the previously decoded symbol bitmap. := make([]byte, ) := 0 for := 0; < 256; ++ { if [] { [] = byte() ++ } } := newMTFDecoder() += 2 // to account for RUNA and RUNB symbols := make([]huffmanTree, ) // Now we decode the arrays of code-lengths for each tree. := make([]uint8, ) for := range { // The code lengths are delta encoded from a 5-bit base value. := .ReadBits(5) for := range { for { if < 1 || > 20 { return StructuralError("Huffman length out of range") } if !.ReadBit() { break } if .ReadBit() { -- } else { ++ } } [] = uint8() } [], = newHuffmanTree() if != nil { return } } := 1 // the next tree index to use if len() == 0 { return StructuralError("no tree selectors given") } if int([0]) >= len() { return StructuralError("tree selector out of range") } := [[0]] := 0 // indexes bz2.buf, the output buffer. // The output of the move-to-front transform is run-length encoded and // we merge the decoding into the Huffman parsing loop. These two // variables accumulate the repeat count. See the Wikipedia page for // details. := 0 := 0 // The `C' array (used by the inverse BWT) needs to be zero initialized. clear(.c[:]) := 0 // counts the number of symbols decoded by the current tree. for { if == 50 { if >= { return StructuralError("insufficient selector indices for number of symbols") } if int([]) >= len() { return StructuralError("tree selector out of range") } = [[]] ++ = 0 } := .Decode() ++ if < 2 { // This is either the RUNA or RUNB symbol. if == 0 { = 1 } += << <<= 1 // This limit of 2 million comes from the bzip2 source // code. It prevents repeat from overflowing. if > 2*1024*1024 { return StructuralError("repeat count too large") } continue } if > 0 { // We have decoded a complete run-length so we need to // replicate the last output symbol. if > .blockSize- { return StructuralError("repeats past end of block") } for := 0; < ; ++ { := .First() .tt[] = uint32() .c[]++ ++ } = 0 } if int() == -1 { // This is the EOF symbol. Because it's always at the // end of the move-to-front list, and never gets moved // to the front, it has this unique value. break } // Since two metasymbols (RUNA and RUNB) have values 0 and 1, // one would expect |v-2| to be passed to the MTF decoder. // However, the front of the MTF list is never referenced as 0, // it's always referenced with a run-length of 1. Thus 0 // doesn't need to be encoded and we have |v-1| in the next // line. := .Decode(int( - 1)) if >= .blockSize { return StructuralError("data exceeds block size") } .tt[] = uint32() .c[]++ ++ } if >= uint() { return StructuralError("origPtr out of bounds") } // We have completed the entropy decoding. Now we can perform the // inverse BWT and setup the RLE buffer. .preRLE = .tt[:] .preRLEUsed = 0 .tPos = inverseBWT(.preRLE, , .c[:]) .lastByte = -1 .byteRepeats = 0 .repeats = 0 return nil } // inverseBWT implements the inverse Burrows-Wheeler transform as described in // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. // In that document, origPtr is called “I” and c is the “C” array after the // first pass over the data. It's an argument here because we merge the first // pass with the Huffman decoding. // // This also implements the “single array” method from the bzip2 source code // which leaves the output, still shuffled, in the bottom 8 bits of tt with the // index of the next byte in the top 24-bits. The index of the first byte is // returned. func inverseBWT( []uint32, uint, []uint) uint32 { := uint(0) for := 0; < 256; ++ { += [] [] = - [] } for := range { := [] & 0xff [[]] |= uint32() << 8 []++ } return [] >> 8 } // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, // causing the bits in the input to be processed in the reverse of the usual order. var crctab [256]uint32 func init() { const = 0x04C11DB7 for := range crctab { := uint32() << 24 for := 0; < 8; ++ { if &0x80000000 != 0 { = ( << 1) ^ } else { <<= 1 } } crctab[] = } } // updateCRC updates the crc value to incorporate the data in b. // The initial value is 0. func updateCRC( uint32, []byte) uint32 { := ^ for , := range { = crctab[byte(>>24)^] ^ ( << 8) } return ^ }