Intro
(See the version A Java program for compressing/decompressing files via Huffman algorithms (version 1.1.1).)
(The full repository is here. It contains also some unit tests that do not fit in this post due to size limits.)
I am making my very first steps in data compression, and so, decided to practice by writing a simple command line program for compressing and decompressing files via Huffman algorithms. On this HTML file, the non-compressed file size is 3 871 657 bytes, and its compressed size is 2 187 451 bytes (57 percent of the original size):

Limitations
Due to the fact that there is no way of allocating in Java a byte array larger than \$2^{31} - 1\$ array components, the program cannot handle files larger than around 2 GB.
What comes to the compression times, Linux zip took around 400 milliseconds, and my compressor twice as much. Also, on the same .html file, zip had a better compression ratio then my implementation by a factor of roughly two. A comparable scenario happens for compessing binary .dll files.
Code
(I will not post the unit tests, as they don't fit in 64kB.)
package io.github.coderodde.compressor.app;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* This class implements a Huffman compressor for binary (byte-wise) data.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025))
*/
public final class App {
/**
* The exit status for abnormal termination.
*/
private static final int EXIT_FAILURE = 1;
/**
* This extension is added to the compressed files.
*/
private static final String COMPRESSED_FILE_EXTENSION = ".huf";
public static void main(String[] args) {
try {
if (args.length == 1) {
compressFile(args[0]);
} else if (args.length == 2) {
decompressFile(args[0], args[1]);
} else {
printUsage();
}
} catch (final IOException ex) {
error(ex.getMessage());
System.exit(EXIT_FAILURE);
}
}
private static void printUsage() {
final String jarName = getJarFileName();
System.out.printf(
String.format(
"Usage: %s FILE - to compress FILE into FILE.huf\n",
jarName));
System.out.printf(
String.format(
" %s FILE.huf OUTPUT_FILE - " +
"to decompress FILE.huf into OUTPUT_FILE\n",
jarName));
}
private static void compressFile(final String inputFileName) throws IOException {
final File inputFile = new File(inputFileName);
if (!inputFile.exists()) {
error(String.format("The input file '%s' does not exist.\n",
inputFileName));
System.exit(EXIT_FAILURE);
}
if (inputFileName.endsWith(COMPRESSED_FILE_EXTENSION)) {
error(String.format("Input file '%s' already seems to be compressed.\n",
inputFileName));
System.exit(EXIT_FAILURE);
}
final Path path = inputFile.toPath();
final byte[] rawData = Files.readAllBytes(path);
final byte[] compressedData = HuffmanByteCompressor.compress(rawData);
final File outputFile =
new File(inputFileName + COMPRESSED_FILE_EXTENSION);
Files.write(outputFile.toPath(),
compressedData);
}
private static void decompressFile(final String compressedFileName,
final String outputFileName) throws IOException {
final File compressedFile = new File(compressedFileName);
final File outputFile = new File(outputFileName);
if (!compressedFile.exists()) {
error(String.format("Compressed file '%s' does not exist.\n",
compressedFileName));
System.exit(EXIT_FAILURE);
}
if (!outputFile.exists()) {
outputFile.createNewFile();
}
final Path compressedFilePath = compressedFile.toPath();
final byte[] compressedData = Files.readAllBytes(compressedFilePath);
final Path outputFilePath = outputFile.toPath();
final byte[] originalData =
HuffmanByteDecompressor.decompress(compressedData);
Files.write(outputFilePath, originalData);
}
private static void error(final String message) {
System.err.printf("[ERROR] %s", message);
}
/**
* Obtains the current name of this JAR-file.
*
* @return the name of this JAR-file.
*/
private static String getJarFileName() {
return new File(App.class.getProtectionDomain().getCodeSource()
.getLocation()
.getPath())
.getName();
}
}
package io.github.coderodde.compressor.app;
import java.util.Objects;
/**
* This class is responsible for decompressing the actual compressed data to the
* compression source data.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 17, 2025)
* @since 1.0.0 (Nov 17, 2025)
*/
public final class ByteArrayCompressedDataReader {
/**
* The resultant decompressed data.
*/
private final byte[] outputRawData;
/**
* The input compressed data.
*/
private final byte[] inputCompressedData;
/**
* The index of the bit where compressed data begins.
*/
private final long startingBitIndex;
/**
* The (Huffman) decoder tree.
*/
private final HuffmanDecoderTree<Byte> decoderTree;
/**
* Constructs this compressed data reader/decompressor.
*
* @param outputRawData the resultant decompressed data.
* @param inputCompressedData the input compressed data.
* @param startingBitIndex the index of the first bit to decompress right
* after the header.
* @param decoderTree the decoder tree.
*/
public ByteArrayCompressedDataReader(final byte[] outputRawData,
final byte[] inputCompressedData,
final long startingBitIndex,
final HuffmanDecoderTree<Byte>
decoderTree) {
this.outputRawData =
Objects.requireNonNull(
outputRawData,
"The output raw data is null");
this.inputCompressedData =
Objects.requireNonNull(
inputCompressedData,
"The input compressed data is null");
this.decoderTree =
Objects.requireNonNull(
decoderTree,
"The input decoder tree is null");
this.startingBitIndex = startingBitIndex;
}
/**
* Decompresses and reads the compressed data.
*/
public void read() {
final int totalBytes = outputRawData.length;
long currentBitIndex = startingBitIndex;
for (int byteIndex = 0;
byteIndex != totalBytes;
byteIndex++) {
final Byte dataByte = decoderTree.decode(inputCompressedData,
currentBitIndex);
outputRawData[byteIndex] = dataByte;
currentBitIndex += decoderTree.getPreviousCodeLength();
}
}
}
package io.github.coderodde.compressor.app;
import java.util.Objects;
/**
* This class is responsible for writing the actual compressed data to a byte
* array. This class does not handle the compression header; it is handled by
* {@link io.github.coderodde.compressor.app.ByteArrayHeaderWriter}.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 17, 2025)
* @since 1.0.0 (Nov 17, 2025)
*/
public final class ByteArrayCompressedDataWriter {
/**
* The target byte array to which all the compressed data will end up.
*/
private final byte[] compressedOutputData;
/**
* The actual data to be compressed.
*/
private final byte[] inputRawData;
/**
* The index of the first bit in the compressed data. We need this in order
* to omit the compression header.
*/
private final long startingBitIndex;
/**
* The actual (Huffman) code table.
*/
private final HuffmanCodeTable<Byte> codeTable;
/**
* Constructs this writer.
*
* @param compressedOutputData the compressed data byte array.
* @param inputRawData the input raw data byte array.
* @param startingBitIndex the starting bit index for the writing.
* @param codeTable the byte encoding table.
*/
public ByteArrayCompressedDataWriter(
final byte[] compressedOutputData,
final byte[] inputRawData,
final long startingBitIndex,
final HuffmanCodeTable<Byte> codeTable) {
this.compressedOutputData =
Objects.requireNonNull(
compressedOutputData,
"The output compressed data is null");
this.inputRawData =
Objects.requireNonNull(
inputRawData,
"The input raw data is null");
this.codeTable =
Objects.requireNonNull(
codeTable,
"The input code table is null");
this.startingBitIndex = startingBitIndex;
}
/**
* Writes the entire compressed data of {@code inputRawData}.
*/
public void write() {
long currentBitIndex = startingBitIndex;
for (final byte b : inputRawData) {
final CodeWord codeword = codeTable.getCodeword(b).reverse();
final int codewordLength = codeword.length();
writeCodeWord(compressedOutputData,
currentBitIndex,
codeword);
currentBitIndex += codewordLength;
}
}
/**
* Writes a single codeword to the compressed output data byte array.
*
* @param compressedOutputData the compressed output data byte array.
* @param currentBitIndex the current bit index.
* @param codeword the codeword to write.
*/
private static void writeCodeWord(final byte[] compressedOutputData,
final long currentBitIndex,
final CodeWord codeword) {
int byteIndex = (int) (currentBitIndex / Byte.SIZE);
int bitIndex = (int) (currentBitIndex % Byte.SIZE);
final int codewordLength = codeword.length();
for (int codewordBitIndex = 0;
codewordBitIndex < codewordLength;
codewordBitIndex++) {
if (codeword.get(codewordBitIndex)) {
setBit(compressedOutputData,
byteIndex,
bitIndex);
}
bitIndex++;
if (bitIndex == Byte.SIZE) {
bitIndex = 0;
byteIndex++;
}
}
}
/**
* Sets the {@code bitIndex}th bit in
* {@code compressedOutputData[byteIndex]}.
*
* @param compressedOutputData the compressed output data byte array.
* @param byteIndex the target byte index.
* @param bitIndex the target bit index.
*/
private static void setBit(final byte[] compressedOutputData,
final int byteIndex,
final int bitIndex) {
final byte mask = (byte)(1 << bitIndex);
compressedOutputData[byteIndex] |= mask;
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_CODEWORD_MAX;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_CODE_SIZE;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_RAW_DATA_LENGTH;
import java.nio.ByteBuffer;
import java.util.Objects;
import java.nio.ByteOrder;
import java.util.Arrays;
/**
* This class implements the reader returning the file header data such as the
* length of the raw data being compressed and its decoding Huffman tree.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class ByteArrayHeaderReader {
/**
* The compressed data byte array containing the header.
*/
private final byte[] compressedData;
/**
* We cache this in order users of this class can query the length of the
* raw data that would result from decompression.
*/
private final int rawDataLength;
/**
* The code table being read.
*/
private final HuffmanCodeTable<Byte> codeTable;
public ByteArrayHeaderReader(final byte[] compressedData) {
this.compressedData =
Objects.requireNonNull(compressedData,
"The input compressed data is null");
final int codeTableSize = readCodeTableSize();
this.rawDataLength = readRawDataLength();
this.codeTable = readCodeTable(codeTableSize);
}
public int getRawDataLength() {
return rawDataLength;
}
public HuffmanCodeTable<Byte> getCodeTable() {
return codeTable;
}
private int readCodeTableSize() {
final byte[] codeTableSizeBytes = new byte[BYTES_PER_CODE_SIZE];
System.arraycopy(compressedData,
0,
codeTableSizeBytes,
0,
codeTableSizeBytes.length);
final ByteBuffer byteBuffer = ByteBuffer.wrap(codeTableSizeBytes);
return byteBuffer.order(ByteOrder.LITTLE_ENDIAN).getInt();
}
private int readRawDataLength() {
final byte[] rawDataLengthBytes = new byte[BYTES_PER_RAW_DATA_LENGTH];
System.arraycopy(compressedData,
BYTES_PER_CODE_SIZE,
rawDataLengthBytes,
0,
rawDataLengthBytes.length);
final ByteBuffer byteBuffer = ByteBuffer.wrap(rawDataLengthBytes);
return byteBuffer.order(ByteOrder.LITTLE_ENDIAN).getInt();
}
private HuffmanCodeTable<Byte> readCodeTable(final int codeTableSize) {
final HuffmanCodeTable<Byte> codeTable = new HuffmanCodeTable<>();
final int codeEntryLength = Utils.getCodeEntryLength();
int byteCursor = BYTES_PER_CODE_SIZE + BYTES_PER_RAW_DATA_LENGTH;
for (int codeIndex = 0; codeIndex < codeTableSize; ++codeIndex) {
readCodeEntry(codeTable,
compressedData,
byteCursor);
byteCursor += codeEntryLength;
}
return codeTable;
}
private static void readCodeEntry(final HuffmanCodeTable<Byte> codeTable,
final byte[] compressedData,
final int byteCursor) {
final byte value = compressedData[byteCursor];
final byte length = compressedData[byteCursor + 1];
final byte[] codeEntryData =
Arrays.copyOfRange(compressedData,
byteCursor + 2,
byteCursor + 2 + BYTES_PER_CODEWORD_MAX);
final CodeWord codeword = inferCodeWord(length, codeEntryData);
codeTable.linkSymbolToCodeword(value, codeword);
}
private static CodeWord inferCodeWord(final int length,
final byte[] codeData) {
final int bits = ByteBuffer.wrap(codeData)
.order(ByteOrder.LITTLE_ENDIAN)
.getInt();
final CodeWord codeword = new CodeWord(length);
int mask = 1;
for (int i = 0; i < length; ++i) {
if ((bits & mask) != 0) {
codeword.set(i);
}
mask <<= 1;
}
return codeword;
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_CODEWORD_MAX;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_CODE_SIZE;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_RAW_DATA_LENGTH;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Map;
import java.util.Objects;
/**
* This class writes the file header to the compressed file.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class ByteArrayHeaderWriter {
/**
* The minimum length of the raw data byte array.
*/
private static final int MINIMUM_RAW_DATA_LENGTH = 1;
/**
* The length of the raw data in bytes.
*/
private final int rawDataLength;
/**
* The output data containing the compressed file.
*/
private final byte[] outputData;
/**
* The index of the bit in the compressed data byte array at which writing
* compressed data must begin.
*/
private long dataStartBitIndex;
/**
* The code table to write to the compressed file header.
*/
private final HuffmanCodeTable<Byte> codeTable;
public ByteArrayHeaderWriter(final int rawDataLength,
final byte[] outputData,
final HuffmanCodeTable<Byte> codeTable) {
checkRawDataLength(rawDataLength);
Objects.requireNonNull(outputData, "The output data array is null");
Objects.requireNonNull(codeTable, "The input code table is null");
checkCodeTable(codeTable);
this.rawDataLength = rawDataLength;
this.outputData = outputData;
this.codeTable = codeTable;
}
public void write() {
writeCodeSize();
writeRawDataLength();
writeCodeTable();
}
public long getDataStartBitIndex() {
return dataStartBitIndex;
}
/**
* Writes the code size to the very first 32-bit integer of the compressed
* file.
*/
private void writeCodeSize() {
final byte[] codeSizeBytes =
ByteBuffer.allocate(BYTES_PER_CODE_SIZE)
.order(ByteOrder.LITTLE_ENDIAN)
.putInt(codeTable.size())
.array();
System.arraycopy(codeSizeBytes,
0,
outputData,
0,
codeSizeBytes.length);
}
/**
* Writes the length of the raw data file to the second 32-bit word in the
* compressed file.
*/
private void writeRawDataLength() {
final byte[] rawDataLengthBytes =
ByteBuffer.allocate(BYTES_PER_RAW_DATA_LENGTH)
.order(ByteOrder.LITTLE_ENDIAN)
.putInt(rawDataLength)
.array();
System.arraycopy(rawDataLengthBytes,
0,
outputData,
BYTES_PER_CODE_SIZE,
rawDataLengthBytes.length);
}
/**
* Writes the actual code table to the compressed file header to starting
* from the 8th byte.
*/
private void writeCodeTable() {
int currentByteIndex = BYTES_PER_CODE_SIZE
+ BYTES_PER_RAW_DATA_LENGTH;
for (final Map.Entry<Byte, CodeWord> entry : codeTable) {
outputData[currentByteIndex++] = entry.getKey();
outputData[currentByteIndex++] = (byte) entry.getValue().length();
final byte[] codewordBytes = entry.getValue().toByteArray();
System.arraycopy(codewordBytes,
0,
outputData,
currentByteIndex,
codewordBytes.length);
currentByteIndex += BYTES_PER_CODEWORD_MAX;
}
this.dataStartBitIndex = currentByteIndex * Byte.SIZE;
}
private static void checkRawDataLength(final int rawDataLength) {
if (rawDataLength < MINIMUM_RAW_DATA_LENGTH) {
throw new TooShortRawDataLengthException(
String.format(
"The length of the raw data is too small: %d. " +
"Must be at least %d.",
rawDataLength,
MINIMUM_RAW_DATA_LENGTH));
}
}
private static void checkCodeTable(final HuffmanCodeTable<Byte> codeTable) {
if (codeTable.isEmpty()) {
throw new EmptyCodeTableException();
}
}
}
package io.github.coderodde.compressor.app;
import java.util.HashMap;
import java.util.Map;
/**
* This class provides a method for building instances of
* {@link io.github.coderodde.compressor.app.WeightDistribution} over byte-wise
* data.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class ByteWeightDistributionBuilder {
private static final int BYTE_ALPHABET_SIZE = 256;
private ByteWeightDistributionBuilder() {
}
/**
* Builds and returns the weight distribution of the input raw data.
*
* @param rawData the byte array holding the data to compress.
*
* @return the weight distribution.
*/
public static WeightDistribution<Byte>
buildByteWeightDistribution(final byte[] rawData) {
final WeightDistribution<Byte> weightDistribution =
new WeightDistribution<>();
final Map<Byte, Integer> frequencyMap =
new HashMap<>(BYTE_ALPHABET_SIZE);
for (final byte b : rawData) {
frequencyMap.put(b, frequencyMap.getOrDefault(b, 0) + 1);
}
for (final Map.Entry<Byte, Integer> entry : frequencyMap.entrySet()) {
weightDistribution.associateSymbolWithWeight(entry.getKey(),
entry.getValue());
}
return weightDistribution;
}
}
package io.github.coderodde.compressor.app;
import java.util.BitSet;
/**
* This class implements a <b>binary</b> code word in data compression
* scenarios.
*
* @author Rodion "rodde" Efremov
* @version 1.1.0 (Nov 13, 2025)
* @since 1.0.0 (Oct 28, 2025)
*/
public class CodeWord {
private int length;
private final BitSet bits;
public CodeWord(final int length) {
checkLength(length);
this.length = length;
this.bits = new BitSet(length);
}
public CodeWord reverse() {
final CodeWord reversed = new CodeWord(length);
for (int i = length - 1, j = 0; i >= 0; --i, ++j) {
if (get(i)) {
reversed.set(j);
}
}
return reversed;
}
public byte[] toByteArray() {
return bits.toByteArray();
}
public void prependBit(final boolean bit) {
if (bit) {
bits.set(length);
}
++length;
}
public int length() {
return length;
}
public boolean get(final int index) {
checkIndex(index);
return bits.get(index);
}
public void set(final int index) {
checkIndex(index);
bits.set(index);
}
public void unset(final int index) {
checkIndex(index);
bits.set(index, false);
}
@Override
public boolean equals(final Object object) {
if (object == this) {
return true;
}
if (object == null) {
return false;
}
if (!getClass().equals(object.getClass())) {
return false;
}
final CodeWord other = (CodeWord) object;
if (length() != other.length()) {
return false;
}
for (int i = 0; i < length(); ++i) {
if (get(i) != other.get(i)) {
return false;
}
}
return true;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(length);
for (int i = length - 1; i >= 0; --i) {
sb.append(get(i) ? "1" : "0");
}
return sb.toString();
}
private void checkIndex(final int index) {
if (index < 0) {
throw new IndexOutOfBoundsException(
String.format("index(%d) < 0", index));
}
if (index >= this.length) {
throw new IndexOutOfBoundsException(
String.format("index(%d) >= length(%d)",
index,
length));
}
}
private static void checkLength(final int length) {
if (length < 0) {
throw new IllegalArgumentException(
String.format("length(%d) < 1", length));
}
}
}
package io.github.coderodde.compressor.app;
/**
* The instances of this class are thrown when dealing with empty code tables.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class EmptyCodeTableException extends RuntimeException {
public EmptyCodeTableException() {
super("The code table is empty");
}
}
package io.github.coderodde.compressor.app;
import java.util.Objects;
/**
* This class implements a method for compressing byte-wise files via Huffman-
* coding.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
public final class HuffmanByteCompressor {
/**
* Specifies how many bytes to use in order to communicate the size of the
* Huffman code.
*/
static final int BYTES_PER_CODE_SIZE = 4;
/**
* Specifies how many bytes to use in order to communicate the actual length
* (in bytes) of the input byte array.
*/
static final int BYTES_PER_RAW_DATA_LENGTH = 4;
/**
* Specifies how many bytes to reserve for describing the byte being
* encoded.
*/
static final int BYTES_PER_BYTE_DESCRIPTOR = 1;
/**
* Specifies how many bytes to reserve for signalling the codeword length.
*/
static final int BYTES_PER_CODEWORD_LENGTH = 1;
/**
* Specifies how many bytes to use for the codeword.
*/
static final int BYTES_PER_CODEWORD_MAX = 4;
/**
* Compresses the {@code rawData} {@code byte}-array using the input
* {@code weightDistribution}.
*
* @param rawData the raw data to compress.
*
* @return the full binary {@code byte}-array containing all the data needed
* to decompress the compressed file.
*/
public static byte[] compress(final byte[] rawData) {
Objects.requireNonNull(rawData);
if (rawData.length == 0) {
throw new IllegalArgumentException("The input byte array is empty");
}
final WeightDistribution<Byte> byteWeightDistribution =
ByteWeightDistributionBuilder
.buildByteWeightDistribution(rawData);
final HuffmanCodeTable<Byte> codeTable =
HuffmanCodeBuilder.buildCode(byteWeightDistribution);
final int countNumberOfBytesInCodeHeader =
Utils.countBytesInCodeHeader(codeTable.size());
final long countNumberOfBytesInRawData =
Utils.countBitsInRawData(codeTable,
rawData);
final byte[] outputData =
new byte[(int)(countNumberOfBytesInCodeHeader +
countNumberOfBytesInRawData)];
final ByteArrayHeaderWriter headerWriter =
new ByteArrayHeaderWriter(rawData.length,
outputData,
codeTable);
headerWriter.write();
final long startingDataBitIndex = headerWriter.getDataStartBitIndex();
final ByteArrayCompressedDataWriter dataWriter =
new ByteArrayCompressedDataWriter(outputData,
rawData,
startingDataBitIndex,
codeTable);
dataWriter.write();
return outputData;
}
}
package io.github.coderodde.compressor.app;
/**
* This class implements a method for <b>decompressing</b> byte-wise files via
* Huffman-coding.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
public final class HuffmanByteDecompressor {
public static byte[] decompress(final byte[] compressedData) {
final ByteArrayHeaderReader headerReader =
new ByteArrayHeaderReader(compressedData);
final int rawDataLength = headerReader.getRawDataLength();
final byte[] rawData = new byte[rawDataLength];
final HuffmanCodeTable<Byte> codeTable = headerReader.getCodeTable();
final HuffmanDecoderTree<Byte> decoder =
new HuffmanDecoderTree<>(codeTable);
final int startingBitIndex =
Utils.countBytesInCodeHeader(codeTable.size()) * Byte.SIZE;
final ByteArrayCompressedDataReader dataReader =
new ByteArrayCompressedDataReader(rawData,
compressedData,
startingBitIndex,
decoder);
dataReader.read();
return rawData;
}
}
package io.github.coderodde.compressor.app;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
/**
* This class implements the Huffman code builder over weight distributions.
*
* @author Rodion "rodde" Efremov
* @version 2.0.1 (Nov 14, 2025)
* @since 1.0.0 (Nov 12, 2025)
*/
public final class HuffmanCodeBuilder {
private HuffmanCodeBuilder() {
// Hide constructor.
}
public static <S> HuffmanCodeTable<S>
buildCode(final WeightDistribution<S> weightDistribution) {
Objects.requireNonNull(weightDistribution,
"The input probability distribution is null");
final HuffmanCodeTable<S> codeTable = new HuffmanCodeTable<>();
if (weightDistribution.isEmpty()) {
return codeTable;
}
final Queue<WeightedSymbolSet<S>> queue = new PriorityQueue<>();
for (final Map.Entry<S, Double> entry : weightDistribution) {
final double weight = entry.getValue();
final S symbol = entry.getKey();
final Set<S> set = new HashSet<>();
set.add(symbol);
queue.add(new WeightedSymbolSet<>(set, weight));
}
for (final Map.Entry<S, Double> weightEntry : weightDistribution) {
codeTable.linkSymbolToCodeword(weightEntry.getKey(),
new CodeWord(0));
}
while (queue.size() > 1) {
final WeightedSymbolSet<S> entry1 = queue.remove();
final WeightedSymbolSet<S> entry2 = queue.remove();
for (final S symbol : entry1.getSet()) {
codeTable.getCodeword(symbol).prependBit(true);
}
for (final S symbol : entry2.getSet()) {
codeTable.getCodeword(symbol).prependBit(false);
}
entry1.getSet().addAll(entry2.getSet());
queue.add(new WeightedSymbolSet<>(
entry1.getSet(),
entry1.getTotalWeight() +
entry2.getTotalWeight()));
}
return codeTable;
}
}
package io.github.coderodde.compressor.app;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
/**
* This class implements the Huffman encoding table.
*
* @author Rodion "rodde" Efremov
* @param <S> the alphabet symbol type.
*
* @version 1.0.0 (Nov 13, 2025)
* @since 1.0.0 (Nov 13, 2025)
*/
public final class HuffmanCodeTable<S>
implements Iterable<Map.Entry<S, CodeWord>> {
/**
* The actual encoding table.
*/
private final Map<S, CodeWord> symbolToCodeWordMap = new HashMap<>();
/**
* Returns the number of symbol to codeword mappings.
*
* @return the size of this encoding table.
*/
public int size() {
return symbolToCodeWordMap.size();
}
/**
* Returns {@code true} if and only if this encoding table has no mappings.
*
* @return a Boolean flag indicating whether this encoding table is empty.
*/
public boolean isEmpty() {
return symbolToCodeWordMap.isEmpty();
}
public CodeWord getCodeword(final S symbol) {
Objects.requireNonNull(symbol, "The input symbol is null");
if (!symbolToCodeWordMap.containsKey(symbol)) {
throw new SymbolNotFoundException(
String.format(
"Symbol '%s' is not present in this encoding table",
symbol));
}
return symbolToCodeWordMap.get(symbol);
}
public void linkSymbolToCodeword(final S symbol, final CodeWord codeword) {
Objects.requireNonNull(symbol, "The input symbol is null");
Objects.requireNonNull(codeword, "The input codeword is null");
symbolToCodeWordMap.put(symbol, codeword);
}
@Override
public int hashCode() {
return symbolToCodeWordMap.hashCode();
}
@Override
public boolean equals(final Object object) {
if (object == this) {
return true;
}
if (object == null) {
return false;
}
if (!getClass().equals(object.getClass())) {
return false;
}
final HuffmanCodeTable<S> other = (HuffmanCodeTable<S>) object;
if (size() != other.size()) {
return false;
}
for (final Map.Entry<S, CodeWord> entry : this) {
final S symbol = entry.getKey();
final CodeWord codeword = entry.getValue();
final CodeWord codewordFromOther = other.getCodeword(symbol);
if (!codeword.equals(codewordFromOther)) {
return false;
}
}
return true;
}
@Override
public Iterator<Map.Entry<S, CodeWord>> iterator() {
return new SymbolCodeWordIterator();
}
@Override
public String toString() {
return symbolToCodeWordMap.toString();
}
/**
* The symbol/codeword entry.
*
* @param <S> the symbol type.
*/
private static final class SymbolCodeWordEntry<S>
implements Map.Entry<S, CodeWord> {
private final Map.Entry<S, CodeWord> mapEntry;
SymbolCodeWordEntry(final Map.Entry<S, CodeWord> mapEntry) {
this.mapEntry = mapEntry;
}
@Override
public S getKey() {
return mapEntry.getKey();
}
@Override
public CodeWord getValue() {
return mapEntry.getValue();
}
@Override
public CodeWord setValue(final CodeWord codeword) {
Objects.requireNonNull(codeword, "The input codeword is null");
final CodeWord old = mapEntry.getValue();
mapEntry.setValue(codeword);
return old;
}
}
/**
* Wraps the actual symbol table iterator.
*/
private final class SymbolCodeWordIterator
implements Iterator<Map.Entry<S, CodeWord>> {
private final Iterator<Map.Entry<S, CodeWord>> iterator;
public SymbolCodeWordIterator() {
this.iterator = symbolToCodeWordMap.entrySet().iterator();
}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public SymbolCodeWordEntry<S> next() {
return new SymbolCodeWordEntry<>(iterator.next());
}
}
}
package io.github.coderodde.compressor.app;
import java.util.Map;
import java.util.Objects;
/**
* This class implements the Huffman decoding tree.
*
* @author Rodion "rodde" Efremov
* @param <S> the alphabet symbol type.
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
public final class HuffmanDecoderTree<S> {
/**
* This static inner class implements the Huffman decoding tree node.
*
* @param <S> the alphabet symbol type.
*/
private static final class TreeNode<S> {
S symbol;
TreeNode<S> zeroChild;
TreeNode<S> oneChild;
}
/**
* The root node of the tree.
*/
private final TreeNode<S> root = new TreeNode<>();
/**
* Caches the code length of the previously decoded symbol.
*/
private long previousCodeLength = -1L;
/**
* Constructs this Huffman decoding tree.
*
* @param codeTable the code table for which to construct the decoding tree.
*/
public HuffmanDecoderTree(final HuffmanCodeTable<S> codeTable) {
Objects.requireNonNull(codeTable, "The input code table is null");
for (final Map.Entry<S, CodeWord> entry : codeTable) {
final S symbol = entry.getKey();
final CodeWord codeword = entry.getValue();
insert(symbol, codeword);
}
}
/**
* Decodes a codeword to a symbol.
*
* @param compressedData the compressed data for decompression.
* @param bitIndex the index of the starting bit of a codeword to
* scan.
* @return the encoded symbol.
*/
public S decode(final byte[] compressedData, long bitIndex) {
Objects.requireNonNull(compressedData, "The input raw data is null");
previousCodeLength = 0;
TreeNode<S> node = root;
while (node.symbol == null) {
final boolean bit = readBit(compressedData,
bitIndex);
node = bit ? node.oneChild : node.zeroChild;
++bitIndex;
++previousCodeLength;
}
return node.symbol;
}
public long getPreviousCodeLength() {
return previousCodeLength;
}
private static boolean readBit(final byte[] rawData, final long bitIndex) {
final int byteIndex = (int) (bitIndex / Byte.SIZE);
final byte targetByte = rawData[byteIndex];
final byte mask = (byte)(1 << (bitIndex % Byte.SIZE));
return ((mask & targetByte) != 0);
}
/**
* Inserts the symbol/codeword pair into this tree.
*
* @param symbol the symbol to insert.
* @param codeword the codeword associated with the input symbol.
*/
private void insert(final S symbol, final CodeWord codeword) {
TreeNode<S> node = root;
for (int i = codeword.length() - 1; i >= 0; --i) {
final boolean bit = codeword.get(i);
if (bit) {
// Bit is set to 1.
if (node.oneChild == null) {
node.oneChild = new TreeNode<>();
}
node = node.oneChild;
} else {
// Bit is set to 0.
if (node.zeroChild == null) {
node.zeroChild = new TreeNode<>();
}
node = node.zeroChild;
}
}
if (node.symbol != null) {
throw new IllegalStateException(
String.format(
"Dublicate codeword [%s]: " +
"two symbols share the same code.",
codeword.toString()));
}
node.symbol = symbol;
}
}
package io.github.coderodde.compressor.app;
/**
* This exception class implements an exception that is thrown if
* {@link io.github.coderodde.compressor.app.WeightDistribution} does not contain a
* requested symbol.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 13, 2025)
* @since 1.0.0 (Nov 13, 2025)
*/
public class SymbolNotFoundException extends RuntimeException {
public SymbolNotFoundException(final String exceptionMessage) {
super(exceptionMessage);
}
}
package io.github.coderodde.compressor.app;
/**
* The instances of this class denote the situations where the compressed raw
* data is too short.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class TooShortRawDataLengthException extends RuntimeException {
public TooShortRawDataLengthException(final String exceptionMessage) {
super(exceptionMessage);
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_BYTE_DESCRIPTOR;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_CODEWORD_LENGTH;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_CODEWORD_MAX;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_CODE_SIZE;
import static io.github.coderodde.compressor.app.HuffmanByteCompressor.BYTES_PER_RAW_DATA_LENGTH;
/**
* This class contains some various helper methods.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class Utils {
private Utils() {
}
public static long countBitsInRawData(final HuffmanCodeTable<Byte> code,
final byte[] rawData) {
long bits = 0;
for (final byte b : rawData) {
bits += code.getCodeword(b).length();
}
return bits / Byte.SIZE + (bits % Byte.SIZE != 0 ? 1L : 0L);
}
public static int getCodeEntryLength() {
return BYTES_PER_BYTE_DESCRIPTOR +
BYTES_PER_CODEWORD_LENGTH +
BYTES_PER_CODEWORD_MAX;
}
public static int countBytesInCodeHeader(final int codeSize) {
final int codeEntryLength = getCodeEntryLength();
return (codeSize * codeEntryLength) + BYTES_PER_CODE_SIZE
+ BYTES_PER_RAW_DATA_LENGTH;
}
}
package io.github.coderodde.compressor.app;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
/**
* This class encapsulates a weight distribution.
*
* @author Rodion "rodde" Efremov
* @param <S> the alphabet symbol type.
*
* @version 1.0.0 (Nov 13, 2025)
* @since 1.0.0 (Nov 13, 2025)
*/
public final class WeightDistribution<S> implements Iterable<Map.Entry<S, Double>> {
/**
* The actual dictionary mapping symbols to their weights.
*/
private final Map<S, Double> symbolWeightMap = new HashMap<>();
/**
* Returns the number of symbol/weight mappings.
*
* @return the size of this distribution.
*/
public int size() {
return symbolWeightMap.size();
}
/**
* Returns {@code true} if and only if this distribution is empty.
*
* @return a Boolean flag indicating whether this distribution is empty.
*/
public boolean isEmpty() {
return symbolWeightMap.isEmpty();
}
/**
* Associates the input {@code symbol} with {@code weight}.
*
* @param symbol the symbol to associate.
* @param weight the associated weight.
*/
public void associateSymbolWithWeight(final S symbol, final double weight) {
Objects.requireNonNull(symbol, "The input symbol is null");
checkWeight(weight);
symbolWeightMap.put(symbol, weight);
}
/**
* Returns the weight that is associated with the {@code symbol}.
*
* @param symbol the symbol to query.
* @return the weight of {@code symbol}.
*/
public double getSymbolWeight(final S symbol) {
Objects.requireNonNull(symbol, "The input symbol is null");
if (!symbolWeightMap.containsKey(symbol)) {
throw new SymbolNotFoundException(
String.format("Symbol '%s' is unknown", symbol));
}
return symbolWeightMap.get(symbol);
}
private static void checkWeight(final double weight) {
if (Double.isNaN(weight)) {
throw new IllegalArgumentException("weight is NaN");
}
if (weight <= 0.0) {
throw new IllegalArgumentException(
String.format("weight(%f) <= 0.0", weight));
}
if (Double.isInfinite(weight)) {
throw new IllegalArgumentException("weight is Infinity");
}
}
/**
* Returns the iterator over this distribution's entries.
*
* @return the iterator.
*/
@Override
public Iterator<Map.Entry<S, Double>> iterator() {
return new WeightDistributionIterator();
}
@Override
public String toString() {
return symbolWeightMap.toString();
}
/**
* The weighted entry.
*
* @param <S> the symbol type.
*/
private static final class WeightedDistributionEntry<S>
implements Map.Entry<S, Double> {
private final Map.Entry<S, Double> mapEntry;
WeightedDistributionEntry(final Map.Entry<S, Double> mapEntry) {
this.mapEntry = mapEntry;
}
@Override
public S getKey() {
return mapEntry.getKey();
}
@Override
public Double getValue() {
return mapEntry.getValue();
}
@Override
public Double setValue(final Double weight) {
Objects.requireNonNull(weight, "The input weight is null");
checkWeight(weight);
final Double old = mapEntry.getValue();
mapEntry.setValue(weight);
return old;
}
}
/**
* Wraps the actual symbol table iterator.
*/
private final class WeightDistributionIterator
implements Iterator<Map.Entry<S, Double>> {
private final Iterator<Map.Entry<S, Double>> iterator;
public WeightDistributionIterator() {
this.iterator = symbolWeightMap.entrySet().iterator();
}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public WeightedDistributionEntry<S> next() {
return new WeightedDistributionEntry<>(iterator.next());
}
}
}
package io.github.coderodde.compressor.app;
import java.util.Set;
/**
* The instances of this class hold sets of symbols with the sum of their
* frequencies.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
final class WeightedSymbolSet<S> implements Comparable<WeightedSymbolSet<S>> {
private final Set<S> set;
private final double totalSetWeight;
WeightedSymbolSet(final Set<S> set,
final double totalSetWeight) {
this.set = set;
this.totalSetWeight = totalSetWeight;
}
Set<S> getSet() {
return set;
}
double getTotalWeight() {
return totalSetWeight;
}
@Override
public int compareTo(final WeightedSymbolSet<S> o) {
return Double.compare(totalSetWeight, o.totalSetWeight);
}
}
Critique request
I have only two concerns: (1) efficiency, (2) maintainability of the code base. Please, tell me whatever comes to mind.