Huffman coding, using vector<bool>

Question

This is my draft implementation of Huffman encoding & decoding. I'd appreciate any feedback, but have a couple specific questions:

I wrote the code assuming vector represents each bool with a single bit. Is there a standard way to check whether the STL used by the compiler actually does that? With concepts would be best, but I don't know how.
I've profiled the code on VS2022 on x86, and for compress & decompress, most time is spent in compress in the line doing std::copy. Any way to optimize it?

 #include <map> #include <queue> #include <vector> #include <cstddef> #include <fstream> #include <unordered_map> class Huffman { public: struct freqPair { char val; uint32_t count; }; class compressed { public: std::vector<freqPair> frequencies; std::vector<bool> data; bool isFull() { return data.size() > 0; } bool isEmpty() { return data.size() == 0; } void save(const std::string& fileName) { std::ofstream outfile(fileName, std::ofstream::binary); uint64_t word = frequencies.size(); outfile.write((const char*)&word, sizeof(uint64_t)); for (auto p : frequencies) { outfile.write(&p.val, sizeof(char)); outfile.write((const char*)&p.count, sizeof(uint32_t)); } word = data.size(); outfile.write((const char*)&word, sizeof(uint64_t)); uint_fast8_t bits = 0; for (size_t i = 0; i < data.size(); i++) { word = word << 1 | data[i]; bits++; if (bits == sizeof(uint64_t) * CHAR_BIT) { outfile.write((const char*)&word, sizeof(uint64_t)); bits = 0; } } if (bits != 0) { word <<= (sizeof(uint64_t) * CHAR_BIT - bits); outfile.write((const char*)&word, sizeof(uint64_t)); } outfile.close(); } void load(const std::string& fileName) { data.clear(); frequencies.clear(); std::ifstream infile(fileName, std::ofstream::binary); uint64_t word; infile.read((char*)&word, sizeof(uint64_t)); freqPair p; for (uint64_t i = 0; i < word; i++) { infile.read(&p.val, sizeof(char)); infile.read((char*)&p.count, sizeof(uint32_t)); frequencies.push_back(p); } uint64_t totalBits, mask = 0; infile.read((char*)&totalBits, sizeof(uint64_t)); for (size_t i = 0; i < totalBits; i++) { if (mask == 0) { infile.read((char*)&word, sizeof(uint64_t)); mask = 1ULL << 63; } data.push_back((word & mask) != 0); mask >>= 1; } infile.close(); } }; private: class Tree { public: enum class status : std::uint8_t { Success, Duplicate, NotFound }; struct node { node(char v = '\0', uint32_t c = 0, node* l = nullptr, node* r = nullptr) : val(v), count(c), left(l), right(r) {} char val; uint32_t count; node *left, *right; bool isLeaf() { return left == nullptr && right == nullptr; } }; node* Root; Tree() : Root(nullptr) {} Tree(node* r) : Root(r) {} Tree(Tree&& rhs) noexcept : Root(rhs.Root) { rhs.Root = nullptr; } ~Tree() { if (Root) { std::queue<node*> nodes; nodes.push(Root); Root = nullptr; while (nodes.size()) { auto head = nodes.front(); nodes.pop(); if (!head->isLeaf()) { nodes.push(head->left); nodes.push(head->right); } delete head; } } } std::map<char, std::vector<bool>> createDictionary() { std::vector<bool> prefix; std::map<char, std::vector<bool>> dictionary; if (Root) createDictionary(Root, prefix, dictionary); return dictionary; } private: void createDictionary(const node* Node, std::vector<bool>& currPrefix, std::map<char, std::vector<bool>>& dictionary) { if (Node->left) { currPrefix.push_back(false); createDictionary(Node->left, currPrefix, dictionary); currPrefix.pop_back(); currPrefix.push_back(true); createDictionary(Node->right, currPrefix, dictionary); currPrefix.pop_back(); } else dictionary.insert(std::make_pair(Node->val, currPrefix)); } }; class nodeCompare { public: bool operator()(const Tree::node* lhs, const Tree::node* rhs) { if (lhs->count != rhs->count) return lhs->count > rhs->count; else return lhs->val > rhs->val; } }; Tree createTree(const std::unordered_map<char, uint32_t>& frequencies) { std::priority_queue<Tree::node*, std::vector<Tree::node*>, nodeCompare> priQ; for (auto& p : frequencies) priQ.push(new Tree::node(p.first, p.second)); while (priQ.size() > 1) { auto left = priQ.top(); priQ.pop(); auto right = priQ.top(); priQ.pop(); auto newNode = new Tree::node(left->val, left->count + right->count, left, right); priQ.push(newNode); } Tree t(priQ.top()); priQ.pop(); return t; } public: compressed compress(const std::string& data) { compressed retVal; if (data.length()) { std::unordered_map<char, uint32_t> frequencies; for (auto c : data) frequencies[c]++; auto tree = createTree(frequencies); auto dictionary = tree.createDictionary(); for (auto& f : frequencies) retVal.frequencies.push_back(freqPair(f.first, f.second)); for (auto ch : data) std::copy(dictionary[ch].begin(), dictionary[ch].end(), std::back_inserter(retVal.data)); } return retVal; } std::string decompress(const compressed& cdata) { std::string retVal; if (cdata.frequencies.size() > 0 && cdata.data.size() > 0) { std::unordered_map<char, uint32_t> frequencies; for (auto& p : cdata.frequencies) frequencies[p.val] = p.count; auto tree = createTree(frequencies); Tree::node *currNode = tree.Root; for (auto bit : cdata.data) { currNode = bit ? currNode->right : currNode->left; if (currNode->isLeaf()) { retVal.push_back(currNode->val); currNode = tree.Root; } } } return retVal; } };

user555045 · Accepted Answer · 2022-03-05 20:36:55Z

`std::vector<bool>`

I wrote the code assuming vector represents each bool with a single bit. Is there a standard way to check whether the STL used by the compiler actually does that?

I don't know how to do that or whether it's even possible, but because you're also asking for performance improvements, I have a kind-of answer to this: avoid std::vector<bool> in the first place. Then it's also not important whether it's packing the bools tightly or not.

It's not surprising that the std::copy into the back inserter of a std::vector<bool> takes a lot of time, especially if it is tightly packing the bools. Hypothetically it could work in some clever way that makes use of the fact that both the source and destination are tightly packed vector of booleans, using some bitwise operations to copy big blocks of bits at once.. but std::vector<bool> and "clever" don't really go together, in my experience the most likely outcome of writing code like that is the bools will be extracted one-by-one from the source vectors and inserted one-by-one into the destination vector. Not good. Later in the code, those bits are then extracted one-by-one once again, while being inserted into buffer, where they are tightly packed again. Lots of unpacking and then repacking.

So, my recommendation is to avoid std::vector<bool>. Another way to represent a code is with two integers (or an integer and an std::bitset<MAXLEN> where MAXLEN is some appropriately chosen constant), one integer to store the bits of the code and one integer to store the length of the code. Using that representation, a whole code (in some cases a partial code, when the buffer is nearly full) can be appended to the buffer in one go with some simple bitwise operations, with no loops over individual bits.

Efficient decoding

The current decoding algorithm is bit-by-bit tree-walking, which is inherently not efficient because it's bit-by-bit. An alternative to that is table-based decoding, in which you take MAXLEN bits and use those bits as an index into a 2^MAXLEN entry table that tells you what the first symbol in those bits is and how many bits to advance. For this purpose it is beneficial to set MAXLEN to some modest value between 14 and 18, which is commonly done for practical Huffman coding. BTW there are more advanced approaches that work with a smaller table.

There are various ways to set a limit on the code length, a simple one is just to detect whether any code is too long, and if so then divide your frequencies by 2 (while ensuring that non-zero freqs stay non-zero) and rebuild the Huffman tree. That "flattens" the tree so it results in making the longest code shorter, at the cost of making some other codes longer. That slightly affects the compression ratio, but practically not much. There is little practical benefit to supporting arbitrarily long codes.

Here are some references for table-based decoding and limiting the code lengths in other ways:

Introduction to table based Huffman decoding
An efficient algorithm of Huffman decoder with nearly constant decoding time
Huffman revisited - Part 2 : the Decoder
A Fast and Space - Economical Algorithm for Length - Limited Coding (for a way to generate the code lengths with a length limit)

`new Tree::node`

Up to 511 nodes could be used, 256 leafs and 255 internal nodes. All these nodes could be pre-allocated as a block, rather than being individually created with new. That would make the destructor of Tree a lot simpler too, or even unnecessary at all.

Maps and unordered maps with `char` as key

An std::map or an std::unordered_map with a char as key is more "heavy duty" than it needs to be. All of these can be arrays or vectors.

Representing the tree in the file header

Saving the vector of frequencies mostly works, though it relies on the decoder building the Huffman tree in exactly the same way as the encoder did. Huffman trees are not in general unique for a given vector of frequencies, but your program works because it is deterministic so it will build the same tree every time when given the same frequencies. A more effective approach though, is to use the canonical Huffman code, which then lets you get away with storing just the code lengths (which can be stored much more compactly in the file header, compared to the frequencies) and is unambiguous. A code book can be formed directly from the array of code lengths, and a decoding table for table-based decoding can be formed directly from the code book, so the decoder does not need to create a tree at all. You could still choose to create the tree from the code book however, so you could keep the bit-by-bit tree-walking decoder but also use canonical Huffman codes to get a smaller file header.

Well said. Many people consider the inclusion of vector <bool> into C++ an error in retrospect; this includes a large part of the ISO standard’s committee. — Aganju, CommentedMar 5, 2022 at 21:24
@Aganju: I like Howard Hinnant's take on it: A bit-array is a useful data structure for some things, and one that a standard library can usefully specialize algorithms like std::find for to give big speedups. The problem is that std::vector<bool> is a terrible name for that data structure; the mistake was providing it under that name, as a specialization making a normal vector unavailable. — Peter Cordes, CommentedMar 13, 2022 at 13:32
Since you discuss the question's concern that std::bool might not be bit-packed, the standard does leave open that possibility for some reason: 24.3.12 class vector<bool> (3)There is no requirement that the data be stored as a contiguous allocation of bool values. A space-optimized representation of bits is recommended instead. Perhaps one could check if std::vector<bool>::reference is the same type as bool&, if one was using vector<bool> instead of a custom bit-array. — Peter Cordes, CommentedMar 13, 2022 at 13:41

G. Sliepen · Accepted Answer · 2022-03-05 22:53:08Z

Missing I/O error checking

Your code does not check whether reading from or writing to files has succeeded, with the worst possible outcome being silent data corruption.

The proper way to check for errors is right at the end of reading or writing a file. For example, for reading, check that infile.eof() is true. If not, it means you did not succesfully read until the end of the file. However, in this case you might also want to check whether individual I/O operations, in particular after you have read the size of the frequency table or the total number of bits from the file, as otherwise variables like word and totalBits might contain incorrect data.

After writing all data to a file, call outfile.close() and then check that outfile.good() is true.

Avoid manual `new`/`delete`

It is easy to make mistakes with new and delete, and C++ has several ways to avoid you having to do manual memory management. You could consider using std::unique_ptr to store Tree::nodes:

std::priority_queue<std::unique_ptr<Tree::node>, ...> priQ; for (auto& p: frequencies) priQ.push(std::make_unique<Tree::node>(p.first, p.second)); while (priQ.size() > 1) { auto left = std::move(priQ.top()); priQ.pop(); auto right = std::move(priQ.top()); priQ.pop(); priQ.push(std::make_unique<Tree::node>( left->val, left->count + right->count, std::move(left), std::move(right) )); } return std::move(priQ.top());

It's a little bit more verbose, and now you have to make Tree, Tree::node and nodeCompare() deal with std::unique_ptrs to nodes, but the upshot is that you can remove the destructor for Tree.

Or, as harold suggested, find a way to avoid having to allocate individual nodes.

Stack Exchange Network

Huffman coding, using vector<bool>

2 Answers 2

`std::vector<bool>`

Efficient decoding

`new Tree::node`

Maps and unordered maps with `char` as key

Representing the tree in the file header

Missing I/O error checking

Avoid manual `new`/`delete`

Hot Network Questions

Huffman coding, using vector<bool>

2 Answers 2

std::vector<bool>

Efficient decoding

new Tree::node

Maps and unordered maps with char as key

Representing the tree in the file header

Missing I/O error checking

Avoid manual new/delete

Related

Hot Network Questions

`std::vector<bool>`

`new Tree::node`

Maps and unordered maps with `char` as key

Avoid manual `new`/`delete`