I am writing a program that compresses and decompresses data using the Huffman Coding algorithm.
About compression: this program will export 2 files: header file (contains characters' frequency for re-constructing the Huffman Code Tree for decompressing data) and actual compressed data.
About decompression: After joining both files above, the program will decompress the data and write to file.
I am a beginner in C++ and would love to hear some feedback about my code style as well as my code design.
Thank you.
hzip.hpp (class for building Huffman Code Tree)
/* * Author: Harry Nguyen * Date: 23/05/2018 */ #ifndef _HZIP_HPP #define _HZIP_HPP #include <string> #include <vector> #include <map> #include <queue> // Data type for a Huffman char typedef unsigned short hchar_t; // Data type for frequency typedef unsigned int hfreq_t; // Data type for frequency table typedef std::map<hchar_t, hfreq_t> frmap_t; // Data type for huffman code map typedef std::map<hchar_t, std::string> hmap_t; // Data type for a bit typedef char bit_t; const hchar_t PSEUDO_EOF = 256; // A virtual end of file const hchar_t NOT_A_CHAR = 257; // Write to internal nodes const bit_t MAX_NUMBER_OF_BITS = 8; /* Convert string of bits ( "0" or "1") * to real byte * Return converted bytes in the form of strings */ std::string stobyte(std::string& sBits); /* * A Huffman node contains a character, its frequency * and 2 children node */ struct NODE { hchar_t _character; hfreq_t _freq; NODE* _lChild; NODE* _rChild; NODE(hchar_t character, hfreq_t freq): _character(character), _freq(freq), _lChild(nullptr), _rChild(nullptr) {} ~NODE() { delete _lChild; delete _rChild; } }; // ADT for min heap implementation struct Comparator { bool operator()(NODE* lChild, NODE* rChild) { return lChild->_freq > rChild->_freq; } }; class HZip { public: HZip(); /* * Build a frequency table from input string * and return a frequency map. */ frmap_t buildFrequencyTable(const std::string& input); /* * Build a Huffman tree from frequency table * and return the root. */ NODE* buildEncodingTree(frmap_t freqTable); /* * Get encoded map from the root of encoded tree * and return encoded map. */ hmap_t getEncodedMap(NODE* encodingTree); /* * Read characters from input string * and write bytes to output. * ------------------------------------- * Used for compression */ void encodeData(std::string& input, const frmap_t freqMap, hmap_t encodedMap, std::vector<std::string>& output); /* * Construct frequency table from the header * of the input string. * ------------------------------------------ * Used for decompression */ frmap_t headerProcessing(std::string& input); /* * Read bits by bits from the input string * and write character to the output. * ---------------------------------------- * Used for decompression */ void decodeData(std::string& input, NODE* encodingTree, std::string& output); protected: /* * Build Huffman encoding map. * ---------------------------- * Recursive approach. */ void buildEncodingMap(NODE* encodingTree, std::string sCode); protected: hmap_t p_encodedMap; private: std::priority_queue<NODE*, std::vector<NODE*>, Comparator> m_minHeap; }; #endif
hzip.cpp
#include "hzip.hpp" /* Convert string of bits to bytes */ std::string stobyte(std::string& sBits) { std::string sBytes; std::size_t numberOfBits = 0; bit_t bit = 0; /* Iterate through string of bits * If there are not enough 8 bits (to construct 1 byte) * when reaching the end of string bits then * 0 will be filled. That's the purpose of "+ numberOfBits" */ for(std::size_t i = 0; i < sBits.size() + numberOfBits; ++i) { // Convert character in form of binary to real bits. (sBits[i] == '0') ? (bit = 0) : (bit = 1); // Initialise byteChar once static char byteChar = 0; // then get 1 bit byteChar = (byteChar << 1) | bit; ++numberOfBits; // If reaching 8 bits (1 byte) if (numberOfBits == MAX_NUMBER_OF_BITS) { // then add that byte to the string sBytes += byteChar; // and reset byteChar and numberOfBits for the next iteration byteChar = 0; numberOfBits = 0; } } return sBytes; } HZip::HZip() {} frmap_t HZip::buildFrequencyTable(const std::string& input) { frmap_t freqTable; for (std::size_t i = 0; i < input.size(); ++i) ++freqTable[hchar_t(input[i])]; // cast input[i] from char to hchar_t // push a PSEDO_EOF to frequency table with freq = 1 freqTable[PSEUDO_EOF] = 1; return freqTable; } NODE* HZip::buildEncodingTree(frmap_t freqTable) { // push each char and its frequency to min heap for (auto it = freqTable.begin(); it != freqTable.end(); ++it) m_minHeap.push(new NODE(it->first, it->second)); // push until there is just 1 node left (root) while (m_minHeap.size() > 1) { // get least frequency node NODE* lChild = m_minHeap.top(); m_minHeap.pop(); // get next least frequency node NODE* rChild = m_minHeap.top(); m_minHeap.pop(); /* * Make a parent node with * key is a char or a NOT_A_CHAR which indicates an internal node * value is the sum of 2 least frequency */ NODE* parent = new NODE(NOT_A_CHAR, lChild->_freq + rChild->_freq); // Link to its children parent->_lChild = lChild; parent->_rChild = rChild; m_minHeap.push(parent); } // Top of min heap is the root of built tree return m_minHeap.top(); } /* * Traverse binary tree recursively * and push char and its huffman code to the map */ void HZip::buildEncodingMap(NODE* encodingTree, std::string sCode) { if (!encodingTree) return ; // If leaf node if (encodingTree->_character != NOT_A_CHAR) p_encodedMap.insert(std::pair<hchar_t, std::string> (encodingTree->_character, sCode)); HZip::buildEncodingMap(encodingTree->_lChild, sCode + "0"); HZip::buildEncodingMap(encodingTree->_rChild, sCode + "1"); } /* * Get Huffman coding Map */ hmap_t HZip::getEncodedMap(NODE* encodingTree) { HZip::buildEncodingMap(encodingTree, ""); delete encodingTree; return p_encodedMap; } void HZip::encodeData(std::string& input, const frmap_t freqMap, hmap_t encodedMap, std::vector<std::string>& output) { /* * Construct the header * Format: {,<hchart>$<hfreq>...} */ std::string header; header = "{"; for (auto it = freqMap.begin(); it != freqMap.end(); ++it) header += "," + std::to_string(it->first) + "$" + std::to_string(it->second); header += "}"; /* * Reconstruct the document with string of '0' and '1' */ std::string sBits; for (std::size_t i = 0; i < input.size(); ++i) { sBits += encodedMap[hchar_t(input[i])]; } // Put a PSEUDO_EOF at the end of the input string sBits += encodedMap[PSEUDO_EOF]; // Vector output contains 2 parts: the header and actual compressed data output.push_back(header); output.push_back(stobyte(sBits)); } frmap_t HZip::headerProcessing(std::string& input) { frmap_t freqTable; /* Get header */ std::size_t endOfHeader = input.find_first_of("}"); std::string header = input.substr(0, endOfHeader + 1); /* Positions for constructing key/value from header */ std::size_t start = header.find_first_of("{,") + 2; std::size_t end = start; while (start < header.size()) { end = header.find_first_of("$", start); // get character from header std::string character = header.substr(start, end - start); hchar_t hCharacter = std::stoi(character); // convert string to integer start = end + 1; end = header.find_first_of(",}", start); // get frequency from header std::string freq = header.substr(start, end - start); hfreq_t hFreq = std::stoi(freq); // convert char(freq) to integer start = end + 1; // Reconstruct freqTable freqTable.insert(std::pair<hchar_t, hfreq_t> (hCharacter, hFreq)); } return freqTable; } void HZip::decodeData(std::string& input, NODE* rootOfEncodedTree, std::string& output) { // initialise current node NODE* curNode = rootOfEncodedTree; // flag for break 2 loops bool flagBreak = false; /* Get position right after the header */ std::size_t endOfHeader = input.find_first_of("}"); std::size_t pos = endOfHeader + 1; while (pos < input.size()) { /* Read 1 bit at a time from input */ for (bit_t j = MAX_NUMBER_OF_BITS - 1; j >= 0; --j) { // Get bit bit_t bit = (input[pos] >> j) & 0x01; /* If bit = 0 then go to the left child * else go to the right child */ (bit == 0) ? (curNode = curNode->_lChild) : (curNode = curNode->_rChild); // Reach the end of input if (curNode->_character == PSEUDO_EOF) { // turn on flagBreak for breaking the while loop flagBreak = true; // break the for loop break; } // If reaching leaf node else if (curNode->_character != NOT_A_CHAR) { // print node's character and add to the output output += curNode->_character; // Scan from root again curNode = rootOfEncodedTree; } } // Used for break the outer loop (while loop) if (flagBreak) break; // next position of input string ++pos; } }
hfile.hpp (Header file for file and string processing)
#ifndef _HFILE_HPP_ #define _HFILE_HPP_ #include "hzip.hpp" #include <fstream> #include <sstream> #include <string> const unsigned char HEADER = 0; const unsigned char ACTUAL_COMPRESSED_DATA = 1; /* Namespace for file extensions */ namespace h_extension { const std::string hzip = ".hzip"; const std::string har = ".har"; } /* * Namespace for path manipulation * ---------------------------------------- * Can be used for constructing the path of * compressed and decompressed file */ namespace hfile { std::string getFileName(const std::string& path_to_file); std::string getParentDicrectory(const std::string& path_to_file); /* Get the extension of file need to be compressed * Example: raw file: test.txt -> .txt * -------------------------------------------------- * Used for constructing the path to compressed file */ std::string getSourceFileExtension(const std::string& path_to_file); /* Get the extension of original file * Example: compressed file: test.txt.har -> txt * --------------------------------------------------- * Used for constructing the path to decompressed file */ std::string getOriginalFileExtension(const std::string& path_to_file); std::string getSrarExtension(const std::string& path_to_file); } /* Compress inFile and split into 2 files: * the header and actual compressed file. * --------------------------------------------------- * dest_file_path is the path to the directory you want to write * your compressed file to, no need to include file name. */ void compress(const std::ifstream& inFile, std::ofstream& outHeaderFile, std::ofstream& outDataFile, const std::string& source_file_path, const std::string& dest_file_path, const std::string& extension); /* Join 2 files inFile_1 and inFile_2 and write to outFile. * ----------------------------------------------------------------------- * dest_file_path must include file name you want to create after joining */ void joinFile(const std::ifstream& inFile_1, const std::ifstream& inFile_2, std::ofstream& outFile, const std::string& source_file_path, const std::string& dest_file_path); /* Decompress inFile and write to outFile * ------------------------------------------------------------------- * dest_file_path is the path to the directory you want to write * your compressed file to, no need to include file name. */ void decompress(const std::ifstream& inFile, std::ofstream& outFile, const std::string& source_file_path, const std::string& dest_file_path); #endif
hfile.cpp
#include "hfile.hpp" std::string hfile::getFileName(const std::string& path_to_file) { std::size_t start = 0; /* In case there is no parent directory at all */ if (path_to_file.find_last_of("/") != std::string::npos) start = path_to_file.find_last_of("/") + 1; else start = 0; std::size_t end = path_to_file.find_first_of(".", start); std::string file_name = path_to_file.substr(start, end - start); return file_name; } std::string hfile::getParentDicrectory(const std::string& path_to_file) { std::size_t end = 0; /* In case there is no parent directory at all */ if(path_to_file.find_last_of("/") != std::string::npos) end = path_to_file.find_last_of("/") + 1; else end = 0; std::string parent_directory = path_to_file.substr(0, end); return parent_directory; } std::string hfile::getSourceFileExtension(const std::string& path_to_file) { std::size_t start = path_to_file.find_last_of("."); std::size_t end = path_to_file.size(); std::string source_file_extension = path_to_file.substr(start, end - start); return source_file_extension; } std::string hfile::getOriginalFileExtension(const std::string& path_to_file) { std::size_t endOfPath = 0; /* In case there is no parent directory at all */ if (path_to_file.find_last_of("/") != std::string::npos) endOfPath = path_to_file.find_last_of("/"); else endOfPath = 0; // find the distance between 2 "." in the path // example: test.txt.har -> 4 (.txt) std::size_t start = path_to_file.find_first_of(".", endOfPath); // get next position of "." std::size_t end = path_to_file.find_first_of(".", start + 1); std::string original_file_extension = path_to_file.substr(start, end - start); return original_file_extension; } std::string hfile::getSrarExtension(const std::string& path_to_file) { std::size_t start = path_to_file.find_last_of("."); std::size_t end = path_to_file.size(); std::string srarExtension = path_to_file.substr(start, end - start); return srarExtension; } void compress(const std::ifstream& inFile, std::ofstream& outHeaderFile, std::ofstream& outDataFile, const std::string& source_file_path, const std::string& dest_file_path, const std::string& extension) { /* Read whole string in inFile * and store to inDoc */ std::stringstream buf; buf << inFile.rdbuf(); std::string inDoc = buf.str(); /* Read more on Huffman coding for more info */ HZip haz; frmap_t freqMap = haz.buildFrequencyTable(inDoc); NODE* root = haz.buildEncodingTree(freqMap); hmap_t encodedMap = haz.getEncodedMap(root); // data vector contains header and actual compressed data std::vector<std::string> data; haz.encodeData(inDoc, freqMap, encodedMap, data); /* Construct the path of header file * header_file_path = dest_dir_path + "h@"+ source_filename + * source_file_extension + huffman_extension */ std::string header_file_path = hfile::getParentDicrectory(dest_file_path) + "h@" + hfile::getFileName(source_file_path) + hfile::getSourceFileExtension(source_file_path) + extension; /* Write to header file */ outHeaderFile.open(header_file_path, std::ios::binary); outHeaderFile << data[HEADER]; /* Construct the path of actual compressed data file * similar to header_file_path, except for "d@" + filename instead of "h@" */ std::string data_file_path = hfile::getParentDicrectory(dest_file_path) + "d@" + hfile::getFileName(source_file_path) + hfile::getSourceFileExtension(source_file_path) + extension; /* Write to actual compressed file */ outDataFile.open(data_file_path, std::ios::binary); outDataFile << data[ACTUAL_COMPRESSED_DATA]; } void decompress(const std::ifstream& inFile, std::ofstream& outFile, const std::string& source_file_path, const std::string& dest_file_path) { /* Read whole string in inFile * and store to inDoc */ std::stringstream buf; buf << inFile.rdbuf(); std::string inDoc = buf.str(); /* Read more on Huffman coding decompression for more info */ HZip haz; frmap_t freqTable = haz.headerProcessing(inDoc); NODE* root = haz.buildEncodingTree(freqTable); std::string decompressedDoc; haz.decodeData(inDoc, root, decompressedDoc); // Free allocated root delete root; /* Construct the path of decompressed file * decompressed_file_path = dest_dir_path + <filename> + source_originalfile_extension */ std::string decompressed_file_path = hfile::getParentDicrectory(dest_file_path) + hfile::getFileName(source_file_path) + hfile::getOriginalFileExtension(source_file_path); /* Write to outFile */ outFile.open(decompressed_file_path, std::ios::binary); outFile << decompressedDoc; } void joinFile(const std::ifstream& inFile_1, const std::ifstream& inFile_2, std::ofstream& outFile, const std::string& source_file_path, const std::string& dest_file_path) { /* Read whole inFile_1 and store to string inDoc_1 */ std::stringstream buf1; buf1 << inFile_1.rdbuf(); std::string inDoc_1 = buf1.str(); /* Read whole inFile_2 and store to string inDoc_2 */ std::stringstream buf2; buf2 << inFile_2.rdbuf(); std::string inDoc_2 = buf2.str(); // Join 2 strings std::string outDoc = inDoc_1 + inDoc_2; // Get raw file name (still contains "h@" and "d@") std::string source_file_name = hfile::getFileName(source_file_path); /* Remove "h@" or "d@" for getting actual file name */ std::size_t start = source_file_name.find_first_of("@") + 1; std::size_t end = source_file_name.size(); std::string joined_file_name = source_file_name.substr(start, end - start); /* joined_file_path = dest_dir + source_file_name + * original_source_file_extension + srar_extension */ std::string joined_file_path = hfile::getParentDicrectory(dest_file_path) + joined_file_name + hfile::getOriginalFileExtension(source_file_path) + hfile::getSrarExtension(source_file_path); /* Write joined string to dest_file_path */ outFile.open(joined_file_path, std::ios::binary); outFile << outDoc; }