Please review my C++ CSV parsing class.
I have some specific questions:
- Should
get_next_record
be a static function? CsvParser
implies values will be separated by commas so is a field separator constructor over the top?record.clear()
at the beginning ofget_next_record
. Any other ways of solving the problem of removing last record. I realise that you could return record, but then you have the problem of how to deal with EOF or a stream error.
CsvParser.hpp
#ifndef CSV_PARSER_HPP_ #define CSV_PARSER_HPP_ #include <iostream> #include <string> #include <vector> using Field = std::string; using Record = std::vector<Field>; using Records = std::vector<Record>; class CsvParser { public: CsvParser(char field_separator = ','); bool get_next_record(std::istream& istrm, Record& record) const; private: char field_separator_char; }; #endif // CSV_PARSER_HPP_
CsvParser.cpp
#include "CsvParser.hpp" CsvParser::CsvParser(char field_separator) : field_separator_char(field_separator) {} bool CsvParser::get_next_record(std::istream& istrm, Record& record) const { // Having to clear record because otherwise the program will keep pushing back // fields into the vector feels dirty. How could this be improved? record.clear(); bool in_quotes = false; Field field; int ch; while (istrm) { ch = istrm.get(); if (ch == EOF || (ch == '\n' && !in_quotes)) { if (ch == EOF && record.empty() && field.empty()) { return false; } else { record.push_back(field); return true; } } else if (ch == field_separator_char && !in_quotes) { record.push_back(field); field.clear(); } else if (ch == '"') { if (!in_quotes) { in_quotes = true; } else { // Could be an embedded " if next symbol not comma int nextch = istrm.peek(); if (nextch != field_separator_char && nextch != '\n' && nextch != EOF) { field += static_cast<char>(ch); } else { in_quotes = false; } } } else if (ch == '\r') { if (in_quotes) { field += static_cast<char>(ch); } } else { field += static_cast<char>(ch); } } return false; }
Exercising using google test:
#include <gtest/gtest.h> #include "CsvParser.hpp" #include <sstream> #include <string> class CsvParserTest : public ::testing::Test { public: CsvParser parser; }; TEST_F(CsvParserTest, EmptyRecord) { const std::string csv{ "" }; std::stringstream strm(csv); Record record; EXPECT_FALSE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 0u); } TEST_F(CsvParserTest, SimpleSingleRecord) { const std::string csv{ "AA,BB,CC" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "AA"); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CC"); } TEST_F(CsvParserTest, SimpleTwoRecord) { const std::string csv{ "AA,BB,CC\r\nDD,EE,FF" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "AA"); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CC"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "DD"); EXPECT_EQ(record[1], "EE"); EXPECT_EQ(record[2], "FF"); } TEST_F(CsvParserTest, SimpleQuotedField) { const std::string csv{ "\"A\",BB,CCC" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "A"); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CCC"); } TEST_F(CsvParserTest, QuotesEmbeddedInQuotedField) { const std::string csv{ "\"\"A\"\",BB,CCC" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "\"A\""); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CCC"); } TEST_F(CsvParserTest, LinefeedEmbeddedInQuotedField) { const std::string csv{ "\"\"A\n\"\",BB,CCC" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "\"A\n\""); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CCC"); } TEST_F(CsvParserTest, CommaEmbeddedInQuotedField) { const std::string csv{ R"(""A,"",BB,CCC)" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], R"("A,")"); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CCC"); } TEST_F(CsvParserTest, EmptyRow) { const std::string csv{ ",," }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0].size(), 0u); EXPECT_EQ(record[1].size(), 0u); EXPECT_EQ(record[2].size(), 0u); } TEST_F(CsvParserTest, QuotedFollowedByTwoEmptyFields) { const std::string csv{ "\"A\n\n\nB\",," }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "A\n\n\nB"); EXPECT_EQ(record[1].size(), 0u); EXPECT_EQ(record[2].size(), 0u); } TEST_F(CsvParserTest, EmptyThenQuotedThenEmptyField) { const std::string csv{ ",\"A\n\n\nB\"," }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0].size(), 0u); EXPECT_EQ(record[1], "A\n\n\nB"); EXPECT_EQ(record[2].size(), 0u); } TEST_F(CsvParserTest, EmptyEmptyThenQuoted) { const std::string csv{ ",,\"A\n\n\nB\"" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0].size(), 0u); EXPECT_EQ(record[1].size(), 0u); EXPECT_EQ(record[2], "A\n\n\nB"); } TEST_F(CsvParserTest, CRLFEndOfLIne) { const std::string csv{ "A,B,C\r\nD,E,F" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "A"); EXPECT_EQ(record[1], "B"); EXPECT_EQ(record[2], "C"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "D"); EXPECT_EQ(record[1], "E"); EXPECT_EQ(record[2], "F"); } TEST_F(CsvParserTest, EmbeddedCRLF) { const std::string csv{ "A,\"B\r\nC\",D\r\nE,F,G" }; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "A"); EXPECT_EQ(record[1], "B\r\nC"); EXPECT_EQ(record[2], "D"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "E"); EXPECT_EQ(record[1], "F"); EXPECT_EQ(record[2], "G"); } TEST_F(CsvParserTest, Complex) { const std::string csv = "AAA,BB,CCC\nDDD,EE,FFF\n\"A A\",\"B\nB\",CC\n\"A,B,C\",\"D E\",F\n\"Billy \"Da Man\" Hooker\",,\n,,\n,,\"Yo bitches!\"\n,,\"Holler if you luv dem \"hat\" bitches\"\n,\"These are my long\nnotes on a load\nof stuff\n fancy some commas:,,,,,,,,,,,,,,,,,,,,,,,,,,,\","; std::stringstream strm(csv); Record record; EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "AAA"); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CCC"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "DDD"); EXPECT_EQ(record[1], "EE"); EXPECT_EQ(record[2], "FFF"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "A A"); EXPECT_EQ(record[1], "B\nB"); EXPECT_EQ(record[2], "CC"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "A,B,C"); EXPECT_EQ(record[1], "D E"); EXPECT_EQ(record[2], "F"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "Billy \"Da Man\" Hooker"); EXPECT_EQ(record[1], ""); EXPECT_EQ(record[2], ""); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], ""); EXPECT_EQ(record[1], ""); EXPECT_EQ(record[2], ""); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], ""); EXPECT_EQ(record[1], ""); EXPECT_EQ(record[2], "Yo bitches!"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], ""); EXPECT_EQ(record[1], ""); EXPECT_EQ(record[2], "Holler if you luv dem \"hat\" bitches"); EXPECT_TRUE(parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], ""); EXPECT_EQ(record[1], "These are my long\nnotes on a load\nof stuff\n fancy some commas:,,,,,,,,,,,,,,,,,,,,,,,,,,,"); EXPECT_EQ(record[2], ""); } TEST_F(CsvParserTest, TabSeparated) { const std::string csv{ "AA\tBB\tCC\nDD\tEE\tFF" }; std::stringstream strm(csv); Record record; CsvParser tab_parser('\t'); EXPECT_TRUE(tab_parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "AA"); EXPECT_EQ(record[1], "BB"); EXPECT_EQ(record[2], "CC"); EXPECT_TRUE(tab_parser.get_next_record(strm, record)); EXPECT_EQ(record.size(), 3u); EXPECT_EQ(record[0], "DD"); EXPECT_EQ(record[1], "EE"); EXPECT_EQ(record[2], "FF"); }