2
\$\begingroup\$

I have implemented a CSV reader. I think I did pretty well. Since CSV is a loosely defined format to begin with I decided to allow some malformations, like anything but a delimiter after an enclosed value.

Maybe someone could point out improvements to this class, I would be happy to know them.

using System; using System.Collections.Generic; using System.Text; using System.IO; namespace ConsoleApplication49 { public class CsvReader { private const char Sym_Escape = '"'; private static int StandardInitialRowSize = 16; private StreamReader reader; private char delimiter; private char[] buffer; private int bufferSize; private int bufferBound; private int bufferPos; private bool endReached; private bool boundReached; private bool returnImplicitRow; private int initialRowSize; private int valuePos; private StringBuilder valueBuilder; public CsvReader(Stream stream, char delimiter = ',', int bufferSize = 4096) { #region check if (stream == null) { throw new ArgumentNullException("stream"); } if (delimiter == Sym_Escape || delimiter == '\r') { throw new ArgumentException("Invalid delimiter", "delimiter"); } if (bufferSize < 128) { throw new ArgumentException("Invalid buffer size", "bufferSize"); } #endregion this.reader = new StreamReader(stream, Encoding.UTF8, true, bufferSize); this.delimiter = delimiter; this.buffer = new char[bufferSize]; this.bufferSize = bufferSize; this.initialRowSize = StandardInitialRowSize; this.valueBuilder = new StringBuilder(128); if (reader.BaseStream.Length == 0) { returnImplicitRow = true; } } public bool Read(out string[] outRow) { Assert(); if (endReached) { if (returnImplicitRow) { returnImplicitRow = false; outRow = new string[1]; return true; } else { outRow = null; return false; } } string[] row = new string[initialRowSize]; int rowSize = initialRowSize; int rowPos = 0; bool newlineReached = false; do { Assert(); if (endReached) { goto SetValue; } char chr = buffer[bufferPos++]; if (chr == Sym_Escape) { Assert(); if (endReached) { goto SetValue; } valuePos = bufferPos; chr = buffer[bufferPos++]; while (true) { #region Regular assertion if (bufferPos == bufferBound) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); if (reader.EndOfStream) { endReached = true; } else { bufferBound = reader.Read(buffer, 0, bufferSize); bufferPos = 0; valuePos = 0; } boundReached = true; } else { boundReached = false; } #endregion if (chr == Sym_Escape) { if (endReached) { goto SetValue; } chr = buffer[bufferPos]; if (chr == Sym_Escape) { if (boundReached) { valueBuilder.Append(Sym_Escape); } else { valueBuilder.Append(buffer, valuePos, bufferPos - valuePos); } bufferPos++; valuePos = bufferPos; Assert(); } else { if (!boundReached) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); valuePos = bufferPos; } bufferPos++; break; } } else if (boundReached) { valueBuilder.Append(chr); } if (endReached) { goto SetValue; } chr = buffer[bufferPos++]; } } while (true) { #region Regular assertion if (bufferPos == bufferBound) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); if (reader.EndOfStream) { endReached = true; } else { bufferBound = reader.Read(buffer, 0, bufferSize); bufferPos = 0; valuePos = 0; } boundReached = true; } else { boundReached = false; } #endregion if (chr == delimiter) { if (!boundReached) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); valuePos = bufferPos; } endReached = false; break; } else if (chr == '\r' && !endReached && buffer[bufferPos] == '\n') { if (!boundReached) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); } bufferPos++; valuePos = bufferPos; Assert(); if (endReached) { returnImplicitRow = true; } newlineReached = true; break; } else if (boundReached) { valueBuilder.Append(chr); } if (endReached) { break; } chr = buffer[bufferPos++]; } SetValue: if (rowPos == rowSize) { rowSize *= 2; Array.Resize(ref row, rowSize); } row[rowPos++] = valueBuilder.ToString(); valueBuilder.Length = 0; } while (!endReached && !newlineReached); if (rowPos < rowSize) { Array.Resize(ref row, rowPos); } outRow = row; initialRowSize = rowPos; return true; } private void Assert() { if (bufferPos == bufferBound) { if (reader.EndOfStream) { endReached = true; } else { bufferBound = reader.Read(buffer, 0, bufferSize); bufferPos = 0; valuePos = 0; } boundReached = true; } else { boundReached = false; } } } } 

Here is some code to do testing

using System; using System.Collections.Generic; using System.Text; using System.IO; namespace ConsoleApplication49 { class Program { static void Main(string[] args) { CsvReader csvReader = new CsvReader(OpenFile(@"D:\Users\Administrator\desktop\test.csv")); string[] row; while (csvReader.Read(out row)) { int len = row.Length - 1; for (int i = 0; i <= len; i++) { Console.Write(Filter(row[i], Char.IsControl)); if (i < len) { Console.Write('|'); } } Console.WriteLine(); } Console.ReadLine(); } public static string Filter(string str, Func<char, bool> invalidator) { StringBuilder sb = new StringBuilder(); foreach (char c in str) { if (!invalidator.Invoke(c)) { sb.Append(c); } } return sb.ToString(); } public static FileStream OpenFile(string filePath) { return OpenFile(filePath, FileAccess.ReadWrite, FileShare.None); } public static FileStream OpenFile(string filePath, FileAccess fileAccess, FileShare fileShare) { FileStream fs = null; try { fs = File.Open(filePath, FileMode.Open, fileAccess, fileShare); } catch (Exception) { } return fs; } } } 
\$\endgroup\$
17
  • \$\begingroup\$Can I ask why you didn't just import microsoft.visualbasic.fileio and use the native textfieldparser class?\$\endgroup\$
    – JohnP
    CommentedJul 2, 2014 at 22:14
  • \$\begingroup\$@JohnP I would rather not rely on utilities hiding in a language namespace. But eitherway it doesn't sound like an interesting approach.\$\endgroup\$CommentedJul 2, 2014 at 22:19
  • \$\begingroup\$imgs.xkcd.com/comics/goto.png\$\endgroup\$CommentedJul 2, 2014 at 22:22
  • 3
    \$\begingroup\$People have already made sure to critique the hundreds of lines method that includes a goto, and the fact that you havent disposed your stream. But they seem to have not noticed yet that you also have catch(Exception) { }, which is arguably worse. Let's catch and continue when we get an OutOfMemoryException! Those aren't important! Yeah, nice plan!\$\endgroup\$
    – Magus
    CommentedJul 2, 2014 at 23:17
  • 5
    \$\begingroup\$Leopold - This question is designed to be controversial. Your opening paragraph makes broad, opinionated statements that have no facts to back them (yet). Your opinions about goto are also controversial. This question feels like it is trolling. Please remove the hyperbole, or give examples of where your code is more complete than other libraries, more performant, etc. I will be happy to go though and edit this myself in an hour or so.\$\endgroup\$
    – rolfl
    CommentedJul 3, 2014 at 1:29

5 Answers 5

14
\$\begingroup\$

Including the signature, scope opening and closing braces, and all the superfluous vertical whitespace you've got here, the Read method has 223 lines of code, checks if the end was reached in 7 places and includes 4 goto instructions.

Other answers already stated it, but I don't think it can ever be stressed enough: goto is a synonym for bad control flow; the 220-some lines of code in the method reinforce that: Read is, in reality, doing much more than just reading, and your code could really benefit from being broken down into several smaller, private methods.

But let's start with the class itself, and its members:

private const char Sym_Escape = '"'; 

I don't like the name of that constant. Even if it were SymbolToEscape I still wouldn't like it. It's a constant, it's never going to need to change. People know what to expect of a constant that's called FourtyTwo, it's never going to change. A double quote can reasonably be expected to remain represented by a " - I'd call it DoubleQuote, and a line like this would be much easier to understand:

valueBuilder.Append(Sym_Escape); 

Versus:

valueBuilder.Append(DoubleQuote); 

The StandardInitialRowSize looks like it could be any arbitrary value. Where's the 16 coming from? Why isn't it 0 or 32? Why is it static? Why isn't it readonly?

private static int StandardInitialRowSize = 16; ... private int initialRowSize; ... this.initialRowSize = StandardInitialRowSize; 

Not making this value a constant is probably a good design decision. However one shouldn't reasonably expect the standard initial row size to change at any point during the lifetime of an instance of that class, so making it readonly would clarify the intent:

private static readonly int StandardInitialRowSize = 16; 
this.reader = new StreamReader(stream, Encoding.UTF8, true, bufferSize); 

Looking at how the reader is used:

CsvReader csvReader = new CsvReader(OpenFile(@"D:\Users\Administrator\desktop\test.csv")); 

I think it's nice that you provide a constructor that takes a Stream parameter, but as a consumer of that class I would also expect to be able to instantiate it like this:

var reader = new CsvReader(@"D:\Users\Administrator\desktop\test.csv"); 

...which would require chained constructors, one of which taking in a string path parameter:

public CsvReader(string path, char delimiter = ',', int bufferSize = 4096) : this(File.Open(filePath, FileMode.Open, FileAccess.ReadWrite, FileShare.None) { } 

And let the client deal with any exceptions that stem from an invalid file name, or any other exceptions that might be thrown. Your code currently swallows all of these exceptions and potentially sends the constructor a null reference, which you're guarding against by throwing an ArgumentNullException - that's nice and it keeps your class in a consistent/workable state, but it doesn't hold any trace of why this situation is happening in the first place - is the file inexistent? is the file locked for editing by another process? by another thread in the same process? by another instance of your reader? does the user have permissions to access that file? By swallowing the exception that held that information, you failed to fail fast and lost very valuable information, and that makes debugging much harder than it needs to be.

The object is a reader. Does the file really need to be opened with ReadWrite access? It seems pretty awkward to have a reader lock a file for writing when you know from the start that you're not going to write anything.


if (delimiter == Sym_Escape || delimiter == '\r') { throw new ArgumentException("Invalid delimiter", "delimiter"); } 

I always chuckle when I see Comma-Separated-Values data with a delimiter other than a comma. But you're being flexible here, and that's good. However this guard clause is likely to evolve into a bunch of ... || ... || ..., a more maintainable approach would be to use an inline array:

if (new[] { Sym_Escape, '\r' }.Contains(delimiter)) { throw new ArgumentException("Invalid delimiter", "delimiter"); } 

Now if you want to consider \n an invalid delimiter, you can just add it to the array and be done with it.

I don't like the #region around the guard clauses, but that's my own personal opinion. I have a strong bias against #region, because I found that whenever I thought I needed a #region in a method, it turned out it was because the method was doing too many things and, after refactoring into smaller methods, regions weren't needed anymore; when I thought I needed a #region in a class, it turned out it was because the class was doing too many things and, after refactoring into smaller classes, regions weren't needed anymore.

I don't like labels (what goto points to) either, pretty much for the same reason: whenever I thought I needed to use a goto and jump to a label, it turned out after refactoring and extracting the methods and DRYing up my code, the execution flow untangled before my eyes, and goto wasn't needed anymore; without a goto to jump to it, a label is pretty much useless, so I stopped using those too.

Yes, goto can make your code work. But unless proven otherwise goto is always a missed refactoring opportunity.


On this line:

this.reader = new StreamReader(stream, Encoding.UTF8, true, bufferSize); 

You are creating a StreamReader instance. The "typical" usage is normally as follows:

using (var reader = new StreamReader(stream, Encoding.UTF8, true, bufferSize)) { int c; while ((c = reader.Read()) != -1) { // ... } } 

The using block ensures proper disposal of the StreamReader. It's important to dispose of streams and usually, a using block does that automagically.

The only alternative is to call the Dispose method manually. And this is what your code is not doing. Please see Using an object that implements IDisposable.

The primary use of this interface is to release unmanaged resources. The garbage collector automatically releases the memory allocated to a managed object when that object is no longer used. However, it is not possible to predict when garbage collection will occur. Furthermore, the garbage collector has no knowledge of unmanaged resources such as window handles, or open files and streams.

Use the Dispose method of this interface to explicitly release unmanaged resources in conjunction with the garbage collector. The consumer of an object can call this method when the object is no longer needed.

(http://msdn.microsoft.com/en-us/library/system.idisposable.aspx))

An easy rule to follow, is that the object that creates the disposable, is also responsible for disposing the disposable.

In this case the CsvReader class is responsible for creating the reader, using the stream that was passed through the constructor. The reader gets "pinned" into a private field, at instance level - and the reason you need to do this is to enable that while (reader.Read()) client loop. That's a pretty good reason.

You can't call Dispose in the constructor, Read would throw an ObjectDisposedException the minute you try to access the encapsulated reader. You're encapsulating an object that implements IDisposable - the only reasonable thing to do is to implement IDisposable too, and call the Dispose method of the reader instance there.

Then you also enable the using block the client code would rightfully be expecting to be able to use.


You're using a StringBuilder, that's awesome:

this.valueBuilder = new StringBuilder(128); 

Where's the 128 coming from? It's probably best to promote that magic number to a private static readonly field.


I've only scratched the surface, and this answer is getting pretty long already. You have a nice little spaghetti plate there. Refactor, my friend. Refactor. Extract smaller methods: if you can look at a block of code and put a comment above it that says what it does (don't do that, it's a bad commenting habit to have), you'd find that everytime you have such a block of code, it can be extracted into its own method pretty neatly.

I have to say something about your Assert method though:

  1. Bad, bad, bad, bad name.
  2. Bad. Name.
  3. Bad name.
  4. Assert says nothing about what that method is doing, and without a parameter and without a return type, only reading the code will tell.
\$\endgroup\$
15
  • \$\begingroup\$Impressive work :)\$\endgroup\$
    – Nikita B
    CommentedJul 3, 2014 at 7:59
  • 7
    \$\begingroup\$Assert clashes with Debug.Assert, takes no parameters and returns nothing. This is Code Review; you're asking for other programmers' point of view. If I read your code and say "WTF" you can explain all you want, I still WTF'd and that means you have written something that increases your code's "WTF-per-minute" count (the only valid code quality measurement, right?), and got something to fix to make your intent clearer so you don't have to explain everything your code itself could tell all by itself. What you call "regular assertions" are guard clauses.\$\endgroup\$CommentedJul 3, 2014 at 10:42
  • 2
    \$\begingroup\$And a symbol to escape can change, a double quote can't. Pi is by definition a constant, you're pulling my leg. A constant shouldn't be called with some vague name that makes it look like a variable, otherwise just make it static readonly instead of a const. But who cares, it's private. Know that changing a public constant's value is a breaking change that requires recompilation of all client code that refers to it. This isn't just semantics.\$\endgroup\$CommentedJul 3, 2014 at 10:53
  • 2
    \$\begingroup\$@LeopoldAsperger: Nothing is more arbitrary than your decision that the mathematical constant Pi is just as variable as Sym_Escape which, I might add, has an underscore in the middle of it, which is almost universally bad in C#. Additionally, you're mentioning performance considerations such as inlining and not using an array for a character comparison. If the performance difference was noticeable, you might have a point. Mostly, it makes your code more brittle and ugly. If you truly value performance over code quality, try C. All C is not bad, but it'd let you be as bad as you want.\$\endgroup\$
    – Magus
    CommentedJul 3, 2014 at 15:14
  • 1
    \$\begingroup\$@LeopoldAsperger: In any case, define symbolic constants according to their function, and comments that document the EXACT rules of the quoting algorithm you are following.\$\endgroup\$
    – Ben Voigt
    CommentedJul 3, 2014 at 16:00
6
\$\begingroup\$

First of all, GOTOs are evil. You should never touch them, as Dijkstra said, go To Statement Considered Harmful.

Dont forget to dispose your StreamReader, you can implement IDisposable to dispose it.

public class CsvReader : IDisposable { void Dispose() { if(null != this.reader) { this.reader.Dispose() } } } 
\$\endgroup\$
8
  • \$\begingroup\$If there is any legitimate reason not to use goto, it must be to avoid prejudgement. Not using goto because it looks prettier the other way is superficial. I didn't implement IDisposable because disposing the stream is the responsibility of the caller. When the caller lends its stream doesn't mean that it should be claimt.\$\endgroup\$CommentedJul 2, 2014 at 22:51
  • 2
    \$\begingroup\$@LeopoldAsperger I didn't understand the first 2 sentences. Anyways, you shouldn't assume that the caller is a nice chap and he is gonna close the stream\$\endgroup\$CommentedJul 2, 2014 at 22:57
  • 3
    \$\begingroup\$@LeopoldAsperger: It's yourStreamReader, held in a private member. How can it be the caller's responsibility to dispose what he can't access? You can use this constructor if you don't want to pull the rug out from under your caller by closing his stream. Even better, provide a boolean parameter of your own to allow the caller to select between automatic and manual disposal.\$\endgroup\$
    – Ben Voigt
    CommentedJul 2, 2014 at 22:58
  • 1
    \$\begingroup\$@LeopoldAsperger: It does that only if you used the wrong constructor. Don't count on dispose not doing anything else.\$\endgroup\$
    – Ben Voigt
    CommentedJul 2, 2014 at 23:08
  • 2
    \$\begingroup\$@LeopoldAsperger: 1. You have a stream. 2. You should close it.\$\endgroup\$
    – Magus
    CommentedJul 2, 2014 at 23:09
6
\$\begingroup\$

If you understand your toolbox (standard library functions) you can make them perform well and not need to reinvent them.

This is crazy:

// in constructor this.initialRowSize = StandardInitialRowSize; // per-line string[] row = new string[initialRowSize]; int rowSize = initialRowSize; int rowPos = 0; // later if (rowPos == rowSize) { rowSize *= 2; Array.Resize(ref row, rowSize); } row[rowPos++] = valueBuilder.ToString(); valueBuilder.Length = 0; // later if (rowPos < rowSize) { Array.Resize(ref row, rowPos); } outRow = row; initialRowSize = rowPos; 

Instead you should be using a List<string> for accumulating fields. It remembers how many items are stored, so you won't need rowPos. And it remembers its capacity, so you won't need rowSize. As well, clearing it doesn't change the capacity, so you won't need initialRowSize. And it already grows exponentially, as needed, so you can get rid of all the tests comparing rowPos to rowSize.

Much better:

//class variable private readonly List<string> workingRow = new List<string>(StandardInitialRowSize); // at beginning of row processing workingRow.Clear(); // resets Length to zero, retaining capacity and allocated array // adding value to row workingRow.Add(valueBuilder.ToString()); valueBuilder.Length = 0; // at end outRow = workingRow.ToArray(); 

That even gets rid of the need for goto SetValue, because now the value-setting code is so dirt simple it can safely be repeated.

I notice you already did this exact thing with valueBuilder, so why not with the string[]? List<string> is to string[] as StringBuilder is to string.

\$\endgroup\$
8
  • \$\begingroup\$I know someone's going to tell me that new List<string> is expensive and restarts the growth. That's why I suggested a method that doesn't create a new list for each row!\$\endgroup\$
    – Ben Voigt
    CommentedJul 3, 2014 at 16:15
  • \$\begingroup\$Lists are alot more expensive overall, not just initialization. Reusing the same object representing a row is not an option. An unreliable program could corrupt the collection causing the reader to malfunction. Besides when a utilizer retrieves a row, it may directly rely on it, changing the collection is unacceptable.\$\endgroup\$CommentedJul 3, 2014 at 17:23
  • 1
    \$\begingroup\$1. No they are not more expensive. They are very thin wrappers around array access that gets inlined. You may be thinking of costs of the IList interface, which I did not recommend using. 2. The list is not exposed to the caller anywhere. 3. If you aren't going to fully read the reviews, don't bother posting on CR. It is clear that you focused on "use List" and skipped 2/3rds of my answer.\$\endgroup\$
    – Ben Voigt
    CommentedJul 3, 2014 at 17:33
  • \$\begingroup\$Fyi, I have not skipped any part of your answer. I responded to your comment. And regarding the lists, I have benchmarked this, you obviously have not.\$\endgroup\$CommentedJul 3, 2014 at 17:41
  • \$\begingroup\$I used StringBuilder because of performance. Unlike char[], StringBuilder is capable of of calling the internal copy methods of string.\$\endgroup\$CommentedJul 3, 2014 at 17:48
5
\$\begingroup\$

GOTO is nice if used appropriately

That's a big IF. Throwing it into a method which is several hundred lines and already has a cyclomatic complexity in the double digits is not appropriate.

\$\endgroup\$
6
  • \$\begingroup\$Again, prejudice. My benchmark tells me otherwise.\$\endgroup\$CommentedJul 2, 2014 at 23:16
  • 7
    \$\begingroup\$@LeopoldAsperger: Your benchmark says nothing of the sort. Perhaps it shows that you got great performance from code containing GOTO. It does not prove that the same performance cannot be achieved from code without GOTO.\$\endgroup\$
    – Ben Voigt
    CommentedJul 2, 2014 at 23:45
  • 1
    \$\begingroup\$Performance has nothing to do with the goto statements.\$\endgroup\$CommentedJul 3, 2014 at 0:12
  • 1
    \$\begingroup\$This seems to be a critique of a comment and not the OP's code. (Seriously though, GOTOs are the devil. [99.999999999% of the time])\$\endgroup\$CommentedJul 3, 2014 at 3:43
  • \$\begingroup\$@LeopoldAsperger As far as I can see, you were the one who started talking about performance, seemingly implying that gotos make your code faster.\$\endgroup\$
    – svick
    CommentedJul 4, 2014 at 16:30
5
\$\begingroup\$

Bug fix

As I mentioned in the comments, outRow = new string[1] should be outRow = new string[0].

API

The only time you return false, is when you set outRow to null. I would suggest changing the function signature to

public string[] Read() 

Then client code would look like

string[] row; while ((row = csvReader.Read()) != null) 

How do you know that the proper encoding is UTF8? Let the client pass in their own StreamReader and use that.

You could use the more general TextReader in place of StreamReader, by checking for the end of the input using Read instead of EndOfStream.

StreamReader

I find it surprising that nowhere do you call this.reader.ReadLine(), as CSV is a line-based format.

Data structures

You're resizing your row array, which suggests a List<string> would be a better option.

Naming

The method Assert does not contain any assertions, so I would suggest renaming it.

Control flow

There are parts in the code that are of the form

if (...) { ... return; } else if (...) { } 

or

if (...) { ... break; } else if (...) { } 

In which case, you don't need the else.

Other

valueBuilder, reader, buffer, bufferSize, and delimiter can be made readonly.

StringBuilder.Clear is a convenience method for sb.Length = 0, and reads a bit nicer.

\$\endgroup\$
3
  • \$\begingroup\$new string[1] is not a "bug", it represents an implicit row. Having a row with no value doesn't make sense, therefore it has one empty value. Assert does cause assertion, but it doesn't return a boolean value. I've used else in those cases for readability, it will compile to the same IL. StringBuilder.Clear is only available in .NET 4.0. Everything else is too opinion-based.\$\endgroup\$CommentedJul 3, 2014 at 8:24
  • \$\begingroup\$@Magus: The array is the row. The number of elements are the columns/fields/values.\$\endgroup\$
    – Ben Voigt
    CommentedJul 3, 2014 at 15:37
  • \$\begingroup\$I do not intend to intialize an empty array. What I intend to do is initialize an array with 1 empty string value.\$\endgroup\$CommentedJul 3, 2014 at 15:38

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.