I have implemented a CSV reader. I think I did pretty well. Since CSV is a loosely defined format to begin with I decided to allow some malformations, like anything but a delimiter after an enclosed value.
Maybe someone could point out improvements to this class, I would be happy to know them.
using System; using System.Collections.Generic; using System.Text; using System.IO; namespace ConsoleApplication49 { public class CsvReader { private const char Sym_Escape = '"'; private static int StandardInitialRowSize = 16; private StreamReader reader; private char delimiter; private char[] buffer; private int bufferSize; private int bufferBound; private int bufferPos; private bool endReached; private bool boundReached; private bool returnImplicitRow; private int initialRowSize; private int valuePos; private StringBuilder valueBuilder; public CsvReader(Stream stream, char delimiter = ',', int bufferSize = 4096) { #region check if (stream == null) { throw new ArgumentNullException("stream"); } if (delimiter == Sym_Escape || delimiter == '\r') { throw new ArgumentException("Invalid delimiter", "delimiter"); } if (bufferSize < 128) { throw new ArgumentException("Invalid buffer size", "bufferSize"); } #endregion this.reader = new StreamReader(stream, Encoding.UTF8, true, bufferSize); this.delimiter = delimiter; this.buffer = new char[bufferSize]; this.bufferSize = bufferSize; this.initialRowSize = StandardInitialRowSize; this.valueBuilder = new StringBuilder(128); if (reader.BaseStream.Length == 0) { returnImplicitRow = true; } } public bool Read(out string[] outRow) { Assert(); if (endReached) { if (returnImplicitRow) { returnImplicitRow = false; outRow = new string[1]; return true; } else { outRow = null; return false; } } string[] row = new string[initialRowSize]; int rowSize = initialRowSize; int rowPos = 0; bool newlineReached = false; do { Assert(); if (endReached) { goto SetValue; } char chr = buffer[bufferPos++]; if (chr == Sym_Escape) { Assert(); if (endReached) { goto SetValue; } valuePos = bufferPos; chr = buffer[bufferPos++]; while (true) { #region Regular assertion if (bufferPos == bufferBound) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); if (reader.EndOfStream) { endReached = true; } else { bufferBound = reader.Read(buffer, 0, bufferSize); bufferPos = 0; valuePos = 0; } boundReached = true; } else { boundReached = false; } #endregion if (chr == Sym_Escape) { if (endReached) { goto SetValue; } chr = buffer[bufferPos]; if (chr == Sym_Escape) { if (boundReached) { valueBuilder.Append(Sym_Escape); } else { valueBuilder.Append(buffer, valuePos, bufferPos - valuePos); } bufferPos++; valuePos = bufferPos; Assert(); } else { if (!boundReached) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); valuePos = bufferPos; } bufferPos++; break; } } else if (boundReached) { valueBuilder.Append(chr); } if (endReached) { goto SetValue; } chr = buffer[bufferPos++]; } } while (true) { #region Regular assertion if (bufferPos == bufferBound) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); if (reader.EndOfStream) { endReached = true; } else { bufferBound = reader.Read(buffer, 0, bufferSize); bufferPos = 0; valuePos = 0; } boundReached = true; } else { boundReached = false; } #endregion if (chr == delimiter) { if (!boundReached) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); valuePos = bufferPos; } endReached = false; break; } else if (chr == '\r' && !endReached && buffer[bufferPos] == '\n') { if (!boundReached) { valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1); } bufferPos++; valuePos = bufferPos; Assert(); if (endReached) { returnImplicitRow = true; } newlineReached = true; break; } else if (boundReached) { valueBuilder.Append(chr); } if (endReached) { break; } chr = buffer[bufferPos++]; } SetValue: if (rowPos == rowSize) { rowSize *= 2; Array.Resize(ref row, rowSize); } row[rowPos++] = valueBuilder.ToString(); valueBuilder.Length = 0; } while (!endReached && !newlineReached); if (rowPos < rowSize) { Array.Resize(ref row, rowPos); } outRow = row; initialRowSize = rowPos; return true; } private void Assert() { if (bufferPos == bufferBound) { if (reader.EndOfStream) { endReached = true; } else { bufferBound = reader.Read(buffer, 0, bufferSize); bufferPos = 0; valuePos = 0; } boundReached = true; } else { boundReached = false; } } } }
Here is some code to do testing
using System; using System.Collections.Generic; using System.Text; using System.IO; namespace ConsoleApplication49 { class Program { static void Main(string[] args) { CsvReader csvReader = new CsvReader(OpenFile(@"D:\Users\Administrator\desktop\test.csv")); string[] row; while (csvReader.Read(out row)) { int len = row.Length - 1; for (int i = 0; i <= len; i++) { Console.Write(Filter(row[i], Char.IsControl)); if (i < len) { Console.Write('|'); } } Console.WriteLine(); } Console.ReadLine(); } public static string Filter(string str, Func<char, bool> invalidator) { StringBuilder sb = new StringBuilder(); foreach (char c in str) { if (!invalidator.Invoke(c)) { sb.Append(c); } } return sb.ToString(); } public static FileStream OpenFile(string filePath) { return OpenFile(filePath, FileAccess.ReadWrite, FileShare.None); } public static FileStream OpenFile(string filePath, FileAccess fileAccess, FileShare fileShare) { FileStream fs = null; try { fs = File.Open(filePath, FileMode.Open, fileAccess, fileShare); } catch (Exception) { } return fs; } } }
goto
, and the fact that you havent disposed your stream. But they seem to have not noticed yet that you also havecatch(Exception) { }
, which is arguably worse. Let's catch and continue when we get anOutOfMemoryException
! Those aren't important! Yeah, nice plan!\$\endgroup\$goto
are also controversial. This question feels like it is trolling. Please remove the hyperbole, or give examples of where your code is more complete than other libraries, more performant, etc. I will be happy to go though and edit this myself in an hour or so.\$\endgroup\$