I am trying to split a large file (comma delimited, each term wrapped in double quotes) into many smaller files, based on the key which is the first item in each record, there usually is more than 1 record with the same key.
This large file can range from 1GB to 2GB, the number of files generated can range from 10,000-30,000, each in a subfolder named after the key.
In C# I'm doing a StreamReader.ReadLine() on each line, concatenating the results until it reaches a different key (signaling the last of the data for the previous key), then calling a function to write the files asyncronously. I am calling windows sort to sort these files to make the keys contiguous (so I only have to open files once) but it still takes like 20 minutes for the operation to complete. Is there any way to speed this up?
sfd = new SaveFileDataDelegate(this.SaveFileData);
private void CSVParse(string filename, string unzippedFilePath, string feedname)
{
StreamReader filestream = null;
FileStream readerStream = null;
try
{
readerStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.None, 120000, FileOptions.SequentialScan);
filestream = new StreamReader(readerStream, Encoding.UTF8, false, 120000);
string tempstring = "";
string buffer = "";
string lastlotkey = "";
IAsyncResult result = null;
activityLog.Log("Parsing File: " + filename);
while (((tempstring = filestream.ReadLine()) != null) || buffer != "")
{
if (tempstring == null)
{
tempstring = "";
}
string lotkey = tempstring.Replace("\"","").Split(',').First();
if (lotkey == tempstring && tempstring != "")
{
break;
}
if (lotkey == "DealerID")
{
continue;
}
if (lastlotkey == "")
{
lastlotkey = lotkey;
}
if ((lotkey != lastlotkey && buffer.Length > 0))
{
result = sfd.BeginInvoke(outputDirectory + @"\" + feedname + @"\" + lastlotkey + @"\" + (filename.Split('\\').Last()).Split('.').First() + ".txt", buffer, outputDirectory + @"\" + feedname + @"\" + lastlotkey,null,null);
lastlotkey = lotkey;
buffer = "";
if (tempstring == "")
{
continue;
}
}
if (buffer.Length > 0)
{
buffer = buffer + "\r\n";
}
buffer = buffer + tempstring;
}
filestream.Close();
readerStream.Close();
if (result != null)
{
result.AsyncWaitHandle.WaitOne(-1);
}
return;
}
catch (Exception e)
{
activityLog.Log("Error Occurred: " + e.ToString());
if (filestream != null)
{
filestream.Close();
}
hadError = true;
return;
}
}
private void SaveFileData(string file, string buffer, string directory)
{
// create file from last lot key with data from parsing, write, close, update lastlotkey
Directory.CreateDirectory(directory);
FileStream fs = null;
StreamWriter temp = null;
try
{
if (!File.Exists(file))
{
fs = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write, FileShare.None, 120000);
}
else
{
fs = new FileStream(file, FileMode.Truncate, FileAccess.Write, FileShare.None, 120000);
}
temp = new StreamWriter(fs, Encoding.UTF8, 120000);
temp.AutoFlush = false;
temp.WriteLine(headerLine);
temp.Write(buffer);
temp.Flush();
temp.Close();
fs.Close();
}
catch (Exception e)
{
activityLog.Log("Error Occurred: " + e.ToString());
if (fs != null)
{
fs.Close();
}
if (temp != null)
{
temp.Close();
}
hadError = true;
return;
}
}
EDIT
I crawled Stack Overflow and the deepest bowels of the internet, and after profiling line by line I found that the string concatenation was actually the heavy lifting of the parsing routine (after the file copy and windows sort), replacing that with Stringbuilder made a tremendous improvement, total processing time dropped from 20 minutes (copy+sort+parse) to 5 minutes of copy+sort and 2 minutes of parsing, 7 minutes total. A speed improvement of 130%