.NET
C# text file deduping based on trimmed lines
A while ago, I needed to analyze a bunch of files based on the unique trimmed lines in them.
I based my code on the C# Tee filter and the StackOverflow example of C# deduping based on split.
It is a bit more extensive than strictly needed, as it has a few more commandline arguments that come in handy when processing files on the console:
DeDupe - Dedupes a file into unique lines (only the first occurance of a line is kept) standard output
Lines are terminated by CRLF sequences
C# implementation januari 5th, 2012 by Jeroen Wiert Pluimers (http://wiert.wordpress.com),
DeDupe [-i | --ignore] [-t | --trim] [-f | --flush] [-l | --literal] [-? | --h | --help | /?] [file0] [...]
Example:
DeDupe --trim file0.txt file1.txt
Dedupes the appended content of file0.txt and file1.txt into standard output
-t | --trim Will trim the lines before considering duplicates
-f | --flush Flushes files every CRLF
(setting is per tee instance)
-i | --ignore Ignore cancel Ctrl+C keypress: see UnixUtils tee
-l | --literal Stop recognizing flags, force all following filenames literally
-? | --h | /? | --help Displays this message and immediately quits
Duplicate filenames are quietly ignored.
If no input filenames are specified, then standard input is used
Press Ctrl+Z (End of File character) then Enter to abort.
Here is the source code:
using System;
using System.IO;
using System.Collections.Generic;
namespace DeDupe
{
class Program
{
static void help()
{
Console.Error.WriteLine("DeDupe - Dedupes a file into unique lines (only the first occurance of a line is kept) standard output");
Console.Error.WriteLine("Lines are terminated by CRLF sequences");
Console.Error.WriteLine("C# implementation januari 5th, 2012 by Jeroen Wiert Pluimers (http://wiert.wordpress.com),");
Console.Error.WriteLine("");
Console.Error.WriteLine("DeDupe [-i | --ignore] [-t | --trim] [-f | --flush] [-l | --literal] [-? | --h | --help | /?] [file0] [...]");
Console.Error.WriteLine(" Example:");
Console.Error.WriteLine(" DeDupe --trim file0.txt file1.txt");
Console.Error.WriteLine(" Dedupes the appended content of file0.txt and file1.txt into standard output");
Console.Error.WriteLine("");
Console.Error.WriteLine("-t | --trim Will trim the lines before considering duplicates");
Console.Error.WriteLine("-f | --flush Flushes files every CRLF");
Console.Error.WriteLine(" (setting is per tee instance)");
Console.Error.WriteLine("-i | --ignore Ignore cancel Ctrl+C keypress: see UnixUtils tee");
Console.Error.WriteLine("-l | --literal Stop recognizing flags, force all following filenames literally");
Console.Error.WriteLine("-? | --h | /? | --help Displays this message and immediately quits");
Console.Error.WriteLine("");
Console.Error.WriteLine("Duplicate filenames are quietly ignored.");
Console.Error.WriteLine("If no input filenames are specified, then standard input is used");
Console.Error.WriteLine("Press Ctrl+Z (End of File character) then Enter to abort.");
}
static void OnCancelKeyPressed(Object sender, ConsoleCancelEventArgs args)
{
// Set the Cancel property to true to prevent the process from
// terminating.
args.Cancel = true;
}
static List<String> filenames = new List<String>();
static void addFilename(string value)
{
if (-1 == filenames.IndexOf(value))
filenames.Add(value);
}
static bool trimLines = false;
static bool flushFiles = false;
static bool stopInterpretingFlags = false;
static bool ignoreCtrlC = false;
static int Main(string[] args)
{
try
{
foreach (string arg in args)
{
//Since we're already parsing.... might as well check for flags:
if (stopInterpretingFlags) //Stop interpreting flags, assume is filename
{
addFilename(arg);
}
else if (arg.Equals("/?") || arg.Equals("-?") || arg.Equals("-h") || arg.Equals("--help"))
{
help();
return 1; //Quit immediately
}
else if (arg.Equals("-t") || arg.Equals("--trim"))
{
trimLines = true;
}
else if (arg.Equals("-f") || arg.Equals("--flush"))
{
flushFiles = true;
}
else if (arg.Equals("-i") || arg.Equals("--ignore"))
{
ignoreCtrlC = true;
}
else if (arg.Equals("-l") || arg.Equals("--literal"))
{
stopInterpretingFlags = true;
}
else
{ //If it isn't any of the above, it's a filename
addFilename(arg);
}
//Add more flags as necessary, just remember to SKIP adding them to the file processing stream!
}
if (ignoreCtrlC) //Implement the Ctrl+C fix selectively (mirror UnixUtils tee behavior)
Console.CancelKeyPress += OnCancelKeyPressed;
HashSet<string> keys = new HashSet<string>();
Int64 index = 0;
using (StreamWriter writer = new StreamWriter(Console.OpenStandardOutput()))
{
if (filenames.Count == 0)
using (StreamReader reader = new StreamReader(Console.OpenStandardInput()))
{
processInputFileReader(keys, writer, reader, ref index);
}
else
foreach (String filename in filenames)
{
using (StreamReader reader = new StreamReader(filename))
{
processInputFileReader(keys, writer, reader, ref index);
}
}
writer.Flush();
}
}
catch (Exception ex)
{
Console.Error.WriteLine(String.Concat("DeDupe: ", ex.Message)); // Send error messages to stderr
}
return 0;
}
private static void processInputFileReader(HashSet<string> keys, StreamWriter writer, StreamReader reader, ref Int64 index)
{
string line = readLine(reader);
while (!string.IsNullOrEmpty(line))
{
string candidate = line;
if (keys.Add(candidate))
{
writer.WriteLine(line);
index += line.Length + Environment.NewLine.Length;
if (flushFiles)
writer.Flush();
}
line = readLine(reader);
}
}
private static string readLine(StreamReader reader)
{
string line = reader.ReadLine();
if (null != line)
if (trimLines)
line = line.Trim();
return line;
}
}
}
Reference: C# text file deduping based on trimmed lines (via: Stack Overflow) from our NCG partner Jeroen Pluimers at the The Wiert Corner blog.