.NET

C# text file deduping based on trimmed lines

A while ago, I needed to analyze a bunch of files based on the unique trimmed lines in them.

I based my code on the C# Tee filter and the StackOverflow example of C# deduping based on split.

It is a bit more extensive than strictly needed, as it has a few more commandline arguments that come in handy when processing files on the console:

DeDupe - Dedupes a file into unique lines (only the first occurance of a line is kept) standard output
Lines are terminated by CRLF sequences
C# implementation januari 5th, 2012 by Jeroen Wiert Pluimers (http://wiert.wordpress.com),
DeDupe [-i | --ignore] [-t | --trim] [-f | --flush] [-l | --literal] [-? | --h | --help | /?] [file0] [...]
   Example:
 DeDupe --trim file0.txt file1.txt
   Dedupes the appended content of file0.txt and file1.txt into standard output
-t | --trim                  Will trim the lines before considering duplicates
-f | --flush                 Flushes files every CRLF
                               (setting is per tee instance)
-i | --ignore                Ignore cancel Ctrl+C keypress: see UnixUtils tee
-l | --literal               Stop recognizing flags, force all following filenames literally
-? | --h  | /? | --help      Displays this message and immediately quits
Duplicate filenames are quietly ignored.
If no input filenames are specified, then standard input is used
Press Ctrl+Z (End of File character) then Enter to abort.

Here is the source code:

using System;
using System.IO;
using System.Collections.Generic;
namespace DeDupe
{
    class Program
    {
        static void help()
        {
            Console.Error.WriteLine("DeDupe - Dedupes a file into unique lines (only the first occurance of a line is kept) standard output");
            Console.Error.WriteLine("Lines are terminated by CRLF sequences");
            Console.Error.WriteLine("C# implementation januari 5th, 2012 by Jeroen Wiert Pluimers (http://wiert.wordpress.com),");
            Console.Error.WriteLine("");
            Console.Error.WriteLine("DeDupe [-i | --ignore] [-t | --trim] [-f | --flush] [-l | --literal] [-? | --h | --help | /?] [file0] [...]");
            Console.Error.WriteLine("   Example:");
            Console.Error.WriteLine(" DeDupe --trim file0.txt file1.txt");
            Console.Error.WriteLine("   Dedupes the appended content of file0.txt and file1.txt into standard output");
            Console.Error.WriteLine("");
            Console.Error.WriteLine("-t | --trim                  Will trim the lines before considering duplicates");
            Console.Error.WriteLine("-f | --flush                 Flushes files every CRLF");
            Console.Error.WriteLine("                               (setting is per tee instance)");
            Console.Error.WriteLine("-i | --ignore                Ignore cancel Ctrl+C keypress: see UnixUtils tee");
            Console.Error.WriteLine("-l | --literal               Stop recognizing flags, force all following filenames literally");
            Console.Error.WriteLine("-? | --h  | /? | --help      Displays this message and immediately quits");
            Console.Error.WriteLine("");
            Console.Error.WriteLine("Duplicate filenames are quietly ignored.");
            Console.Error.WriteLine("If no input filenames are specified, then standard input is used");
            Console.Error.WriteLine("Press Ctrl+Z (End of File character) then Enter to abort.");
        }
        static void OnCancelKeyPressed(Object sender, ConsoleCancelEventArgs args)
        {
            // Set the Cancel property to true to prevent the process from
            // terminating.
            args.Cancel = true;
        }
        static List<String> filenames = new List<String>();
        static void addFilename(string value)
        {
            if (-1 == filenames.IndexOf(value))
                filenames.Add(value);
        }
        static bool trimLines = false;
        static bool flushFiles = false;
        static bool stopInterpretingFlags = false;
        static bool ignoreCtrlC = false;
        static int Main(string[] args)
        {
            try
            {
                foreach (string arg in args)
                {
                    //Since we're already parsing.... might as well check for flags:
                    if (stopInterpretingFlags)  //Stop interpreting flags, assume is filename
                    {
                        addFilename(arg);
                    }
                    else if (arg.Equals("/?") || arg.Equals("-?") || arg.Equals("-h") || arg.Equals("--help"))
                    {
                        help();
                        return 1; //Quit immediately
                    }
                    else if (arg.Equals("-t") || arg.Equals("--trim"))
                    {
                        trimLines = true;
                    }
                    else if (arg.Equals("-f") || arg.Equals("--flush"))
                    {
                        flushFiles = true;
                    }
                    else if (arg.Equals("-i") || arg.Equals("--ignore"))
                    {
                        ignoreCtrlC = true;
                    }
                    else if (arg.Equals("-l") || arg.Equals("--literal"))
                    {
                        stopInterpretingFlags = true;
                    }
                    else
                    { //If it isn't any of the above, it's a filename
                        addFilename(arg);
                    }
                    //Add more flags as necessary, just remember to SKIP adding them to the file processing stream!
                }
                if (ignoreCtrlC) //Implement the Ctrl+C fix selectively (mirror UnixUtils tee behavior)
                    Console.CancelKeyPress += OnCancelKeyPressed;
                HashSet<string> keys = new HashSet<string>();
                Int64 index = 0;
                using (StreamWriter writer = new StreamWriter(Console.OpenStandardOutput()))
                {
                    if (filenames.Count == 0)
                        using (StreamReader reader = new StreamReader(Console.OpenStandardInput()))
                        {
                            processInputFileReader(keys, writer, reader, ref index);
                        }
                    else
                        foreach (String filename in filenames)
                        {
                            using (StreamReader reader = new StreamReader(filename))
                            {
                                processInputFileReader(keys, writer, reader, ref index);
                            }
                        }
                    writer.Flush();
                }
            }
            catch (Exception ex)
            {
                Console.Error.WriteLine(String.Concat("DeDupe: ", ex.Message));  // Send error messages to stderr
            }
            return 0;
        }
        private static void processInputFileReader(HashSet<string> keys, StreamWriter writer, StreamReader reader, ref Int64 index)
        {
            string line = readLine(reader);
            while (!string.IsNullOrEmpty(line))
            {
                string candidate = line;
                if (keys.Add(candidate))
                {
                    writer.WriteLine(line);
                    index += line.Length + Environment.NewLine.Length;
                    if (flushFiles)
                        writer.Flush();
                }
                line = readLine(reader);
            }
        }
        private static string readLine(StreamReader reader)
        {
            string line = reader.ReadLine();
            if (null != line)
                if (trimLines)
                    line = line.Trim();
            return line;
        }
    }
}

Reference: C# text file deduping based on trimmed lines (via: Stack Overflow) from our NCG partner Jeroen Pluimers at the The Wiert Corner blog.

Related Articles

Subscribe
Notify of
guest

This site uses Akismet to reduce spam. Learn how your comment data is processed.

0 Comments
Oldest
Newest Most Voted
Inline Feedbacks
View all comments
Back to top button