Best way to detect similar email addresses?
- by Chris
I have a list of ~20,000 email addresses, some of which I know to be fraudulent attempts to get around a "1 per e-mail" limit. ([email protected], [email protected], [email protected], etc...). I want to find similar email addresses for evaluation. Currently I'm using a levenshtein algorithm to check each e-mail against the others in the list and report any with an edit distance of less than 2. However, this is painstakingly slow. Is there a more efficient approach?
The test code I'm using now is:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Threading;
namespace LevenshteinAnalyzer
{
    class Program
    {
        const string INPUT_FILE = @"C:\Input.txt";
        const string OUTPUT_FILE = @"C:\Output.txt";
        static void Main(string[] args)
        {
            var inputWords = File.ReadAllLines(INPUT_FILE);
            var outputWords = new SortedSet<string>();
            for (var i = 0; i < inputWords.Length; i++)
            {
                if (i % 100 == 0) 
                    Console.WriteLine("Processing record #" + i);
                var word1 = inputWords[i].ToLower();
                for (var n = i + 1; n < inputWords.Length; n++)
                {
                    if (i == n) continue;
                    var word2 = inputWords[n].ToLower();
                    if (word1 == word2) continue;
                    if (outputWords.Contains(word1)) continue;
                    if (outputWords.Contains(word2)) continue;
                    var distance = LevenshteinAlgorithm.Compute(word1, word2);
                    if (distance <= 2)
                    {
                        outputWords.Add(word1);
                        outputWords.Add(word2);
                    }
                }
            }
            File.WriteAllLines(OUTPUT_FILE, outputWords.ToArray());
            Console.WriteLine("Found {0} words", outputWords.Count);
        }
    }
}