The Computer Language
Benchmarks Game

regex-redux C# .NET Core #4 program

source code

/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/
 * 
 * Regex-Redux
 * by Josh Goldfoot
 *
*/

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using System.Text.RegularExpressions;

class regexredux
{
    static void Main(string[] args)
    {

        // read FASTA sequence
        String sequence = Console.In.ReadToEnd();
        int initialLength = sequence.Length;

        // remove FASTA sequence descriptions and new-lines
        sequence = Regex.Replace(sequence, ">.*\n|\n", "");
        int codeLength = sequence.Length;

        Task<int> substitution = Task.Run(() => {
            // regex substitution
            string newseq = Regex.Replace(sequence, "tHa[Nt]", "<4>");
            newseq = Regex.Replace(newseq, "aND|caN|Ha[DS]|WaS", "<3>");
            newseq = Regex.Replace(newseq, "a[NSt]|BY", "<2>");
            newseq = Regex.Replace(newseq, "<[^>]*>", "|");
            newseq = Regex.Replace(newseq, "\\|[^|][^|]*\\|" , "-");
            return newseq.Length;
        });

        // divide large sequence into chunks (one per core) and search each in parallel
        int[][] sums = Chunks(sequence).AsParallel().Select(CountRegexes).ToArray();

        var variants = Variants.variantsCopy();
        for (int i = 0; i < 9; i++)
            Console.WriteLine("{0} {1}", variants[i], sums.Sum(a => a[i]));

        Console.WriteLine("\n{0}\n{1}\n{2}",
           initialLength, codeLength, substitution.Result);
    }

    private static IEnumerable<string> Chunks(string sequence)
    {
        int numChunks = Environment.ProcessorCount;
        int start = 0;
        int chunkSize = sequence.Length / numChunks;
        while (--numChunks >= 0)
        {
            if (numChunks > 0)
                yield return sequence.Substring(start, chunkSize);
            else
                yield return sequence.Substring(start);
            start += chunkSize;
        }
    }

    private static int[] CountRegexes(string chunk)
    {
        // regex match
        int[] counts = new int[9];
        string[] variants = Variants.variantsCopy();

        for (int i = 0; i < 9; i++)
            for (var m = Regex.Match(chunk, variants[i]); m.Success; m = m.NextMatch()) counts[i]++;
        return counts;
    }
}

public class Variants
{
    public static string[] variantsCopy()
    {
        return new string[] {
          "agggtaaa|tttaccct"
         ,"[cgt]gggtaaa|tttaccc[acg]"
         ,"a[act]ggtaaa|tttacc[agt]t"
         ,"ag[act]gtaaa|tttac[agt]ct"
         ,"agg[act]taaa|ttta[agt]cct"
         ,"aggg[acg]aaa|ttt[cgt]ccct"
         ,"agggt[cgt]aa|tt[acg]accct"
         ,"agggta[cgt]a|t[acg]taccct"
         ,"agggtaa[cgt]|[acg]ttaccct"
        };
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
2.0.0-preview1-005977
"System.GC.Server": true


Thu, 08 Jun 2017 19:59:13 GMT

MAKE:
cp regexredux.csharpcore-4.csharpcore Program.cs
cp Include/csharpcore/tmp.csproj .
cp Include/csharpcore/runtimeconfig.template.json .
mkdir obj
cp Include/csharpcore/tmp.csproj.nuget.g.props ./obj
cp Include/csharpcore/tmp.csproj.nuget.g.targets ./obj
/usr/bin/dotnet build -c Release
Microsoft (R) Build Engine version 15.3.117.23532
Copyright (C) Microsoft Corporation. All rights reserved.

  tmp -> /home/dunham/benchmarksgame_quadcore/regexredux/tmp/bin/Release/netcoreapp2.0/tmp.dll

Build succeeded.
    0 Warning(s)
    0 Error(s)

Time Elapsed 00:00:03.70

4.22s to complete and log all make actions

COMMAND LINE:
/usr/bin/dotnet ./bin/Release/netcoreapp2.0/tmp.dll 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361