The Computer Language
Benchmarks Game

regex-dna C# Mono LLVM #8 program

source code

/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/
 *
 * contributed by Jimmy Tang
 * modified by Josh Goldfoot (2016)
 * modified by Jan de Vaan (compile regex, small stuff)
 */
using System;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

class regexdna
{
    static string readStdIn(out int seqLength, out int inputLength)
    {
        StringBuilder sb = new StringBuilder();
        int commentLength = 0;
        String line;
        
        while ((line = Console.ReadLine()) != null)
        {
            if (line[0] == '>')
                commentLength += line.Length + 1;
            else
            {
                sb.Append(line);
                commentLength += 1;
            }
        }
        seqLength = sb.Length;
        inputLength = seqLength + commentLength; 
        return sb.ToString();
    }

    static void Main()
    {

        string[] variants = {
           "agggtaaa|tttaccct"
          ,"[cgt]gggtaaa|tttaccc[acg]"
          ,"a[act]ggtaaa|tttacc[agt]t"
          ,"ag[act]gtaaa|tttac[agt]ct"
          ,"agg[act]taaa|ttta[agt]cct"
          ,"aggg[acg]aaa|ttt[cgt]ccct"
          ,"agggt[cgt]aa|tt[acg]accct"
          ,"agggta[cgt]a|t[acg]taccct"
          ,"agggtaa[cgt]|[acg]ttaccct"
        };
         
        int seqLength, initialLength;
        var sequence = readStdIn(out seqLength, out initialLength);
        var newSequenceLength = Task.Run(() =>
            {
                var table = new int['Z'];
                table['D'] = "(a|g|t)".Length - 1;
                table['H'] = "(a|c|t)".Length - 1;
                table['K'] = "(g|t)".Length - 1;
                table['M'] = "(a|c)".Length - 1;
                table['N'] = "(a|c|g|t)".Length - 1;
                table['R'] = "(a|g)".Length - 1;
                table['S'] = "(c|g)".Length - 1;
                table['V'] = "(a|c|g)".Length - 1;
                table['W'] = "(a|t)".Length - 1;
                table['Y'] = "(c|t)".Length - 1;
                table['B'] = "(c|g|t)".Length - 1;

                var r = new Regex("[WYKMSRBDVHN]", RegexOptions.Compiled);

                int length = sequence.Length;

                for (Match m = r.Match(sequence); m.Success; m = m.NextMatch())
                {
                    length += table[m.Value[0]];
                }
                
                return length;
            });

        var output = new string[variants.Length];
        Parallel.For(0, variants.Length, i =>
        {
            Regex r = new Regex(variants[i], RegexOptions.Compiled);            
            output[i] = r.ToString() + " " + r.Matches(sequence).Count;
        });

        foreach (var s in output)
            Console.WriteLine(s);
        
        Console.WriteLine("\n{0}\n{1}\n{2}", initialLength, seqLength, newSequenceLength.Result);        
    }
}
    

notes, command-line, and program output

NOTES:
32-bit Ubuntu one core
Mono JIT compiler version 4.5.1 (master/3e844dd Fri May  6 19:24:07 PDT 2016)
	LLVM:          yes(3.6.0svn-mono-master/9f79399)
	GC:            sgen



Thu, 06 Oct 2016 20:10:08 GMT

MAKE:
mv regexdna.csharpllvm-8.csharpllvm regexdna.csharpllvm-8.cs
/usr/local/bin/mcs  -optimize+ -platform:x86 -out:regexdna.csharpllvm-8.csharpllvm_run regexdna.csharpllvm-8.cs
rm regexdna.csharpllvm-8.cs
3.64s to complete and log all make actions

COMMAND LINE:
/usr/local/bin/mono --llvm --gc=sgen regexdna.csharpllvm-8.csharpllvm_run 0 < regexdna-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
66800214