The Computer Language
Benchmarks Game

regex-redux F# .NET Core #3 program

source code

// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// regex-dna program modified version of Valentin Kraevskiy
// contributed by Vassil Keremidchiev
// converted from regex-dna program

open System.Text.RegularExpressions
open System.Threading

let regex s = Regex (s, RegexOptions.Compiled)
let input = stdin.ReadToEnd ()
let withoutComments = (regex ">.*\n").Replace (input, "")
let text = (regex "\n").Replace (withoutComments, "")

let textSize = text.Length
let blockSize = textSize / 2

let onblocks overlapSize blockSize =
    let rec onblocks' res = function
        | "" -> res
        | s when s.Length <= blockSize -> res @ [s]
        | s -> onblocks' (res @ [s.Substring(0, blockSize)]) (s.Substring(blockSize - overlapSize)) 
    onblocks' []

let onProcBlocks = onblocks 0 ((textSize / System.Environment.ProcessorCount) + 1) 

let DNAcodes = [ "agggtaaa|tttaccct"
                 "[cgt]gggtaaa|tttaccc[acg]"
                 "a[act]ggtaaa|tttacc[agt]t"
                 "ag[act]gtaaa|tttac[agt]ct"
                 "agg[act]taaa|ttta[agt]cct"
                 "aggg[acg]aaa|ttt[cgt]ccct"
                 "agggt[cgt]aa|tt[acg]accct"
                 "agggta[cgt]a|t[acg]taccct"
                 "agggtaa[cgt]|[acg]ttaccct" ]

// Calculate all chunks in parallel
let chunksCounts = 
    let chunkedMatch (matchStr:string) = 
        text |> onblocks (matchStr.Length - 1) blockSize 
             |> List.map (fun t -> async { return matchStr, ((regex matchStr).Matches t).Count })

    DNAcodes |> List.collect chunkedMatch |> Async.Parallel |> Async.RunSynchronously 

// Gather result counts by summing them per DNA code
DNAcodes |> List.map (fun key -> key, chunksCounts |> Array.fold (fun S (k,cnt) -> if k=key then S+cnt else S) 0)
         |> List.iter (fun (key, cnt) -> printfn "%s %i" key cnt)
  
let lengthAfterReplace text =
    ["tHa[Nt]", "<4>"
     "aND|caN|Ha[DS]|WaS", "<3>"
     "a[NSt]|BY", "<2>"
     "[^>]*", "|"
     "\\|[^|][^|]*\\|" , "-"]
     |> List.fold (fun s (code, alt) -> (regex code).Replace (s, alt)) text 
     |> String.length

let replacedSize = 
    text |> onProcBlocks 
    |> List.map (fun chunk -> async { return lengthAfterReplace chunk } ) 
    |> Async.Parallel |> Async.RunSynchronously
    |> Array.sum

printf "\n%i\n%i\n%i\n" input.Length textSize replacedSize
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
F# 4.1.0
dotnet 1.0.1 005db40cd1
"System.GC.Server": true


Sun, 16 Apr 2017 19:30:13 GMT

MAKE:
cp regexredux.fsharpcore-3.fsharpcore Program.fs
cp Include/fsharpcore/tmp.fsproj .
cp Include/fsharpcore/runtimeconfig.template.json .
mkdir obj
cp Include/fsharpcore/project.assets.json ./obj
cp Include/fsharpcore/tmp.fsproj.nuget.g.props ./obj
cp Include/fsharpcore/tmp.fsproj.nuget.g.targets ./obj
/usr/bin/dotnet build -c Release
Microsoft (R) Build Engine version 15.1.548.43366
Copyright (C) Microsoft Corporation. All rights reserved.

  tmp -> /home/dunham/benchmarksgame_quadcore/regexredux/tmp/bin/Release/netcoreapp1.1/tmp.dll

Build succeeded.
    0 Warning(s)
    0 Error(s)

Time Elapsed 00:00:11.22
11.70s to complete and log all make actions

COMMAND LINE:
/usr/bin/dotnet ./bin/Release/netcoreapp1.1/tmp.dll 0 < regexredux-input50000.txt

UNEXPECTED OUTPUT 

13c13
< 35392
---
> 273927

PROGRAM OUTPUT:
agggtaaa|tttaccct 3
[cgt]gggtaaa|tttaccc[acg] 12
a[act]ggtaaa|tttacc[agt]t 43
ag[act]gtaaa|tttac[agt]ct 27
agg[act]taaa|ttta[agt]cct 58
aggg[acg]aaa|ttt[cgt]ccct 16
agggt[cgt]aa|tt[acg]accct 15
agggta[cgt]a|t[acg]taccct 18
agggtaa[cgt]|[acg]ttaccct 20

508411
500000
35392