The Computer Language
Benchmarks Game

regex-dna F# Mono LLVM #4 program

source code

// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// Contributed by David Grenier

open System.Text.RegularExpressions
open System.Threading

let inline flip f a b = f b a
let inline curry f a b = f(a, b)

let regex s = Regex (s, RegexOptions.Compiled)
let replace rx = ((regex rx).Replace : string * string -> string) |> curry |> flip
let matchCount rx = (regex rx).Matches >> fun x -> x.Count

let input = stdin.ReadToEnd ()
let text = input |> replace ">.*\n" "" |> replace "\n" ""

module Array =
    module Parallel =
        let loop (count: int) f =
            let count = ref count
            let rec loop f =
                async {
                    match Interlocked.Decrement count with
                    | i when i < 0 -> ()
                    | i -> f i; Async.StartImmediate (loop f)
                }
            Array.init System.Environment.ProcessorCount (fun _ -> loop f)
            |> Async.Parallel
            |> Async.RunSynchronously
            |> ignore


        let init len f =
            let result = Array.zeroCreate len
            loop len (fun i -> result.[i] <- f i)
            result

        let map f arr = init (Array.length arr) (fun i -> f arr.[i])

[|
    "agggtaaa|tttaccct"
    "[cgt]gggtaaa|tttaccc[acg]"
    "a[act]ggtaaa|tttacc[agt]t"
    "ag[act]gtaaa|tttac[agt]ct"
    "agg[act]taaa|ttta[agt]cct"
    "aggg[acg]aaa|ttt[cgt]ccct"
    "agggt[cgt]aa|tt[acg]accct"
    "agggta[cgt]a|t[acg]taccct"
    "agggtaa[cgt]|[acg]ttaccct"
|]
|> Array.Parallel.map (fun s -> sprintf "%s %d" s (matchCount s text))
|> Array.iter (printfn "%s")

let chunkedMap windowCount f arr =
    let len = Array.length arr
    let size = len / (windowCount - 1)
    Array.Parallel.init windowCount (fun i ->
        if i + 1 = windowCount then len % (windowCount - 1) else size
        |> Array.sub arr (i * size)
        |> f
    )

let applyPatterns =
    [
        replace "B" "(c|g|t)"
        replace "D" "(a|g|t)"
        replace "H" "(a|c|t)"
        replace "K" "(g|t)"
        replace "M" "(a|c)"
        replace "N" "(a|c|g|t)"
        replace "R" "(a|g)"
        replace "S" "(c|g)"
        replace "V" "(a|c|g)"
        replace "W" "(a|t)"
        replace "Y" "(c|t)"
    ]
    |> List.reduce (>>)

text.ToCharArray()
|> chunkedMap 16 (fun cs ->
    System.String cs
    |> applyPatterns
    |> String.length
)
|> Array.sum
|> printfn "\n%i\n%i\n%i" input.Length text.Length
    

notes, command-line, and program output

NOTES:
32-bit Ubuntu one core
F# Compiler for F# 4.0 (Open Source Edition)
Mono JIT compiler version 4.5.1 (master/3e844dd Fri May  6 19:24:07 PDT 2016)
	LLVM:          yes(3.6.0svn-mono-master/9f79399)
	GC:            sgen



Sat, 07 May 2016 19:23:26 GMT

MAKE:
mv regexdna.fsharp-4.fsharp regexdna.fsharp-4.fs
/usr/local/bin/fsharpc --target:exe --platform:x86 -O  -o regexdna.fsharp-4.fsharp_run.exe regexdna.fsharp-4.fs
F# Compiler for F# 4.0 (Open Source Edition)
Freely distributed under the Apache 2.0 Open Source License

/home/dunham/benchmarksgame_onecore/regexdna/tmp/regexdna.fsharp-4.fs(6,1): warning FS0221: The declarations in this file will be placed in an implicit module 'Regexdna.fsharp-4' based on the file name 'regexdna.fsharp-4.fs'. However this is not a valid F# identifier, so the contents will not be accessible from other files. Consider renaming the file or adding a 'module' or 'namespace' declaration at the top of the file.
rm regexdna.fsharp-4.fs
4.65s to complete and log all make actions

COMMAND LINE:
/usr/local/bin/mono --llvm --gc=sgen regexdna.fsharp-4.fsharp_run.exe 0 < regexdna-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
66800214