The Computer Language
Benchmarks Game

regex-dna F# Mono LLVM program

source code

// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// Modified version of Valentin Kraevskiy
// Contributed by Vassil Keremidchiev

open System.Text.RegularExpressions
open System.Threading

let regex s = Regex (s, RegexOptions.Compiled)
let input = stdin.ReadToEnd ()
let withoutComments = (regex ">.*\n").Replace (input, "")
let text = (regex "\n").Replace (withoutComments, "")

let rec onblocks res s =
    let size = 1024*4096
    match s with
    | "" -> res
    | s when (s.Length < size) -> res @ [s]
    | s -> onblocks (res @ [s.Substring(0, size)]) (s.Substring(size)) 

["agggtaaa|tttaccct"
 "[cgt]gggtaaa|tttaccc[acg]"
 "a[act]ggtaaa|tttacc[agt]t"
 "ag[act]gtaaa|tttac[agt]ct"
 "agg[act]taaa|ttta[agt]cct"
 "aggg[acg]aaa|ttt[cgt]ccct"
 "agggt[cgt]aa|tt[acg]accct"
 "agggta[cgt]a|t[acg]taccct"
 "agggtaa[cgt]|[acg]ttaccct"]
|> List.map (fun s -> async { 
        return System.String.Format( "{0} {1}", s, 
                                        ((regex s).Matches text).Count) } ) 
|> Async.Parallel |> Async.RunSynchronously 
|> Array.iter (printfn "%s")

let newTextLength t =
    ["B", "(c|g|t)"
     "D", "(a|g|t)"
     "H", "(a|c|t)"
     "K", "(g|t)"
     "M", "(a|c)"
     "N", "(a|c|g|t)"
     "R", "(a|g)"
     "S", "(c|g)"
     "V", "(a|c|g)"
     "W", "(a|t)"
     "Y", "(c|t)"]
     |> List.fold (fun s (code, alt) -> (regex code).Replace (s, alt)) t 
     |> String.length

let newText = 
    text |> onblocks [] 
    |> Seq.map (fun s -> async { return newTextLength s } ) 
    |> Async.Parallel |> Async.RunSynchronously
    |> Array.sum

printf "\n%i\n%i\n%i\n" input.Length text.Length newText
    

notes, command-line, and program output

NOTES:
32-bit Ubuntu one core
F# Compiler for F# 4.0 (Open Source Edition)
Mono JIT compiler version 4.5.1 (master/3e844dd Fri May  6 19:24:07 PDT 2016)
	LLVM:          yes(3.6.0svn-mono-master/9f79399)
	GC:            sgen



Sat, 07 May 2016 19:42:05 GMT

MAKE:
mv regexdna.fsharp regexdna.fs
/usr/local/bin/fsharpc --target:exe --platform:x86 -O  -o regexdna.fsharp_run.exe regexdna.fs
F# Compiler for F# 4.0 (Open Source Edition)
Freely distributed under the Apache 2.0 Open Source License
rm regexdna.fs
5.98s to complete and log all make actions

COMMAND LINE:
/usr/local/bin/mono --llvm --gc=sgen regexdna.fsharp_run.exe 0 < regexdna-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
66800214