The Computer Language
Benchmarks Game

k-nucleotide F# .NET Core #3 program

source code

(* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/
 *
 * Contributed by Don Syme
 * Based on C# contribution by Isaac Gouy, Antti Lankila, A.Nahr, The Anh Tran
 * Uses one byte array rather than strings.
 *)

open System
open System.IO
open System.Collections.Generic
open System.Text
open System.Threading

let input = Console.In
let toBytes (s: string) = s.ToCharArray() |> Array.map byte
let toString (s: byte []) = String(s |> Array.map char)
let prefixes = [| "ggt"; "ggta"; "ggtatt"; "ggtattttaatt"; "ggtattttaatttatagt" |]

let prefixBytes = 
    [| for p in prefixes -> toBytes p |]

let prefixLengths = 
    [| for p in prefixBytes -> p.Length |]

let prefixOffsets = Array.scan (+) 0 prefixLengths
let dnaStart = prefixOffsets.[prefixLengths.Length]

let dna = 
    seq { while true do yield input.ReadLine() }
    |> Seq.takeWhile (fun x -> x <> null)
    |> Seq.skipWhile (fun x -> not (x.StartsWith(">THREE")))
    |> Seq.skip 1
    |> String.concat ""
    |> toBytes
    |> Array.append (Array.concat prefixBytes)

let keyHash (k, n) = 
    let mutable h = 0
    for i in 0..n - 1 do
        h <- h * 31 + int dna.[k + i]
    h

let keyText (k, n) = toString(dna.[k..k + n - 1]).ToUpper()

type KeyData = 
    { mutable count: int
      key: int * int }

let generateFrequencies(frameSize) = 
    let freq = new Dictionary<int, KeyData>()
    let mutable v = Unchecked.defaultof<KeyData>
    for i in dnaStart..dna.Length - frameSize do
        let h = keyHash (i, frameSize)
        if freq.TryGetValue(h, &v) then v.count <- v.count + 1
        else freq.[h] <- { count = 1; key = (i, frameSize) }
    freq

let writeCount(n) = 
    let freq = generateFrequencies(prefixLengths.[n])
    let hash = keyHash(prefixOffsets.[n], prefixLengths.[n])
    
    let count = 
        if freq.ContainsKey(hash) then freq.[hash].count
        else 0
    String.Format("{0}\t{1}", count, prefixes.[n].ToUpper())

type Pair = KeyValuePair<int, KeyData>

let writeFrequencies(nucleotideLength) = 
    let freq = generateFrequencies(nucleotideLength)
    let items = new ResizeArray<Pair>(freq)
    items.Sort(fun (kv1: Pair) (kv2: Pair) -> 
        let c = kv2.Value.count - kv1.Value.count
        if c = 0 then keyText(kv1.Value.key).CompareTo(keyText(kv2.Value.key))
        else c)
    let buf = new StringBuilder()
    let sum = dna.Length - dnaStart - nucleotideLength + 1
    for element in items do
        let percent = double element.Value.count * 100.0 / double sum
        buf.AppendFormat("{0} {1:f3}\n", keyText(element.Value.key), percent) |> ignore
    buf.ToString()

Async.Parallel [ yield async { return writeFrequencies(1) }
                 yield async { return writeFrequencies(2) }
                 for i in 0..prefixes.Length - 1 do
                     yield async { return writeCount(i) } ]
|> Async.RunSynchronously
|> Array.iter Console.Out.WriteLine

    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
F# 4.1.0
dotnet 1.0.1 005db40cd1
"System.GC.Server": true


Sun, 16 Apr 2017 18:57:59 GMT

MAKE:
cp knucleotide.fsharpcore-3.fsharpcore Program.fs
cp Include/fsharpcore/tmp.fsproj .
cp Include/fsharpcore/runtimeconfig.template.json .
mkdir obj
cp Include/fsharpcore/project.assets.json ./obj
cp Include/fsharpcore/tmp.fsproj.nuget.g.props ./obj
cp Include/fsharpcore/tmp.fsproj.nuget.g.targets ./obj
/usr/bin/dotnet build -c Release
Microsoft (R) Build Engine version 15.1.548.43366
Copyright (C) Microsoft Corporation. All rights reserved.

  tmp -> /home/dunham/benchmarksgame_quadcore/knucleotide/tmp/bin/Release/netcoreapp1.1/tmp.dll

Build succeeded.
    0 Warning(s)
    0 Error(s)

Time Elapsed 00:00:12.53
13.01s to complete and log all make actions

COMMAND LINE:
/usr/bin/dotnet ./bin/Release/netcoreapp1.1/tmp.dll 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT