The Computer Language
Benchmarks Game

k-nucleotide OCaml program

source code

(* 
 * The Computer Language Benchmarks Game
 * http://benchmarksgame.alioth.debian.org/
 *
 * contributed by Troestler Christophe
 * modified by Mauricio Fernandez
 *)

module S = struct
  type t = string

  let size = 0x40000

  let equal (s1:string) s2 = (s1 = s2)

  let hash s =
    let h = ref 0 in
    for i = 0 to String.length s - 1 do h := !h * 5 + Char.code s.[i] done;
    !h
end

module H = Hashtbl.Make(S)

(* [counts k dna] fills and return the hashtable [count] of
   k-nucleotide keys and count values for a particular reading-frame
   of length [k] of the string [dna].  Keys point to mutable values
   for speed (to avoid looking twice the same key to reinsert the
   value). *)
let count = H.create S.size
let counts k dna =
  H.clear count;
  let key = String.create k in
    for i = 0 to String.length dna - k do
      String.unsafe_blit dna i key 0 k;
      try incr(H.find count key) with Not_found -> H.add count (String.copy key) (ref 1)
    done;
    count

(* [write_frequencies k dna] writes the frequencies for a
   reading-frame of length [k] sorted by descending frequency and then
   ascending k-nucleotide key. *)
let compare_freq ((k1:string),(f1:float)) (k2, f2) =
  if f1 > f2 then -1 else if f1 < f2 then 1 else compare k1 k2

let write_frequencies k dna =
  let cnt = counts k dna in
  let tot = float(H.fold (fun _ n t -> !n + t) cnt 0) in
  let frq = H.fold (fun k n l -> (k, 100. *. float !n /. tot) :: l) cnt [] in
  let frq = List.sort compare_freq frq in
  List.iter (fun (k,f) -> Printf.printf "%s %.3f\n" k f) frq;
  print_string "\n"

let write_count seq dna =
  let cnt = counts (String.length seq) dna in
  Printf.printf "%d\t%s\n" (try !(H.find cnt seq) with Not_found -> 0) seq

(* Extract DNA sequence "THREE" from stdin *)
let dna_three =
  let is_not_three s = try String.sub s 0 6 <> ">THREE" with _ -> true in
  while is_not_three(input_line stdin) do () done;
  let buf = Buffer.create 1000 in
  (* Skip possible comment *)
  (try while true do
     let line = input_line stdin in
     if line.[0] <> ';' then
       (Buffer.add_string buf (String.uppercase line); raise Exit)
   done with _ -> ());
  (* Read the DNA sequence *)
  (try while true do
       let line = input_line stdin in
       if line.[0] = '>' then raise End_of_file;
       Buffer.add_string buf (String.uppercase line)
   done with End_of_file -> ());
  Buffer.contents buf

let () = Gc.set { (Gc.get()) with Gc.minor_heap_size = 1024 * 2048 }

let () =
  List.iter (fun i -> write_frequencies i dna_three) [1; 2];
  List.iter (fun k -> write_count k dna_three)
    ["GGT"; "GGTA"; "GGTATT"; "GGTATTTTAATT"; "GGTATTTTAATTTATAGT"]
    

notes, command-line, and program output

NOTES:
32-bit Ubuntu one core
The OCaml native-code compiler, version 4.03.0


Tue, 26 Apr 2016 01:38:22 GMT

MAKE:
mv knucleotide.ocaml knucleotide.ml
/usr/local/bin/ocamlopt -noassert -unsafe -nodynlink -inline 100 unix.cmxa knucleotide.ml -o knucleotide.ocaml_run
File "knucleotide.ml", line 32, characters 12-25:
Warning 3: deprecated: String.create
Use Bytes.create instead.
File "knucleotide.ml", line 35, characters 64-75:
Warning 3: deprecated: String.copy
File "knucleotide.ml", line 66, characters 31-47:
Warning 3: deprecated: String.uppercase
Use String.uppercase_ascii instead.
File "knucleotide.ml", line 72, characters 30-46:
Warning 3: deprecated: String.uppercase
Use String.uppercase_ascii instead.
rm knucleotide.ml
0.46s to complete and log all make actions

COMMAND LINE:
./knucleotide.ocaml_run 0 < knucleotide-input25000000.txt

PROGRAM FAILED 


PROGRAM OUTPUT:

Fatal error: exception Failure("Buffer.add: cannot grow buffer")