The Computer Language
Benchmarks Game

k-nucleotide OCaml #3 program

source code

(* The Computer Language Benchmarks Game
 * http://benchmarksgame.alioth.debian.org/
 *
 * contributed by Troestler Christophe
 * modified by Mauricio Fernandez
 * optimized by Fabrice Le Fessant
 *)

let tab = Array.create 256 0
let _ =
  tab.(Char.code 'A') <- 0;
  tab.(Char.code 'a') <- 0;
  tab.(Char.code 'T') <- 1;
  tab.(Char.code 't') <- 1;
  tab.(Char.code 'C') <- 2;
  tab.(Char.code 'c') <- 2;
  tab.(Char.code 'g') <- 3;
  tab.(Char.code 'G') <- 3

let uppercase line =
  let len = String.length line in
    for i = 0 to len- 1 do
      let c =  line.[i] in
	line.[i] <- Char.unsafe_chr tab.(Char.code c)
    done

    (* Extract DNA sequence "THREE" from stdin *)
    let dna =
      let is_not_three s = String.length s < 6 || String.sub s 0 6 <> ">THREE" in
	while is_not_three(input_line stdin) do () done;
	let buf = Buffer.create 130_000_000 in
	  (* Skip possible comment *)
	  (try
	     while true do
	       let line = input_line stdin in
		 if line.[0] <> ';' then begin
		   uppercase line;
		   Buffer.add_string buf line;
		   raise Exit
		 end
	     done with _ -> ());
	  (* Read the DNA sequence *)
	  (try while true do
	     let line = input_line stdin in
	       if line.[0] = '>' then raise End_of_file;
	       uppercase line;
	       Buffer.add_string buf line
	   done with End_of_file -> ());
	  Buffer.contents buf


    module K15 = struct
      type t = int
      let equal k1 k2 = k1 = k2
      let hash n = n
    end

    module K16 = struct
      type t = int * int
      let equal (a1,a2) (b1,b2) = a1 = b1 && a2 = b2
      let hash (a1, _) = a1
    end

    type entry = {
      mutable count : int;
    }

    let threshold15 =
      match Sys.word_size with
	  32 -> 15
	| 64 -> 31
	| _ -> assert false
    let threshold16 = threshold15 + 1

    let c = 0x40000-1
    module H15 = Hashtbl.Make(K15)
    module H16 = Hashtbl.Make(K16)
    let h15 = H15.create c
    let h16 = H16.create c

    let rec pack_word n k h =
      let b = Char.code dna.[n] in
      let h = h * 4 + b in
	if k > 1 then
	  pack_word (n+1) (k-1) h
	else h

    let pack15 k n =
      pack_word n k 0

    let pack16 k n =
      let h1 = pack_word n threshold15 0 in
      let h2 = pack_word (n+ threshold15) (k- threshold15) 0 in
	(h1, h2)

    let rec pack_word_in dna n k h =
      let b = dna.[n] in
      let b = tab.(Char.code b) in
      let h = h * 4 + b in
	if k > 1 then
	  pack_word_in dna (n+1) (k-1) h
	else h

    let pack_key15 seq =
      let k = String.length seq in
	pack_word_in seq 0 k 0

    let pack_key16 seq =
      let k = String.length seq in
      let h1 = pack_word_in seq 0 threshold15 0 in
      let h2 = pack_word_in seq threshold15 (k- threshold15) 0 in
	(h1, h2)

    let char = [| 'A'; 'T'; 'C'; 'G' |]

    let rec unpack h s pos k =
      let pos = pos - 1 in
	s.[pos] <- char.(h land 3);
	if k > 1 then
	  unpack (h lsr 2) s pos (k-1)

    let unpack15 k h1 =
      let s = String.create k in
	unpack h1 s k k;
	s

    let unpack16 k (h1, h2) =
      let s = String.create k in
	unpack h1 s threshold15 threshold15;
	unpack h2 s k (k- threshold15);
	s

    let count15 k =
      for i = 0 to String.length dna - k - 1 do
	let packed = pack15 k i in
	  try
	    let key = H15.find h15 packed in
	      key.count <- key.count + 1
	  with Not_found ->
	    H15.add h15 packed { count = 1 }
      done

    let count16 k =
      for i = 0 to String.length dna - k - 1 do
	let packed = pack16 k i in
	  try
	    let key = H16.find h16 packed in
	      key.count <- key.count + 1
	  with Not_found ->
	    H16.add h16 packed { count = 1 }
      done

    let count k =
      if k < threshold16 then count15 k else count16 k

    let compare_freq ((k1:string),(f1:float)) (k2, f2) =
      if f1 > f2 then -1 else if f1 < f2 then 1 else String.compare k1 k2

    let write_frequencies15 k =
      count15 k;
      let tot = float(H15.fold (fun _ n t -> n.count + t) h15 0) in
      let frq =
	H15.fold (fun h n l ->
		  (unpack15 k h, 100. *. float n.count /. tot) :: l) h15 [] in
      let frq = List.sort compare_freq frq in
	String.concat ""
	  (List.map (fun (k,f) -> Printf.sprintf "%s %.3f\n" k f) frq)

    let write_frequencies16 k =
      count16 k;
      let tot = float(H16.fold (fun _ n t -> n.count + t) h16 0) in
      let frq =
	H16.fold (fun h n l ->
		  (unpack16 k h, 100. *. float n.count /. tot) :: l) h16 [] in
      let frq = List.sort compare_freq frq in
	String.concat ""
	  (List.map (fun (k,f) -> Printf.sprintf "%s %.3f\n" k f) frq)

    let write_count15 k seq =
	count15 k;
	Printf.sprintf "%d\t%s" (try (H15.find h15 (pack_key15 seq)).count with Not_found -> 0) seq

    let write_count16 k seq =
	count16 k;
	Printf.sprintf "%d\t%s" (try (H16.find h16 (pack_key16 seq)).count with Not_found -> 0) seq

    let write_frequencies k =
	if k < threshold16 then write_frequencies15 k
	else write_frequencies16 k

    let write_count seq =
      let k = String.length seq in
	if k < threshold16 then write_count15 k seq
	else write_count16 k seq

    type t = Size of int | Dna of string

let invoke (f : t -> string) x : unit -> string =
  let input, output = Unix.pipe() in
  match Unix.fork() with
  | -1 -> Unix.close input; Unix.close output; (let v = f x in fun () -> v)
  | 0 ->
      Unix.close input;
      let output = Unix.out_channel_of_descr output in
        Marshal.to_channel output (f x) [];
        close_out output;
        exit 0
  | pid ->
      Unix.close output;
      let input = Unix.in_channel_of_descr input in fun () ->
        let v = Marshal.from_channel input in
        ignore (Unix.waitpid [] pid);
        close_in input;
	v

let parallelize f l =
  let list = List.map (invoke f) (List.rev l) in
  List.iter (fun g -> print_endline (g ())) (List.rev list)

let () =
  parallelize
    (fun i ->
       match i with
	   Size i ->
             write_frequencies i
	 | Dna k ->
             write_count k
    ) [Size 1;
       Size 2;
       Dna "GGT";
       Dna "GGTA";
       Dna "GGTATT";
       Dna "GGTATTTTAATT";
       Dna "GGTATTTTAATTTATAGT"]
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
The OCaml native-code compiler, version 4.04.0


Tue, 18 Apr 2017 04:44:13 GMT

MAKE:
mv knucleotide.ocaml-3.ocaml knucleotide.ocaml-3.ml
/usr/local/bin/ocamlopt -noassert -unsafe -fPIC -nodynlink -inline 100 unix.cmxa knucleotide.ocaml-3.ml -o knucleotide.ocaml-3.ocaml_run
File "knucleotide.ocaml-3.ml", line 9, characters 10-22:
Warning 3: deprecated: Array.create
Use Array.make instead.
File "knucleotide.ocaml-3.ml", line 24, characters 1-46:
Warning 3: deprecated: String.unsafe_set
File "knucleotide.ocaml-3.ml", line 118, characters 1-27:
Warning 3: deprecated: String.unsafe_set
File "knucleotide.ocaml-3.ml", line 123, characters 14-27:
Warning 3: deprecated: String.create
Use Bytes.create instead.
File "knucleotide.ocaml-3.ml", line 128, characters 14-27:
Warning 3: deprecated: String.create
Use Bytes.create instead.
rm knucleotide.ocaml-3.ml
0.28s to complete and log all make actions

COMMAND LINE:
./knucleotide.ocaml-3.ocaml_run 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT