/mobile Handheld Friendly website

 performance measurements

Each table row shows performance measurements for this Racket program with a particular command-line input value N.

 N  CPU secs Elapsed secs Memory KB Code B ≈ CPU Load
250,000Bad Output  881   

Read the ↓ make, command line, and program output logs to see how this program was run.

Read k-nucleotide benchmark to see what this program should do.

 notes

Welcome to Racket v6.0.

 k-nucleotide Racket #4 program source code

#lang racket/base
(require racket/fixnum)
(require racket/generator)
(require racket/sequence)
;;;
;;; The Computer Language Benchmarks Game
;;; http://benchmarksgame.alioth.debian.org/

;;; contributed by Matthew Flatt, modified by
;;; modified by James Bergstra

;;; Notes on the implementation: the strategy is to map the DNA letters to the
;;; bytes 0, 1, 2, 3, and then create a hash function that is simply the
;;; concatenation of these two-byte codes. This is handy because the slow part
;;; of this test is building the hash table, and this hash function means that
;;; we can take advantage of overlapping DNA sub-sequences to get a
;;; constant-time hash function (that does not depend on the sequence length).
;;;
;;; The bottleneck in this code seems to be Racket's hash table. The time to
;;; create the last hash table (for the len-18 string) seems to be about half
;;; the runtime of the whole program.

;; Map A->0, C->1, G->2 T->3 (and lowercase too)
(define dna->num
  (let ([tbl (make-bytes 256 255)])
    (for ([ch (in-list (bytes->list #"ACGTacgt"))]
          [ii (in-list '(0 1 2 3 0 1 2 3))])
      (bytes-set! tbl ch ii))
    (lambda (ch) (bytes-ref tbl ch))))

;;; map a hash key back to a string (needed for printing)
(define (unhash key len)
  (let ([rval (make-string len)])
    (sequence-fold
      (lambda (key pos)
        (string-set! rval pos (string-ref "ACGT" (bitwise-and key 3)))
        (arithmetic-shift key -2))
      key
      (in-range len))
    rval))

;;; Ideally this would serve all-counts, but my attempt to do that
;;; was horribly slow.
(define (hashes keylen dna as-codes)
  (generator ()
    (let ([key 0] [ishift (* 2 keylen)] [thresh (sub1 keylen)])
      (for
        ([bb (in-bytes dna)]
         [ii (in-range (bytes-length dna))])
        (set! key (arithmetic-shift (+ key (arithmetic-shift (if as-codes bb (dna->num bb) ) ishift)) -2))
        (if (>= ii thresh) (yield key) #f))
      )))

(define (all-counts keylen dna)
  (let ([table (make-hasheq)]
        [key 0]
        [ishift (* 2 keylen)]
        )
    (for
      ([bb (in-bytes dna)]
       [ii (in-range (bytes-length dna))])
      (set! key (arithmetic-shift (+ key (arithmetic-shift bb ishift)) -2))
      (if (>= ii (- keylen 1)) (hash-set! table key (add1 (hash-ref table key 0))) #f)
      )
    table))

(define (readbuf in foo)
  (let ([s (open-output-bytes)])
    ;; Skip to ">THREE ..."
    (regexp-match #rx#"(?m:^>THREE.*$)" in)
    ;; Copy everything but newlines
    (for ([l (in-bytes-lines in)])
      (write-bytes l s))
    ;; Replace letters with numbers 0, 1, 2, 3
    (let ([actg (get-output-bytes s)])
      (for ([ii (in-range (bytes-length actg))])
           (bytes-set! actg ii (foo (bytes-ref actg ii))))
      actg)))

(define (write-freqs table len)
  (let* ([content (hash-map table cons)]
         [total (exact->inexact (apply + (map cdr content)))])
    (for ([a (sort content > #:key cdr)])
      (printf "~a ~a\n" 
              (unhash (car a) len)
              (real->decimal-string (* 100 (/ (cdr a) total)) 3)))))

(define (write-one-freq table key)
  (let ([cnt (hash-ref table ((hashes (bytes-length key) key #f)) 0)])
    (printf "~a\t~a\n" cnt key)))

(define dna (readbuf (current-input-port) dna->num))

(write-freqs (all-counts 1 dna) 1)
(newline)

(write-freqs (all-counts 2 dna) 2)
(newline)

;; Specific sequences:
(for ([seq '(#"GGT" #"GGTA" #"GGTATT" #"GGTATTTTAATT" #"GGTATTTTAATTTATAGT")]) 
  (write-one-freq (all-counts (bytes-length seq) dna) seq))

 make, command-line, and program output logs

Fri, 28 Feb 2014 05:08:39 GMT

COMMAND LINE:
/usr/local/src/racket-6.0/bin/racket  knucleotide.racket-4.racket 0 < knucleotide-input250000.txt

UNEXPECTED OUTPUT 

27c27
< 0	GGTATTTTAATTTATAGT
---
> 9	GGTATTTTAATTTATAGT

PROGRAM OUTPUT:
A 30.298
T 30.157
C 19.793
G 19.752

AA 9.177
TA 9.137
AT 9.136
TT 9.094
AC 6.000
CA 5.999
GA 5.986
AG 5.985
TC 5.970
CT 5.970
GT 5.957
TG 5.956
CC 3.915
CG 3.910
GC 3.908
GG 3.902

14717	GGT
4463	GGTA
472	GGTATT
9	GGTATTTTAATT
0	GGTATTTTAATTTATAGT

Revised BSD license

  Home   Conclusions   License   Play