The Computer Language
Benchmarks Game

regex-redux Clojure #3 program

source code

;;   The Computer Language Benchmarks Game
;;   http://benchmarksgame.alioth.debian.org/

;; regex-dna program contributed by Andy Fingerhut
;; converted from regex-dna program

(ns regexredux
  (:gen-class)
  (:require [clojure.string :as str])
  (:import (java.util.regex Pattern)))


;; Slightly modified from standard library slurp so that it can read
;; from standard input.

(defn slurp-std-input
  ;; Reads the standard input using the encoding enc into a string and
  ;; returns it.
  ([] (slurp-std-input (.name (java.nio.charset.Charset/defaultCharset))))
  ([#^String enc]
     (with-open [r (new java.io.BufferedReader *in*)]
       (let [sb (new StringBuilder)]
	 (loop [c (.read r)]
	   (if (neg? c)
	     (str sb)
	     (do
	       (.append sb (char c))
	       (recur (.read r)))))))))


(def dna-seq-regexes '(    "agggtaaa|tttaccct"
		       "[cgt]gggtaaa|tttaccc[acg]"
		       "a[act]ggtaaa|tttacc[agt]t"
		       "ag[act]gtaaa|tttac[agt]ct"
		       "agg[act]taaa|ttta[agt]cct"
		       "aggg[acg]aaa|ttt[cgt]ccct"
		       "agggt[cgt]aa|tt[acg]accct"
		       "agggta[cgt]a|t[acg]taccct"
		       "agggtaa[cgt]|[acg]ttaccct" ))


(def iub-codes '( [ "tHa[Nt]"  "<4>"   ]
		  [ "aND|caN|Ha[DS]|WaS"  "<3>"   ]
		  [ "a[NSt]|BY"  "<2>"   ]
		  [ "<[^>]*>"  "|"     ]
		  [ "\\|[^|][^|]*\\|"   "-"     ] ))


(defn one-replacement [str [iub-str iub-replacement]]
  (str/replace str (. Pattern (compile iub-str)) iub-replacement))


(defn count-regex-occurrences [re s]
  ;; Prepending (?i) to the regexp in Java makes it
  ;; case-insensitive.
  [re (count (re-seq (. Pattern (compile (str "(?i)" re)))
                     s))])


(defn -main
  [& args]
  (let [content (slurp-std-input)
        original-len (count content)
        ;; I'd prefer if I could use the regexp #"(^>.*)?\n" like the
        ;; Perl benchmark does, but that only matches ^ at the beginning
        ;; of the string, not at the beginning of a line in the middle
        ;; of the string.
        content (str/replace content #"(^>.*|\n>.*)?\n" "")
        dna-seq-only-len (count content)]
    
    (doseq [[re num-matches] (pmap #(count-regex-occurrences % content)
                                   dna-seq-regexes)]
      (printf "%s %d\n" re num-matches))
    
    (let [content (reduce one-replacement content iub-codes)]
      (printf "\n%d\n%d\n%d\n" original-len dna-seq-only-len (count content))))
  (flush)
  (shutdown-agents))
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Clojure 1.8.0
java version "1.8.0_45"
Java(TM) SE Runtime Environment (build 1.8.0_45-b14)
Java HotSpot(TM) 64-Bit Server VM (build 25.45-b02, mixed mode)


Mon, 20 Mar 2017 18:01:05 GMT

MAKE:
mv regexredux.clojure-3.clojure regexredux.clj
/usr/local/src/jdk1.8.0_121/bin/java -Dclojure.compiler.direct-linking=true -Dclojure.compile.path=. -cp .:/usr/local/src/clojure/clojure-1.8.0.jar clojure.lang.Compile regexredux
Compiling regexredux to .
1.54s to complete and log all make actions

COMMAND LINE:
/usr/local/src/jdk1.8.0_121/bin/java -Xmx1024m -cp .:/usr/local/src/clojure/clojure-1.8.0.jar regexredux 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361