The Computer Language
Benchmarks Game

regex-redux Scala program

source code

/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

  regex-dna program contributed by Isaac Gouy
  modified and updated for 2.8 by Rex Kerr
  converted from regex-dna program
*/

import java.io._

object regexredux {
  def main(args: Array[String]) {

    var sequence = readFully()
    val initialLength = sequence.length

    def matching(s: String) = java.util.regex.Pattern.compile(s).matcher(sequence)

    // remove FASTA sequence descriptions and new-lines
    sequence = matching(">.*\n|\n").replaceAll("")
    val codeLength = sequence.length

    // regex match
    Array(
      "agggtaaa|tttaccct",
      "[cgt]gggtaaa|tttaccc[acg]",
      "a[act]ggtaaa|tttacc[agt]t",
      "ag[act]gtaaa|tttac[agt]ct",
      "agg[act]taaa|ttta[agt]cct",
      "aggg[acg]aaa|ttt[cgt]ccct",
      "agggt[cgt]aa|tt[acg]accct",
      "agggta[cgt]a|t[acg]taccct",
      "agggtaa[cgt]|[acg]ttaccct"
    ).map(v => {
      var count = 0
      val m = matching(v)
      while (m.find()) count += 1
      println(v + " " + count)
    })

    // regex substitution
    Array(
      ("tHa[Nt]", "<4>"),
      ("aND|caN|Ha[DS]|WaS", "<3>"),
      ("a[NSt]|BY", "<2>"),
      ("<[^>]*>", "|"),
      ("\\|[^|][^|]*\\|", "-")
    ).foreach(iub => sequence = matching(iub._1).replaceAll(iub._2) )

    println("\n" + initialLength + "\n" + codeLength + "\n" + sequence.length)
  }

  def readFully() = {
    val block = new Array[Char](10240)
    val buffer = new StringBuffer
    val r = new InputStreamReader(System.in)

    Iterator.
      continually(r.read(block)).
      takeWhile(_ > -1).
      foreach(n => buffer.append(block,0,n))

   r.close
   buffer.toString
  }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Scala compiler version 2.12.1 -- Copyright 2002-2016, LAMP/EPFL and Lightbend, Inc.
java version "1.8.0_121"
Java(TM) SE Runtime Environment (build 1.8.0_121-b13)
Java HotSpot(TM) 64-Bit Server VM (build 25.121-b13, mixed mode)


Mon, 20 Mar 2017 18:05:33 GMT

MAKE:
mv regexredux.scala regexredux.scala
mv: 'regexredux.scala' and 'regexredux.scala' are the same file
/home/dunham/benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:599: recipe for target 'regexredux.scala_run' failed
make: [regexredux.scala_run] Error 1 (ignored)
/usr/local/src/scala-2.12.1/bin/scalac -optimise -target:jvm-1.8 regexredux.scala
warning: there was one deprecation warning; re-run with -deprecation for details
one warning found
6.20s to complete and log all make actions

COMMAND LINE:
env JAVA_OPTS=-Xmx1024m /usr/local/src/jdk1.8.0_121/bin/java  -Xbootclasspath/a:/usr/local/src/scala-2.12.1/lib/scala-library.jar:/usr/local/src/scala-2.12.1/lib/scala-actors-2.11.0.jar:/usr/local/src/scala-2.12.1/lib/akka-actor_2.11-2.3.4.jar:/usr/local/src/scala-2.12.1/lib/config-1.2.1.jar regexredux 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361