The Computer Language
Benchmarks Game

k-nucleotide Scala #2 program

source code

/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

   Contributed by Jimmy Lu
*/

import java.io.InputStream

import scala.annotation.switch
import scala.collection.immutable.SortedSet
import scala.collection.mutable
import scala.collection.parallel.ParSeq
import scala.io.Source

object knucleotide {
  def main(args: Array[String]): Unit = {
    val sequence = extractSequence(System.in, "THREE")
    val (cs1, cs2) = ParSeq(18, 12, 6, 4, 3, 2, 1).map {
      count(sequence, _)
    }.seq.reverse.splitAt(2)
    for ((c, i) <- cs1.zipWithIndex) {
      for ((s, freq) <- frequency(i + 1, c))
        printf("%s %.3f%n", s.toUpperCase, freq * 100)
      println()
    }
    for ((c, s) <- (cs2, Seq("ggt",
                             "ggta",
                             "ggtatt",
                             "ggtattttaatt",
                             "ggtattttaatttatagt")).zipped) {
      val n = c.get(encode(s.getBytes, 0, s.length)).fold(0)(_.n)
      printf("%d\t%s%n", n, s.toUpperCase)
    }
  }

  def extractSequence(input: InputStream, name: String): Array[Byte] = {
    val description = ">" + name
    val builder = Array.newBuilder[Byte]
    builder.sizeHint(4 << 20)
    val lines = Source.fromInputStream(input).getLines().dropWhile {
      !_.startsWith(description)
    }.drop(1).takeWhile(!_.startsWith(">"))
    lines.foreach(builder ++= _.getBytes)
    builder.result()
  }

  class Counter(var n: Int)

  def count(sequence: Array[Byte], length: Int): mutable.LongMap[Counter] = {
    val m = mutable.LongMap.empty[Counter]
    var i = 0
    val end = sequence.length - length + 1
    while (i < end) {
      val k = encode(sequence, i, length)
      try {
        val c = m(k)
        c.n += 1
      } catch {
        case _: NoSuchElementException => m.update(k, new Counter(1))
      }
      i += 1
    }
    m
  }

  def frequency(
    length: Int,
    count: collection.Map[Long, Counter]
  ): Iterable[(String, Double)] = {
    val sum = count.values.map(_.n).sum.asInstanceOf[Double]
    val builder =
      SortedSet.newBuilder(Ordering.by[(String, Double), Double](_._2).reverse)
    for ((k, v) <- count)
      builder += ((new String(decode(k, length)), v.n / sum))
    builder.result()
  }

  def encode(sequence: Array[Byte], offset: Int, length: Int): Long = {
    // assert(length <= 32)
    var n = 0L
    var i = 0
    while (i < length) {
      val m = (sequence(offset + i): @switch) match {
        case 'a' => 0
        case 'c' => 1
        case 'g' => 2
        case 't' => 3
      }
      n <<= 2
      n |= m
      i += 1
    }
    n
  }

  def decode(n: Long, length: Int): Array[Byte] = {
    val bs = Array.ofDim[Byte](length)
    var nn = n
    var i = length - 1
    while (i >= 0) {
      bs(i) = ((nn & 3).asInstanceOf[Int]: @switch) match {
        case 0 => 'a'
        case 1 => 'c'
        case 2 => 'g'
        case 3 => 't'
      }
      nn >>= 2
      i -= 1
    }
    bs
  }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Scala compiler version 2.12.1 -- Copyright 2002-2016, LAMP/EPFL and Lightbend, Inc.
java version "1.8.0_121"
Java(TM) SE Runtime Environment (build 1.8.0_121-b13)
Java HotSpot(TM) 64-Bit Server VM (build 25.121-b13, mixed mode)


Mon, 06 Feb 2017 05:26:09 GMT

MAKE:
mv knucleotide.scala-2.scala knucleotide.scala
/usr/local/src/scala-2.12.1/bin/scalac -optimise -target:jvm-1.8 knucleotide.scala
warning: there was one deprecation warning; re-run with -deprecation for details
one warning found
7.78s to complete and log all make actions

COMMAND LINE:
env JAVA_OPTS=-Xmx3G /usr/local/src/jdk1.8.0_121/bin/java  -Xbootclasspath/a:/usr/local/src/scala-2.12.1/lib/scala-library.jar:/usr/local/src/scala-2.12.1/lib/scala-actors-2.11.0.jar:/usr/local/src/scala-2.12.1/lib/akka-actor_2.11-2.3.4.jar:/usr/local/src/scala-2.12.1/lib/config-1.2.1.jar knucleotide 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT