The Computer Language
Benchmarks Game

regex-dna Java #6 program

source code

/*
   The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

   contributed by Michael Stover
   modified by Stefan Feldbinder
 */

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public final class regexdna {

  private static String getReplacement(final String in) {
    switch (in) {
      case "W":
        return "(a|t)";
      case "Y":
        return "(c|t)";
      case "K":
        return "(g|t)";
      case "M":
        return "(a|c)";
      case "S":
        return "(c|g)";
      case "R":
        return "(a|g)";
      case "B":
        return "(c|g|t)";
      case "D":
        return "(a|g|t)";
      case "V":
        return "(a|c|g)";
      case "H":
        return "(a|c|t)";
      case "N":
        return "(a|c|g|t)";
      default:
        return null;
    }
  }

  public static void main(String[] args) throws IOException {
    ByteArrayOutputStream bao = new ByteArrayOutputStream();
    byte[] buffer = new byte[1024 * 4];
    int bytesRead;
    while ((bytesRead = System.in.read(buffer)) > 0) {
      bao.write(buffer, 0, bytesRead);
    }
    String input = bao.toString("US-ASCII");

    int initialLength = input.length();

    final String sequence = input.replaceAll(">.*\n|\n", "");

    int codeLength = sequence.length();

    String[] variants = {"agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]",
      "a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct",
      "agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct",
      "agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct",
      "agggtaa[cgt]|[acg]ttaccct"};

    for (String variant : variants) {
      int count = 0;
      Matcher m = Pattern.compile(variant).matcher(sequence);
      while (m.find()) {
        ++count;
      }
      System.out.println(variant + " " + count);
    }

    StringBuffer buf = new StringBuffer();
    Matcher m = Pattern.compile("[WYKMSRBDVHN]").matcher(sequence);
    while (m.find()) {
      m.appendReplacement(buf, "");
      buf.append(getReplacement(m.group()));
    }
    m.appendTail(buf);

    System.out.println();
    System.out.println(initialLength);
    System.out.println(codeLength);
    System.out.println(buf.length());
  }
}
    

notes, command-line, and program output

NOTES:
32-bit Ubuntu one core
java version "1.8.0_92"
Java(TM) SE Runtime Environment (build 1.8.0_92-b14)
Java HotSpot(TM) Server VM (build 25.92-b14, mixed mode)


Sun, 22 May 2016 18:28:09 GMT

MAKE:
mv regexdna.java-6.java regexdna.java
/usr/local/src/jdk1.8.0_92/bin/javac regexdna.java
2.67s to complete and log all make actions

COMMAND LINE:
/usr/local/src/jdk1.8.0_92/bin/java  -server -XX:+TieredCompilation -XX:+AggressiveOpts regexdna 0 < regexdna-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
66800214