The Computer Language
Benchmarks Game

regex-redux Java #3 program

source code

/*
   The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

   contributed by Francois Green
*/

import java.io.*;

import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.Map.Entry;
import java.util.function.*;
import java.util.regex.*;

import static java.util.stream.Collectors.*;

public class regexredux {

  public static void main(String[] args) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    {
        byte[] buf = new byte[65536];
        int count;
        while ((count = System.in.read(buf)) > 0) {
            baos.write(buf, 0, count);
        }
    }
    final String input = baos.toString("US-ASCII");

    final int initialLength = input.length();

    final String sequence = input.replaceAll(">.*\n|\n", "");

    CompletableFuture<String> replacements = CompletableFuture.supplyAsync(() -> {
        final Map<String, String> iub = new LinkedHashMap<>();
        iub.put("tHa[Nt]", "<4>");
        iub.put("aND|caN|Ha[DS]|WaS", "<3>");
        iub.put("a[NSt]|BY", "<2>");
        iub.put("<[^>]*>", "|");
        iub.put("\\|[^|][^|]*\\|", "-");

        String buffer = sequence;
        for (Map.Entry<String, String> entry : iub.entrySet()) {
            buffer = Pattern.compile(entry.getKey()).matcher(buffer).replaceAll(entry.getValue());
        }
        return buffer;
    });

    final int codeLength = sequence.length();

    final List<String> variants = Arrays.asList("agggtaaa|tttaccct",
                                                "[cgt]gggtaaa|tttaccc[acg]",
                                                "a[act]ggtaaa|tttacc[agt]t",
                                                "ag[act]gtaaa|tttac[agt]ct",
                                                "agg[act]taaa|ttta[agt]cct",
                                                "aggg[acg]aaa|ttt[cgt]ccct",
                                                "agggt[cgt]aa|tt[acg]accct",
                                                "agggta[cgt]a|t[acg]taccct",
                                                "agggtaa[cgt]|[acg]ttaccct");

    BiFunction<String, String, Entry<String, Long>> counts = (v, s) -> {
      Long count = Pattern.compile(v).splitAsStream(s).count() - 1; //Off by one
      return new AbstractMap.SimpleEntry<>(v, count);
    };

    final Map<String, Long> results = variants.parallelStream()
                                              .map(variant -> counts.apply(variant, sequence))
                                              .collect(toMap(Map.Entry::getKey, Map.Entry::getValue));

    variants.forEach(variant -> System.out.println(variant + " " + results.get(variant)));

    System.out.println();
    System.out.println(initialLength);
    System.out.println(codeLength);
    System.out.println(replacements.join().length());
  }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
java version "1.8.0_121"
Java(TM) SE Runtime Environment (build 1.8.0_121-b13)
Java HotSpot(TM) 64-Bit Server VM (build 25.121-b13, mixed mode)


Mon, 20 Mar 2017 17:55:12 GMT

MAKE:
mv regexredux.java-3.java regexredux.java
/usr/local/src/jdk1.8.0_121/bin/javac -d .  regexredux.java
4.09s to complete and log all make actions

COMMAND LINE:
/usr/local/src/jdk1.8.0_121/bin/java   regexredux 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361