performance measurements

Each table row shows performance measurements for this Erlang HiPE program with a particular command-line input value N.

 N  CPU secs Elapsed secs Memory KB Code B ≈ CPU Load
50,0000.200.20?1198  0% 0% 5% 100%
500,0000.840.8425,7001198  0% 0% 1% 100%
5,000,0007.387.39145,8401198  0% 1% 0% 100%

Read the ↓ make, command line, and program output logs to see how this program was run.

Read regex-dna benchmark to see what this program should do.

 notes

Erlang R16B (erts-5.10.1) [source] [async-threads:10] [hipe] [kernel-poll:false]

The program rewrites the regular expressions as string variants and then uses sub-string search and replace (ala Boyer-More and Aho-Corassick).

 regex-dna Erlang HiPE #7 program source code

% The Computer Language Benchmarks Game
% http://benchmarksgame.alioth.debian.org/
% Contributed by: Hynek Vychodil 2010
% Inspired by regex-dna Erlang HiPE #5 program
%    by Sergei Matusevich 2007 and Thanassis Avgerinos 2009

% Main changes:
%   1/ Very fast Port line input instead stdio (~5x)
%   2/ Faster IUB code alternatives explicit expansion
%      using binary instead lists (~5x)
%   3/ Precompile regexps in data loading phase
%   4/ Simpler dispatch and result join code
%   5/ Use binary:matches for pattern match


-module(regexdna).

-compile([native, {hipe, [o3]}]).

-export([main/0, main/1]).

main(_) -> main().

main() -> do(), halt().

do() ->
  S = self(),
  Worker = spawn_link(fun () -> work(S) end),
  Worker ! {data, read()},
  receive finish -> ok end.

work(Master) ->
  S = self(),
  Patterns = [{Pat, compile_pattern(Pat)}
    || Pat <- patterns()],
  {RawSize, [B3, B2, B1 | _]} = receive
    {data, Data} -> Data
  end,
  [L1, L2, L3] = L = [size(X) || X <- [B1, B2, B3]],
  Size = lists:sum(L),
  PIDS = [{spawn_link(matcher(S, B2, B3, MR)),
      printer(Pat)}
    || {Pat, MR} <- Patterns],
  ExpandedSize = L1 + L3 + size(expand(B2, L2, 0, <<>>)),
  results(PIDS),
  io:format("~n~b~n~b~n~b~n",
    [RawSize, Size, ExpandedSize]),
  Master ! finish.

expand(B, S, I, R) when I < S ->
  case B of
    <<_:I/binary, $B, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(c|g|t)">>);
    <<_:I/binary, $D, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(a|g|t)">>);
    <<_:I/binary, $H, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(a|c|t)">>);
    <<_:I/binary, $K, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(g|t)">>);
    <<_:I/binary, $M, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(a|c)">>);
    <<_:I/binary, $N, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(a|c|g|t)">>);
    <<_:I/binary, $R, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(a|g)">>);
    <<_:I/binary, $S, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(c|g)">>);
    <<_:I/binary, $V, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(a|c|g)">>);
    <<_:I/binary, $W, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(a|t)">>);
    <<_:I/binary, $Y, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, "(c|t)">>);
    <<_:I/binary, X, _/binary>> ->
      expand(B, S, I + 1, <<R/binary, X>>)
  end;
expand(_, _, _, R) -> R.

matcher(S, B2, B3, MR) ->
  fun () ->
      S !
      {self(), countMatches(B2, MR) + countMatches(B3, MR)}
  end.

printer(Pat) ->
  fun (Num) -> io:format("~s ~b~n", [Pat, Num]) end.

countMatches(Data, RE) ->
  length(binary:matches(Data,RE)).

results([{PID, Fin} | R]) ->
  receive {PID, Ret} -> Fin(Ret), results(R) end;
results([]) -> ok.

patterns() ->
  ["agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]",
    "a[act]ggtaaa|tttacc[agt]t",
    "ag[act]gtaaa|tttac[agt]ct",
    "agg[act]taaa|ttta[agt]cct",
    "aggg[acg]aaa|ttt[cgt]ccct",
    "agggt[cgt]aa|tt[acg]accct",
    "agggta[cgt]a|t[acg]taccct",
    "agggtaa[cgt]|[acg]ttaccct"].

read() ->
   Port = open_port({fd, 0, 1}, [in, binary, {line, 256}]),
   read(Port, 0, [], []).

read(Port, Size, Seg, R) ->
  receive
    {Port, {data, {eol, <<$>:8, _/binary>> = Line}}} ->
      read(Port, Size + size(Line) + 1, [],
        [iolist_to_binary(lists:reverse(Seg, [])) | R]);
    {Port, {data, {eol, Line}}} ->
      read(Port, Size + size(Line) + 1, [Line | Seg], R);
    {'EXIT', Port, normal} ->
      {Size, [list_to_binary(lists:reverse(Seg, [])) | R]};
    Other ->
      io:format(">>>>>>> Wrong! ~p~n", [Other]),
      exit(bad_data)
  end.

compile_pattern(RE) ->
  binary:compile_pattern(
    lists:append([multbin(X) || X<-string:tokens(RE, "|")])
  ).

multbin(X) ->
  [list_to_binary(Y) || Y <- mult(X, [])].

mult([$[|R], P) ->
  {C, Suffix} = var(R, []),
  [ lists:reverse(P, [X|Y])
    || X<-C, Y<-mult(Suffix, [])];
mult([X|R], P) -> mult(R, [X|P]);
mult([], P) -> [lists:reverse(P)].

var([$]|R], C) -> {C, R};
var([X|R], C) -> var(R, [X|C]).

 make, command-line, and program output logs

Mon, 04 Mar 2013 22:44:38 GMT

MAKE:
mv regexdna.hipe-7.hipe regexdna.erl
/usr/local/src/otp_src_R16B_nosmp/bin/erlc +native +"{hipe, [o3]}"  regexdna.erl
3.34s to complete and log all make actions

COMMAND LINE:
/usr/local/src/otp_src_R16B_nosmp/bin/erl -smp disable -noshell -run -noinput -run regexdna main 0 < regexdna-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
66800214

Revised BSD license

  Home   Conclusions   License   Play