/mobile Handheld Friendly website

 performance measurements

Each table row shows performance measurements for this Pascal Free Pascal program with a particular command-line input value N.

 N  CPU secs Elapsed secs Memory KB Code B ≈ CPU Load
50,0000.190.213961199  91% 0% 5% 5%
500,0001.921.955,6641199  0% 1% 100% 0%
5,000,00019.5519.56110,5641199  0% 0% 100% 0%

Read the ↓ make, command line, and program output logs to see how this program was run.

Read regex-dna benchmark to see what this program should do.

 notes

Free Pascal Compiler version 2.6.0 [2011/12/23] for x86_64

Don't split pattern at |

 regex-dna Pascal Free Pascal program source code

(*
  The Computer Language Benchmarks Game
  http://benchmarksgame.alioth.debian.org

  contributed by Vitaly Trifonov
*)

{$mode objfpc}


uses sysutils;

(******************************    pcre wrap   *****************************)

const
  libpcre = 'pcre';
  PCRE_CASELESS = $00000001;

type
  pcre = Pointer;
  pcre_extra = Pointer;
  PPChar = ^PChar;


function pcre_compile( const pattern: PChar;
                       options: Integer;
                       const errptr: PPChar;
                       erroffset: PInteger;
                       const tableptr: PChar ): pcre; cdecl; external libpcre;

function pcre_exec( const code: pcre;
                    const extra: pcre_extra;
                    const subject: PChar;
                    length, startoffset, options: Integer;
                    ovector: PInteger;
                    ovecsize: Integer ): Integer; cdecl; external libpcre;

function pcre_study( const external_re: pcre;
                     options: integer;
                     errorptr: PPChar ): pcre_extra; cdecl; external libpcre;

(***************************************************************************)

const
  patt: array[0..10] of PChar = ('B','D','H','K','M','N','R','S','V','W','Y');
  repl: array[0..10] of PChar = ('(c|g|t)', '(a|g|t)', '(a|c|t)', '(g|t)',
      '(a|c)','(a|c|g|t)', '(a|g)', '(c|g)', '(a|c|g)', '(a|t)', '(c|t)');

var
  patterns: array[0..8] of PChar =
    (
      'agggtaaa|tttaccct',
      '[cgt]gggtaaa|tttaccc[acg]',
      'a[act]ggtaaa|tttacc[agt]t',
      'ag[act]gtaaa|tttac[agt]ct',
      'agg[act]taaa|ttta[agt]cct',
      'aggg[acg]aaa|ttt[cgt]ccct',
      'agggt[cgt]aa|tt[acg]accct',
      'agggta[cgt]a|t[acg]taccct',
      'agggtaa[cgt]|[acg]ttaccct'
    );


(* Count match with pattern of regexp in seq buffer. *)
function count( const pattern, seq: PChar; len: Integer ): Longint;
var
  cre: pcre;
  cre_ex: pcre_extra;
  err: PChar;
  ofs: Integer;
  ind: Longint = 0;
  m: array[0..2] of Integer;
begin
  cre := pcre_compile(pattern, 0, @err, @ofs, nil);
  cre_ex := pcre_study(cre, 0, @err);
  m[1] := 0;

  while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    ind += 1;

  count := ind
end;

function split_count ( const pattern, seq: PChar; len: Integer ): Longint; inline;
var
  split: PChar;
  vcount: Longint;
begin
  split := strscan(pattern, '|');
  Byte(split^) := 0;

  vcount := count(pattern, seq, len);
  vcount += count(@split[1], seq, len);

  split^ := '|';
  split_count := vcount
end;

(* Substitute pattern of regexp with repl, return new length. *)
function subst( const pattern, repl: PChar; var seq: PChar; len: Integer ): Longint;
var
  cre: pcre;
  cre_ex: pcre_extra;
  err: PChar;
  ofs: Integer;
  size_repl, size, bsize, pos: Longint;
  m: array[0..2] of Integer;
  newSeq, otmpseq: PChar;
begin
  cre := pcre_compile(pattern, 0, @err, @ofs, nil);
  cre_ex := pcre_study(cre, 0, @err);
  size_repl := strlen(repl);
  m[1] := 0; size := 0;

(* Calculate required size for malloc. *)
  while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    size += size_repl - m[1] + m[0];
  size += len;

  GetMem(newSeq, SizeOf(Char)*size);

(* Do substitute. *)
  m[1] := 0; pos := 0;
  otmpseq := newSeq;


  if size_repl <> 0 then
    while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    begin
      bsize := m[0] - pos;
      strlcopy(otmpseq, @seq[pos], bsize);

      otmpseq := @otmpseq[bsize];
      pos := m[1];

      otmpseq := strecopy(otmpseq, repl);
    end
  else
    while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    begin
      bsize := m[0] - pos;
      strlcopy(otmpseq, @seq[pos], bsize);

      otmpseq := @otmpseq[bsize];
      pos := m[1];
    end;

  strcopy(otmpseq, @seq[pos]);

  FreeMem(seq);
  seq := newSeq;

  subst := size
end;


var
  readLen: Longint = 0;
  maxSeqLen: Longint = 6000000;
  seqLen: Longint = 0;
  seq, newSeq: PChar;
  ch: Char;
  i: Longint;
begin
  GetMem(seq, SizeOf(Char)*(maxSeqLen+1));

(* Read FASTA format file from stdin and count length. *)
  while not eof do
  begin
    if readLen = maxSeqLen then
    begin
      maxSeqLen += 3000000;
      seq := ReAllocMem(seq, SizeOf(Char)*(maxSeqLen+1));
    end;
    read(seq[readLen]);
    readLen += 1
  end;
  Byte(seq[readLen]) := 0; //end read data


(* Remove FASTA sequence descriptions and all linefeed characters.  *)
  seqLen := subst('>.*|\n', '', seq, readLen);


(* Count all matches of patterns[i] in  seq buffer. *)
  for i := 0 to 8 do
    writeln(patterns[i], ' ', split_count(patterns[i], seq, seqLen));
    //writeln(patterns[i], ' ', count(patterns[i], seq, seqLen));

  writeln;
  writeln(readLen);

  writeln(seqLen);
  //writeln(strlen(seq));

(* All IUB substitutes. *)
  for i := 0 to 10 do
    seqLen := subst(patt[i], repl[i], seq, seqLen);

  writeln(seqLen);
  //writeln(strlen(seq));

  FreeMem(seq);
end.

 make, command-line, and program output logs

Sat, 02 Feb 2013 00:48:50 GMT

MAKE:
mv regexdna.fpascal regexdna.pas
/usr/local/src/fpc-2.6.0.x86_64-linux/bin/fpc -FuInclude/fpascal -XXs -O3 -Tlinux  -oFPASCAL_RUN regexdna.pas
Free Pascal Compiler version 2.6.0 [2011/12/23] for x86_64
Copyright (c) 1993-2011 by Florian Klaempfl and others
Target OS: Linux for x86-64
Compiling regexdna.pas
regexdna.pas(161,8) Note: Local variable "newSeq" not used
regexdna.pas(162,3) Note: Local variable "ch" not used
Linking FPASCAL_RUN
/usr/bin/ld: warning: link.res contains output sections; did you forget -T?
204 lines compiled, 0.2 sec 
2 note(s) issued
mv FPASCAL_RUN regexdna.fpascal_run
rm regexdna.pas
0.21s to complete and log all make actions

COMMAND LINE:
./regexdna.fpascal_run 0 < regexdna-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
66800214

Revised BSD license

  Home   Conclusions   License   Play