The Computer Language
Benchmarks Game

regex-redux Go #9 program

source code

/* The Computer Language Benchmarks Game
 * http://benchmarksgame.alioth.debian.org/
 *
 * regex-dna program contributed by The Go Authors.
 * modified by Tylor Arndt.
 * modified by Chandra Sekar S to use optimized PCRE binding.
 * modified by Bert Gijsbers.
 * converted from regex-dna program
 */

package main

import (
    "fmt"
    "io/ioutil"
    "os"
    "runtime"
    "sync"

    "github.com/gijsbers/go-pcre"
)

const (
    comFlags = pcre.NEVER_UTF
    jitFlags = pcre.STUDY_JIT_COMPILE
)

var variants = []string{
    "agggtaaa|tttaccct",
    "[cgt]gggtaaa|tttaccc[acg]",
    "a[act]ggtaaa|tttacc[agt]t",
    "ag[act]gtaaa|tttac[agt]ct",
    "agg[act]taaa|ttta[agt]cct",
    "aggg[acg]aaa|ttt[cgt]ccct",
    "agggt[cgt]aa|tt[acg]accct",
    "agggta[cgt]a|t[acg]taccct",
    "agggtaa[cgt]|[acg]ttaccct",
}

type Substitution struct {
    pattern string
    replace string
    regexp  *pcre.Regexp
}

var substs = []Substitution{
    {"tHa[Nt]", "<4>", nil},
    {"aND|caN|Ha[DS]|WaS", "<3>", nil},
    {"a[NSt]|BY", "<2>", nil},
    {"<[^>]*>", "|", nil},
    {"\\|[^|][^|]*\\|", "-", nil},
}

func countMatches(pattern string, bytes []byte) (count int) {
    m := pcre.MustCompileJIT(pattern, comFlags, jitFlags).Matcher(bytes, 0)
    for f := m.Matches(); f; f = m.Match(bytes, 0) {
        count++
        bytes = bytes[m.Index()[1]:]
    }
    return
}

func readInput() (bytes []byte) {
    bytes, err := ioutil.ReadAll(os.Stdin)
    if err != nil {
        fmt.Fprintf(os.Stderr, "can't read input: %s\n", err)
        os.Exit(2)
    }
    return
}

// cleanData removes descriptions and newlines
func cleanData(bytes []byte) []byte {
    var regexp = pcre.MustCompileJIT("^>.*|\n", comFlags, jitFlags)
    var wg sync.WaitGroup
    ncpu := runtime.NumCPU()
    done := make([][]byte, ncpu)
    blen := len(bytes)
    todo := blen
    for i := 0; i < ncpu; i++ {
        size := todo / (ncpu - i)
        for size < todo && bytes[blen-todo+size-1] != '\n' {
            size++
        }
        work := bytes[blen-todo : blen-todo+size]
        wg.Add(1)
        go func(work []byte, index int) {
            done[index] = regexp.ReplaceAll(work, []byte{}, 0)
            wg.Done()
        }(work, i)
        todo -= size
    }
    wg.Wait()
    size := 0
    for i := 0; i < ncpu; i++ {
        size += len(done[i])
    }
    clean := make([]byte, 0, size)
    for i := 0; i < ncpu; i++ {
        clean = append(clean, done[i]...)
    }
    return clean
}

func substitute(bytes []byte, resultChan chan<- int) {
    var wg sync.WaitGroup
    ncpu := runtime.NumCPU()
    done := make([][]byte, ncpu)
    blen := len(bytes)
    todo := blen
    for i := 0; i < ncpu; i++ {
        size := todo / (ncpu - i)
        work := bytes[blen-todo : blen-todo+size]
        wg.Add(1)
        go func(work []byte, index int) {
            for i := range substs {
                sub := &substs[i]
                work = sub.regexp.ReplaceAll(work, []byte(sub.replace), 0)
            }
            done[index] = work
            wg.Done()
        }(work, i)
        todo -= size
    }
    wg.Wait()
    var sum int
    for i := 0; i < ncpu; i++ {
        sum += len(done[i])
    }
    resultChan <- sum
}

func precompile() {
    for i := range substs {
        re := pcre.MustCompileJIT(substs[i].pattern, comFlags, jitFlags)
        substs[i].regexp = &re
    }
}

func main() {
    var wg sync.WaitGroup
    wg.Add(1)
    go func() {
        precompile()
        wg.Done()
    }()

    bytes := readInput()
    inputLength := len(bytes)
    bytes = cleanData(bytes)
    cleanLength := len(bytes)

    wg.Wait()
    varResults := make([]chan int, len(variants))
    for i := range variants {
        varResults[i] = make(chan int, 1)
        go func(k int) {
            varResults[k] <- countMatches(variants[k], bytes)
        }(i)
    }

    subResult := make(chan int, 1)
    go substitute(bytes, subResult)

    for i := range variants {
        fmt.Printf("%s %d\n", variants[i], <-varResults[i])
    }

    fmt.Printf("\n%d\n%d\n%d\n", inputLength, cleanLength, <-subResult)
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
go version go1.8 linux/amd64


Mon, 20 Mar 2017 18:50:58 GMT

MAKE:
/usr/local/src/go/bin/go build -o regexredux.go-9.go_run
1.66s to complete and log all make actions

COMMAND LINE:
./regexredux.go-9.go_run 0 < regexredux-input50000.txt

UNEXPECTED OUTPUT 

13c13
< 274070
---
> 273927

PROGRAM OUTPUT:
agggtaaa|tttaccct 3
[cgt]gggtaaa|tttaccc[acg] 12
a[act]ggtaaa|tttacc[agt]t 43
ag[act]gtaaa|tttac[agt]ct 27
agg[act]taaa|ttta[agt]cct 58
aggg[acg]aaa|ttt[cgt]ccct 16
agggt[cgt]aa|tt[acg]accct 15
agggta[cgt]a|t[acg]taccct 18
agggtaa[cgt]|[acg]ttaccct 20

508411
500000
274070