The Computer Language
Benchmarks Game

k-nucleotide C++ g++ program

source code

/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

   contributed by Malte Skarupke
*/

#include <iostream>
#include <iomanip>
#include <cstdint>
#include <cstdio>
#include <string>
#include <cstring>
#include <algorithm>
#include <future>
#include <unistd.h>
#include <vector>
#include <ext/pb_ds/assoc_container.hpp>

constexpr const unsigned char tochar[8] =
{
   'A', 'A',
   'C', 'C',
   'T', 'T',
   'G', 'G'
};

// the mask used in the running hash. for a string of size 1
// this is 0b111, for a string of size 2 this is 0b11111,
// for a string of size 4 it's 0b111111111 etc.
// so it sets the lower 2n+1 bits to 1.
//
// this makes it so that the running hash can just keep on shifting
// to the left, and this mask will remove higher bits
uint64_t mask_for_size(unsigned size)
{
   return (1llu << ((size << 1llu) + 1llu)) - 1llu;
}

// incremental hash function that can hash strings of the
// characters 'a', 'A', 'c', 'C', 't', 'T', 'g', 'G'
// it works by realizing that the second and third last bit
// of these are all different. 'a' ends with 0b001,
// 'c' ends with 0b011, 't' ends with 0b100 and 'g'
// ends with 0b111. so we use the mask 0b110 to tell
// them all apart
struct RunningHash
{
   RunningHash()
      : data(0)
   {
   }
   RunningHash(const std::string& s)
      : data(0)
   {
      uint64_t mask = mask_for_size(s.size());
      for (char c : s)
      {
         push(c, mask);
      }
   }
   void push(char c, uint64_t mask)
   {
      // shift old characters over
      data <<= 2;
      // mask out oldest character
      data &= mask;
      // add two bits from the new character
      data |= c & 0b110;
   }
   bool operator<(const RunningHash & in)const
   {
      return data < in.data;
   }
   bool operator==(const RunningHash & in)const
   {
      return data == in.data;
   }
   std::string to_string(unsigned size)const
   {
      std::string tmp;
      tmp.reserve(size);
      uint64_t tmp1 = data;
      for(unsigned i = 0; i != size; ++i)
      {
         tmp += tochar[tmp1 & 0b110];
         tmp1 >>= 2;
      }
      std::reverse(tmp.begin(), tmp.end());
      return tmp;
   }
   struct hash
   {
      uint64_t operator()(const RunningHash & t) const
      {
         return t.data;
      }
   };
   uint64_t data;
};

// comment this next line to run a single-threaded version
#define MULTI_THREADED
#ifdef MULTI_THREADED
std::launch launch_policy = std::launch::async;
#else
std::launch launch_policy = std::launch::deferred;
#endif

typedef __gnu_pbds::cc_hash_table
<
   RunningHash,
   unsigned,
   RunningHash::hash
> HashMap;

HashMap calculate(const char * begin, const char * end, unsigned size)
{
   HashMap frequencies;
   RunningHash tmp;
   uint64_t mask = mask_for_size(size);
   // push the first n-1 characters without adding them to the hash
   // table. need to do this to initialize the incremental hash
   for (const char * init_end = begin + size - 1; begin != init_end; ++begin)
   {
      tmp.push(*begin, mask);
   }
   // add one characer at a time, counting how often each pattern occurs
   for (; begin != end; ++begin)
   {
      tmp.push(*begin, mask);
      ++frequencies[RunningHash(tmp)];
   }
   return frequencies;
}

void write_frequencies(const HashMap & f, unsigned input_size, unsigned size)
{
   std::vector<std::pair<unsigned, std::string>> freq_sorted;
   freq_sorted.reserve(f.size());
   for(auto && i: f)
   {
      freq_sorted.emplace_back(i.second, i.first.to_string(size));
   }
   std::sort(freq_sorted.begin(), freq_sorted.end(),
         [](const auto & l, const auto & r)
   {
      return l.first > r.first;
   });
   unsigned sum = input_size + 1 - size;
   for(auto && i : freq_sorted)
      std::cout << i.second << ' ' << (double(100 * i.first) / sum) << '\n';
   std::cout << '\n';
}

unsigned compute_count(const std::string & input, const std::string& string)
{
   // compute all values, but then only read one value. the problem description
   // requires that we compute all values
   const char * c_str_end = input.c_str() + input.size();
   return calculate(input.c_str(), c_str_end, string.size())[string];
}

void write_single_count(unsigned count, const char * string)
{
   std::cout << count << '\t' << string << '\n';
}

int main()
{
   std::string input;
   char buffer[256];
   while (fgets(buffer, 100, stdin) && memcmp(">THREE", buffer, 6) != 0)
   {
   }
   while (fgets(buffer, 100, stdin) && buffer[0] != '>')
   {
      if (buffer[0] != ';')
      {
         input.append(buffer, strlen(buffer) - 1);
      }
   }

   std::cout << std::setprecision(3) << std::setiosflags(std::ios::fixed);

   // start all tasks at once on background threads
   std::future<unsigned> GGTATTTTAATTTATAGT
      = std::async(launch_policy, compute_count, input, "GGTATTTTAATTTATAGT");
   std::future<unsigned> GGTATTTTAATT
      = std::async(launch_policy, compute_count, input, "GGTATTTTAATT");
   std::future<unsigned> GGTATT
      = std::async(launch_policy, compute_count, input, "GGTATT");
   std::future<unsigned> GGTA
      = std::async(launch_policy, compute_count, input, "GGTA");
   std::future<unsigned> GGT
      = std::async(launch_policy, compute_count, input, "GGT");
   const char * begin = input.data();
   const char * end = input.data() + input.size();
   std::future<HashMap> freq2
      = std::async(launch_policy, calculate, begin, end, 2);
   // do one task on the main thread
   write_frequencies(calculate(begin, end, 1), input.size(), 1);
   // get and write results from background threads
   write_frequencies(freq2.get(), input.size(), 2);
   write_single_count(GGT.get(), "GGT");
   write_single_count(GGTA.get(), "GGTA");
   write_single_count(GGTATT.get(), "GGTATT");
   write_single_count(GGTATTTTAATT.get(), "GGTATTTTAATT");
   write_single_count(GGTATTTTAATTTATAGT.get(), "GGTATTTTAATTTATAGT");
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
g++ (Ubuntu 6.3.0-12ubuntu2) 6.3.0 20170406


Fri, 14 Apr 2017 01:51:29 GMT

MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=native  -std=c++14 knucleotide.c++ -o knucleotide.c++.o &&  \
        /usr/bin/g++ knucleotide.c++.o -o knucleotide.gpp_run -Wl,--no-as-needed -lpthread 
rm knucleotide.c++
4.15s to complete and log all make actions

COMMAND LINE:
./knucleotide.gpp_run 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT