/mobile Handheld Friendly website

 performance measurements

Each table row shows performance measurements for this C++ g++ program with a particular command-line input value N.

 N  CPU secs Elapsed secs Memory KB Code B ≈ CPU Load
1,0000.040.05?1239  0% 0% 0% 100%
4,0000.670.681,9921239  3% 1% 6% 100%
16,00010.4710.4829,4481239  0% 1% 1% 100%

Read the ↓ make, command line, and program output logs to see how this program was run.

Read mandelbrot benchmark to see what this program should do.

 notes

gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8)

 mandelbrot C++ g++ program source code

/*   The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

   contributed by Paolo Bonzini
   further optimized by Jason Garrett-Glaser
   OpenMP by The Anh Tran
*/

#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <sched.h>
#include <memory.h>

// need "-fopenmp" flag when compile
#include <omp.h>

#define L2_CACHE_LINE   64
#define BYTE_A_TIME      L2_CACHE_LINE
#define COLUMN_FETCH    (BYTE_A_TIME * 8)


typedef double   v2d   __attribute__ ((vector_size(16))); // vector of two doubles

const v2d v10   = { 1.0, 1.0 };
const v2d v15   = { 1.5, 1.5 };
const v2d v40   = { 4.0, 4.0 };

v2d inv_2n;   // {2.0/N, 2.0/N}
v2d inv_4n;   // {4.0/N, 4.0/N}


int 
GetThreadCount()
{
   cpu_set_t cs;
   CPU_ZERO(&cs);
   sched_getaffinity(0, sizeof(cs), &cs);

   int count = 0;
   for (int i = 0; i < CPU_SETSIZE; ++i)
   {
      if (CPU_ISSET(i, &cs))
         ++count;
   }
   return count;
}


struct MB_Element
{
private:
   v2d   Crv, Civ, Zrv, Ziv, Trv, Tiv;
   
public:
   // Z1 is point [x, y],   Z2 is point [x+1, y]
   // r = 3 <=> |Z2| <= 4   |Z1| <= 4
   // r = 2 <=> |Z2| > 4   |Z1| <= 4
   // r = 1 <=> |Z2| <= 4   |Z1| > 4
   // r = 0 <=> |Z2| > 4    |Z1| > 4
   int result;
   
   // construct 2 elements from C.real & C.img
   // C.real = Coordinate.x * 2 / N -1.5
   // C.img = Coordinate.y * 2 / N -1.0
   MB_Element(int r, v2d cimg)
   {
      double tmp[2] = {r+1, r};
      Crv = __builtin_ia32_loadupd(tmp);
      
      Crv = Crv * inv_2n - v15;
      Civ = cimg;

      Zrv = Crv;
      Ziv = cimg;

      Trv = Crv * Crv;
      Tiv = cimg * cimg;

      result = 3; // assume that 2 elements belong to MB set
   }

   // construct 2 elements, next to passed MB_Element object
   // Passed object: Tuple(Z1 = {x, y}, Z2 = {x+1, y})
   // Newly construct object: Tuple({x+2, y}, {x+3, y})
   MB_Element(MB_Element const& o)
   {
      Crv = o.Crv + inv_4n;   // c2 = (c1+2)*N = c1*N + 2*N
      Civ = o.Civ;
      
      Zrv = Crv;
      Ziv = o.Ziv;
      
      Trv = Crv * Crv;
      Tiv = o.Tiv;
      
      result = 3;
   }

   int
   eval()
   {
      v2d ZZ = Zrv * Ziv;
      Zrv = Trv - Tiv + Crv;
      Ziv = ZZ + ZZ + Civ;
      Trv = Zrv * Zrv;
      Tiv = Ziv * Ziv;

      // delta = (Trv + Tiv) <= 4.0 ? 0xff : 0x00
      v2d delta = (v2d)__builtin_ia32_cmplepd( (Trv + Tiv), v40 );
      // mask-out elements that goes outside MB set
      result &= __builtin_ia32_movmskpd(delta);

      return result;
   }
};

void 
mandelbrot(int N, char* data)
{
   // counter of each line, how many columns are processed
   __attribute__ ((aligned(L2_CACHE_LINE))) int jobs[N];
   memset(jobs, 0, sizeof(jobs));

   #pragma omp parallel default(shared) firstprivate(data) num_threads(GetThreadCount())
   {
      // foreach line
      for (int y = 0; y < N; ++y, data += (N >> 3)) 
      {
         // Calculate C.img = y*2/N -1.0
         v2d Civ = {y, y};
         Civ = Civ * inv_2n - v10;

         // Divide task for each thread here:
         // claim that me (this thread) will handle K-not-yet-process columns
         // K/8 bytes output should fit cache line size.
         int x;
         while ((x = __sync_fetch_and_add(jobs + y, COLUMN_FETCH)) < N)
         {
            int limit = std::min(x +COLUMN_FETCH, N);
            // unroll loop, evaluate 8 columns at once
            for (; x < limit; x += 8)
            {
               // each MB_Element represents 2 mandelbrot points
               MB_Element   e1(x, Civ), e2(e1), e3(e2), e4(e3);
               
               int i = 1;
               while ( (e1.result || e2.result || e3.result || e4.result) 
                     && (i++ < 50) )
               {
                  e1.eval();
                  e2.eval();
                  e3.eval();
                  e4.eval();
               }   
               
               int byte_acc = (e1.result << 6)|(e2.result << 4)|(e3.result << 2)|e4.result;
               data[x >> 3] = static_cast<char>(byte_acc);
            } // end foreach (column)
         }
      } // end foreach (line)
   } // end parallel region
}


int 
main (int argc, char **argv)
{
   int N = (argc == 2) ? atoi(argv[1]) : 200;
   assert((N % 8) == 0);

   printf("P4\n%d %d\n", N, N);
   int width_bytes = N >> 3;

   {
      double t[2];
      t[0] = t[1] = 2.0 / N;
      inv_2n = __builtin_ia32_loadupd(t);
      inv_4n = inv_2n + inv_2n;   // 4.0/N
   }

   char* data = 0;
   assert(   posix_memalign(reinterpret_cast<void**>(&data), L2_CACHE_LINE, width_bytes * N)
         == 0);

   mandelbrot(N, data);

   fwrite( data, width_bytes, N, stdout);
   free(data);

   return 0;
}

 make, command-line, and program output logs

Mon, 28 Oct 2013 19:41:06 GMT

MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse2 -fopenmp -mfpmath=sse -msse2 mandelbrot.c++ -o mandelbrot.c++.o &&  \
        /usr/bin/g++ mandelbrot.c++.o -o mandelbrot.gpp_run -fopenmp 
rm mandelbrot.c++
0.27s to complete and log all make actions

COMMAND LINE:
./mandelbrot.gpp_run 16000

(BINARY) PROGRAM OUTPUT NOT SHOWN

Revised BSD license

  Home   Conclusions   License   Play