performance measurements

Each table row shows performance measurements for this C++ g++ program with a particular command-line input value N.

 N  CPU secs Elapsed secs Memory KB Code B ≈ CPU Load
5000.140.14?1330  0% 0% 0% 93%
3,0004.884.886441330  1% 1% 0% 100%
5,50016.3416.346441330  0% 1% 1% 100%

Read the ↓ make, command line, and program output logs to see how this program was run.

Read spectral-norm benchmark to see what this program should do.

 notes

gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1)

 spectral-norm C++ g++ #2 program source code

// The Computer Language Benchmarks Game 
// http://shootout.alioth.debian.org/
//
// Original C contributed by Sebastien Loisel
// Conversion to C++ by Jon Harrop
// OpenMP parallelize by The Anh Tran
// Add SSE by The Anh Tran
// Additional SSE optimization by Krzysztof Jakubowski

// g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 ./spec.c++ -o ./spec.run

#include <cmath>
#include <cstdlib>
#include <cstdio>
#include <sched.h>
#include <omp.h>
#include <emmintrin.h>

inline int Index(int i, int j) {
   return (((i + j) * (i + j + 1)) >> 1) + i + 1;
}

inline void MultiplyShorts(__m128i a, __m128i b, __m128i &outA, __m128i &outB) {
   __m128i lo = _mm_mullo_epi16(a, b);
   __m128i hi = _mm_mulhi_epu16(a, b);
   outA = _mm_unpacklo_epi16(lo, hi);
   outB = _mm_unpackhi_epi16(lo, hi);
}

template <bool modei>
void EvalPart(double *__restrict__ src, double *__restrict__ dst, int begin, int end, int length) __attribute((noinline));
template <bool modei>
void EvalPart(double *__restrict__ src, double *__restrict__ dst, int begin, int end, int length) {
   int i = begin;
   for(; i + 7 < end; i += 8) {
      __m128i i8 = _mm_set_epi16(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i + 0);
      __m128i ii8 = _mm_add_epi16(i8, _mm_set1_epi16(1));
      __m128i ai4 = _mm_set_epi32(i + 4, i + 3, i + 2, i + 1);
      __m128i bi4 = _mm_set_epi32(i + 8, i + 7, i + 6, i + 5);
      __m128d sum1 = _mm_set1_pd(0.0), sum2 = _mm_set1_pd(0.0);
      __m128d sum3 = _mm_set1_pd(0.0), sum4 = _mm_set1_pd(0.0);

      for(int j = 0; j < length; j++) {
         __m128i j8 = _mm_set1_epi16(j);
         __m128i a4, b4;
         MultiplyShorts(_mm_add_epi16(i8, j8), _mm_add_epi16(ii8, j8), a4, b4);
         if(modei) {
            a4 = _mm_add_epi32(_mm_srli_epi32(a4, 1), ai4);
            b4 = _mm_add_epi32(_mm_srli_epi32(b4, 1), bi4);
         }
         else {
            a4 = _mm_add_epi32(_mm_srli_epi32(a4, 1), _mm_set1_epi32(j + 1));
            b4 = _mm_add_epi32(_mm_srli_epi32(b4, 1), _mm_set1_epi32(j + 1));
         }

         __m128d a2 = _mm_cvtepi32_pd(a4);
         __m128d b2 = _mm_cvtepi32_pd(_mm_shuffle_epi32(a4, 2 + (3 << 2)));
         __m128d c2 = _mm_cvtepi32_pd(b4);
         __m128d d2 = _mm_cvtepi32_pd(_mm_shuffle_epi32(b4, 2 + (3 << 2)));

         __m128d uj2 = _mm_set1_pd(src[j]);
         sum1 = _mm_add_pd(sum1, _mm_div_pd(uj2, a2));
         sum2 = _mm_add_pd(sum2, _mm_div_pd(uj2, b2));
         sum3 = _mm_add_pd(sum3, _mm_div_pd(uj2, c2));
         sum4 = _mm_add_pd(sum4, _mm_div_pd(uj2, d2));
      }

      _mm_storeu_pd(dst + i + 0, sum1);
      _mm_storeu_pd(dst + i + 2, sum2);
      _mm_storeu_pd(dst + i + 4, sum3);
      _mm_storeu_pd(dst + i + 6, sum4);
   }
   for(; i < end; i++) {
      double sum = 0;
      for (int j = 0; j < length; j++)
         sum += src[j] / double(modei?Index(i, j) : Index(j, i));
      dst[i] = sum;
   }
}

// Search for appropriate number of threads to spawn
int GetThreadCount() {
   cpu_set_t cs;
   CPU_ZERO(&cs);
   sched_getaffinity(0, sizeof(cs), &cs);

   int count = 0;
   for (int i = 0; i < CPU_SETSIZE; ++i) {
      if (CPU_ISSET(i, &cs))
         ++count;
   }
   return count;
}

double spectral_game(int N) {
   __attribute__((aligned(64))) double u[N];
   __attribute__((aligned(64))) double tmp[N];
   __attribute__((aligned(64))) double v[N];

   double vBv   = 0.0;
   double vv   = 0.0;

   #pragma omp parallel default(shared) num_threads(GetThreadCount())
   {
      // this block will be executed by NUM_THREADS
      // variable declared in this block is private for each thread
      int threadid   = omp_get_thread_num();
      int threadcount   = omp_get_num_threads();
      int chunk      = N / threadcount;

      // calculate each thread's working range [r1 .. r2) => static schedule
      int begin = threadid * chunk;
      int end = (threadid < (threadcount -1)) ? (begin + chunk) : N;

      for(int i = begin; i < end; i++)
         u[i] = 1.0;
      #pragma omp barrier

      for (int ite = 0; ite < 10; ++ite) {
         EvalPart<1>(u, tmp, begin, end, N);
         #pragma omp barrier
         EvalPart<0>(tmp, v, begin, end, N);
         #pragma omp barrier
         EvalPart<1>(v, tmp, begin, end, N);
         #pragma omp barrier
         EvalPart<0>(tmp, u, begin, end, N);
         #pragma omp barrier
      }
   
      double sumvb = 0.0, sumvv = 0.0;
      for (int i = begin; i < end; i++) {
         sumvv += v[i] * v[i];
         sumvb += u[i] * v[i];
      }

      #pragma omp critical
      {
         vBv   += sumvb;
         vv   += sumvv;
      }
   } // parallel region

   return sqrt(vBv / vv);
}


int main(int argc, char *argv[]) {
   int N = ((argc >= 2) ? atoi(argv[1]) : 2000);

   printf("%.9f\n", spectral_game(N));
   return 0;
}

 make, command-line, and program output logs

Thu, 24 Apr 2014 03:48:19 GMT

MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse2 -fopenmp -O0 spectralnorm.gpp-2.c++ -o spectralnorm.gpp-2.c++.o &&  \
        /usr/bin/g++ spectralnorm.gpp-2.c++.o -o spectralnorm.gpp-2.gpp_run -fopenmp 
rm spectralnorm.gpp-2.c++
0.21s to complete and log all make actions

COMMAND LINE:
./spectralnorm.gpp-2.gpp_run 5500

PROGRAM OUTPUT:
1.274224153

Revised BSD license

  Home   Conclusions   License   Play