/mobile Handheld Friendly website

 performance measurements

Each table row shows performance measurements for this C++ g++ program with a particular command-line input value N.

 N  CPU secs Elapsed secs Memory KB Code B ≈ CPU Load
5000.070.07?1283  0% 0% 0% 100%
3,0002.382.389361283  0% 0% 1% 100%
5,5007.987.989361283  1% 0% 0% 100%

Read the ↓ make, command line, and program output logs to see how this program was run.

Read spectral-norm benchmark to see what this program should do.

 notes

gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8)

 spectral-norm C++ g++ #7 program source code

// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// Original C contributed by Sebastien Loisel
// Conversion to C++ by Jon Harrop
// OpenMP parallelize by The Anh Tran
// Add SSE by The Anh Tran
// Additional SSE optimization by Krzysztof Jakubowski

// g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 \
//     ./spec.c++ -o ./spec.run

#include <cmath>
#include <cstdlib>
#include <cstdio>
#include <sched.h>
#include <omp.h>
#include <emmintrin.h>

template <bool modei> int Index(int i, int j) {
    return (((i + j) * (i + j + 1)) >> 1) + (modei? i : j) + 1;
}

template <bool modei>
void EvalPart(double *__restrict__ src, double *__restrict__ dst,
                int begin, int end, int length) {
    int i = begin;

    for(; i + 7 < end; i += 8) {
        __m128d sum1 = _mm_set_pd(
            src[0] / double(Index<modei>(i + 1, 0)),
            src[0] / double(Index<modei>(i + 0, 0)));
        __m128d sum2 = _mm_set_pd(
            src[0] / double(Index<modei>(i + 3, 0)),
            src[0] / double(Index<modei>(i + 2, 0)));
        __m128d sum3 = _mm_set_pd(
            src[0] / double(Index<modei>(i + 5, 0)),
            src[0] / double(Index<modei>(i + 4, 0)));
        __m128d sum4 = _mm_set_pd(
            src[0] / double(Index<modei>(i + 7, 0)),
            src[0] / double(Index<modei>(i + 6, 0)));
    
        __m128d i1 = modei? _mm_set_pd(i + 1, i + 0) : _mm_set_pd(i + 2, i + 1);
        __m128d i2 = modei? _mm_set_pd(i + 3, i + 2) : _mm_set_pd(i + 4, i + 3);
        __m128d i3 = modei? _mm_set_pd(i + 5, i + 4) : _mm_set_pd(i + 6, i + 5);
        __m128d i4 = modei? _mm_set_pd(i + 7, i + 6) : _mm_set_pd(i + 8, i + 7);
        __m128d last1 = _mm_set_pd( Index<modei>(i + 1, 0), Index<modei>(i + 0, 0));
        __m128d last2 = _mm_set_pd( Index<modei>(i + 3, 0), Index<modei>(i + 2, 0));
        __m128d last3 = _mm_set_pd( Index<modei>(i + 5, 0), Index<modei>(i + 4, 0));
        __m128d last4 = _mm_set_pd( Index<modei>(i + 7, 0), Index<modei>(i + 6, 0));

        for(int j = 1; j < length; j++) {
            __m128d idx1 = last1 + i1 + _mm_set1_pd(j);
            __m128d idx2 = last2 + i2 + _mm_set1_pd(j);
            __m128d idx3 = last3 + i3 + _mm_set1_pd(j);
            __m128d idx4 = last4 + i4 + _mm_set1_pd(j);
            last1 = idx1;
            last2 = idx2;
            last3 = idx3;
            last4 = idx4;
            sum1 = sum1 + _mm_set1_pd(src[j]) / idx1;
            sum2 = sum2 + _mm_set1_pd(src[j]) / idx2;
            sum3 = sum3 + _mm_set1_pd(src[j]) / idx3;
            sum4 = sum4 + _mm_set1_pd(src[j]) / idx4;
        }

        _mm_storeu_pd(dst + i + 0, sum1);
        _mm_storeu_pd(dst + i + 2, sum2);
        _mm_storeu_pd(dst + i + 4, sum3);
        _mm_storeu_pd(dst + i + 6, sum4);
    }
    for(; i < end; i++) {
        double sum = 0;
        for (int j = 0; j < length; j++)
            sum += src[j] / double(Index<modei>(i, j));
        dst[i] = sum;
    }

}

void EvalATimesU(double *src, double *dst, int begin, int end, int N) {
    EvalPart<1>(src, dst, begin, end, N);
}

void EvalAtTimesU(double *src, double *dst, int begin, int end, int N) {
    EvalPart<0>(src, dst, begin, end, N);
}

void EvalAtATimesU(double *src, double *dst, double *tmp,
                   int begin, int end, int N) {
    EvalATimesU (src, tmp, begin, end, N);
    #pragma omp barrier
    EvalAtTimesU(tmp, dst, begin, end, N);
    #pragma omp barrier
}

int GetThreadCount() {
    cpu_set_t cs;
    CPU_ZERO(&cs);
    sched_getaffinity(0, sizeof(cs), &cs);

    int count = 0;
    for (int i = 0; i < CPU_SETSIZE; ++i)
        if (CPU_ISSET(i, &cs))
            ++count;

    return count;
}

double spectral_game(int N) {
    __attribute__((aligned(16))) double u[N];
    __attribute__((aligned(16))) double v[N], tmp[N];

    double vBv = 0.0;
    double vv = 0.0;

    #pragma omp parallel default(shared) num_threads(GetThreadCount())
    {
        // this block will be executed by NUM_THREADS
        // variable declared in this block is private for each thread
        int threadid = omp_get_thread_num();
        int threadcount = omp_get_num_threads();
        int chunk = N / threadcount;

        // calculate each thread's working range [r1 .. r2) => static schedule
        int begin = threadid * chunk;
        int end = (threadid < (threadcount -1)) ? (begin + chunk) : N;

        for(int i = begin; i < end; i++)
            u[i] = 1.0;
        #pragma omp barrier

        for (int ite = 0; ite < 10; ++ite) {
            EvalAtATimesU(u, v, tmp, begin, end, N);
            EvalAtATimesU(v, u, tmp, begin, end, N);
        }
    
        double sumvb = 0.0, sumvv = 0.0;
        for (int i = begin; i < end; i++) {
            sumvv += v[i] * v[i];
            sumvb += u[i] * v[i];
        }

        #pragma omp critical
        {
            vBv += sumvb;
            vv += sumvv;
        }
    }

    return sqrt(vBv / vv);
}

int main(int argc, char *argv[]) {
    int N = ((argc >= 2) ? atoi(argv[1]) : 2000);
    printf("%.9f\n", spectral_game(N));
    return 0;
}

 make, command-line, and program output logs

Mon, 28 Oct 2013 20:20:27 GMT

MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse2 -fopenmp -O0 spectralnorm.gpp-7.c++ -o spectralnorm.gpp-7.c++.o &&  \
        /usr/bin/g++ spectralnorm.gpp-7.c++.o -o spectralnorm.gpp-7.gpp_run -fopenmp 
rm spectralnorm.gpp-7.c++
0.22s to complete and log all make actions

COMMAND LINE:
./spectralnorm.gpp-7.gpp_run 5500

PROGRAM OUTPUT:
1.274224153

Revised BSD license

  Home   Conclusions   License   Play