/mobile Handheld Friendly website
Ubuntu : Intel® Q6600® one core |
Each table row shows performance measurements for this C++ g++ program with a particular command-line input value N.
| N | CPU secs | Elapsed secs | Memory KB | Code B | ≈ CPU Load |
|---|---|---|---|---|---|
| 500 | 0.12 | 0.12 | ? | 1278 | 0% 0% 0% 100% |
| 3,000 | 4.06 | 4.06 | 644 | 1278 | 0% 0% 0% 100% |
| 5,500 | 13.54 | 13.55 | 644 | 1278 | 0% 0% 0% 100% |
Read the ↓ make, command line, and program output logs to see how this program was run.
Read spectral-norm benchmark to see what this program should do.
gcc (Ubuntu/Linaro 4.7.3-1ubuntu1) 4.7.3
// The Computer Language Benchmarks Game // http://benchmarksgame.alioth.debian.org/ // // Original C contributed by Sebastien Loisel // Conversion to C++ by Jon Harrop // OpenMP parallelize by The Anh Tran // Add SSE by The Anh Tran // Fastest with this flag: -Os // g++ -pipe -Os -fomit-frame-pointer -march=native -fopenmp -mfpmath=sse -msse2 ./spec.c++ -o ./spec.run #include <cmath> #include <cstdlib> #include <cstdio> #include <sched.h> #include <omp.h> // define SIMD data type. 2 doubles are packed in 1 XMM register typedef double v2dt __attribute__((vector_size(16))); v2dt const v1 = {1.0, 1.0}; struct Param { union { double* u; // source v2dt* xmm_u; }; union { double* tmp; // temporary v2dt* xmm_tmp; }; union { double* v; // destination v2dt* xmm_v; }; int length; // source/desti vec's length int half_length; int r_begin; // working range of each thread int r_end; double vBv; double vv; }; // Return: 1.0 / (i + j) * (i + j +1) / 2 + i + 1; double eval_A(int i, int j) { int d = (((i+j) * (i+j+1)) >> 1) + i+1; return 1.0 / d; } // Return: 2 doubles in xmm register [double1, double2] // double1 = 1.0 / (i + j) * (i + j +1) / 2 + i + 1; // double2 = 1.0 / (i+1 + j) * (i+1 + j +1) / 2 + i+1 + 1; // Or: // double2 = 1.0 / (i + j+1) * (i + j+1 +1) / 2 + i + 1; template<bool inc_i> v2dt eval_A_xmm(int i, int j) { if (inc_i) i <<= 1; else j <<= 1; int d1 = (((i+j) * (i+j+1)) >> 1) + i+1; int d2 = (((i+1 +j) * (i+1 +j+1)) >> 1) + i +1; if (inc_i) d2 += 1; v2dt r = {d1, d2}; return v1 / r; } double hz_add(v2dt x) { double const* val = reinterpret_cast<double const*>(&x); return val[0] + val[1]; } void eval_A_times_u (Param const &p) { for (int i = p.r_begin, ie = p.r_end; i < ie; ++i) { v2dt sum = {0, 0}; // xmm = 2 doubles => index [0..length/2) int j = 0; for (; j < p.half_length; ++j) sum += eval_A_xmm<false>(i, j) * p.xmm_u[j]; p.tmp[i] = hz_add(sum); // If source vector is odd size. This should be called <= 1 time for (j = j*2; j < p.length; ++j) p.tmp[i] += eval_A(i, j) * p.u[j]; } } void eval_At_times_u(Param const &p) { for (int i = p.r_begin, ie = p.r_end; i < ie; ++i) { v2dt sum = {0, 0}; int j = 0; for (; j < p.half_length; ++j) sum += eval_A_xmm<true>(j, i) * p.xmm_tmp[j]; p.v[i] = hz_add(sum); for (j = j*2; j < p.length; ++j) p.v[i] += eval_A(j, i) * p.tmp[j]; } } // Each thread modifies its portion in destination vector // -> barrier needed to sync access void eval_AtA_times_u(Param const &p) { eval_A_times_u( p ); #pragma omp barrier eval_At_times_u( p ); #pragma omp barrier } void final_sum(Param& p) { v2dt sum_vBv = {0,0}; v2dt sum_vv = {0,0}; int i = p.r_begin /2; int ie = p.r_end /2; for (; i < ie; ++i) { sum_vv += p.xmm_v[i] * p.xmm_v[i]; sum_vBv += p.xmm_u[i] * p.xmm_v[i]; } p.vBv = hz_add(sum_vBv); p.vv = hz_add(sum_vv); for (i = i*2; i < p.r_end; ++i) { p.vBv += p.u[i] * p.v[i]; p.vv += p.v[i] * p.v[i]; } } void fill_10(Param const& p) { int i = p.r_begin /2; int ie = p.r_end /2; for (; i < ie; ++i) p.xmm_u[i] = v1; for (i = i*2; i < p.r_end; ++i) p.u[i] = 1.0; } // Search for appropriate number of threads to spawn int GetThreadCount() { cpu_set_t cs; CPU_ZERO(&cs); sched_getaffinity(0, sizeof(cs), &cs); int count = 0; for (int i = 0; i < CPU_SETSIZE; ++i) { if (CPU_ISSET(i, &cs)) ++count; } return count; } double spectral_game(int N) { // Align L2 cache line __attribute__((aligned(64))) double u[N]; __attribute__((aligned(64))) double tmp[N]; __attribute__((aligned(64))) double v[N]; double vBv = 0.0; double vv = 0.0; #pragma omp parallel default(shared) num_threads(GetThreadCount()) { // this block will be executed by NUM_THREADS // variable declared in this block is private for each thread int threadid = omp_get_thread_num(); int threadcount = omp_get_num_threads(); int chunk = N / threadcount; Param my_param; my_param.u = u; my_param.tmp = tmp; my_param.v = v; my_param.length = N; my_param.half_length = N /2; // calculate each thread's working range [r1 .. r2) => static schedule my_param.r_begin = threadid * chunk; my_param.r_end = (threadid < (threadcount -1)) ? (my_param.r_begin + chunk) : N; fill_10(my_param); #pragma omp barrier // Evaluating for (int ite = 0; ite < 10; ++ite) { my_param.u = u; // source is u my_param.v = v; // desti is v eval_AtA_times_u(my_param); my_param.u = v; // source is v my_param.v = u; // desti is u eval_AtA_times_u(my_param); } my_param.u = u; my_param.v = v; final_sum(my_param); #pragma omp critical { vBv += my_param.vBv; vv += my_param.vv; } } // parallel region return sqrt( vBv/vv ); } int main(int argc, char *argv[]) { int N = ((argc >= 2) ? atoi(argv[1]) : 2000); printf("%.9f\n", spectral_game(N)); return 0; }
Tue, 30 Apr 2013 05:57:50 GMT
MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse2 -fopenmp -O0 spectralnorm.gpp-8.c++ -o spectralnorm.gpp-8.c++.o && \
/usr/bin/g++ spectralnorm.gpp-8.c++.o -o spectralnorm.gpp-8.gpp_run -fopenmp
rm spectralnorm.gpp-8.c++
0.15s to complete and log all make actions
COMMAND LINE:
./spectralnorm.gpp-8.gpp_run 5500
PROGRAM OUTPUT:
1.274224153