/mobile Handheld Friendly website
Ubuntu : Intel® Q6600® one core |
Each table row shows performance measurements for this C++ g++ program with a particular command-line input value N.
| N | CPU secs | Elapsed secs | Memory KB | Code B | ≈ CPU Load |
|---|---|---|---|---|---|
| 500 | 0.07 | 0.07 | ? | 1283 | 0% 13% 0% 86% |
| 3,000 | 2.38 | 2.39 | 924 | 1283 | 1% 0% 0% 100% |
| 5,500 | 8.30 | 8.31 | 924 | 1283 | 0% 0% 0% 100% |
Read the ↓ make, command line, and program output logs to see how this program was run.
Read spectral-norm benchmark to see what this program should do.
gcc (Ubuntu/Linaro 4.7.3-1ubuntu1) 4.7.3
// The Computer Language Benchmarks Game // http://benchmarksgame.alioth.debian.org/ // // Original C contributed by Sebastien Loisel // Conversion to C++ by Jon Harrop // OpenMP parallelize by The Anh Tran // Add SSE by The Anh Tran // Additional SSE optimization by Krzysztof Jakubowski // g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 \ // ./spec.c++ -o ./spec.run #include <cmath> #include <cstdlib> #include <cstdio> #include <sched.h> #include <omp.h> #include <emmintrin.h> template <bool modei> int Index(int i, int j) { return (((i + j) * (i + j + 1)) >> 1) + (modei? i : j) + 1; } template <bool modei> void EvalPart(double *__restrict__ src, double *__restrict__ dst, int begin, int end, int length) { int i = begin; for(; i + 7 < end; i += 8) { __m128d sum1 = _mm_set_pd( src[0] / double(Index<modei>(i + 1, 0)), src[0] / double(Index<modei>(i + 0, 0))); __m128d sum2 = _mm_set_pd( src[0] / double(Index<modei>(i + 3, 0)), src[0] / double(Index<modei>(i + 2, 0))); __m128d sum3 = _mm_set_pd( src[0] / double(Index<modei>(i + 5, 0)), src[0] / double(Index<modei>(i + 4, 0))); __m128d sum4 = _mm_set_pd( src[0] / double(Index<modei>(i + 7, 0)), src[0] / double(Index<modei>(i + 6, 0))); __m128d i1 = modei? _mm_set_pd(i + 1, i + 0) : _mm_set_pd(i + 2, i + 1); __m128d i2 = modei? _mm_set_pd(i + 3, i + 2) : _mm_set_pd(i + 4, i + 3); __m128d i3 = modei? _mm_set_pd(i + 5, i + 4) : _mm_set_pd(i + 6, i + 5); __m128d i4 = modei? _mm_set_pd(i + 7, i + 6) : _mm_set_pd(i + 8, i + 7); __m128d last1 = _mm_set_pd( Index<modei>(i + 1, 0), Index<modei>(i + 0, 0)); __m128d last2 = _mm_set_pd( Index<modei>(i + 3, 0), Index<modei>(i + 2, 0)); __m128d last3 = _mm_set_pd( Index<modei>(i + 5, 0), Index<modei>(i + 4, 0)); __m128d last4 = _mm_set_pd( Index<modei>(i + 7, 0), Index<modei>(i + 6, 0)); for(int j = 1; j < length; j++) { __m128d idx1 = last1 + i1 + _mm_set1_pd(j); __m128d idx2 = last2 + i2 + _mm_set1_pd(j); __m128d idx3 = last3 + i3 + _mm_set1_pd(j); __m128d idx4 = last4 + i4 + _mm_set1_pd(j); last1 = idx1; last2 = idx2; last3 = idx3; last4 = idx4; sum1 = sum1 + _mm_set1_pd(src[j]) / idx1; sum2 = sum2 + _mm_set1_pd(src[j]) / idx2; sum3 = sum3 + _mm_set1_pd(src[j]) / idx3; sum4 = sum4 + _mm_set1_pd(src[j]) / idx4; } _mm_storeu_pd(dst + i + 0, sum1); _mm_storeu_pd(dst + i + 2, sum2); _mm_storeu_pd(dst + i + 4, sum3); _mm_storeu_pd(dst + i + 6, sum4); } for(; i < end; i++) { double sum = 0; for (int j = 0; j < length; j++) sum += src[j] / double(Index<modei>(i, j)); dst[i] = sum; } } void EvalATimesU(double *src, double *dst, int begin, int end, int N) { EvalPart<1>(src, dst, begin, end, N); } void EvalAtTimesU(double *src, double *dst, int begin, int end, int N) { EvalPart<0>(src, dst, begin, end, N); } void EvalAtATimesU(double *src, double *dst, double *tmp, int begin, int end, int N) { EvalATimesU (src, tmp, begin, end, N); #pragma omp barrier EvalAtTimesU(tmp, dst, begin, end, N); #pragma omp barrier } int GetThreadCount() { cpu_set_t cs; CPU_ZERO(&cs); sched_getaffinity(0, sizeof(cs), &cs); int count = 0; for (int i = 0; i < CPU_SETSIZE; ++i) if (CPU_ISSET(i, &cs)) ++count; return count; } double spectral_game(int N) { __attribute__((aligned(16))) double u[N]; __attribute__((aligned(16))) double v[N], tmp[N]; double vBv = 0.0; double vv = 0.0; #pragma omp parallel default(shared) num_threads(GetThreadCount()) { // this block will be executed by NUM_THREADS // variable declared in this block is private for each thread int threadid = omp_get_thread_num(); int threadcount = omp_get_num_threads(); int chunk = N / threadcount; // calculate each thread's working range [r1 .. r2) => static schedule int begin = threadid * chunk; int end = (threadid < (threadcount -1)) ? (begin + chunk) : N; for(int i = begin; i < end; i++) u[i] = 1.0; #pragma omp barrier for (int ite = 0; ite < 10; ++ite) { EvalAtATimesU(u, v, tmp, begin, end, N); EvalAtATimesU(v, u, tmp, begin, end, N); } double sumvb = 0.0, sumvv = 0.0; for (int i = begin; i < end; i++) { sumvv += v[i] * v[i]; sumvb += u[i] * v[i]; } #pragma omp critical { vBv += sumvb; vv += sumvv; } } return sqrt(vBv / vv); } int main(int argc, char *argv[]) { int N = ((argc >= 2) ? atoi(argv[1]) : 2000); printf("%.9f\n", spectral_game(N)); return 0; }
Tue, 30 Apr 2013 05:56:05 GMT
MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse2 -fopenmp -O0 spectralnorm.gpp-7.c++ -o spectralnorm.gpp-7.c++.o && \
/usr/bin/g++ spectralnorm.gpp-7.c++.o -o spectralnorm.gpp-7.gpp_run -fopenmp
rm spectralnorm.gpp-7.c++
0.25s to complete and log all make actions
COMMAND LINE:
./spectralnorm.gpp-7.gpp_run 5500
PROGRAM OUTPUT:
1.274224153