diff options
Diffstat (limited to 'eigen/bench/bench_gemm.cpp')
-rw-r--r-- | eigen/bench/bench_gemm.cpp | 117 |
1 files changed, 93 insertions, 24 deletions
diff --git a/eigen/bench/bench_gemm.cpp b/eigen/bench/bench_gemm.cpp index 41ca8b3..8528c55 100644 --- a/eigen/bench/bench_gemm.cpp +++ b/eigen/bench/bench_gemm.cpp @@ -2,6 +2,14 @@ // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out +// Compilation options: +// +// -DSCALAR=std::complex<double> +// -DSCALARA=double or -DSCALARB=double +// -DHAVE_BLAS +// -DDECOUPLED +// + #include <iostream> #include <Eigen/Core> #include <bench/BenchTimer.h> @@ -14,10 +22,18 @@ using namespace Eigen; #define SCALAR float #endif +#ifndef SCALARA +#define SCALARA SCALAR +#endif + +#ifndef SCALARB +#define SCALARB SCALAR +#endif + typedef SCALAR Scalar; typedef NumTraits<Scalar>::Real RealScalar; -typedef Matrix<RealScalar,Dynamic,Dynamic> A; -typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B; +typedef Matrix<SCALARA,Dynamic,Dynamic> A; +typedef Matrix<SCALARB,Dynamic,Dynamic> B; typedef Matrix<Scalar,Dynamic,Dynamic> C; typedef Matrix<RealScalar,Dynamic,Dynamic> M; @@ -129,35 +145,69 @@ int main(int argc, char ** argv) int tries = 2; // number of tries, we keep the best int s = 2048; - int cache_size = -1; + int m = s; + int n = s; + int p = s; + int cache_size1=-1, cache_size2=l2, cache_size3 = 0; bool need_help = false; - for (int i=1; i<argc; ++i) + for (int i=1; i<argc;) { - if(argv[i][0]=='s') - s = atoi(argv[i]+1); - else if(argv[i][0]=='c') - cache_size = atoi(argv[i]+1); - else if(argv[i][0]=='t') - tries = atoi(argv[i]+1); - else if(argv[i][0]=='p') - rep = atoi(argv[i]+1); + if(argv[i][0]=='-') + { + if(argv[i][1]=='s') + { + ++i; + s = atoi(argv[i++]); + m = n = p = s; + if(argv[i][0]!='-') + { + n = atoi(argv[i++]); + p = atoi(argv[i++]); + } + } + else if(argv[i][1]=='c') + { + ++i; + cache_size1 = atoi(argv[i++]); + if(argv[i][0]!='-') + { + cache_size2 = atoi(argv[i++]); + if(argv[i][0]!='-') + cache_size3 = atoi(argv[i++]); + } + } + else if(argv[i][1]=='t') + { + ++i; + tries = atoi(argv[i++]); + } + else if(argv[i][1]=='p') + { + ++i; + rep = atoi(argv[i++]); + } + } else + { need_help = true; + break; + } } if(need_help) { - std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n"; + std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n"; + std::cout << " <matrix sizes> : size\n"; + std::cout << " <matrix sizes> : rows columns depth\n"; return 1; } - if(cache_size>0) - setCpuCacheSizes(cache_size,96*cache_size); - - int m = s; - int n = s; - int p = s; +#if EIGEN_VERSION_AT_LEAST(3,2,90) + if(cache_size1>0) + setCpuCacheSizes(cache_size1,cache_size2,cache_size3); +#endif + A a(m,p); a.setRandom(); B b(p,n); b.setRandom(); C c(m,n); c.setOnes(); @@ -172,6 +222,7 @@ int main(int argc, char ** argv) // check the parallel product is correct #if defined EIGEN_HAS_OPENMP + Eigen::initParallel(); int procs = omp_get_max_threads(); if(procs>1) { @@ -188,11 +239,20 @@ int main(int argc, char ** argv) #elif defined HAVE_BLAS blas_gemm(a,b,r); c.noalias() += a * b; - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } #else - gemm(a,b,c); - r.noalias() += a.cast<Scalar>() * b.cast<Scalar>(); - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(1.*m*n*p<2000.*2000*2000) + { + gemm(a,b,c); + r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() ); + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } + } #endif #ifdef HAVE_BLAS @@ -214,7 +274,7 @@ int main(int argc, char ** argv) { BenchTimer tmono; omp_set_num_threads(1); - Eigen::internal::setNbThreads(1); + Eigen::setNbThreads(1); c = rc; BENCH(tmono, tries, rep, gemm(a,b,c)); std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n"; @@ -223,6 +283,15 @@ int main(int argc, char ** argv) } #endif + if(1.*m*n*p<30*30*30) + { + BenchTimer tmt; + c = rc; + BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); + std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; + std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; + } + #ifdef DECOUPLED if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex)) { |