From 35f7829af10c61e33dd2e2a7a015058e11a11ea0 Mon Sep 17 00:00:00 2001 From: Stanislaw Halik Date: Sat, 25 Mar 2017 14:17:07 +0100 Subject: update --- eigen/bench/BenchTimer.h | 10 +- eigen/bench/analyze-blocking-sizes.cpp | 876 +++++++++++++++++++++ eigen/bench/benchCholesky.cpp | 16 +- eigen/bench/bench_gemm.cpp | 117 ++- eigen/bench/bench_norm.cpp | 117 +-- eigen/bench/benchmark-blocking-sizes.cpp | 677 ++++++++++++++++ eigen/bench/btl/CMakeLists.txt | 31 +- eigen/bench/btl/actions/action_axpby.hh | 2 +- eigen/bench/btl/actions/action_axpy.hh | 2 +- eigen/bench/btl/actions/basic_actions.hh | 2 +- eigen/bench/btl/cmake/FindACML.cmake | 2 + eigen/bench/btl/cmake/FindATLAS.cmake | 26 +- eigen/bench/btl/cmake/FindBLAZE.cmake | 31 + eigen/bench/btl/cmake/FindCBLAS.cmake | 1 + eigen/bench/btl/cmake/FindGOTO.cmake | 15 - eigen/bench/btl/cmake/FindGOTO2.cmake | 25 - eigen/bench/btl/cmake/FindOPENBLAS.cmake | 17 + eigen/bench/btl/data/action_settings.txt | 34 +- eigen/bench/btl/data/perlib_plot_settings.txt | 4 +- eigen/bench/btl/generic_bench/bench.hh | 4 +- eigen/bench/btl/generic_bench/bench_parameter.hh | 4 +- eigen/bench/btl/generic_bench/btl.hh | 4 +- .../bench/btl/generic_bench/init/init_function.hh | 8 +- eigen/bench/btl/generic_bench/init/init_matrix.hh | 10 +- eigen/bench/btl/generic_bench/init/init_vector.hh | 2 +- .../generic_bench/timers/portable_perf_analyzer.hh | 2 +- .../btl/generic_bench/timers/portable_timer.hh | 46 +- .../bench/btl/generic_bench/utils/size_lin_log.hh | 2 +- eigen/bench/btl/libs/BLAS/CMakeLists.txt | 29 +- eigen/bench/btl/libs/BLAS/blas_interface_impl.hh | 12 +- eigen/bench/btl/libs/BLAS/c_interface_base.h | 6 +- eigen/bench/btl/libs/BLAS/main.cpp | 12 +- eigen/bench/btl/libs/STL/STL_interface.hh | 28 +- eigen/bench/btl/libs/blaze/CMakeLists.txt | 13 + eigen/bench/btl/libs/blaze/blaze_interface.hh | 141 ++++ eigen/bench/btl/libs/blaze/main.cpp | 40 + eigen/bench/btl/libs/eigen2/eigen2_interface.hh | 2 +- eigen/bench/btl/libs/eigen3/eigen3_interface.hh | 64 +- eigen/bench/btl/libs/eigen3/main_adv.cpp | 14 +- eigen/bench/btl/libs/eigen3/main_matmat.cpp | 2 +- eigen/bench/btl/libs/tensors/CMakeLists.txt | 44 ++ eigen/bench/btl/libs/tensors/main_linear.cpp | 23 + eigen/bench/btl/libs/tensors/main_matmat.cpp | 21 + eigen/bench/btl/libs/tensors/main_vecmat.cpp | 21 + eigen/bench/btl/libs/tensors/tensor_interface.hh | 105 +++ eigen/bench/dense_solvers.cpp | 186 +++++ eigen/bench/eig33.cpp | 57 +- eigen/bench/perf_monitoring/changesets.txt | 71 ++ eigen/bench/perf_monitoring/gemm.cpp | 12 + eigen/bench/perf_monitoring/gemm_common.h | 67 ++ eigen/bench/perf_monitoring/gemm_settings.txt | 15 + .../bench/perf_monitoring/gemm_square_settings.txt | 11 + eigen/bench/perf_monitoring/gemv.cpp | 12 + eigen/bench/perf_monitoring/gemv_common.h | 69 ++ eigen/bench/perf_monitoring/gemv_settings.txt | 11 + .../bench/perf_monitoring/gemv_square_settings.txt | 13 + eigen/bench/perf_monitoring/gemvt.cpp | 12 + eigen/bench/perf_monitoring/lazy_gemm.cpp | 101 +++ eigen/bench/perf_monitoring/lazy_gemm_settings.txt | 15 + eigen/bench/perf_monitoring/llt.cpp | 15 + eigen/bench/perf_monitoring/make_plot.sh | 98 +++ .../perf_monitoring/resources/chart_footer.html | 37 + .../perf_monitoring/resources/chart_header.html | 46 ++ eigen/bench/perf_monitoring/resources/footer.html | 3 + eigen/bench/perf_monitoring/resources/header.html | 42 + eigen/bench/perf_monitoring/resources/s1.js | 1 + eigen/bench/perf_monitoring/resources/s2.js | 1 + eigen/bench/perf_monitoring/run.sh | 172 ++++ eigen/bench/perf_monitoring/runall.sh | 63 ++ eigen/bench/perf_monitoring/trmv_lo.cpp | 12 + eigen/bench/perf_monitoring/trmv_lot.cpp | 12 + eigen/bench/perf_monitoring/trmv_up.cpp | 12 + eigen/bench/perf_monitoring/trmv_upt.cpp | 12 + eigen/bench/spbench/CMakeLists.txt | 2 +- eigen/bench/spbench/spbenchstyle.h | 3 +- eigen/bench/tensors/README | 21 + eigen/bench/tensors/benchmark.h | 49 ++ eigen/bench/tensors/benchmark_main.cc | 237 ++++++ eigen/bench/tensors/contraction_benchmarks_cpu.cc | 39 + eigen/bench/tensors/tensor_benchmarks.h | 478 +++++++++++ eigen/bench/tensors/tensor_benchmarks_cpu.cc | 168 ++++ eigen/bench/tensors/tensor_benchmarks_fp16_gpu.cu | 77 ++ eigen/bench/tensors/tensor_benchmarks_gpu.cu | 75 ++ eigen/bench/tensors/tensor_benchmarks_sycl.cc | 20 + 84 files changed, 4691 insertions(+), 320 deletions(-) create mode 100644 eigen/bench/analyze-blocking-sizes.cpp create mode 100644 eigen/bench/benchmark-blocking-sizes.cpp create mode 100644 eigen/bench/btl/cmake/FindBLAZE.cmake delete mode 100644 eigen/bench/btl/cmake/FindGOTO.cmake delete mode 100644 eigen/bench/btl/cmake/FindGOTO2.cmake create mode 100644 eigen/bench/btl/cmake/FindOPENBLAS.cmake create mode 100644 eigen/bench/btl/libs/blaze/CMakeLists.txt create mode 100644 eigen/bench/btl/libs/blaze/blaze_interface.hh create mode 100644 eigen/bench/btl/libs/blaze/main.cpp create mode 100644 eigen/bench/btl/libs/tensors/CMakeLists.txt create mode 100644 eigen/bench/btl/libs/tensors/main_linear.cpp create mode 100644 eigen/bench/btl/libs/tensors/main_matmat.cpp create mode 100644 eigen/bench/btl/libs/tensors/main_vecmat.cpp create mode 100644 eigen/bench/btl/libs/tensors/tensor_interface.hh create mode 100644 eigen/bench/dense_solvers.cpp create mode 100644 eigen/bench/perf_monitoring/changesets.txt create mode 100644 eigen/bench/perf_monitoring/gemm.cpp create mode 100644 eigen/bench/perf_monitoring/gemm_common.h create mode 100644 eigen/bench/perf_monitoring/gemm_settings.txt create mode 100644 eigen/bench/perf_monitoring/gemm_square_settings.txt create mode 100644 eigen/bench/perf_monitoring/gemv.cpp create mode 100644 eigen/bench/perf_monitoring/gemv_common.h create mode 100644 eigen/bench/perf_monitoring/gemv_settings.txt create mode 100644 eigen/bench/perf_monitoring/gemv_square_settings.txt create mode 100644 eigen/bench/perf_monitoring/gemvt.cpp create mode 100644 eigen/bench/perf_monitoring/lazy_gemm.cpp create mode 100644 eigen/bench/perf_monitoring/lazy_gemm_settings.txt create mode 100644 eigen/bench/perf_monitoring/llt.cpp create mode 100644 eigen/bench/perf_monitoring/make_plot.sh create mode 100644 eigen/bench/perf_monitoring/resources/chart_footer.html create mode 100644 eigen/bench/perf_monitoring/resources/chart_header.html create mode 100644 eigen/bench/perf_monitoring/resources/footer.html create mode 100644 eigen/bench/perf_monitoring/resources/header.html create mode 100644 eigen/bench/perf_monitoring/resources/s1.js create mode 100644 eigen/bench/perf_monitoring/resources/s2.js create mode 100644 eigen/bench/perf_monitoring/run.sh create mode 100644 eigen/bench/perf_monitoring/runall.sh create mode 100644 eigen/bench/perf_monitoring/trmv_lo.cpp create mode 100644 eigen/bench/perf_monitoring/trmv_lot.cpp create mode 100644 eigen/bench/perf_monitoring/trmv_up.cpp create mode 100644 eigen/bench/perf_monitoring/trmv_upt.cpp create mode 100644 eigen/bench/tensors/README create mode 100644 eigen/bench/tensors/benchmark.h create mode 100644 eigen/bench/tensors/benchmark_main.cc create mode 100644 eigen/bench/tensors/contraction_benchmarks_cpu.cc create mode 100644 eigen/bench/tensors/tensor_benchmarks.h create mode 100644 eigen/bench/tensors/tensor_benchmarks_cpu.cc create mode 100644 eigen/bench/tensors/tensor_benchmarks_fp16_gpu.cu create mode 100644 eigen/bench/tensors/tensor_benchmarks_gpu.cu create mode 100644 eigen/bench/tensors/tensor_benchmarks_sycl.cc (limited to 'eigen/bench') diff --git a/eigen/bench/BenchTimer.h b/eigen/bench/BenchTimer.h index 28e2bca..ea28496 100644 --- a/eigen/bench/BenchTimer.h +++ b/eigen/bench/BenchTimer.h @@ -22,12 +22,19 @@ # endif # include #elif defined(__APPLE__) -#include #include #else # include #endif +static void escape(void *p) { + asm volatile("" : : "g"(p) : "memory"); +} + +static void clobber() { + asm volatile("" : : : "memory"); +} + #include namespace Eigen @@ -168,6 +175,7 @@ public: CODE; \ } \ TIMER.stop(); \ + clobber(); \ } \ } diff --git a/eigen/bench/analyze-blocking-sizes.cpp b/eigen/bench/analyze-blocking-sizes.cpp new file mode 100644 index 0000000..d563a1d --- /dev/null +++ b/eigen/bench/analyze-blocking-sizes.cpp @@ -0,0 +1,876 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace std; + +const int default_precision = 4; + +// see --only-cubic-sizes +bool only_cubic_sizes = false; + +// see --dump-tables +bool dump_tables = false; + +uint8_t log2_pot(size_t x) { + size_t l = 0; + while (x >>= 1) l++; + return l; +} + +uint16_t compact_size_triple(size_t k, size_t m, size_t n) +{ + return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); +} + +// just a helper to store a triple of K,M,N sizes for matrix product +struct size_triple_t +{ + uint16_t k, m, n; + size_triple_t() : k(0), m(0), n(0) {} + size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} + size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} + size_triple_t(uint16_t compact) + { + k = 1 << ((compact & 0xf00) >> 8); + m = 1 << ((compact & 0x0f0) >> 4); + n = 1 << ((compact & 0x00f) >> 0); + } + bool is_cubic() const { return k == m && m == n; } +}; + +ostream& operator<<(ostream& s, const size_triple_t& t) +{ + return s << "(" << t.k << ", " << t.m << ", " << t.n << ")"; +} + +struct inputfile_entry_t +{ + uint16_t product_size; + uint16_t pot_block_size; + size_triple_t nonpot_block_size; + float gflops; +}; + +struct inputfile_t +{ + enum class type_t { + unknown, + all_pot_sizes, + default_sizes + }; + + string filename; + vector entries; + type_t type; + + inputfile_t(const string& fname) + : filename(fname) + , type(type_t::unknown) + { + ifstream stream(filename); + if (!stream.is_open()) { + cerr << "couldn't open input file: " << filename << endl; + exit(1); + } + string line; + while (getline(stream, line)) { + if (line.empty()) continue; + if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) { + if (type != type_t::unknown) { + cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines"; + exit(1); + } + type = type_t::all_pot_sizes; + continue; + } + if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) { + if (type != type_t::unknown) { + cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines"; + exit(1); + } + type = type_t::default_sizes; + continue; + } + + + if (type == type_t::unknown) { + continue; + } + switch(type) { + case type_t::all_pot_sizes: { + unsigned int product_size, block_size; + float gflops; + int sscanf_result = + sscanf(line.c_str(), "%x %x %f", + &product_size, + &block_size, + &gflops); + if (3 != sscanf_result || + !product_size || + product_size > 0xfff || + !block_size || + block_size > 0xfff || + !isfinite(gflops)) + { + cerr << "ill-formed input file: " << filename << endl; + cerr << "offending line:" << endl << line << endl; + exit(1); + } + if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) { + continue; + } + inputfile_entry_t entry; + entry.product_size = uint16_t(product_size); + entry.pot_block_size = uint16_t(block_size); + entry.gflops = gflops; + entries.push_back(entry); + break; + } + case type_t::default_sizes: { + unsigned int product_size; + float gflops; + int bk, bm, bn; + int sscanf_result = + sscanf(line.c_str(), "%x default(%d, %d, %d) %f", + &product_size, + &bk, &bm, &bn, + &gflops); + if (5 != sscanf_result || + !product_size || + product_size > 0xfff || + !isfinite(gflops)) + { + cerr << "ill-formed input file: " << filename << endl; + cerr << "offending line:" << endl << line << endl; + exit(1); + } + if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) { + continue; + } + inputfile_entry_t entry; + entry.product_size = uint16_t(product_size); + entry.pot_block_size = 0; + entry.nonpot_block_size = size_triple_t(bk, bm, bn); + entry.gflops = gflops; + entries.push_back(entry); + break; + } + + default: + break; + } + } + stream.close(); + if (type == type_t::unknown) { + cerr << "Unrecognized input file " << filename << endl; + exit(1); + } + if (entries.empty()) { + cerr << "didn't find any measurements in input file: " << filename << endl; + exit(1); + } + } +}; + +struct preprocessed_inputfile_entry_t +{ + uint16_t product_size; + uint16_t block_size; + + float efficiency; +}; + +bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2) +{ + return e1.efficiency < e2.efficiency; +} + +struct preprocessed_inputfile_t +{ + string filename; + vector entries; + + preprocessed_inputfile_t(const inputfile_t& inputfile) + : filename(inputfile.filename) + { + if (inputfile.type != inputfile_t::type_t::all_pot_sizes) { + abort(); + } + auto it = inputfile.entries.begin(); + auto it_first_with_given_product_size = it; + while (it != inputfile.entries.end()) { + ++it; + if (it == inputfile.entries.end() || + it->product_size != it_first_with_given_product_size->product_size) + { + import_input_file_range_one_product_size(it_first_with_given_product_size, it); + it_first_with_given_product_size = it; + } + } + } + +private: + void import_input_file_range_one_product_size( + const vector::const_iterator& begin, + const vector::const_iterator& end) + { + uint16_t product_size = begin->product_size; + float max_gflops = 0.0f; + for (auto it = begin; it != end; ++it) { + if (it->product_size != product_size) { + cerr << "Unexpected ordering of entries in " << filename << endl; + cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl; + exit(1); + } + max_gflops = max(max_gflops, it->gflops); + } + for (auto it = begin; it != end; ++it) { + preprocessed_inputfile_entry_t entry; + entry.product_size = it->product_size; + entry.block_size = it->pot_block_size; + entry.efficiency = it->gflops / max_gflops; + entries.push_back(entry); + } + } +}; + +void check_all_files_in_same_exact_order( + const vector& preprocessed_inputfiles) +{ + if (preprocessed_inputfiles.empty()) { + return; + } + + const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0]; + const size_t num_entries = first_file.entries.size(); + + for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) { + if (preprocessed_inputfiles[i].entries.size() != num_entries) { + cerr << "these files have different number of entries: " + << preprocessed_inputfiles[i].filename + << " and " + << first_file.filename + << endl; + exit(1); + } + } + + for (size_t entry_index = 0; entry_index < num_entries; entry_index++) { + const uint16_t entry_product_size = first_file.entries[entry_index].product_size; + const uint16_t entry_block_size = first_file.entries[entry_index].block_size; + for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) { + const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index]; + if (cur_file.entries[entry_index].product_size != entry_product_size || + cur_file.entries[entry_index].block_size != entry_block_size) + { + cerr << "entries not in same order between these files: " + << first_file.filename + << " and " + << cur_file.filename + << endl; + exit(1); + } + } + } +} + +float efficiency_of_subset( + const vector& preprocessed_inputfiles, + const vector& subset) +{ + if (subset.size() <= 1) { + return 1.0f; + } + const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]]; + const size_t num_entries = first_file.entries.size(); + float efficiency = 1.0f; + size_t entry_index = 0; + size_t first_entry_index_with_this_product_size = 0; + uint16_t product_size = first_file.entries[0].product_size; + while (entry_index < num_entries) { + ++entry_index; + if (entry_index == num_entries || + first_file.entries[entry_index].product_size != product_size) + { + float efficiency_this_product_size = 0.0f; + for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) { + float efficiency_this_entry = 1.0f; + for (auto i = subset.begin(); i != subset.end(); ++i) { + efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency); + } + efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry); + } + efficiency = min(efficiency, efficiency_this_product_size); + if (entry_index < num_entries) { + first_entry_index_with_this_product_size = entry_index; + product_size = first_file.entries[entry_index].product_size; + } + } + } + + return efficiency; +} + +void dump_table_for_subset( + const vector& preprocessed_inputfiles, + const vector& subset) +{ + const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]]; + const size_t num_entries = first_file.entries.size(); + size_t entry_index = 0; + size_t first_entry_index_with_this_product_size = 0; + uint16_t product_size = first_file.entries[0].product_size; + size_t i = 0; + size_triple_t min_product_size(first_file.entries.front().product_size); + size_triple_t max_product_size(first_file.entries.back().product_size); + if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) { + abort(); + } + if (only_cubic_sizes) { + cerr << "Can't generate tables with --only-cubic-sizes." << endl; + abort(); + } + cout << "struct LookupTable {" << endl; + cout << " static const size_t BaseSize = " << min_product_size.k << ";" << endl; + const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1; + const size_t TableSize = NumSizes * NumSizes * NumSizes; + cout << " static const size_t NumSizes = " << NumSizes << ";" << endl; + cout << " static const unsigned short* Data() {" << endl; + cout << " static const unsigned short data[" << TableSize << "] = {"; + while (entry_index < num_entries) { + ++entry_index; + if (entry_index == num_entries || + first_file.entries[entry_index].product_size != product_size) + { + float best_efficiency_this_product_size = 0.0f; + uint16_t best_block_size_this_product_size = 0; + for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) { + float efficiency_this_entry = 1.0f; + for (auto i = subset.begin(); i != subset.end(); ++i) { + efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency); + } + if (efficiency_this_entry > best_efficiency_this_product_size) { + best_efficiency_this_product_size = efficiency_this_entry; + best_block_size_this_product_size = first_file.entries[e].block_size; + } + } + if ((i++) % NumSizes) { + cout << " "; + } else { + cout << endl << " "; + } + cout << "0x" << hex << best_block_size_this_product_size << dec; + if (entry_index < num_entries) { + cout << ","; + first_entry_index_with_this_product_size = entry_index; + product_size = first_file.entries[entry_index].product_size; + } + } + } + if (i != TableSize) { + cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl; + abort(); + } + cout << endl << " };" << endl; + cout << " return data;" << endl; + cout << " }" << endl; + cout << "};" << endl; +} + +float efficiency_of_partition( + const vector& preprocessed_inputfiles, + const vector>& partition) +{ + float efficiency = 1.0f; + for (auto s = partition.begin(); s != partition.end(); ++s) { + efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s)); + } + return efficiency; +} + +void make_first_subset(size_t subset_size, vector& out_subset, size_t set_size) +{ + assert(subset_size >= 1 && subset_size <= set_size); + out_subset.resize(subset_size); + for (size_t i = 0; i < subset_size; i++) { + out_subset[i] = i; + } +} + +bool is_last_subset(const vector& subset, size_t set_size) +{ + return subset[0] == set_size - subset.size(); +} + +void next_subset(vector& inout_subset, size_t set_size) +{ + if (is_last_subset(inout_subset, set_size)) { + cerr << "iterating past the last subset" << endl; + abort(); + } + size_t i = 1; + while (inout_subset[inout_subset.size() - i] == set_size - i) { + i++; + assert(i <= inout_subset.size()); + } + size_t first_index_to_change = inout_subset.size() - i; + inout_subset[first_index_to_change]++; + size_t p = inout_subset[first_index_to_change]; + for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) { + inout_subset[j] = ++p; + } +} + +const size_t number_of_subsets_limit = 100; +const size_t always_search_subsets_of_size_at_least = 2; + +bool is_number_of_subsets_feasible(size_t n, size_t p) +{ + assert(n>0 && p>0 && p<=n); + uint64_t numerator = 1, denominator = 1; + for (size_t i = 0; i < p; i++) { + numerator *= n - i; + denominator *= i + 1; + if (numerator > denominator * number_of_subsets_limit) { + return false; + } + } + return true; +} + +size_t max_feasible_subset_size(size_t n) +{ + assert(n > 0); + const size_t minresult = min(n-1, always_search_subsets_of_size_at_least); + for (size_t p = 1; p <= n - 1; p++) { + if (!is_number_of_subsets_feasible(n, p+1)) { + return max(p, minresult); + } + } + return n - 1; +} + +void find_subset_with_efficiency_higher_than( + const vector& preprocessed_inputfiles, + float required_efficiency_to_beat, + vector& inout_remainder, + vector& out_subset) +{ + out_subset.resize(0); + + if (required_efficiency_to_beat >= 1.0f) { + cerr << "can't beat efficiency 1." << endl; + abort(); + } + + while (!inout_remainder.empty()) { + + vector candidate_indices(inout_remainder.size()); + for (size_t i = 0; i < candidate_indices.size(); i++) { + candidate_indices[i] = i; + } + + size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size()); + while (candidate_indices_subset_size >= 1) { + vector candidate_indices_subset; + make_first_subset(candidate_indices_subset_size, + candidate_indices_subset, + candidate_indices.size()); + + vector best_candidate_indices_subset; + float best_efficiency = 0.0f; + vector trial_subset = out_subset; + trial_subset.resize(out_subset.size() + candidate_indices_subset_size); + while (true) + { + for (size_t i = 0; i < candidate_indices_subset_size; i++) { + trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]]; + } + + float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset); + if (trial_efficiency > best_efficiency) { + best_efficiency = trial_efficiency; + best_candidate_indices_subset = candidate_indices_subset; + } + if (is_last_subset(candidate_indices_subset, candidate_indices.size())) { + break; + } + next_subset(candidate_indices_subset, candidate_indices.size()); + } + + if (best_efficiency > required_efficiency_to_beat) { + for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) { + candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]]; + } + candidate_indices.resize(best_candidate_indices_subset.size()); + } + candidate_indices_subset_size--; + } + + size_t candidate_index = candidate_indices[0]; + auto candidate_iterator = inout_remainder.begin() + candidate_index; + vector trial_subset = out_subset; + + trial_subset.push_back(*candidate_iterator); + float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset); + if (trial_efficiency > required_efficiency_to_beat) { + out_subset.push_back(*candidate_iterator); + inout_remainder.erase(candidate_iterator); + } else { + break; + } + } +} + +void find_partition_with_efficiency_higher_than( + const vector& preprocessed_inputfiles, + float required_efficiency_to_beat, + vector>& out_partition) +{ + out_partition.resize(0); + + vector remainder; + for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) { + remainder.push_back(i); + } + + while (!remainder.empty()) { + vector new_subset; + find_subset_with_efficiency_higher_than( + preprocessed_inputfiles, + required_efficiency_to_beat, + remainder, + new_subset); + out_partition.push_back(new_subset); + } +} + +void print_partition( + const vector& preprocessed_inputfiles, + const vector>& partition) +{ + float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition); + cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency" << endl; + for (auto subset = partition.begin(); subset != partition.end(); ++subset) { + cout << " Subset " << (subset - partition.begin()) + << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:" + << endl; + for (auto file = subset->begin(); file != subset->end(); ++file) { + cout << " " << preprocessed_inputfiles[*file].filename << endl; + } + if (dump_tables) { + cout << " Table:" << endl; + dump_table_for_subset(preprocessed_inputfiles, *subset); + } + } + cout << endl; +} + +struct action_t +{ + virtual const char* invokation_name() const { abort(); return nullptr; } + virtual void run(const vector&) const { abort(); } + virtual ~action_t() {} +}; + +struct partition_action_t : action_t +{ + virtual const char* invokation_name() const override { return "partition"; } + virtual void run(const vector& input_filenames) const override + { + vector preprocessed_inputfiles; + + if (input_filenames.empty()) { + cerr << "The " << invokation_name() << " action needs a list of input files." << endl; + exit(1); + } + + for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) { + inputfile_t inputfile(*it); + switch (inputfile.type) { + case inputfile_t::type_t::all_pot_sizes: + preprocessed_inputfiles.emplace_back(inputfile); + break; + case inputfile_t::type_t::default_sizes: + cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and " + << "has no use for " << *it << " which contains measurements for default sizes." << endl; + exit(1); + break; + default: + cerr << "Unrecognized input file: " << *it << endl; + exit(1); + } + } + + check_all_files_in_same_exact_order(preprocessed_inputfiles); + + float required_efficiency_to_beat = 0.0f; + vector>> partitions; + cerr << "searching for partitions...\r" << flush; + while (true) + { + vector> partition; + find_partition_with_efficiency_higher_than( + preprocessed_inputfiles, + required_efficiency_to_beat, + partition); + float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition); + cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size() + << " subsets for " << 100.0f * actual_efficiency + << " % efficiency" + << " \r" << flush; + partitions.push_back(partition); + if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) { + break; + } + required_efficiency_to_beat = actual_efficiency; + } + cerr << " " << endl; + while (true) { + bool repeat = false; + for (size_t i = 0; i < partitions.size() - 1; i++) { + if (partitions[i].size() >= partitions[i+1].size()) { + partitions.erase(partitions.begin() + i); + repeat = true; + break; + } + } + if (!repeat) { + break; + } + } + for (auto it = partitions.begin(); it != partitions.end(); ++it) { + print_partition(preprocessed_inputfiles, *it); + } + } +}; + +struct evaluate_defaults_action_t : action_t +{ + struct results_entry_t { + uint16_t product_size; + size_triple_t default_block_size; + uint16_t best_pot_block_size; + float default_gflops; + float best_pot_gflops; + float default_efficiency; + }; + friend ostream& operator<<(ostream& s, const results_entry_t& entry) + { + return s + << "Product size " << size_triple_t(entry.product_size) + << ": default block size " << entry.default_block_size + << " -> " << entry.default_gflops + << " GFlop/s = " << entry.default_efficiency * 100.0f << " %" + << " of best POT block size " << size_triple_t(entry.best_pot_block_size) + << " -> " << entry.best_pot_gflops + << " GFlop/s" << dec; + } + static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) { + return e1.default_efficiency < e2.default_efficiency; + } + virtual const char* invokation_name() const override { return "evaluate-defaults"; } + void show_usage_and_exit() const + { + cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl; + cerr << "checks how well the performance with default sizes compares to the best " + << "performance measured over all POT sizes." << endl; + exit(1); + } + virtual void run(const vector& input_filenames) const override + { + if (input_filenames.size() != 2) { + show_usage_and_exit(); + } + inputfile_t inputfile_default_sizes(input_filenames[0]); + inputfile_t inputfile_all_pot_sizes(input_filenames[1]); + if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) { + cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl; + show_usage_and_exit(); + } + if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) { + cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl; + show_usage_and_exit(); + } + vector results; + vector cubic_results; + + uint16_t product_size = 0; + auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin(); + for (auto it_default_sizes = inputfile_default_sizes.entries.begin(); + it_default_sizes != inputfile_default_sizes.entries.end(); + ++it_default_sizes) + { + if (it_default_sizes->product_size == product_size) { + continue; + } + product_size = it_default_sizes->product_size; + while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() && + it_all_pot_sizes->product_size != product_size) + { + ++it_all_pot_sizes; + } + if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) { + break; + } + uint16_t best_pot_block_size = 0; + float best_pot_gflops = 0; + for (auto it = it_all_pot_sizes; + it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size; + ++it) + { + if (it->gflops > best_pot_gflops) { + best_pot_gflops = it->gflops; + best_pot_block_size = it->pot_block_size; + } + } + results_entry_t entry; + entry.product_size = product_size; + entry.default_block_size = it_default_sizes->nonpot_block_size; + entry.best_pot_block_size = best_pot_block_size; + entry.default_gflops = it_default_sizes->gflops; + entry.best_pot_gflops = best_pot_gflops; + entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops; + results.push_back(entry); + + size_triple_t t(product_size); + if (t.k == t.m && t.m == t.n) { + cubic_results.push_back(entry); + } + } + + cout << "All results:" << endl; + for (auto it = results.begin(); it != results.end(); ++it) { + cout << *it << endl; + } + cout << endl; + + sort(results.begin(), results.end(), lower_efficiency); + + const size_t n = min(20, results.size()); + cout << n << " worst results:" << endl; + for (size_t i = 0; i < n; i++) { + cout << results[i] << endl; + } + cout << endl; + + cout << "cubic results:" << endl; + for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) { + cout << *it << endl; + } + cout << endl; + + sort(cubic_results.begin(), cubic_results.end(), lower_efficiency); + + cout.precision(2); + vector a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f}; + for (auto it = a.begin(); it != a.end(); ++it) { + size_t n = min(results.size() - 1, size_t(*it * results.size())); + cout << (100.0f * n / (results.size() - 1)) + << " % of product sizes have default efficiency <= " + << 100.0f * results[n].default_efficiency << " %" << endl; + } + cout.precision(default_precision); + } +}; + + +void show_usage_and_exit(int argc, char* argv[], + const vector>& available_actions) +{ + cerr << "usage: " << argv[0] << " [options...] " << endl; + cerr << "available actions:" << endl; + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + cerr << " " << (*it)->invokation_name() << endl; + } + cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl; + exit(1); +} + +int main(int argc, char* argv[]) +{ + cout.precision(default_precision); + cerr.precision(default_precision); + + vector> available_actions; + available_actions.emplace_back(new partition_action_t); + available_actions.emplace_back(new evaluate_defaults_action_t); + + vector input_filenames; + + action_t* action = nullptr; + + if (argc < 2) { + show_usage_and_exit(argc, argv, available_actions); + } + for (int i = 1; i < argc; i++) { + bool arg_handled = false; + // Step 1. Try to match action invokation names. + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + if (!strcmp(argv[i], (*it)->invokation_name())) { + if (!action) { + action = it->get(); + arg_handled = true; + break; + } else { + cerr << "can't specify more than one action!" << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + } + if (arg_handled) { + continue; + } + // Step 2. Try to match option names. + if (argv[i][0] == '-') { + if (!strcmp(argv[i], "--only-cubic-sizes")) { + only_cubic_sizes = true; + arg_handled = true; + } + if (!strcmp(argv[i], "--dump-tables")) { + dump_tables = true; + arg_handled = true; + } + if (!arg_handled) { + cerr << "Unrecognized option: " << argv[i] << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + if (arg_handled) { + continue; + } + // Step 3. Default to interpreting args as input filenames. + input_filenames.emplace_back(argv[i]); + } + + if (dump_tables && only_cubic_sizes) { + cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl; + show_usage_and_exit(argc, argv, available_actions); + } + + if (!action) { + show_usage_and_exit(argc, argv, available_actions); + } + + action->run(input_filenames); +} diff --git a/eigen/bench/benchCholesky.cpp b/eigen/bench/benchCholesky.cpp index 42b3e12..9a8e7cf 100644 --- a/eigen/bench/benchCholesky.cpp +++ b/eigen/bench/benchCholesky.cpp @@ -31,7 +31,7 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m) int rows = m.rows(); int cols = m.cols(); - int cost = 0; + double cost = 0; for (int j=0; j0; ++i) + for (int i=0; dynsizes[i]>0; ++i) benchLLT(Matrix(dynsizes[i],dynsizes[i])); benchLLT(Matrix()); diff --git a/eigen/bench/bench_gemm.cpp b/eigen/bench/bench_gemm.cpp index 41ca8b3..8528c55 100644 --- a/eigen/bench/bench_gemm.cpp +++ b/eigen/bench/bench_gemm.cpp @@ -2,6 +2,14 @@ // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out +// Compilation options: +// +// -DSCALAR=std::complex +// -DSCALARA=double or -DSCALARB=double +// -DHAVE_BLAS +// -DDECOUPLED +// + #include #include #include @@ -14,10 +22,18 @@ using namespace Eigen; #define SCALAR float #endif +#ifndef SCALARA +#define SCALARA SCALAR +#endif + +#ifndef SCALARB +#define SCALARB SCALAR +#endif + typedef SCALAR Scalar; typedef NumTraits::Real RealScalar; -typedef Matrix A; -typedef Matrix B; +typedef Matrix A; +typedef Matrix B; typedef Matrix C; typedef Matrix M; @@ -129,35 +145,69 @@ int main(int argc, char ** argv) int tries = 2; // number of tries, we keep the best int s = 2048; - int cache_size = -1; + int m = s; + int n = s; + int p = s; + int cache_size1=-1, cache_size2=l2, cache_size3 = 0; bool need_help = false; - for (int i=1; i c t p\n"; + std::cout << argv[0] << " -s -c -t -p \n"; + std::cout << " : size\n"; + std::cout << " : rows columns depth\n"; return 1; } - if(cache_size>0) - setCpuCacheSizes(cache_size,96*cache_size); - - int m = s; - int n = s; - int p = s; +#if EIGEN_VERSION_AT_LEAST(3,2,90) + if(cache_size1>0) + setCpuCacheSizes(cache_size1,cache_size2,cache_size3); +#endif + A a(m,p); a.setRandom(); B b(p,n); b.setRandom(); C c(m,n); c.setOnes(); @@ -172,6 +222,7 @@ int main(int argc, char ** argv) // check the parallel product is correct #if defined EIGEN_HAS_OPENMP + Eigen::initParallel(); int procs = omp_get_max_threads(); if(procs>1) { @@ -188,11 +239,20 @@ int main(int argc, char ** argv) #elif defined HAVE_BLAS blas_gemm(a,b,r); c.noalias() += a * b; - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } #else - gemm(a,b,c); - r.noalias() += a.cast() * b.cast(); - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(1.*m*n*p<2000.*2000*2000) + { + gemm(a,b,c); + r.noalias() += a.cast() .lazyProduct( b.cast() ); + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } + } #endif #ifdef HAVE_BLAS @@ -214,7 +274,7 @@ int main(int argc, char ** argv) { BenchTimer tmono; omp_set_num_threads(1); - Eigen::internal::setNbThreads(1); + Eigen::setNbThreads(1); c = rc; BENCH(tmono, tries, rep, gemm(a,b,c)); std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n"; @@ -223,6 +283,15 @@ int main(int argc, char ** argv) } #endif + if(1.*m*n*p<30*30*30) + { + BenchTimer tmt; + c = rc; + BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); + std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; + std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; + } + #ifdef DECOUPLED if((NumTraits::IsComplex) && (NumTraits::IsComplex)) { diff --git a/eigen/bench/bench_norm.cpp b/eigen/bench/bench_norm.cpp index 806db29..129afcf 100644 --- a/eigen/bench/bench_norm.cpp +++ b/eigen/bench/bench_norm.cpp @@ -6,19 +6,25 @@ using namespace Eigen; using namespace std; template -EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v) +EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(T& v) { return v.norm(); } template -EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v) +EIGEN_DONT_INLINE typename T::Scalar stableNorm(T& v) +{ + return v.stableNorm(); +} + +template +EIGEN_DONT_INLINE typename T::Scalar hypotNorm(T& v) { return v.hypotNorm(); } template -EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v) +EIGEN_DONT_INLINE typename T::Scalar blueNorm(T& v) { return v.blueNorm(); } @@ -32,25 +38,25 @@ EIGEN_DONT_INLINE typename T::Scalar lapackNorm(T& v) Scalar ssq = 1; for (int i=0;i= ax) { - ssq += internal::abs2(ax/scale); + ssq += numext::abs2(ax/scale); } else { - ssq = Scalar(1) + ssq * internal::abs2(scale/ax); + ssq = Scalar(1) + ssq * numext::abs2(scale/ax); scale = ax; } } - return scale * internal::sqrt(ssq); + return scale * std::sqrt(ssq); } template EIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v) { typedef typename T::Scalar Scalar; - Scalar s = v.cwise().abs().maxCoeff(); + Scalar s = v.array().abs().maxCoeff(); return s*(v/s).norm(); } @@ -73,16 +79,20 @@ EIGEN_DONT_INLINE typename T::Scalar divacNorm(T& v) v(i) = v(2*i) + v(2*i+1); n = n/2; } - return internal::sqrt(v(0)); + return std::sqrt(v(0)); } +namespace Eigen { +namespace internal { #ifdef EIGEN_VECTORIZE -Packet4f internal::plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); } -Packet2d internal::plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); } +Packet4f plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); } +Packet2d plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); } -Packet4f internal::pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); } -Packet2d internal::pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); } +Packet4f pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); } +Packet2d pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); } #endif +} +} template EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) @@ -126,7 +136,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) overfl = rbig*s2m; // overfow boundary for abig eps = std::pow(ibeta, 1-it); - relerr = internal::sqrt(eps); // tolerance for neglecting asml + relerr = std::sqrt(eps); // tolerance for neglecting asml abig = 1.0/eps - 1.0; if (Scalar(nbig)>abig) nmax = abig; // largest safe n else nmax = nbig; @@ -134,13 +144,13 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) typedef typename internal::packet_traits::type Packet; const int ps = internal::packet_traits::size; - Packet pasml = internal::pset1(Scalar(0)); - Packet pamed = internal::pset1(Scalar(0)); - Packet pabig = internal::pset1(Scalar(0)); - Packet ps2m = internal::pset1(s2m); - Packet ps1m = internal::pset1(s1m); - Packet pb2 = internal::pset1(b2); - Packet pb1 = internal::pset1(b1); + Packet pasml = internal::pset1(Scalar(0)); + Packet pamed = internal::pset1(Scalar(0)); + Packet pabig = internal::pset1(Scalar(0)); + Packet ps2m = internal::pset1(s2m); + Packet ps1m = internal::pset1(s1m); + Packet pb2 = internal::pset1(b2); + Packet pb1 = internal::pset1(b1); for(int j=0; j(j)); @@ -170,7 +180,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) Scalar amed = internal::predux(pamed); if(abig > Scalar(0)) { - abig = internal::sqrt(abig); + abig = std::sqrt(abig); if(abig > overfl) { eigen_assert(false && "overflow"); @@ -179,7 +189,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) if(amed > Scalar(0)) { abig = abig/s2m; - amed = internal::sqrt(amed); + amed = std::sqrt(amed); } else { @@ -191,55 +201,56 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) { if (amed > Scalar(0)) { - abig = internal::sqrt(amed); - amed = internal::sqrt(asml) / s1m; + abig = std::sqrt(amed); + amed = std::sqrt(asml) / s1m; } else { - return internal::sqrt(asml)/s1m; + return std::sqrt(asml)/s1m; } } else { - return internal::sqrt(amed); + return std::sqrt(amed); } asml = std::min(abig, amed); abig = std::max(abig, amed); if(asml <= abig*relerr) return abig; else - return abig * internal::sqrt(Scalar(1) + internal::abs2(asml/abig)); + return abig * std::sqrt(Scalar(1) + numext::abs2(asml/abig)); #endif } #define BENCH_PERF(NRM) { \ + float af = 0; double ad = 0; std::complex ac = 0; \ Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\ for (int k=0; k()); - double yd = based * internal::abs(internal::random()); + double yf = basef * std::abs(internal::random()); + double yd = based * std::abs(internal::random()); VectorXf vf = VectorXf::Ones(s) * yf; VectorXd vd = VectorXd::Ones(s) * yd; - std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n"; + std::cout << "reference\t" << std::sqrt(double(s))*yf << "\t" << std::sqrt(double(s))*yd << "\n"; std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\n"; std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\n"; std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\n"; @@ -255,8 +266,8 @@ void check_accuracy_var(int ef0, int ef1, int ed0, int ed1, int s) VectorXd vd(s); for (int i=0; i()) * std::pow(double(10), internal::random(ef0,ef1)); - vd[i] = internal::abs(internal::random()) * std::pow(double(10), internal::random(ed0,ed1)); + vf[i] = std::abs(internal::random()) * std::pow(double(10), internal::random(ef0,ef1)); + vd[i] = std::abs(internal::random()) * std::pow(double(10), internal::random(ed0,ed1)); } //std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n"; @@ -312,34 +323,38 @@ int main(int argc, char** argv) std::cout << "\n"; } + y = 1; std::cout.precision(4); - std::cerr << "Performance (out of cache):\n"; + int s1 = 1024*1024*32; + std::cerr << "Performance (out of cache, " << s1 << "):\n"; { int iters = 1; - VectorXf vf = VectorXf::Random(1024*1024*32) * y; - VectorXd vd = VectorXd::Random(1024*1024*32) * y; - VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y; + VectorXf vf = VectorXf::Random(s1) * y; + VectorXd vd = VectorXd::Random(s1) * y; + VectorXcf vcf = VectorXcf::Random(s1) * y; BENCH_PERF(sqsumNorm); + BENCH_PERF(stableNorm); BENCH_PERF(blueNorm); -// BENCH_PERF(pblueNorm); -// BENCH_PERF(lapackNorm); -// BENCH_PERF(hypotNorm); -// BENCH_PERF(twopassNorm); + BENCH_PERF(pblueNorm); + BENCH_PERF(lapackNorm); + BENCH_PERF(hypotNorm); + BENCH_PERF(twopassNorm); BENCH_PERF(bl2passNorm); } - std::cerr << "\nPerformance (in cache):\n"; + std::cerr << "\nPerformance (in cache, " << 512 << "):\n"; { int iters = 100000; VectorXf vf = VectorXf::Random(512) * y; VectorXd vd = VectorXd::Random(512) * y; VectorXcf vcf = VectorXcf::Random(512) * y; BENCH_PERF(sqsumNorm); + BENCH_PERF(stableNorm); BENCH_PERF(blueNorm); -// BENCH_PERF(pblueNorm); -// BENCH_PERF(lapackNorm); -// BENCH_PERF(hypotNorm); -// BENCH_PERF(twopassNorm); + BENCH_PERF(pblueNorm); + BENCH_PERF(lapackNorm); + BENCH_PERF(hypotNorm); + BENCH_PERF(twopassNorm); BENCH_PERF(bl2passNorm); } } diff --git a/eigen/bench/benchmark-blocking-sizes.cpp b/eigen/bench/benchmark-blocking-sizes.cpp new file mode 100644 index 0000000..827be28 --- /dev/null +++ b/eigen/bench/benchmark-blocking-sizes.cpp @@ -0,0 +1,677 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include +#include +#include +#include +#include +#include + +bool eigen_use_specific_block_size; +int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n +#include + +#include + +using namespace Eigen; +using namespace std; + +static BenchTimer timer; + +// how many times we repeat each measurement. +// measurements are randomly shuffled - we're not doing +// all N identical measurements in a row. +const int measurement_repetitions = 3; + +// Timings below this value are too short to be accurate, +// we'll repeat measurements with more iterations until +// we get a timing above that threshold. +const float min_accurate_time = 1e-2f; + +// See --min-working-set-size command line parameter. +size_t min_working_set_size = 0; + +float max_clock_speed = 0.0f; + +// range of sizes that we will benchmark (in all 3 K,M,N dimensions) +const size_t maxsize = 2048; +const size_t minsize = 16; + +typedef MatrixXf MatrixType; +typedef MatrixType::Scalar Scalar; +typedef internal::packet_traits::type Packet; + +static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two"); +static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two"); +static_assert(maxsize > minsize, "maxsize must be larger than minsize"); +static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)"); + +// just a helper to store a triple of K,M,N sizes for matrix product +struct size_triple_t +{ + size_t k, m, n; + size_triple_t() : k(0), m(0), n(0) {} + size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} + size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} + size_triple_t(uint16_t compact) + { + k = 1 << ((compact & 0xf00) >> 8); + m = 1 << ((compact & 0x0f0) >> 4); + n = 1 << ((compact & 0x00f) >> 0); + } +}; + +uint8_t log2_pot(size_t x) { + size_t l = 0; + while (x >>= 1) l++; + return l; +} + +// Convert between size tripes and a compact form fitting in 12 bits +// where each size, which must be a POT, is encoded as its log2, on 4 bits +// so the largest representable size is 2^15 == 32k ... big enough. +uint16_t compact_size_triple(size_t k, size_t m, size_t n) +{ + return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); +} + +uint16_t compact_size_triple(const size_triple_t& t) +{ + return compact_size_triple(t.k, t.m, t.n); +} + +// A single benchmark. Initially only contains benchmark params. +// Then call run(), which stores the result in the gflops field. +struct benchmark_t +{ + uint16_t compact_product_size; + uint16_t compact_block_size; + bool use_default_block_size; + float gflops; + benchmark_t() + : compact_product_size(0) + , compact_block_size(0) + , use_default_block_size(false) + , gflops(0) + { + } + benchmark_t(size_t pk, size_t pm, size_t pn, + size_t bk, size_t bm, size_t bn) + : compact_product_size(compact_size_triple(pk, pm, pn)) + , compact_block_size(compact_size_triple(bk, bm, bn)) + , use_default_block_size(false) + , gflops(0) + {} + benchmark_t(size_t pk, size_t pm, size_t pn) + : compact_product_size(compact_size_triple(pk, pm, pn)) + , compact_block_size(0) + , use_default_block_size(true) + , gflops(0) + {} + + void run(); +}; + +ostream& operator<<(ostream& s, const benchmark_t& b) +{ + s << hex << b.compact_product_size << dec; + if (b.use_default_block_size) { + size_triple_t t(b.compact_product_size); + Index k = t.k, m = t.m, n = t.n; + internal::computeProductBlockingSizes(k, m, n); + s << " default(" << k << ", " << m << ", " << n << ")"; + } else { + s << " " << hex << b.compact_block_size << dec; + } + s << " " << b.gflops; + return s; +} + +// We sort first by increasing benchmark parameters, +// then by decreasing performance. +bool operator<(const benchmark_t& b1, const benchmark_t& b2) +{ + return b1.compact_product_size < b2.compact_product_size || + (b1.compact_product_size == b2.compact_product_size && ( + (b1.compact_block_size < b2.compact_block_size || ( + b1.compact_block_size == b2.compact_block_size && + b1.gflops > b2.gflops)))); +} + +void benchmark_t::run() +{ + size_triple_t productsizes(compact_product_size); + + if (use_default_block_size) { + eigen_use_specific_block_size = false; + } else { + // feed eigen with our custom blocking params + eigen_use_specific_block_size = true; + size_triple_t blocksizes(compact_block_size); + eigen_block_size_k = blocksizes.k; + eigen_block_size_m = blocksizes.m; + eigen_block_size_n = blocksizes.n; + } + + // set up the matrix pool + + const size_t combined_three_matrices_sizes = + sizeof(Scalar) * + (productsizes.k * productsizes.m + + productsizes.k * productsizes.n + + productsizes.m * productsizes.n); + + // 64 M is large enough that nobody has a cache bigger than that, + // while still being small enough that everybody has this much RAM, + // so conveniently we don't need to special-case platforms here. + const size_t unlikely_large_cache_size = 64 << 20; + + const size_t working_set_size = + min_working_set_size ? min_working_set_size : unlikely_large_cache_size; + + const size_t matrix_pool_size = + 1 + working_set_size / combined_three_matrices_sizes; + + MatrixType *lhs = new MatrixType[matrix_pool_size]; + MatrixType *rhs = new MatrixType[matrix_pool_size]; + MatrixType *dst = new MatrixType[matrix_pool_size]; + + for (size_t i = 0; i < matrix_pool_size; i++) { + lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k); + rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n); + dst[i] = MatrixType::Zero(productsizes.m, productsizes.n); + } + + // main benchmark loop + + int iters_at_a_time = 1; + float time_per_iter = 0.0f; + size_t matrix_index = 0; + while (true) { + + double starttime = timer.getCpuTime(); + for (int i = 0; i < iters_at_a_time; i++) { + dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; + matrix_index++; + if (matrix_index == matrix_pool_size) { + matrix_index = 0; + } + } + double endtime = timer.getCpuTime(); + + const float timing = float(endtime - starttime); + + if (timing >= min_accurate_time) { + time_per_iter = timing / iters_at_a_time; + break; + } + + iters_at_a_time *= 2; + } + + delete[] lhs; + delete[] rhs; + delete[] dst; + + gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter; +} + +void print_cpuinfo() +{ +#ifdef __linux__ + cout << "contents of /proc/cpuinfo:" << endl; + string line; + ifstream cpuinfo("/proc/cpuinfo"); + if (cpuinfo.is_open()) { + while (getline(cpuinfo, line)) { + cout << line << endl; + } + cpuinfo.close(); + } + cout << endl; +#elif defined __APPLE__ + cout << "output of sysctl hw:" << endl; + system("sysctl hw"); + cout << endl; +#endif +} + +template +string type_name() +{ + return "unknown"; +} + +template<> +string type_name() +{ + return "float"; +} + +template<> +string type_name() +{ + return "double"; +} + +struct action_t +{ + virtual const char* invokation_name() const { abort(); return nullptr; } + virtual void run() const { abort(); } + virtual ~action_t() {} +}; + +void show_usage_and_exit(int /*argc*/, char* argv[], + const vector>& available_actions) +{ + cerr << "usage: " << argv[0] << " [options...]" << endl << endl; + cerr << "available actions:" << endl << endl; + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + cerr << " " << (*it)->invokation_name() << endl; + } + cerr << endl; + cerr << "options:" << endl << endl; + cerr << " --min-working-set-size=N:" << endl; + cerr << " Set the minimum working set size to N bytes." << endl; + cerr << " This is rounded up as needed to a multiple of matrix size." << endl; + cerr << " A larger working set lowers the chance of a warm cache." << endl; + cerr << " The default value 0 means use a large enough working" << endl; + cerr << " set to likely outsize caches." << endl; + cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl; + cerr << " avoid warm caches." << endl; + exit(1); +} + +float measure_clock_speed() +{ + cerr << "Measuring clock speed... \r" << flush; + + vector all_gflops; + for (int i = 0; i < 8; i++) { + benchmark_t b(1024, 1024, 1024); + b.run(); + all_gflops.push_back(b.gflops); + } + + sort(all_gflops.begin(), all_gflops.end()); + float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5]; + + // multiply by an arbitrary constant to discourage trying doing anything with the + // returned values besides just comparing them with each other. + float result = stable_estimate * 123.456f; + + return result; +} + +struct human_duration_t +{ + int seconds; + human_duration_t(int s) : seconds(s) {} +}; + +ostream& operator<<(ostream& s, const human_duration_t& d) +{ + int remainder = d.seconds; + if (remainder > 3600) { + int hours = remainder / 3600; + s << hours << " h "; + remainder -= hours * 3600; + } + if (remainder > 60) { + int minutes = remainder / 60; + s << minutes << " min "; + remainder -= minutes * 60; + } + if (d.seconds < 600) { + s << remainder << " s"; + } + return s; +} + +const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data"; + +void serialize_benchmarks(const char* filename, const vector& benchmarks, size_t first_benchmark_to_run) +{ + FILE* file = fopen(filename, "w"); + if (!file) { + cerr << "Could not open file " << filename << " for writing." << endl; + cerr << "Do you have write permissions on the current working directory?" << endl; + exit(1); + } + size_t benchmarks_vector_size = benchmarks.size(); + fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file); + fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file); + fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file); + fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file); + fclose(file); +} + +bool deserialize_benchmarks(const char* filename, vector& benchmarks, size_t& first_benchmark_to_run) +{ + FILE* file = fopen(filename, "r"); + if (!file) { + return false; + } + if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) { + return false; + } + size_t benchmarks_vector_size = 0; + if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) { + return false; + } + if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) { + return false; + } + benchmarks.resize(benchmarks_vector_size); + if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) { + return false; + } + unlink(filename); + return true; +} + +void try_run_some_benchmarks( + vector& benchmarks, + double time_start, + size_t& first_benchmark_to_run) +{ + if (first_benchmark_to_run == benchmarks.size()) { + return; + } + + double time_last_progress_update = 0; + double time_last_clock_speed_measurement = 0; + double time_now = 0; + + size_t benchmark_index = first_benchmark_to_run; + + while (true) { + float ratio_done = float(benchmark_index) / benchmarks.size(); + time_now = timer.getRealTime(); + + // We check clock speed every minute and at the end. + if (benchmark_index == benchmarks.size() || + time_now > time_last_clock_speed_measurement + 60.0f) + { + time_last_clock_speed_measurement = time_now; + + // Ensure that clock speed is as expected + float current_clock_speed = measure_clock_speed(); + + // The tolerance needs to be smaller than the relative difference between + // clock speeds that a device could operate under. + // It seems unlikely that a device would be throttling clock speeds by + // amounts smaller than 2%. + // With a value of 1%, I was getting within noise on a Sandy Bridge. + const float clock_speed_tolerance = 0.02f; + + if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) { + // Clock speed is now higher than we previously measured. + // Either our initial measurement was inaccurate, which won't happen + // too many times as we are keeping the best clock speed value and + // and allowing some tolerance; or something really weird happened, + // which invalidates all benchmark results collected so far. + // Either way, we better restart all over again now. + if (benchmark_index) { + cerr << "Restarting at " << 100.0f * ratio_done + << " % because clock speed increased. " << endl; + } + max_clock_speed = current_clock_speed; + first_benchmark_to_run = 0; + return; + } + + bool rerun_last_tests = false; + + if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { + cerr << "Measurements completed so far: " + << 100.0f * ratio_done + << " % " << endl; + cerr << "Clock speed seems to be only " + << current_clock_speed/max_clock_speed + << " times what it used to be." << endl; + + unsigned int seconds_to_sleep_if_lower_clock_speed = 1; + + while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { + if (seconds_to_sleep_if_lower_clock_speed > 32) { + cerr << "Sleeping longer probably won't make a difference." << endl; + cerr << "Serializing benchmarks to " << session_filename << endl; + serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run); + cerr << "Now restart this benchmark, and it should pick up where we left." << endl; + exit(2); + } + rerun_last_tests = true; + cerr << "Sleeping " + << seconds_to_sleep_if_lower_clock_speed + << " s... \r" << endl; + sleep(seconds_to_sleep_if_lower_clock_speed); + current_clock_speed = measure_clock_speed(); + seconds_to_sleep_if_lower_clock_speed *= 2; + } + } + + if (rerun_last_tests) { + cerr << "Redoing the last " + << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size() + << " % because clock speed had been low. " << endl; + return; + } + + // nothing wrong with the clock speed so far, so there won't be a need to rerun + // benchmarks run so far in case we later encounter a lower clock speed. + first_benchmark_to_run = benchmark_index; + } + + if (benchmark_index == benchmarks.size()) { + // We're done! + first_benchmark_to_run = benchmarks.size(); + // Erase progress info + cerr << " " << endl; + return; + } + + // Display progress info on stderr + if (time_now > time_last_progress_update + 1.0f) { + time_last_progress_update = time_now; + cerr << "Measurements... " << 100.0f * ratio_done + << " %, ETA " + << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done) + << " \r" << flush; + } + + // This is where we actually run a benchmark! + benchmarks[benchmark_index].run(); + benchmark_index++; + } +} + +void run_benchmarks(vector& benchmarks) +{ + size_t first_benchmark_to_run; + vector deserialized_benchmarks; + bool use_deserialized_benchmarks = false; + if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) { + cerr << "Found serialized session with " + << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size() + << " % already done" << endl; + if (deserialized_benchmarks.size() == benchmarks.size() && + first_benchmark_to_run > 0 && + first_benchmark_to_run < benchmarks.size()) + { + use_deserialized_benchmarks = true; + } + } + + if (use_deserialized_benchmarks) { + benchmarks = deserialized_benchmarks; + } else { + // not using deserialized benchmarks, starting from scratch + first_benchmark_to_run = 0; + + // Randomly shuffling benchmarks allows us to get accurate enough progress info, + // as now the cheap/expensive benchmarks are randomly mixed so they average out. + // It also means that if data is corrupted for some time span, the odds are that + // not all repetitions of a given benchmark will be corrupted. + random_shuffle(benchmarks.begin(), benchmarks.end()); + } + + for (int i = 0; i < 4; i++) { + max_clock_speed = max(max_clock_speed, measure_clock_speed()); + } + + double time_start = 0.0; + while (first_benchmark_to_run < benchmarks.size()) { + if (first_benchmark_to_run == 0) { + time_start = timer.getRealTime(); + } + try_run_some_benchmarks(benchmarks, + time_start, + first_benchmark_to_run); + } + + // Sort timings by increasing benchmark parameters, and decreasing gflops. + // The latter is very important. It means that we can ignore all but the first + // benchmark with given parameters. + sort(benchmarks.begin(), benchmarks.end()); + + // Collect best (i.e. now first) results for each parameter values. + vector best_benchmarks; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + if (best_benchmarks.empty() || + best_benchmarks.back().compact_product_size != it->compact_product_size || + best_benchmarks.back().compact_block_size != it->compact_block_size) + { + best_benchmarks.push_back(*it); + } + } + + // keep and return only the best benchmarks + benchmarks = best_benchmarks; +} + +struct measure_all_pot_sizes_action_t : action_t +{ + virtual const char* invokation_name() const { return "all-pot-sizes"; } + virtual void run() const + { + vector benchmarks; + for (int repetition = 0; repetition < measurement_repetitions; repetition++) { + for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { + for (size_t msize = minsize; msize <= maxsize; msize *= 2) { + for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { + for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) { + for (size_t mblock = minsize; mblock <= msize; mblock *= 2) { + for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) { + benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock); + } + } + } + } + } + } + } + + run_benchmarks(benchmarks); + + cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + cout << *it << endl; + } + } +}; + +struct measure_default_sizes_action_t : action_t +{ + virtual const char* invokation_name() const { return "default-sizes"; } + virtual void run() const + { + vector benchmarks; + for (int repetition = 0; repetition < measurement_repetitions; repetition++) { + for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { + for (size_t msize = minsize; msize <= maxsize; msize *= 2) { + for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { + benchmarks.emplace_back(ksize, msize, nsize); + } + } + } + } + + run_benchmarks(benchmarks); + + cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + cout << *it << endl; + } + } +}; + +int main(int argc, char* argv[]) +{ + double time_start = timer.getRealTime(); + cout.precision(4); + cerr.precision(4); + + vector> available_actions; + available_actions.emplace_back(new measure_all_pot_sizes_action_t); + available_actions.emplace_back(new measure_default_sizes_action_t); + + auto action = available_actions.end(); + + if (argc <= 1) { + show_usage_and_exit(argc, argv, available_actions); + } + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + if (!strcmp(argv[1], (*it)->invokation_name())) { + action = it; + break; + } + } + + if (action == available_actions.end()) { + show_usage_and_exit(argc, argv, available_actions); + } + + for (int i = 2; i < argc; i++) { + if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { + const char* equals_sign = strchr(argv[i], '='); + min_working_set_size = strtoul(equals_sign+1, nullptr, 10); + } else { + cerr << "unrecognized option: " << argv[i] << endl << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + + print_cpuinfo(); + + cout << "benchmark parameters:" << endl; + cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; + cout << "scalar type: " << type_name() << endl; + cout << "packet size: " << internal::packet_traits::size << endl; + cout << "minsize = " << minsize << endl; + cout << "maxsize = " << maxsize << endl; + cout << "measurement_repetitions = " << measurement_repetitions << endl; + cout << "min_accurate_time = " << min_accurate_time << endl; + cout << "min_working_set_size = " << min_working_set_size; + if (min_working_set_size == 0) { + cout << " (try to outsize caches)"; + } + cout << endl << endl; + + (*action)->run(); + + double time_end = timer.getRealTime(); + cerr << "Finished in " << human_duration_t(time_end - time_start) << endl; +} diff --git a/eigen/bench/btl/CMakeLists.txt b/eigen/bench/btl/CMakeLists.txt index 119b470..38ff9f4 100644 --- a/eigen/bench/btl/CMakeLists.txt +++ b/eigen/bench/btl/CMakeLists.txt @@ -11,29 +11,24 @@ SET(CMAKE_INCLUDE_CURRENT_DIR ON) string(REGEX MATCH icpc IS_ICPC ${CMAKE_CXX_COMPILER}) IF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC) - SET(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG") - SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG") - IF(NOT BTL_NOVEC) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2") - SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -msse2") - ELSE(NOT BTL_NOVEC) + SET(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_CXX_FLAGS}") + SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_Fortran_FLAGS}") + IF(BTL_NOVEC) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE") - ENDIF(NOT BTL_NOVEC) + ENDIF(BTL_NOVEC) ENDIF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC) IF(MSVC) SET(CMAKE_CXX_FLAGS " /O2 /Ot /GL /fp:fast -DNDEBUG") # SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG") - IF(NOT BTL_NOVEC) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2") - ELSE(NOT BTL_NOVEC) + IF(BTL_NOVEC) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE") - ENDIF(NOT BTL_NOVEC) + ENDIF(BTL_NOVEC) ENDIF(MSVC) if(IS_ICPC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fast") - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fast") + set(CMAKE_CXX_FLAGS "-fast ${CMAKE_CXX_FLAGS}") + set(CMAKE_Fortran_FLAGS "-fast ${CMAKE_Fortran_FLAGS}") endif(IS_ICPC) include_directories( @@ -48,6 +43,12 @@ include_directories( # set(DEFAULT_LIBRARIES ${MKL_LIBRARIES}) # endif (MKL_FOUND) +find_library(EIGEN_BTL_RT_LIBRARY rt) +# if we cannot find it easily, then we don't need it! +if(NOT EIGEN_BTL_RT_LIBRARY) + set(EIGEN_BTL_RT_LIBRARY "") +endif() + MACRO(BTL_ADD_BENCH targetname) foreach(_current_var ${ARGN}) @@ -70,7 +71,7 @@ MACRO(BTL_ADD_BENCH targetname) IF(BUILD_${targetname}) ADD_EXECUTABLE(${targetname} ${_sources}) ADD_TEST(${targetname} "${targetname}") - target_link_libraries(${targetname} ${DEFAULT_LIBRARIES} rt) + target_link_libraries(${targetname} ${DEFAULT_LIBRARIES} ${EIGEN_BTL_RT_LIBRARY}) ENDIF(BUILD_${targetname}) ENDMACRO(BTL_ADD_BENCH) @@ -91,6 +92,7 @@ ENABLE_TESTING() add_subdirectory(libs/eigen3) add_subdirectory(libs/eigen2) +add_subdirectory(libs/tensors) add_subdirectory(libs/BLAS) add_subdirectory(libs/ublas) add_subdirectory(libs/gmm) @@ -98,6 +100,7 @@ add_subdirectory(libs/mtl4) add_subdirectory(libs/blitz) add_subdirectory(libs/tvmet) add_subdirectory(libs/STL) +add_subdirectory(libs/blaze) add_subdirectory(data) diff --git a/eigen/bench/btl/actions/action_axpby.hh b/eigen/bench/btl/actions/action_axpby.hh index 98511ab..dadd0cc 100644 --- a/eigen/bench/btl/actions/action_axpby.hh +++ b/eigen/bench/btl/actions/action_axpby.hh @@ -33,7 +33,7 @@ class Action_axpby { public : // Ctor - Action_axpby( int size ):_size(size),_alpha(0.5),_beta(0.95) + Action_axpby( int size ):_alpha(0.5),_beta(0.95),_size(size) { MESSAGE("Action_axpby Ctor"); diff --git a/eigen/bench/btl/actions/action_axpy.hh b/eigen/bench/btl/actions/action_axpy.hh index e4cb3a5..261be4c 100644 --- a/eigen/bench/btl/actions/action_axpy.hh +++ b/eigen/bench/btl/actions/action_axpy.hh @@ -35,7 +35,7 @@ public : // Ctor - Action_axpy( int size ):_size(size),_coef(1.0) + Action_axpy( int size ):_coef(1.0),_size(size) { MESSAGE("Action_axpy Ctor"); diff --git a/eigen/bench/btl/actions/basic_actions.hh b/eigen/bench/btl/actions/basic_actions.hh index a3333ea..62442f0 100644 --- a/eigen/bench/btl/actions/basic_actions.hh +++ b/eigen/bench/btl/actions/basic_actions.hh @@ -6,7 +6,7 @@ #include "action_atv_product.hh" #include "action_matrix_matrix_product.hh" -// #include "action_ata_product.hh" +#include "action_ata_product.hh" #include "action_aat_product.hh" #include "action_trisolve.hh" diff --git a/eigen/bench/btl/cmake/FindACML.cmake b/eigen/bench/btl/cmake/FindACML.cmake index f45ae1b..4989fa2 100644 --- a/eigen/bench/btl/cmake/FindACML.cmake +++ b/eigen/bench/btl/cmake/FindACML.cmake @@ -17,6 +17,7 @@ find_file(ACML_LIBRARIES libacml_mp.so PATHS /usr/lib + /usr/lib64 $ENV{ACMLDIR}/lib ${LIB_INSTALL_DIR} ) @@ -35,6 +36,7 @@ if(NOT ACML_LIBRARIES) libacml.so libacml_mv.so PATHS /usr/lib + /usr/lib64 $ENV{ACMLDIR}/lib ${LIB_INSTALL_DIR} ) diff --git a/eigen/bench/btl/cmake/FindATLAS.cmake b/eigen/bench/btl/cmake/FindATLAS.cmake index 6b90652..4136a98 100644 --- a/eigen/bench/btl/cmake/FindATLAS.cmake +++ b/eigen/bench/btl/cmake/FindATLAS.cmake @@ -3,33 +3,25 @@ if (ATLAS_LIBRARIES) set(ATLAS_FIND_QUIETLY TRUE) endif (ATLAS_LIBRARIES) -find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_LAPACK NAMES liblapack_atlas.so.3 liblapack.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_LAPACK NAMES lapack_atlas lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) - -if(NOT ATLAS_LAPACK) - find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) - find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -endif(NOT ATLAS_LAPACK) - -find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) - set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS} ${ATLAS_F77BLAS} ${ATLAS_LIB}) + set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_LIB}) # search the default lapack lib link to it find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64) find_library(ATLAS_REFERENCE_LAPACK NAMES lapack) - if(ATLAS_REFERENCE_LAPACK) - set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK}) - endif() +# if(ATLAS_REFERENCE_LAPACK) +# set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK}) +# endif() endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) diff --git a/eigen/bench/btl/cmake/FindBLAZE.cmake b/eigen/bench/btl/cmake/FindBLAZE.cmake new file mode 100644 index 0000000..dba4c89 --- /dev/null +++ b/eigen/bench/btl/cmake/FindBLAZE.cmake @@ -0,0 +1,31 @@ +# - Try to find eigen2 headers +# Once done this will define +# +# BLAZE_FOUND - system has blaze lib +# BLAZE_INCLUDE_DIR - the blaze include directory +# +# Copyright (C) 2008 Gael Guennebaud +# Adapted from FindEigen.cmake: +# Copyright (c) 2006, 2007 Montel Laurent, +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + +if (BLAZE_INCLUDE_DIR) + + # in cache already + set(BLAZE_FOUND TRUE) + +else (BLAZE_INCLUDE_DIR) + +find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h + PATHS + ${INCLUDE_INSTALL_DIR} + ) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR) + +mark_as_advanced(BLAZE_INCLUDE_DIR) + +endif(BLAZE_INCLUDE_DIR) + diff --git a/eigen/bench/btl/cmake/FindCBLAS.cmake b/eigen/bench/btl/cmake/FindCBLAS.cmake index 554f029..ce0f2f2 100644 --- a/eigen/bench/btl/cmake/FindCBLAS.cmake +++ b/eigen/bench/btl/cmake/FindCBLAS.cmake @@ -23,6 +23,7 @@ find_file(CBLAS_LIBRARIES libcblas.so.3 PATHS /usr/lib + /usr/lib64 $ENV{CBLASDIR}/lib ${LIB_INSTALL_DIR} ) diff --git a/eigen/bench/btl/cmake/FindGOTO.cmake b/eigen/bench/btl/cmake/FindGOTO.cmake deleted file mode 100644 index 67ea093..0000000 --- a/eigen/bench/btl/cmake/FindGOTO.cmake +++ /dev/null @@ -1,15 +0,0 @@ - -if (GOTO_LIBRARIES) - set(GOTO_FIND_QUIETLY TRUE) -endif (GOTO_LIBRARIES) - -find_library(GOTO_LIBRARIES goto PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR}) - -if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) - set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread -lgfortran") -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_LIBRARIES) - -mark_as_advanced(GOTO_LIBRARIES) diff --git a/eigen/bench/btl/cmake/FindGOTO2.cmake b/eigen/bench/btl/cmake/FindGOTO2.cmake deleted file mode 100644 index baa68d2..0000000 --- a/eigen/bench/btl/cmake/FindGOTO2.cmake +++ /dev/null @@ -1,25 +0,0 @@ - -if (GOTO2_LIBRARIES) - set(GOTO2_FIND_QUIETLY TRUE) -endif (GOTO2_LIBRARIES) -# -# find_path(GOTO_INCLUDES -# NAMES -# cblas.h -# PATHS -# $ENV{GOTODIR}/include -# ${INCLUDE_INSTALL_DIR} -# ) - -find_file(GOTO2_LIBRARIES libgoto2.so PATHS /usr/lib $ENV{GOTO2DIR} ${LIB_INSTALL_DIR}) -find_library(GOTO2_LIBRARIES goto2 PATHS $ENV{GOTO2DIR} ${LIB_INSTALL_DIR}) - -if(GOTO2_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) - set(GOTO2_LIBRARIES ${GOTO2_LIBRARIES} "-lpthread -lgfortran") -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(GOTO2 DEFAULT_MSG - GOTO2_LIBRARIES) - -mark_as_advanced(GOTO2_LIBRARIES) diff --git a/eigen/bench/btl/cmake/FindOPENBLAS.cmake b/eigen/bench/btl/cmake/FindOPENBLAS.cmake new file mode 100644 index 0000000..2a09194 --- /dev/null +++ b/eigen/bench/btl/cmake/FindOPENBLAS.cmake @@ -0,0 +1,17 @@ + +if (OPENBLAS_LIBRARIES) + set(OPENBLAS_FIND_QUIETLY TRUE) +endif (OPENBLAS_LIBRARIES) + +find_file(OPENBLAS_LIBRARIES NAMES libopenblas.so libopenblas.so.0 PATHS /usr/lib /usr/lib64 $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) +find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) + +if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) + set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES} "-lpthread -lgfortran") +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(OPENBLAS DEFAULT_MSG + OPENBLAS_LIBRARIES) + +mark_as_advanced(OPENBLAS_LIBRARIES) diff --git a/eigen/bench/btl/data/action_settings.txt b/eigen/bench/btl/data/action_settings.txt index e32213e..39d2b5d 100644 --- a/eigen/bench/btl/data/action_settings.txt +++ b/eigen/bench/btl/data/action_settings.txt @@ -1,19 +1,19 @@ -aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:3000 -ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:3000 -atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:3000 +aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:5000 +ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:5000 +atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:5000 axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000 axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000 -matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:3000 -matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000 -trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:3000 -trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:3000 -trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:3000 -cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000 -complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000 -partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000 -tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000 -hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000 -symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000 -syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000 -ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000 -rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000 \ No newline at end of file +matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:5000 +matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:5000 +trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:5000 +trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:5000 +trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:5000 +cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:5000 +complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:5000 +partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:5000 +tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:5000 +hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:5000 +symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:5000 +syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:5000 +ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:5000 +rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000 diff --git a/eigen/bench/btl/data/perlib_plot_settings.txt b/eigen/bench/btl/data/perlib_plot_settings.txt index 6844bab..f023cfe 100644 --- a/eigen/bench/btl/data/perlib_plot_settings.txt +++ b/eigen/bench/btl/data/perlib_plot_settings.txt @@ -10,7 +10,7 @@ ublas ; with lines lw 3 lt 1 lc rgbcolor "#00b7ff" mtl4 ; with lines lw 3 lt 1 lc rgbcolor "#d18847" blitz ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff" F77 ; with lines lw 3 lt 3 lc rgbcolor "#e6e64c" -GOTO ; with lines lw 3 lt 3 lc rgbcolor "#C05600" -GOTO2 ; with lines lw 3 lt 1 lc rgbcolor "#C05600" +OPENBLAS ; with lines lw 3 lt 1 lc rgbcolor "#C05600" C ; with lines lw 3 lt 3 lc rgbcolor "#e6bd96" ACML ; with lines lw 2 lt 3 lc rgbcolor "#e6e64c" +blaze ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff" diff --git a/eigen/bench/btl/generic_bench/bench.hh b/eigen/bench/btl/generic_bench/bench.hh index 005c363..7b7b951 100644 --- a/eigen/bench/btl/generic_bench/bench.hh +++ b/eigen/bench/btl/generic_bench/bench.hh @@ -102,8 +102,8 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point ) // merge the two data std::vector newSizes; std::vector newFlops; - int i=0; - int j=0; + unsigned int i=0; + unsigned int j=0; while (i config = BtlString(_config).split(" \t\n"); - for (int i = 0; i BTL_DONT_INLINE void init_matrix(Vector & A, int size){ A.resize(size); - for (int row=0; row(A[row],size,row); } } @@ -50,11 +50,11 @@ BTL_DONT_INLINE void init_matrix(Vector & A, int size){ template BTL_DONT_INLINE void init_matrix_symm(Matrix& A, int size){ A.resize(size); - for (int row=0; row @@ -87,6 +87,48 @@ }; // Portable_Timer +#elif defined(__APPLE__) +#include +#include + + +class Portable_Timer +{ + public: + + Portable_Timer() + { + } + + void start() + { + m_start_time = double(mach_absolute_time())*1e-9;; + + } + + void stop() + { + m_stop_time = double(mach_absolute_time())*1e-9;; + + } + + double elapsed() + { + return user_time(); + } + + double user_time() + { + return m_stop_time - m_start_time; + } + + +private: + + double m_stop_time, m_start_time; + +}; // Portable_Timer (Apple) + #else #include @@ -138,7 +180,7 @@ private: int m_clkid; double m_stop_time, m_start_time; -}; // Portable_Timer +}; // Portable_Timer (Linux) #endif diff --git a/eigen/bench/btl/generic_bench/utils/size_lin_log.hh b/eigen/bench/btl/generic_bench/utils/size_lin_log.hh index bca3932..bbc9f54 100644 --- a/eigen/bench/btl/generic_bench/utils/size_lin_log.hh +++ b/eigen/bench/btl/generic_bench/utils/size_lin_log.hh @@ -23,7 +23,7 @@ #include "size_log.hh" template -void size_lin_log(const int nb_point, const int size_min, const int size_max, Vector & X) +void size_lin_log(const int nb_point, const int /*size_min*/, const int size_max, Vector & X) { int ten=10; int nine=9; diff --git a/eigen/bench/btl/libs/BLAS/CMakeLists.txt b/eigen/bench/btl/libs/BLAS/CMakeLists.txt index de42fe0..0272cca 100644 --- a/eigen/bench/btl/libs/BLAS/CMakeLists.txt +++ b/eigen/bench/btl/libs/BLAS/CMakeLists.txt @@ -18,27 +18,14 @@ if (MKL_FOUND) endif (MKL_FOUND) -find_package(GOTO2) -if (GOTO2_FOUND) - btl_add_bench(btl_goto2 main.cpp) - if(BUILD_btl_goto2) - target_link_libraries(btl_goto2 ${GOTO_LIBRARIES} ) - set_target_properties(btl_goto2 PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO2") - endif(BUILD_btl_goto2) -endif (GOTO2_FOUND) - -find_package(GOTO) -if (GOTO_FOUND) - if(GOTO2_FOUND) - btl_add_bench(btl_goto main.cpp OFF) - else() - btl_add_bench(btl_goto main.cpp) - endif() - if(BUILD_btl_goto) - target_link_libraries(btl_goto ${GOTO_LIBRARIES} ) - set_target_properties(btl_goto PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO") - endif(BUILD_btl_goto) -endif (GOTO_FOUND) +find_package(OPENBLAS) +if (OPENBLAS_FOUND) + btl_add_bench(btl_openblas main.cpp) + if(BUILD_btl_openblas) + target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} ) + set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS") + endif(BUILD_btl_openblas) +endif (OPENBLAS_FOUND) find_package(ACML) if (ACML_FOUND) diff --git a/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh b/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh index 0e84df0..9e0a649 100644 --- a/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh +++ b/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh @@ -46,9 +46,9 @@ public : BLAS_FUNC(gemm)(¬rans,¬rans,&N,&N,&N,&fone,A,&N,B,&N,&fzero,X,&N); } -// static inline void ata_product(gene_matrix & A, gene_matrix & X, int N){ -// ssyrk_(&lower,&trans,&N,&N,&fone,A,&N,&fzero,X,&N); -// } + static inline void ata_product(gene_matrix & A, gene_matrix & X, int N){ + BLAS_FUNC(syrk)(&lower,&trans,&N,&N,&fone,A,&N,&fzero,X,&N); + } static inline void aat_product(gene_matrix & A, gene_matrix & X, int N){ BLAS_FUNC(syrk)(&lower,¬rans,&N,&N,&fone,A,&N,&fzero,X,&N); @@ -75,7 +75,6 @@ public : static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ int N2 = N*N; BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); - char uplo = 'L'; int info = 0; int * ipiv = (int*)alloca(sizeof(int)*N); BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info); @@ -92,7 +91,7 @@ public : BLAS_FUNC(trsm)(&right, &lower, ¬rans, &nonunit, &N, &N, &fone, L, &N, X, &N); } - static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N){ + static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & /*X*/, int N){ BLAS_FUNC(trmm)(&left, &lower, ¬rans,&nonunit, &N,&N,&fone,A,&N,B,&N); } @@ -101,7 +100,6 @@ public : static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ int N2 = N*N; BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); - char uplo = 'L'; int info = 0; int * ipiv = (int*)alloca(sizeof(int)*N); int * jpiv = (int*)alloca(sizeof(int)*N); @@ -134,8 +132,6 @@ public : } char uplo = 'U'; int info = 0; - int ilo = 1; - int ihi = N; int bsize = 64; int worksize = N*bsize; SCALAR* d = new SCALAR[3*N+worksize]; diff --git a/eigen/bench/btl/libs/BLAS/c_interface_base.h b/eigen/bench/btl/libs/BLAS/c_interface_base.h index 515d8dc..de61380 100644 --- a/eigen/bench/btl/libs/BLAS/c_interface_base.h +++ b/eigen/bench/btl/libs/BLAS/c_interface_base.h @@ -17,12 +17,12 @@ public: typedef real* gene_matrix; typedef real* gene_vector; - static void free_matrix(gene_matrix & A, int N){ - delete A; + static void free_matrix(gene_matrix & A, int /*N*/){ + delete[] A; } static void free_vector(gene_vector & B){ - delete B; + delete[] B; } static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ diff --git a/eigen/bench/btl/libs/BLAS/main.cpp b/eigen/bench/btl/libs/BLAS/main.cpp index 8347c9f..fd99149 100644 --- a/eigen/bench/btl/libs/BLAS/main.cpp +++ b/eigen/bench/btl/libs/BLAS/main.cpp @@ -48,7 +48,7 @@ int main() bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); bench > >(MIN_MM,MAX_MM,NB_POINT); -// bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_MM,MAX_MM,NB_POINT); bench > >(MIN_MM,MAX_MM,NB_POINT); bench > >(MIN_MM,MAX_MM,NB_POINT); @@ -56,13 +56,13 @@ int main() bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); #ifdef HAS_LAPACK - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); #endif //bench > >(MIN_LU,MAX_LU,NB_POINT); diff --git a/eigen/bench/btl/libs/STL/STL_interface.hh b/eigen/bench/btl/libs/STL/STL_interface.hh index 93e76bd..16658c4 100644 --- a/eigen/bench/btl/libs/STL/STL_interface.hh +++ b/eigen/bench/btl/libs/STL/STL_interface.hh @@ -44,9 +44,9 @@ public : return "STL"; } - static void free_matrix(gene_matrix & A, int N){} + static void free_matrix(gene_matrix & /*A*/, int /*N*/){} - static void free_vector(gene_vector & B){} + static void free_vector(gene_vector & /*B*/){} static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ A = A_stl; @@ -78,18 +78,18 @@ public : cible[i][j]=source[i][j]; } -// static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N) -// { -// real somme; -// for (int j=0;j +//===================================================== +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +#ifndef BLAZE_INTERFACE_HH +#define BLAZE_INTERFACE_HH + +#include +#include +#include +// using namespace blaze; + +#include + +template +class blaze_interface { + +public : + + typedef real real_type ; + + typedef std::vector stl_vector; + typedef std::vector stl_matrix; + + typedef blaze::DynamicMatrix gene_matrix; + typedef blaze::DynamicVector gene_vector; + + static inline std::string name() { return "blaze"; } + + static void free_matrix(gene_matrix & A, int N){ + return ; + } + + static void free_vector(gene_vector & B){ + return ; + } + + static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(A_stl[0].size(), A_stl.size()); + + for (int j=0; j ipvt(N); +// lu_factor(R, ipvt); +// } + +// static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector & X, int N){ +// X = lower_trisolve(L, B); +// } + + static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ + cible = source; + } + + static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){ + cible = source; + } + +}; + +#endif diff --git a/eigen/bench/btl/libs/blaze/main.cpp b/eigen/bench/btl/libs/blaze/main.cpp new file mode 100644 index 0000000..ccae0cb --- /dev/null +++ b/eigen/bench/btl/libs/blaze/main.cpp @@ -0,0 +1,40 @@ +//===================================================== +// Copyright (C) 2008 Gael Guennebaud +//===================================================== +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +#include "utilities.h" +#include "blaze_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + bench > >(MIN_MV,MAX_MV,NB_POINT); + bench > >(MIN_MV,MAX_MV,NB_POINT); + bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} + + diff --git a/eigen/bench/btl/libs/eigen2/eigen2_interface.hh b/eigen/bench/btl/libs/eigen2/eigen2_interface.hh index 47fe581..1deabda 100644 --- a/eigen/bench/btl/libs/eigen2/eigen2_interface.hh +++ b/eigen/bench/btl/libs/eigen2/eigen2_interface.hh @@ -47,7 +47,7 @@ public : { #if defined(EIGEN_VECTORIZE_SSE) if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2"; - #elif defined(EIGEN_VECTORIZE_ALTIVEC) + #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2"; #else if (SIZE==Dynamic) return "eigen2_novec"; else return "tiny_eigen2_novec"; diff --git a/eigen/bench/btl/libs/eigen3/eigen3_interface.hh b/eigen/bench/btl/libs/eigen3/eigen3_interface.hh index 31bcc1f..2e302d0 100644 --- a/eigen/bench/btl/libs/eigen3/eigen3_interface.hh +++ b/eigen/bench/btl/libs/eigen3/eigen3_interface.hh @@ -45,15 +45,15 @@ public : return EIGEN_MAKESTRING(BTL_PREFIX); } - static void free_matrix(gene_matrix & A, int N) {} + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} - static void free_vector(gene_vector & B) {} + static void free_vector(gene_vector & /*B*/) {} static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ A.resize(A_stl[0].size(), A_stl.size()); - for (int j=0; j().setZero(); + X.template selfadjointView().rankUpdate(A.transpose()); + } - static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){ + static inline void aat_product(const gene_matrix & A, gene_matrix & X, int /*N*/){ X.template triangularView().setZero(); X.template selfadjointView().rankUpdate(A); } - static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = A*B; } - static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ + static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = (A.template selfadjointView() * B); // internal::product_selfadjoint_vector(N,A.data(),N, B.data(), 1, X.data(), 1); } @@ -155,54 +157,54 @@ public : } } - static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){ + static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){ // internal::product_selfadjoint_rank2_update(N,A.data(),N, X.data(), 1, Y.data(), 1, -1); for(int j=0; j(c,s)); } - static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ + static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = (A.transpose()*B); } - static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ Y += coef * X; } - static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){ + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ Y = a*X + b*Y; } - static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ cible = source; } - static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int N){ + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ cible = source; } - static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){ + static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int /*N*/){ X = L.template triangularView().solve(B); } - static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ + static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){ X = L.template triangularView().solve(B); } - static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ + static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){ X.noalias() = L.template triangularView() * B; } - static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){ + static inline void cholesky(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = X; internal::llt_inplace::blocked(C); //C = X.llt().matrixL(); @@ -211,11 +213,11 @@ public : // Cholesky::computeInPlaceBlock(C); } - static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ + static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = X.fullPivLu().matrixLU(); } - static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ + static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ Matrix piv(N); DenseIndex nb; C = X; @@ -223,13 +225,13 @@ public : // C = X.partialPivLu().matrixLU(); } - static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ + static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ typename Tridiagonalization::CoeffVectorType aux(N-1); C = X; internal::tridiagonalization_inplace(C, aux); } - static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){ + static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = HessenbergDecomposition(X).packedMatrix(); } diff --git a/eigen/bench/btl/libs/eigen3/main_adv.cpp b/eigen/bench/btl/libs/eigen3/main_adv.cpp index efe5857..9586535 100644 --- a/eigen/bench/btl/libs/eigen3/main_adv.cpp +++ b/eigen/bench/btl/libs/eigen3/main_adv.cpp @@ -29,14 +29,14 @@ BTL_MAIN; int main() { - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); return 0; } diff --git a/eigen/bench/btl/libs/eigen3/main_matmat.cpp b/eigen/bench/btl/libs/eigen3/main_matmat.cpp index 926fa2b..052810a 100644 --- a/eigen/bench/btl/libs/eigen3/main_matmat.cpp +++ b/eigen/bench/btl/libs/eigen3/main_matmat.cpp @@ -25,7 +25,7 @@ BTL_MAIN; int main() { bench > >(MIN_MM,MAX_MM,NB_POINT); -// bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_MM,MAX_MM,NB_POINT); bench > >(MIN_MM,MAX_MM,NB_POINT); bench > >(MIN_MM,MAX_MM,NB_POINT); diff --git a/eigen/bench/btl/libs/tensors/CMakeLists.txt b/eigen/bench/btl/libs/tensors/CMakeLists.txt new file mode 100644 index 0000000..09d6d8e --- /dev/null +++ b/eigen/bench/btl/libs/tensors/CMakeLists.txt @@ -0,0 +1,44 @@ + + +if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR) + # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version + set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR}) + set(TENSOR_FOUND TRUE) +else() + find_package(Tensor) +endif() + +if (TENSOR_FOUND) + + include_directories(${TENSOR_INCLUDE_DIR}) + btl_add_bench(btl_tensor_linear main_linear.cpp) + btl_add_bench(btl_tensor_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + + option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF) + if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC) + btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp) + btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + endif() + + + if(NOT BTL_NOVEC) + btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF) + btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF) + btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF) + btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + + endif(NOT BTL_NOVEC) + +endif (TENSOR_FOUND) diff --git a/eigen/bench/btl/libs/tensors/main_linear.cpp b/eigen/bench/btl/libs/tensors/main_linear.cpp new file mode 100644 index 0000000..e257f1e --- /dev/null +++ b/eigen/bench/btl/libs/tensors/main_linear.cpp @@ -0,0 +1,23 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + return 0; +} diff --git a/eigen/bench/btl/libs/tensors/main_matmat.cpp b/eigen/bench/btl/libs/tensors/main_matmat.cpp new file mode 100644 index 0000000..675fcfc --- /dev/null +++ b/eigen/bench/btl/libs/tensors/main_matmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} diff --git a/eigen/bench/btl/libs/tensors/main_vecmat.cpp b/eigen/bench/btl/libs/tensors/main_vecmat.cpp new file mode 100644 index 0000000..1af00c8 --- /dev/null +++ b/eigen/bench/btl/libs/tensors/main_vecmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_MV,MAX_MV,NB_POINT); + + return 0; +} diff --git a/eigen/bench/btl/libs/tensors/tensor_interface.hh b/eigen/bench/btl/libs/tensors/tensor_interface.hh new file mode 100644 index 0000000..97b8e0f --- /dev/null +++ b/eigen/bench/btl/libs/tensors/tensor_interface.hh @@ -0,0 +1,105 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#ifndef TENSOR_INTERFACE_HH +#define TENSOR_INTERFACE_HH + +#include +#include +#include "btl.hh" + +using namespace Eigen; + +template +class tensor_interface +{ +public : + typedef real real_type; + typedef typename Eigen::Tensor::Index Index; + + typedef std::vector stl_vector; + typedef std::vector stl_matrix; + + typedef Eigen::Tensor gene_matrix; + typedef Eigen::Tensor gene_vector; + + + static inline std::string name( void ) + { + return EIGEN_MAKESTRING(BTL_PREFIX); + } + + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} + + static void free_vector(gene_vector & /*B*/) {} + + static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(Eigen::array(A_stl[0].size(), A_stl.size())); + + for (unsigned int j=0; j(i,j)) = A_stl[j][i]; + } + } + } + + static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){ + B.resize(B_stl.size()); + + for (unsigned int i=0; i(i,j)); + } + } + } + + static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){ + typedef typename Eigen::Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ + typedef typename Eigen::Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ + Y += X.constant(coef) * X; + } + + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ + Y = X.constant(a)*X + Y.constant(b)*Y; + } + + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ + cible = source; + } + + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ + cible = source; + } +}; + +#endif diff --git a/eigen/bench/dense_solvers.cpp b/eigen/bench/dense_solvers.cpp new file mode 100644 index 0000000..24343dc --- /dev/null +++ b/eigen/bench/dense_solvers.cpp @@ -0,0 +1,186 @@ +#include +#include "BenchTimer.h" +#include +#include +#include +#include +#include +using namespace Eigen; + +std::map > results; +std::vector labels; +std::vector sizes; + +template +EIGEN_DONT_INLINE +void compute_norm_equation(Solver &solver, const MatrixType &A) { + if(A.rows()!=A.cols()) + solver.compute(A.transpose()*A); + else + solver.compute(A); +} + +template +EIGEN_DONT_INLINE +void compute(Solver &solver, const MatrixType &A) { + solver.compute(A); +} + +template +void bench(int id, int rows, int size = Size) +{ + typedef Matrix Mat; + typedef Matrix MatDyn; + typedef Matrix MatSquare; + Mat A(rows,size); + A.setRandom(); + if(rows==size) + A = A*A.adjoint(); + BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd; + + int svd_opt = ComputeThinU|ComputeThinV; + + int tries = 5; + int rep = 1000/size; + if(rep==0) rep = 1; +// rep = rep*rep; + + LLT llt(size); + LDLT ldlt(size); + PartialPivLU lu(size); + FullPivLU fplu(size,size); + HouseholderQR qr(A.rows(),A.cols()); + ColPivHouseholderQR cpqr(A.rows(),A.cols()); + CompleteOrthogonalDecomposition cod(A.rows(),A.cols()); + FullPivHouseholderQR fpqr(A.rows(),A.cols()); + JacobiSVD jsvd(A.rows(),A.cols()); + BDCSVD bdcsvd(A.rows(),A.cols()); + + BENCH(t_llt, tries, rep, compute_norm_equation(llt,A)); + BENCH(t_ldlt, tries, rep, compute_norm_equation(ldlt,A)); + BENCH(t_lu, tries, rep, compute_norm_equation(lu,A)); + if(size<=1000) + BENCH(t_fplu, tries, rep, compute_norm_equation(fplu,A)); + BENCH(t_qr, tries, rep, compute(qr,A)); + BENCH(t_cpqr, tries, rep, compute(cpqr,A)); + BENCH(t_cod, tries, rep, compute(cod,A)); + if(size*rows<=10000000) + BENCH(t_fpqr, tries, rep, compute(fpqr,A)); + if(size<500) // JacobiSVD is really too slow for too large matrices + BENCH(t_jsvd, tries, rep, jsvd.compute(A,svd_opt)); +// if(size*rows<=20000000) + BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,svd_opt)); + + results["LLT"][id] = t_llt.best(); + results["LDLT"][id] = t_ldlt.best(); + results["PartialPivLU"][id] = t_lu.best(); + results["FullPivLU"][id] = t_fplu.best(); + results["HouseholderQR"][id] = t_qr.best(); + results["ColPivHouseholderQR"][id] = t_cpqr.best(); + results["CompleteOrthogonalDecomposition"][id] = t_cod.best(); + results["FullPivHouseholderQR"][id] = t_fpqr.best(); + results["JacobiSVD"][id] = t_jsvd.best(); + results["BDCSVD"][id] = t_bdcsvd.best(); +} + + +int main() +{ + labels.push_back("LLT"); + labels.push_back("LDLT"); + labels.push_back("PartialPivLU"); + labels.push_back("FullPivLU"); + labels.push_back("HouseholderQR"); + labels.push_back("ColPivHouseholderQR"); + labels.push_back("CompleteOrthogonalDecomposition"); + labels.push_back("FullPivHouseholderQR"); + labels.push_back("JacobiSVD"); + labels.push_back("BDCSVD"); + + for(int i=0; i(k,sizes[k](0),sizes[k](1)); + } + + cout.width(32); + cout << "solver/size"; + cout << " "; + for(int k=0; k=1e6) cout << "-"; + else cout << r(k); + cout << " "; + } + cout << endl; + } + + // HTML output + cout << "" << endl; + cout << "" << endl; + for(int k=0; k" << sizes[k](0) << "x" << sizes[k](1) << ""; + cout << "" << endl; + for(int i=0; i"; + ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f; + for(int k=0; k=1e6) cout << ""; + else + { + cout << ""; + } + } + cout << "" << endl; + } + cout << "
solver/size
" << labels[i] << "-" << r(k); + if(i>0) + cout << " (x" << numext::round(10.f*results[labels[i]](k)/results["LLT"](k))/10.f << ")"; + if(i<4 && sizes[k](0)!=sizes[k](1)) + cout << " *"; + cout << "
" << endl; + +// cout << "LLT (ms) " << (results["LLT"]*1000.).format(fmt) << "\n"; +// cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; +// cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; +// cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; +// cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; +// cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; +// cout << "CompleteOrthogonalDecomposition (%) " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n"; +// cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; +// cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; +// cout << "BDCSVD (%) " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n"; +} diff --git a/eigen/bench/eig33.cpp b/eigen/bench/eig33.cpp index 1608b99..47947a9 100644 --- a/eigen/bench/eig33.cpp +++ b/eigen/bench/eig33.cpp @@ -50,7 +50,7 @@ inline void computeRoots(const Matrix& m, Roots& roots) { typedef typename Matrix::Scalar Scalar; const Scalar s_inv3 = 1.0/3.0; - const Scalar s_sqrt3 = internal::sqrt(Scalar(3.0)); + const Scalar s_sqrt3 = std::sqrt(Scalar(3.0)); // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The // eigenvalues are the roots to this equation, all guaranteed to be @@ -73,23 +73,13 @@ inline void computeRoots(const Matrix& m, Roots& roots) q = Scalar(0); // Compute the eigenvalues by solving for the roots of the polynomial. - Scalar rho = internal::sqrt(-a_over_3); - Scalar theta = std::atan2(internal::sqrt(-q),half_b)*s_inv3; - Scalar cos_theta = internal::cos(theta); - Scalar sin_theta = internal::sin(theta); - roots(0) = c2_over_3 + Scalar(2)*rho*cos_theta; - roots(1) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta); - roots(2) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta); - - // Sort in increasing order. - if (roots(0) >= roots(1)) - std::swap(roots(0),roots(1)); - if (roots(1) >= roots(2)) - { - std::swap(roots(1),roots(2)); - if (roots(0) >= roots(1)) - std::swap(roots(0),roots(1)); - } + Scalar rho = std::sqrt(-a_over_3); + Scalar theta = std::atan2(std::sqrt(-q),half_b)*s_inv3; + Scalar cos_theta = std::cos(theta); + Scalar sin_theta = std::sin(theta); + roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta; + roots(0) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta); + roots(1) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta); } template @@ -99,9 +89,12 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals) // Scale the matrix so its entries are in [-1,1]. The scaling is applied // only when at least one matrix entry has magnitude larger than 1. - Scalar scale = mat.cwiseAbs()/*.template triangularView()*/.maxCoeff(); + Scalar shift = mat.trace()/3; + Matrix scaledMat = mat; + scaledMat.diagonal().array() -= shift; + Scalar scale = scaledMat.cwiseAbs()/*.template triangularView()*/.maxCoeff(); scale = std::max(scale,Scalar(1)); - Matrix scaledMat = mat / scale; + scaledMat/=scale; // Compute the eigenvalues // scaledMat.setZero(); @@ -166,6 +159,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals) // Rescale back to the original size. evals *= scale; + evals.array()+=shift; } int main() @@ -173,24 +167,29 @@ int main() BenchTimer t; int tries = 10; int rep = 400000; - typedef Matrix3f Mat; - typedef Vector3f Vec; + typedef Matrix3d Mat; + typedef Vector3d Vec; Mat A = Mat::Random(3,3); A = A.adjoint() * A; +// Mat Q = A.householderQr().householderQ(); +// A = Q * Vec(2.2424567,2.2424566,7.454353).asDiagonal() * Q.transpose(); SelfAdjointEigenSolver eig(A); BENCH(t, tries, rep, eig.compute(A)); - std::cout << "Eigen: " << t.best() << "s\n"; + std::cout << "Eigen iterative: " << t.best() << "s\n"; + + BENCH(t, tries, rep, eig.computeDirect(A)); + std::cout << "Eigen direct : " << t.best() << "s\n"; Mat evecs; Vec evals; BENCH(t, tries, rep, eigen33(A,evecs,evals)); std::cout << "Direct: " << t.best() << "s\n\n"; - std::cerr << "Eigenvalue/eigenvector diffs:\n"; - std::cerr << (evals - eig.eigenvalues()).transpose() << "\n"; - for(int k=0;k<3;++k) - if(evecs.col(k).dot(eig.eigenvectors().col(k))<0) - evecs.col(k) = -evecs.col(k); - std::cerr << evecs - eig.eigenvectors() << "\n\n"; +// std::cerr << "Eigenvalue/eigenvector diffs:\n"; +// std::cerr << (evals - eig.eigenvalues()).transpose() << "\n"; +// for(int k=0;k<3;++k) +// if(evecs.col(k).dot(eig.eigenvectors().col(k))<0) +// evecs.col(k) = -evecs.col(k); +// std::cerr << evecs - eig.eigenvectors() << "\n\n"; } diff --git a/eigen/bench/perf_monitoring/changesets.txt b/eigen/bench/perf_monitoring/changesets.txt new file mode 100644 index 0000000..960699c --- /dev/null +++ b/eigen/bench/perf_monitoring/changesets.txt @@ -0,0 +1,71 @@ +#3.0.1 +#3.1.1 +#3.2.0 +3.2.4 +#5745:37f59e65eb6c +5891:d8652709345d # introduce AVX +#5893:24b4dc92c6d3 # merge +5895:997c2ef9fc8b # introduce FMA +#5904:e1eafd14eaa1 # complex and AVX +5908:f8ee3c721251 # improve packing with ptranspose +#5921:ca808bb456b0 # merge +#5927:8b1001f9e3ac +5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks +#5949:f3488f4e45b2 # merge +#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec +#5992:4a429f5e0483 # merge +before-evaluators +#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products +#6639:c9121c60b5c7 +#6655:06f163b5221f # Properly detect FMA support on ARM +#6677:700e023044e7 # FMA has been wrongly disabled +#6681:11d31dafb0e3 +#6699:5e6e8e10aad1 # merge default to tensors +#6726:ff2d2388e7b9 # merge default to tensors +#6742:0cbd6195e829 # merge default to tensors +#6747:853d2bafeb8f # Generalized the gebp apis +6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation +6781:9cc5a931b2c6 # generalized gemv +6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product +#6844:039efd86b75c # merge tensor +6845:7333ed40c6ef # change prefetching in gebp +#6856:b5be5e10eb7f # merge index conversion +6893:c3a64aba7c70 # clean blocking size computation +6899:877facace746 # rotating kernel for ARM only +#6904:c250623ae9fa # result_of +6921:915f1b1fc158 # fix prefetching change for ARM +6923:9ff25f6dacc6 # prefetching +6933:52572e60b5d3 # blocking size strategy +6937:c8c042f286b2 # avoid redundant pack_rhs +6981:7e5d6f78da59 # dynamic loop swapping +6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache +6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. +7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) +7015:8aad8f35c955 # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables +7016:a58d253e8c91 # Polish lookup tables generation +7018:9b27294a8186 # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment +7019:c758b1e2c073 # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now. +7085:627e039fba68 # Bug 986: add support for coefficient-based product with 0 depth. +7098:b6f1db9cf9ec # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code +7591:09a8e2186610 # 3.3-alpha1 +7650:b0f3c8f43025 # help clang inlining +7708:dfc6ab9d9458 # Improve numerical accuracy in LLT and triangular solve by using true scalar divisions (instead of x * (1/y)) +#8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs) +8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes +8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path +8985:d935df21a082 # Remove the rotating kernel. +8988:6c2dc56e73b3 # Bug 256: enable vectorization with unaligned loads/stores. +9148:b8b8c421e36c # Relax mixing-type constraints for binary coefficient-wise operators +9174:d228bc282ac9 # merge +9212:c90098affa7b # Fix performance regression introduced in changeset 8aad8f35c955 +9213:9f1c14e4694b # Fix performance regression in dgemm introduced by changeset 81d53c711775 +9361:69d418c06999 # 3.3-beta2 +9583:bef509908b9d # 3.3-rc1 +9792:26667be4f70b # 3.3.0 +9942:b1d3eba60130 # Operators += and -= do not resize! +9943:79bb9887afd4 # Ease compiler job to generate clean and efficient code in mat*vec +9946:2213991340ea # Complete rewrite of column-major-matrix * vector product to deliver higher performance of modern CPU. +9955:630471c3298c # Improve performance of row-major-dense-matrix * vector products for recent CPUs. (this is the next changeset fixing a typo) +9975:2eeed9de710c # Revert vec/y to vec*(1/y) in row-major TRSM + + diff --git a/eigen/bench/perf_monitoring/gemm.cpp b/eigen/bench/perf_monitoring/gemm.cpp new file mode 100644 index 0000000..804139d --- /dev/null +++ b/eigen/bench/perf_monitoring/gemm.cpp @@ -0,0 +1,12 @@ +#include "gemm_common.h" + +EIGEN_DONT_INLINE +void gemm(const Mat &A, const Mat &B, Mat &C) +{ + C.noalias() += A * B; +} + +int main(int argc, char **argv) +{ + return main_gemm(argc, argv, gemm); +} diff --git a/eigen/bench/perf_monitoring/gemm_common.h b/eigen/bench/perf_monitoring/gemm_common.h new file mode 100644 index 0000000..30dbc0d --- /dev/null +++ b/eigen/bench/perf_monitoring/gemm_common.h @@ -0,0 +1,67 @@ +#include +#include +#include +#include +#include "eigen_src/Eigen/Core" +#include "../BenchTimer.h" +using namespace Eigen; + +#ifndef SCALAR +#error SCALAR must be defined +#endif + +typedef SCALAR Scalar; + +typedef Matrix Mat; + +template +EIGEN_DONT_INLINE +double bench(long m, long n, long k, const Func& f) +{ + Mat A(m,k); + Mat B(k,n); + Mat C(m,n); + A.setRandom(); + B.setRandom(); + C.setZero(); + + BenchTimer t; + + double up = 1e8*4/sizeof(Scalar); + double tm0 = 4, tm1 = 10; + if(NumTraits::IsComplex) + { + up /= 4; + tm0 = 2; + tm1 = 4; + } + + double flops = 2. * m * n * k; + long rep = std::max(1., std::min(100., up/flops) ); + long tries = std::max(tm0, std::min(tm1, up/flops) ); + + BENCH(t, tries, rep, f(A,B,C)); + + return 1e-9 * rep * flops / t.best(); +} + +template +int main_gemm(int argc, char **argv, const Func& f) +{ + std::vector results; + + std::string filename = std::string("gemm_settings.txt"); + if(argc>1) + filename = std::string(argv[1]); + std::ifstream settings(filename); + long m, n, k; + while(settings >> m >> n >> k) + { + //std::cerr << " Testing " << m << " " << n << " " << k << std::endl; + results.push_back( bench(m, n, k, f) ); + } + + std::cout << RowVectorXd::Map(results.data(), results.size()); + + return 0; +} diff --git a/eigen/bench/perf_monitoring/gemm_settings.txt b/eigen/bench/perf_monitoring/gemm_settings.txt new file mode 100644 index 0000000..5c43e1c --- /dev/null +++ b/eigen/bench/perf_monitoring/gemm_settings.txt @@ -0,0 +1,15 @@ +8 8 8 +9 9 9 +24 24 24 +239 239 239 +240 240 240 +2400 24 24 +24 2400 24 +24 24 2400 +24 2400 2400 +2400 24 2400 +2400 2400 24 +2400 2400 64 +4800 23 160 +23 4800 160 +2400 2400 2400 diff --git a/eigen/bench/perf_monitoring/gemm_square_settings.txt b/eigen/bench/perf_monitoring/gemm_square_settings.txt new file mode 100644 index 0000000..98474d1 --- /dev/null +++ b/eigen/bench/perf_monitoring/gemm_square_settings.txt @@ -0,0 +1,11 @@ +8 8 8 +9 9 9 +12 12 12 +15 15 15 +16 16 16 +24 24 24 +102 102 102 +239 239 239 +240 240 240 +2400 2400 2400 +2463 2463 2463 diff --git a/eigen/bench/perf_monitoring/gemv.cpp b/eigen/bench/perf_monitoring/gemv.cpp new file mode 100644 index 0000000..82e5ab9 --- /dev/null +++ b/eigen/bench/perf_monitoring/gemv.cpp @@ -0,0 +1,12 @@ +#include "gemv_common.h" + +EIGEN_DONT_INLINE +void gemv(const Mat &A, const Vec &B, Vec &C) +{ + C.noalias() += A * B; +} + +int main(int argc, char **argv) +{ + return main_gemv(argc, argv, gemv); +} diff --git a/eigen/bench/perf_monitoring/gemv_common.h b/eigen/bench/perf_monitoring/gemv_common.h new file mode 100644 index 0000000..cc32577 --- /dev/null +++ b/eigen/bench/perf_monitoring/gemv_common.h @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include "eigen_src/Eigen/Core" +#include "../BenchTimer.h" +using namespace Eigen; + +#ifndef SCALAR +#error SCALAR must be defined +#endif + +typedef SCALAR Scalar; + +typedef Matrix Mat; +typedef Matrix Vec; + +template +EIGEN_DONT_INLINE +double bench(long m, long n, Func &f) +{ + Mat A(m,n); + Vec B(n); + Vec C(m); + A.setRandom(); + B.setRandom(); + C.setRandom(); + + BenchTimer t; + + double up = 1e8/sizeof(Scalar); + double tm0 = 4, tm1 = 10; + if(NumTraits::IsComplex) + { + up /= 4; + tm0 = 2; + tm1 = 4; + } + + double flops = 2. * m * n; + long rep = std::max(1., std::min(100., up/flops) ); + long tries = std::max(tm0, std::min(tm1, up/flops) ); + + BENCH(t, tries, rep, f(A,B,C)); + + return 1e-9 * rep * flops / t.best(); +} + +template +int main_gemv(int argc, char **argv, Func& f) +{ + std::vector results; + + std::string filename = std::string("gemv_settings.txt"); + if(argc>1) + filename = std::string(argv[1]); + std::ifstream settings(filename); + long m, n; + while(settings >> m >> n) + { + //std::cerr << " Testing " << m << " " << n << std::endl; + results.push_back( bench(m, n, f) ); + } + + std::cout << RowVectorXd::Map(results.data(), results.size()); + + return 0; +} diff --git a/eigen/bench/perf_monitoring/gemv_settings.txt b/eigen/bench/perf_monitoring/gemv_settings.txt new file mode 100644 index 0000000..21a5ee0 --- /dev/null +++ b/eigen/bench/perf_monitoring/gemv_settings.txt @@ -0,0 +1,11 @@ +8 8 +9 9 +24 24 +239 239 +240 240 +2400 24 +24 2400 +24 240 +2400 2400 +4800 23 +23 4800 diff --git a/eigen/bench/perf_monitoring/gemv_square_settings.txt b/eigen/bench/perf_monitoring/gemv_square_settings.txt new file mode 100644 index 0000000..5165759 --- /dev/null +++ b/eigen/bench/perf_monitoring/gemv_square_settings.txt @@ -0,0 +1,13 @@ +8 8 +9 9 +12 12 +15 15 +16 16 +24 24 +53 53 +74 74 +102 102 +239 239 +240 240 +2400 2400 +2463 2463 diff --git a/eigen/bench/perf_monitoring/gemvt.cpp b/eigen/bench/perf_monitoring/gemvt.cpp new file mode 100644 index 0000000..fe94576 --- /dev/null +++ b/eigen/bench/perf_monitoring/gemvt.cpp @@ -0,0 +1,12 @@ +#include "gemv_common.h" + +EIGEN_DONT_INLINE +void gemv(const Mat &A, Vec &B, const Vec &C) +{ + B.noalias() += A.transpose() * C; +} + +int main(int argc, char **argv) +{ + return main_gemv(argc, argv, gemv); +} diff --git a/eigen/bench/perf_monitoring/lazy_gemm.cpp b/eigen/bench/perf_monitoring/lazy_gemm.cpp new file mode 100644 index 0000000..7733060 --- /dev/null +++ b/eigen/bench/perf_monitoring/lazy_gemm.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include +#include "../../BenchTimer.h" +using namespace Eigen; + +#ifndef SCALAR +#error SCALAR must be defined +#endif + +typedef SCALAR Scalar; + +template +EIGEN_DONT_INLINE +void lazy_gemm(const MatA &A, const MatB &B, MatC &C) +{ +// escape((void*)A.data()); +// escape((void*)B.data()); + C.noalias() += A.lazyProduct(B); +// escape((void*)C.data()); +} + +template +EIGEN_DONT_INLINE +double bench() +{ + typedef Matrix MatA; + typedef Matrix MatB; + typedef Matrix MatC; + + MatA A(m,k); + MatB B(k,n); + MatC C(m,n); + A.setRandom(); + B.setRandom(); + C.setZero(); + + BenchTimer t; + + double up = 1e7*4/sizeof(Scalar); + double tm0 = 10, tm1 = 20; + + double flops = 2. * m * n * k; + long rep = std::max(10., std::min(10000., up/flops) ); + long tries = std::max(tm0, std::min(tm1, up/flops) ); + + BENCH(t, tries, rep, lazy_gemm(A,B,C)); + + return 1e-9 * rep * flops / t.best(); +} + +template +double bench_t(int t) +{ + if(t) + return bench(); + else + return bench(); +} + +EIGEN_DONT_INLINE +double bench_mnk(int m, int n, int k, int t) +{ + int id = m*10000 + n*100 + k; + switch(id) { + case 10101 : return bench_t< 1, 1, 1>(t); break; + case 20202 : return bench_t< 2, 2, 2>(t); break; + case 30303 : return bench_t< 3, 3, 3>(t); break; + case 40404 : return bench_t< 4, 4, 4>(t); break; + case 50505 : return bench_t< 5, 5, 5>(t); break; + case 60606 : return bench_t< 6, 6, 6>(t); break; + case 70707 : return bench_t< 7, 7, 7>(t); break; + case 80808 : return bench_t< 8, 8, 8>(t); break; + case 90909 : return bench_t< 9, 9, 9>(t); break; + case 101010 : return bench_t<10,10,10>(t); break; + case 111111 : return bench_t<11,11,11>(t); break; + case 121212 : return bench_t<12,12,12>(t); break; + } + return 0; +} + +int main(int argc, char **argv) +{ + std::vector results; + + std::string filename = std::string("lazy_gemm_settings.txt"); + if(argc>1) + filename = std::string(argv[1]); + std::ifstream settings(filename); + long m, n, k, t; + while(settings >> m >> n >> k >> t) + { + //std::cerr << " Testing " << m << " " << n << " " << k << std::endl; + results.push_back( bench_mnk(m, n, k, t) ); + } + + std::cout << RowVectorXd::Map(results.data(), results.size()); + + return 0; +} diff --git a/eigen/bench/perf_monitoring/lazy_gemm_settings.txt b/eigen/bench/perf_monitoring/lazy_gemm_settings.txt new file mode 100644 index 0000000..407d5d4 --- /dev/null +++ b/eigen/bench/perf_monitoring/lazy_gemm_settings.txt @@ -0,0 +1,15 @@ +1 1 1 0 +2 2 2 0 +3 3 3 0 +4 4 4 0 +4 4 4 1 +5 5 5 0 +6 6 6 0 +7 7 7 0 +7 7 7 1 +8 8 8 0 +9 9 9 0 +10 10 10 0 +11 11 11 0 +12 12 12 0 +12 12 12 1 diff --git a/eigen/bench/perf_monitoring/llt.cpp b/eigen/bench/perf_monitoring/llt.cpp new file mode 100644 index 0000000..d55b7d8 --- /dev/null +++ b/eigen/bench/perf_monitoring/llt.cpp @@ -0,0 +1,15 @@ +#include "gemm_common.h" +#include + +EIGEN_DONT_INLINE +void llt(const Mat &A, const Mat &B, Mat &C) +{ + C = A; + C.diagonal().array() += 1000; + Eigen::internal::llt_inplace::blocked(C); +} + +int main(int argc, char **argv) +{ + return main_gemm(argc, argv, llt); +} diff --git a/eigen/bench/perf_monitoring/make_plot.sh b/eigen/bench/perf_monitoring/make_plot.sh new file mode 100644 index 0000000..ca9fa96 --- /dev/null +++ b/eigen/bench/perf_monitoring/make_plot.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# base name of the bench +# it reads $1.out +# and generates $1.pdf +WHAT=$1 +bench=$2 +settings_file=$3 + +header="rev " +while read line +do + if [ ! -z '$line' ]; then + header="$header \"$line\"" + fi +done < $settings_file + +echo $header > $WHAT.out.header +cat $WHAT.out >> $WHAT.out.header + + +echo "set title '$WHAT'" > $WHAT.gnuplot +echo "set key autotitle columnhead outside " >> $WHAT.gnuplot +echo "set xtics rotate 1" >> $WHAT.gnuplot + +echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot +echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot + +col=`cat $settings_file | wc -l` +echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot +echo " " >> $WHAT.gnuplot + +gnuplot -persist < $WHAT.gnuplot + +# generate a png file (thumbnail) +convert -colors 256 -background white -density 300 -resize 300 -quality 0 $WHAT.pdf -background white -flatten $WHAT.png + +# clean +rm $WHAT.out.header $WHAT.gnuplot + + +# generate html/svg graph + +echo " " > $WHAT.html +cat resources/chart_header.html > $WHAT.html +echo 'var customSettings = {"TITLE":"","SUBTITLE":"","XLABEL":"","YLABEL":""};' >> $WHAT.html +# 'data' is an array of datasets (i.e. curves), each of which is an object of the form +# { +# key: , +# color: , +# values: [{ +# r: , +# v: +# }] +# } +echo 'var data = [' >> $WHAT.html + +col=2 +while read line +do + if [ ! -z '$line' ]; then + header="$header \"$line\"" + echo '{"key":"'$line'","values":[' >> $WHAT.html + i=0 + while read line2 + do + if [ ! -z '$line2' ]; then + echo '{"r":'$i',"v":'`echo $line2 | cut -f $col -d ' '`'},' >> $WHAT.html + fi + ((i++)) + done < $WHAT.out + echo ']},' >> $WHAT.html + fi + ((col++)) +done < $settings_file +echo '];' >> $WHAT.html + +echo 'var changesets = [' >> $WHAT.html +while read line2 +do + if [ ! -z '$line2' ]; then + echo '"'`echo $line2 | cut -f 1 -d ' '`'",' >> $WHAT.html + fi +done < $WHAT.out +echo '];' >> $WHAT.html + +echo 'var changesets_count = [' >> $WHAT.html +i=0 +while read line2 +do + if [ ! -z '$line2' ]; then + echo $i ',' >> $WHAT.html + fi + ((i++)) +done < $WHAT.out +echo '];' >> $WHAT.html + +cat resources/chart_footer.html >> $WHAT.html diff --git a/eigen/bench/perf_monitoring/resources/chart_footer.html b/eigen/bench/perf_monitoring/resources/chart_footer.html new file mode 100644 index 0000000..8acc69f --- /dev/null +++ b/eigen/bench/perf_monitoring/resources/chart_footer.html @@ -0,0 +1,37 @@ + /* setup the chart and its options */ + var chart = nv.models.lineChart() + .color(d3.scale.category10().range()) + .margin({left: 75, bottom: 100}) + .forceX([0]).forceY([0]); + + chart.x(function(datum){ return datum.r; }) + .xAxis.options({ + axisLabel: customSettings.XLABEL || 'Changeset', + tickFormat: d3.format('.0f') + }); + chart.xAxis + .tickValues(changesets_count) + .tickFormat(function(d){return changesets[d]}) + .rotateLabels(-90); + + chart.y(function(datum){ return datum.v; }) + .yAxis.options({ + axisLabel: customSettings.YLABEL || 'GFlops'/*, + tickFormat: function(val){ return d3.format('.0f')(val) + ' GFlops'; }*/ + }); + + //chart.useInteractiveGuideline(true); + d3.select('#chart').datum(data).call(chart); + var plot = d3.select('#chart > g'); + + /* setup the title */ + plot.append('text') + .style('font-size', '24px') + .attr('text-anchor', 'middle').attr('x', '50%').attr('y', '20px') + .text(customSettings.TITLE || ''); + + /* ensure the chart is responsive */ + nv.utils.windowResize(chart.update); + + + \ No newline at end of file diff --git a/eigen/bench/perf_monitoring/resources/chart_header.html b/eigen/bench/perf_monitoring/resources/chart_header.html new file mode 100644 index 0000000..bb9ddff --- /dev/null +++ b/eigen/bench/perf_monitoring/resources/chart_header.html @@ -0,0 +1,46 @@ + + + + + + + + + + + + +