I implemented a simple 2D array wrapper using a single contiguous std::vector and, benchmarked it against std::vector<std::vector<>>.
Surprisingly, array2d is just slightly faster!
#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>
#include <cstdlib>
template<class T>
class array2d {
size_t nc{0}, nr{0};
std::vector<T> vec{};
public:
array2d(size_t rows, size_t cols)
: nc(cols), nr(rows), vec(rows * cols) {}
template<class Self>
auto&& operator[](this Self&& self, size_t i, size_t j) {
auto&& a = std::forward<Self>(self);
return a.vec[i * a.nc + j];
}
size_t rows() const { return nr; }
size_t cols() const { return nc; }
};
using Clock = std::chrono::high_resolution_clock;
using Duration = std::chrono::duration<double, std::milli>;
void benchmark_write() {
constexpr size_t ROWS = 5000;
constexpr size_t COLS = 5000;
constexpr int ITERATIONS = 10;
std::cout << "\n=== Write Benchmark (" << ROWS << "x" << COLS << ") ===\n";
// Benchmark array2d
volatile int dummy1 = 0;
auto t1 = Clock::now();
for (int iter = 0; iter < ITERATIONS; ++iter) {
array2d<int> arr(ROWS, COLS);
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
arr[i, j] = i * COLS + j;
}
}
dummy1 += arr[0, 0];
}
auto t2 = Clock::now();
Duration array2d_time = t2 - t1;
// Benchmark vector<vector>
volatile int dummy2 = 0;
auto t3 = Clock::now();
for (int iter = 0; iter < ITERATIONS; ++iter) {
std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
vec[i][j] = i * COLS + j;
}
}
dummy2 += vec[0][0];
}
auto t4 = Clock::now();
Duration vector_time = t4 - t3;
std::cout << std::fixed << std::setprecision(3);
std::cout << "array2d: " << array2d_time.count() << " ms\n";
std::cout << "vector<vector>: " << vector_time.count() << " ms\n";
std::cout << "Speedup: " << vector_time.count() / array2d_time.count() << "x\n";
}
void benchmark_read() {
constexpr size_t ROWS = 5000;
constexpr size_t COLS = 5000;
constexpr int ITERATIONS = 10;
std::cout << "\n=== Read Benchmark (" << ROWS << "x" << COLS << ") ===\n";
// Setup array2d
array2d<int> arr(ROWS, COLS);
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
arr[i, j] = i * COLS + j;
}
}
// Setup vector<vector>
std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
vec[i][j] = i * COLS + j;
}
}
// Benchmark array2d reads
auto t1 = Clock::now();
volatile int sum1 = 0;
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
sum1 += arr[i, j];
}
}
}
auto t2 = Clock::now();
Duration array2d_time = t2 - t1;
// Benchmark vector<vector> reads
auto t3 = Clock::now();
volatile int sum2 = 0;
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
sum2 += vec[i][j];
}
}
}
auto t4 = Clock::now();
Duration vector_time = t4 - t3;
std::cout << std::fixed << std::setprecision(3);
std::cout << "array2d: " << array2d_time.count() << " ms\n";
std::cout << "vector<vector>: " << vector_time.count() << " ms\n";
std::cout << "Speedup: " << vector_time.count() / array2d_time.count() << "x\n";
}
void benchmark_sequential_access() {
constexpr size_t ROWS = 5000;
constexpr size_t COLS = 5000;
constexpr int ITERATIONS = 10;
std::cout << "\n=== Sequential Access Benchmark (" << ROWS << "x" << COLS << ") ===\n";
// Setup array2d
array2d<int> arr(ROWS, COLS);
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
arr[i, j] = i * COLS + j;
}
}
// Setup vector<vector>
std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
vec[i][j] = i * COLS + j;
}
}
// Benchmark array2d sequential reads
auto t1 = Clock::now();
volatile int sum1 = 0;
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
sum1 += arr[i, j];
}
}
}
auto t2 = Clock::now();
Duration array2d_time = t2 - t1;
// Benchmark vector<vector> sequential reads
auto t3 = Clock::now();
volatile int sum2 = 0;
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (size_t i = 0; i < ROWS; ++i) {
for (size_t j = 0; j < COLS; ++j) {
sum2 += vec[i][j];
}
}
}
auto t4 = Clock::now();
Duration vector_time = t4 - t3;
std::cout << std::fixed << std::setprecision(3);
std::cout << "array2d: " << array2d_time.count() << " ms\n";
std::cout << "vector<vector>: " << vector_time.count() << " ms\n";
std::cout << "Speedup: " << vector_time.count() / array2d_time.count() << "x\n";
}
int main() {
std::cout << "array2d vs std::vector<std::vector<T>> Performance Benchmark\n";
std::cout << "=========================================================\n";
benchmark_write();
benchmark_read();
benchmark_sequential_access();
std::cout << "\n" << std::string(50, '=') << "\n";
std::cout << "Benchmarks complete!\n";
return 0;
}
The output:
array2d vs std::vector<std::vector<T>> Performance Benchmark
=== Write Benchmark (5000x5000) ===
array2d: 140.291 ms
vector<vector>: 183.976 ms
Speedup: 1.311x
=== Read Benchmark (5000x5000) ===
array2d: 240.611 ms
vector<vector>: 241.066 ms
Speedup: 1.002x
=== Sequential Access Benchmark (5000x5000) ===
array2d: 240.598 ms
vector<vector>: 242.433 ms
Speedup: 1.008x
==================================================
Benchmarks complete!
My questions:
- Why is array2d not dramatically faster than vector?
- Are there compiler optimizations or memory access patterns that make vector competitive for small to medium arrays?
- Is there a better way to implement a user-defined contiguous 2D array that shows a clear speedup over vector?
I am using:
- clang version 21.1.4
- arm64-apple-darwin24.6.0
- clang++ -O3 -march=native -std=c++23 benchmark.cpp -o benchmark
arr[i, j]is valid since C++23, which the OP is using. Theoperator[]can be overloaded on user-defined types to accept multiple input arguments separated by comma. See P2128R6