5

I implemented a simple 2D array wrapper using a single contiguous std::vector and, benchmarked it against std::vector<std::vector<>>.

Surprisingly, array2d is just slightly faster!

#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>
#include <cstdlib>

template<class T>
class array2d {
    size_t nc{0}, nr{0};
    std::vector<T> vec{};

public:
    array2d(size_t rows, size_t cols)
        : nc(cols), nr(rows), vec(rows * cols) {}

    template<class Self>
    auto&& operator[](this Self&& self, size_t i, size_t j) {
        auto&& a = std::forward<Self>(self);
        return a.vec[i * a.nc + j];
    }

    size_t rows() const { return nr; }
    size_t cols() const { return nc; }
};

using Clock = std::chrono::high_resolution_clock;
using Duration = std::chrono::duration<double, std::milli>;

void benchmark_write() {
    constexpr size_t ROWS = 5000;
    constexpr size_t COLS = 5000;
    constexpr int ITERATIONS = 10;

    std::cout << "\n=== Write Benchmark (" << ROWS << "x" << COLS << ") ===\n";

    // Benchmark array2d
    volatile int dummy1 = 0;
    auto t1 = Clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        array2d<int> arr(ROWS, COLS);
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                arr[i, j] = i * COLS + j;
            }
        }
        dummy1 += arr[0, 0];
    }
    auto t2 = Clock::now();
    Duration array2d_time = t2 - t1;

    // Benchmark vector<vector>
    volatile int dummy2 = 0;
    auto t3 = Clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                vec[i][j] = i * COLS + j;
            }
        }
        dummy2 += vec[0][0];
    }
    auto t4 = Clock::now();
    Duration vector_time = t4 - t3;

    std::cout << std::fixed << std::setprecision(3);
    std::cout << "array2d:           " << array2d_time.count() << " ms\n";
    std::cout << "vector<vector>:    " << vector_time.count() << " ms\n";
    std::cout << "Speedup:           " << vector_time.count() / array2d_time.count() << "x\n";
}

void benchmark_read() {
    constexpr size_t ROWS = 5000;
    constexpr size_t COLS = 5000;
    constexpr int ITERATIONS = 10;

    std::cout << "\n=== Read Benchmark (" << ROWS << "x" << COLS << ") ===\n";

    // Setup array2d
    array2d<int> arr(ROWS, COLS);
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            arr[i, j] = i * COLS + j;
        }
    }

    // Setup vector<vector>
    std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            vec[i][j] = i * COLS + j;
        }
    }

    // Benchmark array2d reads
    auto t1 = Clock::now();
    volatile int sum1 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum1 += arr[i, j];
            }
        }
    }
    auto t2 = Clock::now();
    Duration array2d_time = t2 - t1;

    // Benchmark vector<vector> reads
    auto t3 = Clock::now();
    volatile int sum2 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum2 += vec[i][j];
            }
        }
    }
    auto t4 = Clock::now();
    Duration vector_time = t4 - t3;

    std::cout << std::fixed << std::setprecision(3);
    std::cout << "array2d:           " << array2d_time.count() << " ms\n";
    std::cout << "vector<vector>:    " << vector_time.count() << " ms\n";
    std::cout << "Speedup:           " << vector_time.count() / array2d_time.count() << "x\n";
}

void benchmark_sequential_access() {
    constexpr size_t ROWS = 5000;
    constexpr size_t COLS = 5000;
    constexpr int ITERATIONS = 10;

    std::cout << "\n=== Sequential Access Benchmark (" << ROWS << "x" << COLS << ") ===\n";

    // Setup array2d
    array2d<int> arr(ROWS, COLS);
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            arr[i, j] = i * COLS + j;
        }
    }

    // Setup vector<vector>
    std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            vec[i][j] = i * COLS + j;
        }
    }

    // Benchmark array2d sequential reads
    auto t1 = Clock::now();
    volatile int sum1 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum1 += arr[i, j];
            }
        }
    }
    auto t2 = Clock::now();
    Duration array2d_time = t2 - t1;

    // Benchmark vector<vector> sequential reads
    auto t3 = Clock::now();
    volatile int sum2 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum2 += vec[i][j];
            }
        }
    }
    auto t4 = Clock::now();
    Duration vector_time = t4 - t3;

    std::cout << std::fixed << std::setprecision(3);
    std::cout << "array2d:           " << array2d_time.count() << " ms\n";
    std::cout << "vector<vector>:    " << vector_time.count() << " ms\n";
    std::cout << "Speedup:           " << vector_time.count() / array2d_time.count() << "x\n";
}

int main() {
    std::cout << "array2d vs std::vector<std::vector<T>> Performance Benchmark\n";
    std::cout << "=========================================================\n";

    benchmark_write();
    benchmark_read();
    benchmark_sequential_access();

    std::cout << "\n" << std::string(50, '=') << "\n";
    std::cout << "Benchmarks complete!\n";

    return 0;
}

The output:

array2d vs std::vector<std::vector<T>> Performance Benchmark

=== Write Benchmark (5000x5000) ===
array2d:           140.291 ms
vector<vector>:    183.976 ms
Speedup:           1.311x

=== Read Benchmark (5000x5000) ===
array2d:           240.611 ms
vector<vector>:    241.066 ms
Speedup:           1.002x

=== Sequential Access Benchmark (5000x5000) ===
array2d:           240.598 ms
vector<vector>:    242.433 ms
Speedup:           1.008x

==================================================
Benchmarks complete!

My questions:

  • Why is array2d not dramatically faster than vector?
  • Are there compiler optimizations or memory access patterns that make vector competitive for small to medium arrays?
  • Is there a better way to implement a user-defined contiguous 2D array that shows a clear speedup over vector?

I am using:

  • clang version 21.1.4
  • arm64-apple-darwin24.6.0
  • clang++ -O3 -march=native -std=c++23 benchmark.cpp -o benchmark
15
  • No because it is straightforward Commented yesterday
  • 1
    @abcdefg I will definitely investigate that. Thanks for your comment! Commented yesterday
  • 3
    My guess would be that a 5kx5k array is big enough that the small performance loss from changing between memory locations when switching rows isn't significant. If you try smaller sizes you might see more difference Commented yesterday
  • 3
    @JohnBollinger arr[i, j] is valid since C++23, which the OP is using. The operator[] can be overloaded on user-defined types to accept multiple input arguments separated by comma. See P2128R6 Commented yesterday
  • 1
    Quick bench repro gcc and clang - Quick-bench has to old compilers to use "deducing this". Commented yesterday

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.