Why is my user-defined contiguous 2D array not dramatically faster than std::vector<std::vector<>>?

Ask Question

Asked yesterday

Modified yesterday

Viewed 181 times

I implemented a simple 2D array wrapper using a single contiguous std::vector and, benchmarked it against std::vector<std::vector<>>.

Surprisingly, array2d is just slightly faster!

#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>
#include <cstdlib>

template<class T>
class array2d {
    size_t nc{0}, nr{0};
    std::vector<T> vec{};

public:
    array2d(size_t rows, size_t cols)
        : nc(cols), nr(rows), vec(rows * cols) {}

    template<class Self>
    auto&& operator[](this Self&& self, size_t i, size_t j) {
        auto&& a = std::forward<Self>(self);
        return a.vec[i * a.nc + j];
    }

    size_t rows() const { return nr; }
    size_t cols() const { return nc; }
};

using Clock = std::chrono::high_resolution_clock;
using Duration = std::chrono::duration<double, std::milli>;

void benchmark_write() {
    constexpr size_t ROWS = 5000;
    constexpr size_t COLS = 5000;
    constexpr int ITERATIONS = 10;

    std::cout << "\n=== Write Benchmark (" << ROWS << "x" << COLS << ") ===\n";

    // Benchmark array2d
    volatile int dummy1 = 0;
    auto t1 = Clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        array2d<int> arr(ROWS, COLS);
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                arr[i, j] = i * COLS + j;
            }
        }
        dummy1 += arr[0, 0];
    }
    auto t2 = Clock::now();
    Duration array2d_time = t2 - t1;

    // Benchmark vector<vector>
    volatile int dummy2 = 0;
    auto t3 = Clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                vec[i][j] = i * COLS + j;
            }
        }
        dummy2 += vec[0][0];
    }
    auto t4 = Clock::now();
    Duration vector_time = t4 - t3;

    std::cout << std::fixed << std::setprecision(3);
    std::cout << "array2d:           " << array2d_time.count() << " ms\n";
    std::cout << "vector<vector>:    " << vector_time.count() << " ms\n";
    std::cout << "Speedup:           " << vector_time.count() / array2d_time.count() << "x\n";
}

void benchmark_read() {
    constexpr size_t ROWS = 5000;
    constexpr size_t COLS = 5000;
    constexpr int ITERATIONS = 10;

    std::cout << "\n=== Read Benchmark (" << ROWS << "x" << COLS << ") ===\n";

    // Setup array2d
    array2d<int> arr(ROWS, COLS);
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            arr[i, j] = i * COLS + j;
        }
    }

    // Setup vector<vector>
    std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            vec[i][j] = i * COLS + j;
        }
    }

    // Benchmark array2d reads
    auto t1 = Clock::now();
    volatile int sum1 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum1 += arr[i, j];
            }
        }
    }
    auto t2 = Clock::now();
    Duration array2d_time = t2 - t1;

    // Benchmark vector<vector> reads
    auto t3 = Clock::now();
    volatile int sum2 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum2 += vec[i][j];
            }
        }
    }
    auto t4 = Clock::now();
    Duration vector_time = t4 - t3;

    std::cout << std::fixed << std::setprecision(3);
    std::cout << "array2d:           " << array2d_time.count() << " ms\n";
    std::cout << "vector<vector>:    " << vector_time.count() << " ms\n";
    std::cout << "Speedup:           " << vector_time.count() / array2d_time.count() << "x\n";
}

void benchmark_sequential_access() {
    constexpr size_t ROWS = 5000;
    constexpr size_t COLS = 5000;
    constexpr int ITERATIONS = 10;

    std::cout << "\n=== Sequential Access Benchmark (" << ROWS << "x" << COLS << ") ===\n";

    // Setup array2d
    array2d<int> arr(ROWS, COLS);
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            arr[i, j] = i * COLS + j;
        }
    }

    // Setup vector<vector>
    std::vector<std::vector<int>> vec(ROWS, std::vector<int>(COLS));
    for (size_t i = 0; i < ROWS; ++i) {
        for (size_t j = 0; j < COLS; ++j) {
            vec[i][j] = i * COLS + j;
        }
    }

    // Benchmark array2d sequential reads
    auto t1 = Clock::now();
    volatile int sum1 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum1 += arr[i, j];
            }
        }
    }
    auto t2 = Clock::now();
    Duration array2d_time = t2 - t1;

    // Benchmark vector<vector> sequential reads
    auto t3 = Clock::now();
    volatile int sum2 = 0;
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (size_t i = 0; i < ROWS; ++i) {
            for (size_t j = 0; j < COLS; ++j) {
                sum2 += vec[i][j];
            }
        }
    }
    auto t4 = Clock::now();
    Duration vector_time = t4 - t3;

    std::cout << std::fixed << std::setprecision(3);
    std::cout << "array2d:           " << array2d_time.count() << " ms\n";
    std::cout << "vector<vector>:    " << vector_time.count() << " ms\n";
    std::cout << "Speedup:           " << vector_time.count() / array2d_time.count() << "x\n";
}

int main() {
    std::cout << "array2d vs std::vector<std::vector<T>> Performance Benchmark\n";
    std::cout << "=========================================================\n";

    benchmark_write();
    benchmark_read();
    benchmark_sequential_access();

    std::cout << "\n" << std::string(50, '=') << "\n";
    std::cout << "Benchmarks complete!\n";

    return 0;
}

The output:

array2d vs std::vector<std::vector<T>> Performance Benchmark

=== Write Benchmark (5000x5000) ===
array2d:           140.291 ms
vector<vector>:    183.976 ms
Speedup:           1.311x

=== Read Benchmark (5000x5000) ===
array2d:           240.611 ms
vector<vector>:    241.066 ms
Speedup:           1.002x

=== Sequential Access Benchmark (5000x5000) ===
array2d:           240.598 ms
vector<vector>:    242.433 ms
Speedup:           1.008x

==================================================
Benchmarks complete!

My questions:

Why is array2d not dramatically faster than vector?
Are there compiler optimizations or memory access patterns that make vector competitive for small to medium arrays?
Is there a better way to implement a user-defined contiguous 2D array that shows a clear speedup over vector?

I am using:

clang version 21.1.4
arm64-apple-darwin24.6.0
clang++ -O3 -march=native -std=c++23 benchmark.cpp -o benchmark

edited yesterday

asked yesterday

Ali Sedighi

1515 bronze badges

No because it is straightforward

Ali Sedighi
– Ali Sedighi

2025-11-20 22:01:46 +00:00
Commented yesterday
1

@abcdefg I will definitely investigate that. Thanks for your comment!

Ali Sedighi
– Ali Sedighi

2025-11-20 22:18:38 +00:00
Commented yesterday
3

My guess would be that a 5kx5k array is big enough that the small performance loss from changing between memory locations when switching rows isn't significant. If you try smaller sizes you might see more difference

Alan Birtles
– Alan Birtles

2025-11-20 22:23:44 +00:00
Commented yesterday
3

@JohnBollinger arr[i, j] is valid since C++23, which the OP is using. The operator[] can be overloaded on user-defined types to accept multiple input arguments separated by comma. See P2128R6

Remy Lebeau
– Remy Lebeau

2025-11-20 22:57:38 +00:00
Commented yesterday
1

Quick bench repro gcc and clang - Quick-bench has to old compilers to use "deducing this".

Marek R
– Marek R

2025-11-20 23:14:22 +00:00
Commented yesterday

| Show 10 more comments

0 Your Answer

Sign up or log in

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.

Collectives™ on Stack Overflow

Why is my user-defined contiguous 2D array not dramatically faster than std::vector<std::vector<>>?

0

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Linked