update on pandas join/merge performance in python

Question

(I post this self-answered question to share my own tests.) There are a huge number of ways to join two DataFrames together in Python/Pandas. Previous performance analyses indicated that DataFrame.join is faster than DataFrame.merge and that it is best that one table has an index on the column to be joined on. None of these are true anymore as it seems that DataFrame.merge has improved a lot. However, I find it still slower than some alternatives. Is there an updated performance comparison between these alternative methods with the latest versions of Python/Pandas in 2024? In particular, I am interested in the most common case, which is a join of a large left table to a smaller right table which retains the left index and with no missing values (so join is the same as left join). The sizes can be about 10000000 for the left table and 100000 for the right table which may have up to 10 columns. And copy-on-write mode should be turned on because it's the future of Pandas.

user1537366 · Accepted Answer · 2024-11-07 07:14:23Z

We start with a DataFrame df1 of size m = 10000000 with two columns where the column x is to be joined on and b is another column. They are both random numbers in range(0, n) where n = 100000. We make an index column r2_index for the second table df2, which is a random sublist of range(0, 2 * n), and then we apply this mapping to x in df1 and have r2_index as x in df2. Our goal is to assess the alternatives for df1.merge(df2.set_index("x"), left_on="x", right_index=True, how="left"), retaining df1's index if possible. It turns out that the fastest alternative is the relatively unknown pd.concat([df1, df2.set_index("x").reindex(df1["x"]).set_index(df1.index)], axis=1), which offers a speedup of about 60-70%.

Here are the timings for each method on Python 3.12.1 and Pandas 2.2.3, where k is the number of other columns in df2:

---- m=10000000 n=100000 k=1
merge 0.7543465448585006
merge_index 0.849070595834443
join 0.7793994406921019
*reindex_stack 0.31846871529001697
*reindex_build_dict 0.25757221154861093
reindex_build_df 0.28695021016145833
reindex_set 0.29311707158077266
reindex_assign 0.24651527803225984
reindex_concat 0.24365811587138583
loc_concat 0.45010350695583445
numba_by_rows 0.5910008519405143
multi_map 0.44640477491307334
set 0.22621148322603754
map + numpy_indexing 0.34636545836644167
unique + map + numpy_indexing 2.0781196703988827
numba_map + numpy_indexing 0.30156832467717676
---- m=10000000 n=100000 k=5
merge 0.8237963316921826
merge_index 0.8338317163324973
join 0.8705977065837942
*reindex_stack 0.455108648129997
*reindex_build_dict 0.3370581329334527
reindex_build_df 0.47327432681711135
reindex_set 0.3697347569634855
reindex_assign 0.43470075648103373
reindex_concat 0.3544610048963383
loc_concat 0.5667034523899525
numba_by_rows 0.8590000060842916
multi_map 3.523335634250543
set 3.0202132660015195
map + numpy_indexing 1.1369471510002365
unique + map + numpy_indexing 4.243288217335551
numba_map + numpy_indexing 0.9269536811810791
---- m=10000000 n=100000 k=10
merge 1.2057412993340728
merge_index 1.234110462778416
join 1.0439695833993028
*reindex_stack 0.6809621169338546
*reindex_build_dict 0.4961718555236335
reindex_build_df 0.673938729132836
reindex_set 0.49392068914312404
reindex_assign 0.6563574546235031
reindex_concat 0.48132824199897833
loc_concat 0.7141473570663948
numba_by_rows 1.2562420190006378
multi_map 4.51269712166201
set 2.241050166799687
map + numpy_indexing 1.0398264798990566
unique + map + numpy_indexing 2.7061847894947277
numba_map + numpy_indexing 0.9649961699099301

#!/usr/bin/env python3
import timeit

import numba
import numba.typed.typeddict
import numpy as np
import pandas as pd


def main():
    pd.set_option("mode.copy_on_write", True)
    max_time = 1
    max_iters = 30
    for m in [10000000]:
        for n in [100000]:
            # for m in (1000, 10000, 100000, 1000000, 10000000):
            #     for n in (10, 1000, 100000):
            if n > m:
                continue
            for k in (1, 5, 10):
                test(m, n, k, max_time, max_iters)


def test(m, n, k, max_time, max_iters):
    print(f'---- {m=} {n=} {k=}')
    np_rand = np.random.default_rng()
    r1_x = np_rand.integers(0, n, m)
    r1_b = np_rand.integers(0, n, m)
    r2_index = np_rand.choice(np.arange(0, 2 * n), n, replace=False)
    r1_x = r2_index[r1_x]
    r2_dict = {}
    for i in range(k):
        t = np.arange(0, n)
        np_rand.shuffle(t)
        r2_dict["y%d" % i] = t
    check = get_check()

    method = "merge"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
    func = lambda: df1.merge(df2, left_on="x", right_on="x", how="left")
    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "merge_index"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
    func = lambda: df1.merge(
        df2.set_index("x"), left_on="x", right_index=True, how="left")
    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "join"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
    func = lambda: df1.join(df2.set_index("x"), on="x", how="left")
    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "*reindex_stack"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        t = df2.set_index("x").reindex(df1["x"])
        return np.column_stack([df1.values, t.values])

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "*reindex_build_dict"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df2c = df2.set_index("x")
        df2_cols = list(df2c.columns)
        t = df2c.reindex(df1["x"])
        return {
            **{
                x: y.values
                for x, y in df1.items()
            },
            **{
                x: t[x].values
                for i, x in enumerate(df2_cols)
            }
        }

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "reindex_build_df"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df2c = df2.set_index("x")
        df2_cols = list(df2c.columns)
        t = df2c.reindex(df1["x"])
        res = {
            **{
                x: y.values
                for x, y in df1.items()
            },
            **{
                x: t[x].values
                for i, x in enumerate(df2_cols)
            }
        }
        return pd.DataFrame(res)

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "reindex_set"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df1c = df1.copy()
        df2c = df2.set_index("x")
        df2_cols = list(df2c.columns)
        df1c[df2_cols] = df2c.reindex(df1["x"]).reset_index(drop=True)
        return df1c

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "reindex_assign"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df2c = df2.set_index("x")
        df2_cols = list(df2c.columns)
        t = df2c.reindex(df1["x"])
        return df1.assign(**{x: t[x].values for x in df2_cols})

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    # method = "reindex_assign2"
    # df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    # df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    # def func():
    #     df2c = df2.set_index("x")
    #     df2_cols = list(df2c.columns)
    #     t = df2c.reindex(df1["x"]).values
    #     return df1.assign(**{x: t[:, i] for i, x in enumerate(df2_cols)})

    # check(func())
    # iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    # per_run = timing / iterations
    # print(method, per_run)

    method = "reindex_concat"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        return pd.concat(
            [df1,
             df2.set_index("x").reindex(df1["x"]).set_index(df1.index)],
            axis=1)

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "loc_concat"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        return pd.concat(
            [df1, df2.set_index("x").loc[df1["x"]].set_index(df1.index)],
            axis=1)

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "numba_by_rows"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    @numba.njit
    def helper(r1_list, r2_list):
        r2_index = r2_list[:, 0]
        mapping = {x: i for i, x in enumerate(r2_index)}
        mapped = np.empty((r1_list.shape[0], r2_list.shape[1] - 1), "int64")
        for i in range(r1_list.shape[0]):
            x = r1_list[i, 0]
            mapped_i = mapping[x]
            mapped[i, :] = r2_list[mapped_i, 1:]
        return mapped

    def func():
        r1_list = df1.values
        r2_list = df2.values
        return pd.concat([
            df1,
            pd.DataFrame(helper(r1_list, r2_list),
                         index=df1.index,
                         columns=df2.columns[1:])
        ],
                         axis=1)

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    # method = "single_map_concat"
    # df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    # df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    # def func():
    #     df2_keys, df2_values = zip(*df2.items())
    #     df2_index = df2_values[0]
    #     if len(df2_values) >= 3:
    #         mapping = {
    #             df2_index[i]: np.array([y[i] for y in df2_values[1:]])
    #             for i in range(df2.shape[0])
    #         }
    #         mapped = pd.DataFrame(df1["x"].map(mapping).to_list(),
    #                               index=df1.index,
    #                               columns=list(df2_keys[1:]))
    #     else:
    #         mapping = {
    #             df2_index[i]: df2_values[1][i]
    #             for i in range(df2.shape[0])
    #         }
    #         mapped = df1["x"].map(mapping).to_frame(df2_keys[1])
    #     return pd.concat([df1, mapped], axis=1)

    # check(func())
    # iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    # per_run = timing / iterations
    # print(method, per_run)

    method = "multi_map"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df2_keys, df2_values = zip(*df2.items())
        df2_index = df2_values[0]
        mapping = [{
            x: y[i]
            for i, x in enumerate(df2_index)
        } for y in df2_values[1:]]
        return df1.assign(**{
            k: df1["x"].map(mapping[i])
            for i, k in enumerate(df2_keys[1:])
        })

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "set"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df1c = df1.set_index("x")
        df2c = df2.set_index("x")
        df2_cols = list(df2c.columns)
        df1c[df2_cols] = df2c[df2_cols]
        return df1c.reset_index()

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "map + numpy_indexing"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df2c = df2.set_index("x")
        r2_mapping = {x: i for i, x in enumerate(df2c.index)}
        mapping = df1["x"].map(r2_mapping).values
        if df2c.shape[1] > 1:
            return pd.concat([
                df1,
                pd.DataFrame(df2c.values[mapping],
                             index=df1.index,
                             columns=df2c.columns)
            ],
                             axis=1)
        return df1.assign(**{df2c.columns[0]: df2c.values[mapping]})

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "unique + map + numpy_indexing"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    def func():
        df2c = df2.set_index("x")
        u, inv = np.unique(df1["x"], return_inverse=True)
        r2_mapping = {x: i for i, x in enumerate(df2c.index)}
        mapping = pd.Series(u).map(r2_mapping).values[inv]
        if df2c.shape[1] > 1:
            return pd.concat([
                df1,
                pd.DataFrame(df2c.values[mapping],
                             index=df1.index,
                             columns=df2c.columns)
            ],
                             axis=1)
        return df1.assign(**{df2c.columns[0]: df2c.values[mapping]})

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)

    method = "numba_map + numpy_indexing"
    df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
    df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")

    @numba.njit
    def array_map(r, r2_index):
        mapping = {x: i for i, x in enumerate(r2_index)}
        ret = np.empty_like(r)
        for i in range(r.shape[0]):
            ret[i] = mapping[r[i]]
        return ret

    def func():
        df2c = df2.set_index("x")
        mapping = array_map(df1["x"].values, df2c.index.values)
        if df2c.shape[1] > 1:
            return pd.concat([
                df1,
                pd.DataFrame(df2c.values[mapping],
                             index=df1.index,
                             columns=df2c.columns)
            ],
                             axis=1)
        return df1.assign(**{df2c.columns[0]: df2c.values[mapping]})

    check(func())
    iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
    per_run = timing / iterations
    print(method, per_run)


def get_df_dict(df):
    return {k: x.values for k, x in df.items()}


def get_check():
    first = None

    def check(res):
        if isinstance(res, pd.DataFrame):
            vals = res.values
        elif isinstance(res, dict):
            vals = pd.DataFrame(res).values
        else:
            vals = res
        nonlocal first
        if first is not None:
            assert np.all(vals == first)
        else:
            first = vals

    return check


def auto_timer(timeit_timer, max_time, max_iters, callback=None):
    """
    calls the `timeit` method of `timeit_timer` with exponentially increasing
    iterations but less than the estimated number of iterations left in order
    for total_time >= max_time, and returns the total number of iterations and
    total time

    :param timeit_timer: timer to run timeit on
    :param max_time: approximate maximum time to spend
    :param callback: `callback(total_iters, total_time)` will be called for
        every exponential update (if given)
    :return: total_iters, total_time
    """
    i = 1
    total_iters = 0
    total_time = 0
    while total_time < max_time and total_iters < max_iters:
        if total_iters > 0:
            new_i = int(
                (max_time - total_time) / (total_time / total_iters)) + 1
            i = min(new_i, i * 2)
        total_iters += i
        total_time += timeit_timer.timeit(i)
        if callback:
            callback(total_iters, total_time)
    return total_iters, total_time


if __name__ == "__main__":
    main()

Collectives™ on Stack Overflow

update on pandas join/merge performance in python

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related