(I post this self-answered question to share my own tests.) There are a huge number of ways to join two DataFrames together in Python/Pandas. Previous performance analyses indicated that DataFrame.join is faster than DataFrame.merge and that it is best that one table has an index on the column to be joined on. None of these are true anymore as it seems that DataFrame.merge has improved a lot. However, I find it still slower than some alternatives. Is there an updated performance comparison between these alternative methods with the latest versions of Python/Pandas in 2024? In particular, I am interested in the most common case, which is a join of a large left table to a smaller right table which retains the left index and with no missing values (so join is the same as left join). The sizes can be about 10000000 for the left table and 100000 for the right table which may have up to 10 columns. And copy-on-write mode should be turned on because it's the future of Pandas.
1 Answer
We start with a DataFrame df1 of size m = 10000000 with two columns where the column x is to be joined on and b is another column. They are both random numbers in range(0, n) where n = 100000. We make an index column r2_index for the second table df2, which is a random sublist of range(0, 2 * n), and then we apply this mapping to x in df1 and have r2_index as x in df2. Our goal is to assess the alternatives for df1.merge(df2.set_index("x"), left_on="x", right_index=True, how="left"), retaining df1's index if possible. It turns out that the fastest alternative is the relatively unknown pd.concat([df1, df2.set_index("x").reindex(df1["x"]).set_index(df1.index)], axis=1), which offers a speedup of about 60-70%.
Here are the timings for each method on Python 3.12.1 and Pandas 2.2.3, where k is the number of other columns in df2:
---- m=10000000 n=100000 k=1
merge 0.7543465448585006
merge_index 0.849070595834443
join 0.7793994406921019
*reindex_stack 0.31846871529001697
*reindex_build_dict 0.25757221154861093
reindex_build_df 0.28695021016145833
reindex_set 0.29311707158077266
reindex_assign 0.24651527803225984
reindex_concat 0.24365811587138583
loc_concat 0.45010350695583445
numba_by_rows 0.5910008519405143
multi_map 0.44640477491307334
set 0.22621148322603754
map + numpy_indexing 0.34636545836644167
unique + map + numpy_indexing 2.0781196703988827
numba_map + numpy_indexing 0.30156832467717676
---- m=10000000 n=100000 k=5
merge 0.8237963316921826
merge_index 0.8338317163324973
join 0.8705977065837942
*reindex_stack 0.455108648129997
*reindex_build_dict 0.3370581329334527
reindex_build_df 0.47327432681711135
reindex_set 0.3697347569634855
reindex_assign 0.43470075648103373
reindex_concat 0.3544610048963383
loc_concat 0.5667034523899525
numba_by_rows 0.8590000060842916
multi_map 3.523335634250543
set 3.0202132660015195
map + numpy_indexing 1.1369471510002365
unique + map + numpy_indexing 4.243288217335551
numba_map + numpy_indexing 0.9269536811810791
---- m=10000000 n=100000 k=10
merge 1.2057412993340728
merge_index 1.234110462778416
join 1.0439695833993028
*reindex_stack 0.6809621169338546
*reindex_build_dict 0.4961718555236335
reindex_build_df 0.673938729132836
reindex_set 0.49392068914312404
reindex_assign 0.6563574546235031
reindex_concat 0.48132824199897833
loc_concat 0.7141473570663948
numba_by_rows 1.2562420190006378
multi_map 4.51269712166201
set 2.241050166799687
map + numpy_indexing 1.0398264798990566
unique + map + numpy_indexing 2.7061847894947277
numba_map + numpy_indexing 0.9649961699099301
#!/usr/bin/env python3
import timeit
import numba
import numba.typed.typeddict
import numpy as np
import pandas as pd
def main():
pd.set_option("mode.copy_on_write", True)
max_time = 1
max_iters = 30
for m in [10000000]:
for n in [100000]:
# for m in (1000, 10000, 100000, 1000000, 10000000):
# for n in (10, 1000, 100000):
if n > m:
continue
for k in (1, 5, 10):
test(m, n, k, max_time, max_iters)
def test(m, n, k, max_time, max_iters):
print(f'---- {m=} {n=} {k=}')
np_rand = np.random.default_rng()
r1_x = np_rand.integers(0, n, m)
r1_b = np_rand.integers(0, n, m)
r2_index = np_rand.choice(np.arange(0, 2 * n), n, replace=False)
r1_x = r2_index[r1_x]
r2_dict = {}
for i in range(k):
t = np.arange(0, n)
np_rand.shuffle(t)
r2_dict["y%d" % i] = t
check = get_check()
method = "merge"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
func = lambda: df1.merge(df2, left_on="x", right_on="x", how="left")
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "merge_index"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
func = lambda: df1.merge(
df2.set_index("x"), left_on="x", right_index=True, how="left")
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "join"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
func = lambda: df1.join(df2.set_index("x"), on="x", how="left")
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "*reindex_stack"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
t = df2.set_index("x").reindex(df1["x"])
return np.column_stack([df1.values, t.values])
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "*reindex_build_dict"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df2c = df2.set_index("x")
df2_cols = list(df2c.columns)
t = df2c.reindex(df1["x"])
return {
**{
x: y.values
for x, y in df1.items()
},
**{
x: t[x].values
for i, x in enumerate(df2_cols)
}
}
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "reindex_build_df"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df2c = df2.set_index("x")
df2_cols = list(df2c.columns)
t = df2c.reindex(df1["x"])
res = {
**{
x: y.values
for x, y in df1.items()
},
**{
x: t[x].values
for i, x in enumerate(df2_cols)
}
}
return pd.DataFrame(res)
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "reindex_set"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df1c = df1.copy()
df2c = df2.set_index("x")
df2_cols = list(df2c.columns)
df1c[df2_cols] = df2c.reindex(df1["x"]).reset_index(drop=True)
return df1c
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "reindex_assign"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df2c = df2.set_index("x")
df2_cols = list(df2c.columns)
t = df2c.reindex(df1["x"])
return df1.assign(**{x: t[x].values for x in df2_cols})
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
# method = "reindex_assign2"
# df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
# df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
# def func():
# df2c = df2.set_index("x")
# df2_cols = list(df2c.columns)
# t = df2c.reindex(df1["x"]).values
# return df1.assign(**{x: t[:, i] for i, x in enumerate(df2_cols)})
# check(func())
# iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
# per_run = timing / iterations
# print(method, per_run)
method = "reindex_concat"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
return pd.concat(
[df1,
df2.set_index("x").reindex(df1["x"]).set_index(df1.index)],
axis=1)
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "loc_concat"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
return pd.concat(
[df1, df2.set_index("x").loc[df1["x"]].set_index(df1.index)],
axis=1)
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "numba_by_rows"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
@numba.njit
def helper(r1_list, r2_list):
r2_index = r2_list[:, 0]
mapping = {x: i for i, x in enumerate(r2_index)}
mapped = np.empty((r1_list.shape[0], r2_list.shape[1] - 1), "int64")
for i in range(r1_list.shape[0]):
x = r1_list[i, 0]
mapped_i = mapping[x]
mapped[i, :] = r2_list[mapped_i, 1:]
return mapped
def func():
r1_list = df1.values
r2_list = df2.values
return pd.concat([
df1,
pd.DataFrame(helper(r1_list, r2_list),
index=df1.index,
columns=df2.columns[1:])
],
axis=1)
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
# method = "single_map_concat"
# df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
# df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
# def func():
# df2_keys, df2_values = zip(*df2.items())
# df2_index = df2_values[0]
# if len(df2_values) >= 3:
# mapping = {
# df2_index[i]: np.array([y[i] for y in df2_values[1:]])
# for i in range(df2.shape[0])
# }
# mapped = pd.DataFrame(df1["x"].map(mapping).to_list(),
# index=df1.index,
# columns=list(df2_keys[1:]))
# else:
# mapping = {
# df2_index[i]: df2_values[1][i]
# for i in range(df2.shape[0])
# }
# mapped = df1["x"].map(mapping).to_frame(df2_keys[1])
# return pd.concat([df1, mapped], axis=1)
# check(func())
# iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
# per_run = timing / iterations
# print(method, per_run)
method = "multi_map"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df2_keys, df2_values = zip(*df2.items())
df2_index = df2_values[0]
mapping = [{
x: y[i]
for i, x in enumerate(df2_index)
} for y in df2_values[1:]]
return df1.assign(**{
k: df1["x"].map(mapping[i])
for i, k in enumerate(df2_keys[1:])
})
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "set"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df1c = df1.set_index("x")
df2c = df2.set_index("x")
df2_cols = list(df2c.columns)
df1c[df2_cols] = df2c[df2_cols]
return df1c.reset_index()
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "map + numpy_indexing"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df2c = df2.set_index("x")
r2_mapping = {x: i for i, x in enumerate(df2c.index)}
mapping = df1["x"].map(r2_mapping).values
if df2c.shape[1] > 1:
return pd.concat([
df1,
pd.DataFrame(df2c.values[mapping],
index=df1.index,
columns=df2c.columns)
],
axis=1)
return df1.assign(**{df2c.columns[0]: df2c.values[mapping]})
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "unique + map + numpy_indexing"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
def func():
df2c = df2.set_index("x")
u, inv = np.unique(df1["x"], return_inverse=True)
r2_mapping = {x: i for i, x in enumerate(df2c.index)}
mapping = pd.Series(u).map(r2_mapping).values[inv]
if df2c.shape[1] > 1:
return pd.concat([
df1,
pd.DataFrame(df2c.values[mapping],
index=df1.index,
columns=df2c.columns)
],
axis=1)
return df1.assign(**{df2c.columns[0]: df2c.values[mapping]})
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
method = "numba_map + numpy_indexing"
df1 = pd.DataFrame({"x": r1_x, "b": r1_b})
df2 = pd.DataFrame(r2_dict, index=r2_index).reset_index(names="x")
@numba.njit
def array_map(r, r2_index):
mapping = {x: i for i, x in enumerate(r2_index)}
ret = np.empty_like(r)
for i in range(r.shape[0]):
ret[i] = mapping[r[i]]
return ret
def func():
df2c = df2.set_index("x")
mapping = array_map(df1["x"].values, df2c.index.values)
if df2c.shape[1] > 1:
return pd.concat([
df1,
pd.DataFrame(df2c.values[mapping],
index=df1.index,
columns=df2c.columns)
],
axis=1)
return df1.assign(**{df2c.columns[0]: df2c.values[mapping]})
check(func())
iterations, timing = auto_timer(timeit.Timer(func), max_time, max_iters)
per_run = timing / iterations
print(method, per_run)
def get_df_dict(df):
return {k: x.values for k, x in df.items()}
def get_check():
first = None
def check(res):
if isinstance(res, pd.DataFrame):
vals = res.values
elif isinstance(res, dict):
vals = pd.DataFrame(res).values
else:
vals = res
nonlocal first
if first is not None:
assert np.all(vals == first)
else:
first = vals
return check
def auto_timer(timeit_timer, max_time, max_iters, callback=None):
"""
calls the `timeit` method of `timeit_timer` with exponentially increasing
iterations but less than the estimated number of iterations left in order
for total_time >= max_time, and returns the total number of iterations and
total time
:param timeit_timer: timer to run timeit on
:param max_time: approximate maximum time to spend
:param callback: `callback(total_iters, total_time)` will be called for
every exponential update (if given)
:return: total_iters, total_time
"""
i = 1
total_iters = 0
total_time = 0
while total_time < max_time and total_iters < max_iters:
if total_iters > 0:
new_i = int(
(max_time - total_time) / (total_time / total_iters)) + 1
i = min(new_i, i * 2)
total_iters += i
total_time += timeit_timer.timeit(i)
if callback:
callback(total_iters, total_time)
return total_iters, total_time
if __name__ == "__main__":
main()