My code involves slicing large tensors on the CPU by index and asynchronously transmitting them back to the GPU. However, through the Profiler debugging tool, I found that this step would seriously slow down the program (this step would be executed multiple times) because it occupied CPU time and caused the GPU to starve.
This is a partial screenshot from the Profiler. It can be seen that the slicing operation takes up a lot of CPU time:

Here is my demo code. For the convenience of testing, I extracted it separately from my original code, kept the tensor shape consistent, and at the same time gave the speed of slicing before and after sorting the index.:
import torch
import time
import numpy as np
import random
def set_seed(seed_value):
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def upload_from_cpu_and_compare_slicing_time():
set_seed(42)
key_cache_shape = (1, 32, 2048, 128)
indices_shape = (32, 2048)
_indices = torch.randint(0, key_cache_shape[2], indices_shape, dtype=torch.int32, device="cpu")
key_cache = torch.randn(key_cache_shape, dtype=torch.float16, device="cuda")
value_cache = torch.randn(key_cache_shape, dtype=torch.float16, device="cuda")
_indices = _indices.cpu()
_head_index_cache = torch.arange(key_cache_shape[1]).unsqueeze(1)
print("--- Unsorted indices slicing time ---")
start_time_cpu_unsorted = time.perf_counter()
key_cpu_unsorted = key_cache[:, _head_index_cache, _indices, :].cpu().pin_memory()
value_cpu_unsorted = value_cache[:, _head_index_cache, _indices, :].cpu().pin_memory()
end_time_cpu_unsorted = time.perf_counter()
print(f"CPU slicing time (unsorted indices): {end_time_cpu_unsorted - start_time_cpu_unsorted:.6f} seconds")
cuda_stream_unsorted = torch.cuda.Stream()
with torch.cuda.stream(cuda_stream_unsorted):
key_gpu_unsorted = key_cpu_unsorted.cuda(non_blocking=True)
value_gpu_unsorted = value_cpu_unsorted.cuda(non_blocking=True)
cuda_stream_unsorted.synchronize()
print("\n--- Sorted indices slicing time ---")
_indices_sorted, _ = torch.sort(_indices)
start_time_cpu_sorted = time.perf_counter()
key_cpu_sorted = key_cache[:, _head_index_cache, _indices_sorted, :].cpu().pin_memory()
value_cpu_sorted = value_cache[:, _head_index_cache, _indices_sorted, :].cpu().pin_memory()
end_time_cpu_sorted = time.perf_counter()
print(f"CPU slicing time (sorted indices): {end_time_cpu_sorted - start_time_cpu_sorted:.6f} seconds")
cuda_stream_sorted = torch.cuda.Stream()
with torch.cuda.stream(cuda_stream_sorted):
key_gpu_sorted = key_cpu_sorted.cuda(non_blocking=True)
value_gpu_sorted = value_cpu_sorted.cuda(non_blocking=True)
cuda_stream_sorted.synchronize()
return key_gpu_unsorted, value_gpu_unsorted, key_gpu_sorted, value_gpu_sorted
if __name__ == "__main__":
if torch.cuda.is_available():
key_unsorted, value_unsorted, key_sorted, value_sorted = upload_from_cpu_and_compare_slicing_time()
I tried to optimize the slicing time. For example, I sorted _indices first. This has accelerated the slicing speed to a certain extent, but I wonder if there are more effective improvements?
--- Unsorted indices slicing time ---
CPU slicing time (unsorted indices): 0.075565 seconds
--- Sorted indices slicing time ---
CPU slicing time (sorted indices): 0.028489 seconds