I have the corpus.jsonl which has 6.5gb storage.And i use the one h100 gpu to embedding the corpus to the chromadb,but it seems very slowly.I want to find how can i accelerate the progress(gpu,cpu,io).Here is my raw code:
import os
import torch
import chromadb
from tqdm import tqdm
import json
from sentence_transformers import SentenceTransformer
# ===================== 配置 =====================
# 输入:直接使用第一阶段生成的预处理文件
PREPROCESSED_FILE = "./preprocessed_chunks.jsonl"
EMBED_MODEL_PATH = "./minicpm" # 你的 MiniCPM 模型路径
CHROMA_DIR = "./chroma" # 持久化路径
COLLECTION_NAME = "my_rag"
BATCH_SIZE = 2048
NORMALIZE = True
# ===================== 主程序:建立索引 =====================
def build_chroma_index():
embedder = SentenceTransformer(EMBED_MODEL_PATH, trust_remote_code=True)
client = chromadb.PersistentClient(path=CHROMA_DIR)
collection = client.get_or_create_collection(COLLECTION_NAME)
# --- 单GPU版本提示 ---
if torch.cuda.is_available():
print("检测到可用的 GPU。将使用默认的单个 GPU (cuda:0) 进行嵌入。")
else:
print("警告:未检测到可用的 GPU。将使用 CPU 进行嵌入,速度会较慢。")
# ---
all_chunks = []
all_ids = []
total = 0
print(f"📖 从预处理文件 {PREPROCESSED_FILE} 读取文本块...")
# 主进程的工作就是逐行读取预处理好的数据
with open(PREPROCESSED_FILE, "r", encoding="utf-8") as f:
for line in tqdm(f, desc="构建向量索引"):
chunk_text = json.loads(line)["text"]
all_chunks.append(chunk_text)
all_ids.append(f"chunk-{total}")
total += 1
# 当累积的文本块数量达到 BATCH_SIZE 时,进行一次批处理
if len(all_chunks) >= BATCH_SIZE:
# 直接调用 encode 函数,无需 pool
# show_progress_bar=True 可以在处理大批次时显示一个内部进度条
vecs = embedder.encode(
all_chunks,
normalize_embeddings=NORMALIZE,
batch_size=512, # 这是模型内部计算时使用的批大小,对于GPU很重要
show_progress_bar=True
).tolist()
collection.add(documents=all_chunks, embeddings=vecs, ids=all_ids)
all_chunks.clear()
all_ids.clear()
# 处理最后一批不足 BATCH_SIZE 的剩余文本块
if all_chunks:
vecs = embedder.encode(
all_chunks,
normalize_embeddings=NORMALIZE,
batch_size=512,
show_progress_bar=True
).tolist()
collection.add(documents=all_chunks, embeddings=vecs, ids=all_ids)
print(f"✅ 索引建立完成。总共处理了 {total} 个文本块。")
# (查询和命令行接口部分与之前完全相同,无需改动)
# ===================== 查询 =====================
def search(query: str, k: int = 5):
embedder = SentenceTransformer(EMBED_MODEL_PATH, trust_remote_code=True)
client = chromadb.PersistentClient(path=CHROMA_DIR)
collection = client.get_or_create_collection(COLLECTION_NAME)
q_vec = embedder.encode([query], normalize_embeddings=NORMALIZE).tolist()
results = collection.query(query_embeddings=q_vec, n_results=k)
print("🔍 Top-{} results for: {}".format(k, query))
for i, doc in enumerate(results['documents'][0]):
clean_doc = doc[:100].replace('\n', ' ').replace('\r', '')
print("[{}] {}...".format(i + 1, clean_doc))
# ===================== CLI =====================
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser("MiniCPM + ChromaDB RAG Pipeline (Python 3.10 Compatible)")
sub = parser.add_subparsers(dest="cmd", required=True)
b = sub.add_parser("build", help="构建 Chroma 向量库")
s = sub.add_parser("search", help="搜索")
s.add_argument("--query", required=True)
s.add_argument("--topk", type=int, default=5)
args = parser.parse_args()
if args.cmd == "build":
build_chroma_index()
elif args.cmd == "search":
search(args.query, args.topk)
And below is my gpu、cpu、memory usage:

this is my raw corpus analysis:
What is the limitation of my whole progress?Should i use the more gpus or Does the chromadb write too slow or Does the progress reads the data too slow and so on.