How to accelerate my corpus embedding to the chromadb

Ask Question

Asked 2 months ago

Modified 2 months ago

Viewed 75 times

I have the corpus.jsonl which has 6.5gb storage.And i use the one h100 gpu to embedding the corpus to the chromadb,but it seems very slowly.I want to find how can i accelerate the progress(gpu,cpu,io).Here is my raw code:

import os
import torch
import chromadb
from tqdm import tqdm
import json
from sentence_transformers import SentenceTransformer

# ===================== 配置 =====================
# 输入：直接使用第一阶段生成的预处理文件
PREPROCESSED_FILE = "./preprocessed_chunks.jsonl"
EMBED_MODEL_PATH = "./minicpm"  # 你的 MiniCPM 模型路径
CHROMA_DIR = "./chroma"  # 持久化路径
COLLECTION_NAME = "my_rag"
BATCH_SIZE = 2048
NORMALIZE = True


# ===================== 主程序：建立索引 =====================
def build_chroma_index():
    embedder = SentenceTransformer(EMBED_MODEL_PATH, trust_remote_code=True)
    client = chromadb.PersistentClient(path=CHROMA_DIR)
    collection = client.get_or_create_collection(COLLECTION_NAME)

    # --- 单GPU版本提示 ---
    if torch.cuda.is_available():
        print("检测到可用的 GPU。将使用默认的单个 GPU (cuda:0) 进行嵌入。")
    else:
        print("警告：未检测到可用的 GPU。将使用 CPU 进行嵌入，速度会较慢。")
    # ---

    all_chunks = []
    all_ids = []
    total = 0

    print(f"📖 从预处理文件 {PREPROCESSED_FILE} 读取文本块...")

    # 主进程的工作就是逐行读取预处理好的数据
    with open(PREPROCESSED_FILE, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="构建向量索引"):
            chunk_text = json.loads(line)["text"]

            all_chunks.append(chunk_text)
            all_ids.append(f"chunk-{total}")
            total += 1

            # 当累积的文本块数量达到 BATCH_SIZE 时，进行一次批处理
            if len(all_chunks) >= BATCH_SIZE:
                # 直接调用 encode 函数，无需 pool
                # show_progress_bar=True 可以在处理大批次时显示一个内部进度条
                vecs = embedder.encode(
                    all_chunks,
                    normalize_embeddings=NORMALIZE,
                    batch_size=512,  # 这是模型内部计算时使用的批大小，对于GPU很重要
                    show_progress_bar=True
                ).tolist()

                collection.add(documents=all_chunks, embeddings=vecs, ids=all_ids)
                all_chunks.clear()
                all_ids.clear()

    # 处理最后一批不足 BATCH_SIZE 的剩余文本块
    if all_chunks:
        vecs = embedder.encode(
            all_chunks,
            normalize_embeddings=NORMALIZE,
            batch_size=512,
            show_progress_bar=True
        ).tolist()
        collection.add(documents=all_chunks, embeddings=vecs, ids=all_ids)

    print(f"✅ 索引建立完成。总共处理了 {total} 个文本块。")


# (查询和命令行接口部分与之前完全相同，无需改动)
# ===================== 查询 =====================
def search(query: str, k: int = 5):
    embedder = SentenceTransformer(EMBED_MODEL_PATH, trust_remote_code=True)
    client = chromadb.PersistentClient(path=CHROMA_DIR)
    collection = client.get_or_create_collection(COLLECTION_NAME)

    q_vec = embedder.encode([query], normalize_embeddings=NORMALIZE).tolist()
    results = collection.query(query_embeddings=q_vec, n_results=k)

    print("🔍 Top-{} results for: {}".format(k, query))
    for i, doc in enumerate(results['documents'][0]):
        clean_doc = doc[:100].replace('\n', ' ').replace('\r', '')
        print("[{}] {}...".format(i + 1, clean_doc))


# ===================== CLI =====================
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser("MiniCPM + ChromaDB RAG Pipeline (Python 3.10 Compatible)")
    sub = parser.add_subparsers(dest="cmd", required=True)

    b = sub.add_parser("build", help="构建 Chroma 向量库")
    s = sub.add_parser("search", help="搜索")
    s.add_argument("--query", required=True)
    s.add_argument("--topk", type=int, default=5)

    args = parser.parse_args()
    if args.cmd == "build":
        build_chroma_index()
    elif args.cmd == "search":
        search(args.query, args.topk)

And below is my gpu、cpu、memory usage:

this is my raw corpus analysis: What is the limitation of my whole progress?Should i use the more gpus or Does the chromadb write too slow or Does the progress reads the data too slow and so on.

edited Aug 27 at 5:24

asked Aug 27 at 1:52

YiJun Sachs

234 bronze badges

Add a comment |

0 Your Answer

Sign up or log in

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.

Collectives™ on Stack Overflow

How to accelerate my corpus embedding to the chromadb

0

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest