使用 tcvdb-text 混合检索 SDK Demo

最近更新时间:2024-09-29 10:49:11

我的收藏
本章节提供在腾讯云向量数据库中,使用 tcvdb-text 写入稀疏向量并进行混合检索的 Demo。
Python
import tcvectordb from tcvectordb.model.document import AnnSearch, WeightedRerank, RRFRerank, KeywordSearch from tcvectordb.model.enum import FieldType, IndexType, MetricType, ReadConsistency from tcvectordb.model.index import Index, VectorIndex, FilterIndex, HNSWParams, SparseVector from tcvdb_text.encoder import BM25Encoder from typing import List vdb_url = 'YOUR CONNECTION URL' vdb_key = 'YOUR CONNECTION KEY' client = tcvectordb.RPCVectorDBClient( url=vdb_url, key=vdb_key, username='root', read_consistency=ReadConsistency.EVENTUAL_CONSISTENCY, timeout=30) db_name = 'db-test-sparse-vec' client.drop_database(db_name) db = client.create_database(db_name) # 定义集合的索引结构,包括稠密向量索引、稀疏向量索引 index = Index() index.add(FilterIndex('id', FieldType.String, IndexType.PRIMARY_KEY)) index.add(VectorIndex(name='vector', dimension=3, index_type=IndexType.HNSW, metric_type=MetricType.IP, params=HNSWParams(m=16, efconstruction=200))) index.add(VectorIndex(name='sparse_vector', field_type=FieldType.SparseVector, index_type=IndexType.SPARSE_INVERTED, metric_type=MetricType.IP)) # 创建 Collection coll_name = 'coll-sparse-vec' res = db.create_collection( name=coll_name, shard=1, replicas=1, description='test collection', index=index ) # 初始化稀疏向量 Encoder bm25 = BM25Encoder.default('zh') # 根据文本内容,生成对应的稀疏向量 texts = ['腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务', '腾讯云向量数据库可以和大语言模型 LLM 配合使用'] sparse_vectors: List[SparseVector] = bm25.encode_texts(texts) # 写入包含稀疏向量的 Document 数据 client.upsert( database_name=db_name, collection_name=coll_name, documents=[ { "id": "0000", "vector": [0.1273, 0.0871, -0.6573], "sparse_vector": sparse_vectors[0] }, { "id": "0001", "vector": [0.9172, 0.7612, 0.5523], "sparse_vector": sparse_vectors[1] } ] ) # 执行混合检索,并使用指定权重(Weighted)的Rerank方法 doc_lists = client.hybrid_search( database_name=db_name, collection_name=coll_name, ann=[ AnnSearch( field_name="vector", data=[0.3123, 0.43, 0.213], ), ], match=[ KeywordSearch( field_name="sparse_vector", data=bm25.encode_queries('向量数据库'), ), ], rerank=WeightedRerank( field_list=['vector', 'sparse_vector'], weight=[0.9, 0.1], ), retrieve_vector=False, limit=1, ) for i, docs in enumerate(doc_lists): print(i) for doc in docs: print(doc) # 执行混合检索,并使用RRF的Rerank方法 doc_lists = client.hybrid_search( database_name=db_name, collection_name=coll_name, ann=[ AnnSearch( field_name="vector", data=[0.3123, 0.43, 0.213], ), ], match=[ KeywordSearch( field_name="sparse_vector", data=bm25.encode_queries('向量数据库'), ), ], rerank=RRFRerank(k=60), retrieve_vector=False, limit=1, ) for i, docs in enumerate(doc_lists): print(i) for doc in docs: print(doc)