本文最后编辑于 前,其中的内容可能需要更新。
Milvus向量嵌入 Milvus
是一个向量库,可用于AI的文本检索,本文使用官方文档例子,例子语言为python
安装依赖 1 pip3 install pymilvus FlagEmbedding
定义向量生成模型 此处使用默认轻量级模型,也可通过milvus_model.hybrid.BGEM3EmbeddingFunction
j加载指定模型
1 2 ef = model.DefaultEmbeddingFunction()
准备嵌入数据 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 docs = [ "Artificial intelligence was founded as an academic discipline in 1956." , "Alan Turing was the first person to conduct substantial research in AI." , "Born in Maida Vale, London, Turing was raised in southern England." , ] embeddings = ef.encode_documents(docs) query_embedding = ef.encode_documents(["where was Turing born?" ]) entities = [{"id" : index_id, "vector" : embeddings[index_id], "source" : docs[index_id]} for index_id in range (len (docs))]
初始化client 也是简单的事
1 2 3 client = MilvusClient( uri="http://HOST:PORT" )
创建集合 需要先定义schema, 然后直接创建即可
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 schema = MilvusClient.create_schema( auto_id=False , enable_dynamic_field=True , ) schema.add_field(field_name="id" , datatype=DataType.INT64, is_primary=True ) schema.add_field(field_name="vector" , datatype=DataType.FLOAT_VECTOR, dim=768 ) schema.add_field(field_name="source" , datatype=DataType.VARCHAR, max_length=500 ) client.create_collection( collection_name="customized_setup" , schema=schema, )
创建索引 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 index_params = MilvusClient.prepare_index_params() index_params.add_index( field_name="vector" , metric_type="COSINE" , index_type="IVF_FLAT" , index_name="vector_index" , nlist=1024 ) client.create_index( collection_name="customized_setup" , index_params=index_params )
查看索引信息 1 2 3 4 5 6 res = client.describe_index( collection_name="customized_setup" , index_name="vector_index" ) print (res)
向集合插入数据 1 2 3 4 5 6 7 res = client.insert( "customized_setup" , entities ) print (res)
查看集合数据 1 2 3 4 5 6 7 8 9 10 client.load_collection("customized_setup" ) client.load_collection("customized_setup" ) res = client.get( collection_name="customized_setup" , ids=[0 , 1 , 2 ], output_fields=["source" , "vector" ] ) for i in res: print (i)
搜索数据 1 2 3 4 5 6 7 8 9 10 res = client.search( collection_name="customized_setup" , data=query_embedding, limit=1 , output_fields=["source" ] ) for result in res: print (result)
删除数据 1 2 3 4 5 6 7 8 res = client.delete( collection_name='customized_setup' , ids=[0 , 1 , 2 ], ) print ("Entities deleted from partitionA: " , res['delete_count' ])