# LlamaIndex
# 介绍
LlamaIndex(GPT Index)是一个项目,提供了一个中央接口,用于连接您的LLM与外部数据。有关LlamaIndex的更多信息,请参阅LlamaIndex文档网站 (opens new window)。
# 先决条件
在开始之前,我们需要安装LlamaIndex (opens new window)和clickhouse python client (opens new window)。
pip install -U llama-index clickhouse-connect
# 环境设置
要使用OpenAI嵌入模型,我们需要在OpenAI (opens new window)上注册一个OpenAI API密钥。
export OPENAI_API_KEY="YOUR_OPENAI_API_KEY"
# 加载数据和构建索引
我们将加载本地文本文件到Llama文档中:
from gpt_index import GPTMyScaleIndex, SimpleDirectoryReader
# 加载文档
documents = SimpleDirectoryReader("YOUR_PATH_TO_FILE").load_data()
接下来,我们将上传数据到MyScale集群。如果索引不存在,它将被创建,如果已经存在,它将被重用。有关配置MyScale索引的更多信息,请参阅MyScaleVectorStore (opens new window)。
import clickhouse_connect
# 初始化客户端
client = clickhouse_connect.get_client(
host='YOUR_CLUSTER_HOST',
port=443,
username='YOUR_USERNAME',
password='YOUR_CLUSTER_PASSWORD'
)
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import MyScaleVectorStore
from IPython.display import Markdown, display
# 加载文档
documents = SimpleDirectoryReader("../data/paul_graham").load_data()
# 初始化索引
loader = SimpleDirectoryReader("./data/paul_graham/")
documents = loader.load_data()
# 初始化带有元数据过滤器和存储索引
from llama_index.storage.storage_context import StorageContext
for document in documents:
document.metadata = {"user_id": "123", "favorite_color": "blue"}
vector_store = MyScaleVectorStore(myscale_client=client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context
)
# 查询MyScale
我们可以根据相似性搜索进行查询:
import textwrap
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters
# 设置Logging为DEBUG以获得更详细的输出
query_engine = index.as_query_engine(
filters=MetadataFilters(
filters=[
ExactMatchFilter(key="user_id", value="123"),
]
),
similarity_top_k=2,
vector_store_query_mode="hybrid",
)
response = query_engine.query("What did the author learn?")
print(textwrap.fill(str(response), 100))
# 清除所有索引
您还可以使用文档的ID删除文档:
for document in documents:
index.delete_ref_doc(document.doc_id)