# Python 客户端

MyScale 兼容 ClickHouse,因此您可以使用官方的 ClickHouse 客户端 (opens new window) 来从您的 Python 应用程序访问 MyScale。

要安装所需的依赖项,请使用以下命令:

pip install -U clickhouse-connect

# 创建连接

要了解如何与集群建立连接,请参阅 连接详情 部分。

# 创建表

使用 Python SDK 创建一个包含 iddatadatelabel 列的 Table。 需要注意的是,不同的向量类型创建表格的 SQL 语句有所区别,目前 MyScale 支持的向量类型有浮点数向量和二进制向量:

# 浮点数向量

如果你的向量数据是浮点数(Float32)数组,可遵循下述 SQL 创建 Table,我们假设该 Table 后续导入的向量长度均为 128。

# 创建一个包含 128 维浮点数向量的表。
client.command("""
CREATE TABLE default.myscale_categorical_vector_search
(
    id    UInt32,
    data  Array(Float32),
    CONSTRAINT check_length CHECK length(data) = 128,
    date  Date,
    label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# 获取并打印当前数据库中所有表的名称。
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])

示例代码执行结果:

['myscale_categorical_vector_search']

# 二进制向量

如果你的向量数据是一个二进制数据, 那么可以按照下面的方式创建一个名为 myscale_categorical_binary_search 的表,它包含 iddatadatelabel 列。 需要注意的是,导入到 Table 中的二进制数据长度应该保持一致,假设我们要导入的二进制数据长度为 128,那么就需要将 FixedString 长度设置为 128/8=16

# 创建一个 Table, 用于存储二进制数据
client.command("""
CREATE TABLE default.myscale_categorical_binary_search
(
    id    UInt32,
    data  FixedString(16),
    date  Date,
    label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# 获取并打印当前数据库中所有表的名称。
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])

示例代码执行结果:

['myscale_categorical_binary_search']

# 导入数据

# 浮点数向量

浮点数向量的导入相对直接,假设我们有一个 Pandas DataFrame,其中的值如下所示:

import pandas as pd
# 创建 dict, dict 包含了需要导入的数据
data = {
    'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    'data': [
        [0,0,0,1,8,7,3,2,5,0,0,3,5,7,11,31,13,0,0,0,0,29,106,107,13,0,0,0,1,61,70,42,0,0,0,0,1,23,28,16,63,4,0,0,0,6,83,81,117,86,25,15,17,50,84,117,31,23,18,35,97,117,49,24,68,27,0,0,0,4,29,71,81,47,13,10,32,87,117,117,45,76,40,22,60,70,41,9,7,21,29,39,53,21,4,1,55,72,3,0,0,0,0,9,65,117,73,37,28,23,17,34,11,11,27,61,64,25,4,0,42,13,1,1,1,14,10,6],
        [65,35,8,0,0,0,1,63,48,27,31,19,16,34,96,114,3,1,8,21,27,43,57,21,11,8,37,8,0,0,1,23,101,104,11,0,0,0,0,29,83,114,114,77,23,14,18,52,28,8,46,75,39,24,59,60,2,0,18,10,20,52,52,16,12,28,4,0,0,3,5,8,102,79,58,3,0,0,0,11,114,112,78,50,17,14,45,104,19,31,53,114,73,44,34,26,3,2,0,0,0,1,8,9,34,20,0,0,0,0,1,23,30,75,87,36,0,0,0,2,0,17,66,73,3,0,0,0],
        [0,0,0,0,0,0,4,1,15,0,0,0,0,0,10,49,27,0,0,0,0,29,113,114,9,0,0,0,3,69,71,42,14,0,0,0,0,1,56,79,63,2,0,0,0,38,118,77,118,60,8,8,18,48,59,104,27,16,7,13,80,118,34,21,118,47,4,0,0,1,32,99,61,40,31,57,46,118,118,61,80,64,16,21,20,33,23,27,6,22,16,14,51,33,0,0,76,40,8,0,2,14,42,94,19,42,57,67,23,34,22,10,9,52,15,21,5,1,3,3,1,38,12,5,18,1,0,0],
        [3,9,45,22,28,11,4,3,77,10,4,1,1,4,3,11,23,0,0,0,26,49,6,7,5,3,3,1,11,50,8,9,11,7,15,21,12,17,21,25,121,12,4,7,4,7,4,41,28,2,0,1,10,42,22,20,1,1,4,9,31,79,16,3,23,4,6,26,31,121,87,40,121,82,16,12,15,41,6,10,76,48,5,3,21,42,41,50,5,17,18,64,86,54,17,6,43,62,56,84,116,108,38,26,58,63,20,87,105,37,2,2,121,121,38,25,44,33,24,46,3,16,27,74,121,55,9,4],
        [6,4,3,7,80,122,62,19,2,0,0,0,32,60,10,19,4,0,0,0,0,10,69,66,0,0,0,0,8,58,49,5,5,31,59,67,122,37,1,2,50,1,0,16,99,48,3,27,122,38,6,7,11,31,87,122,9,8,6,23,122,122,69,21,0,11,31,55,28,0,0,0,61,4,0,37,43,2,0,15,122,122,55,32,6,1,0,12,5,22,52,122,122,9,2,0,2,0,0,5,28,20,2,2,19,3,0,2,12,12,3,16,25,18,34,35,5,4,1,13,21,2,22,51,9,20,57,59],
        [6,2,19,22,22,81,31,12,72,15,12,10,3,6,1,37,30,17,4,2,9,4,2,21,1,0,1,3,11,9,5,2,7,11,17,61,127,127,28,13,49,36,26,45,28,17,4,16,111,46,11,2,7,25,40,89,2,0,8,31,63,60,28,12,0,18,82,127,50,1,0,0,94,28,11,88,15,0,0,4,127,127,34,23,25,18,18,69,6,16,26,90,127,42,12,8,0,3,46,29,0,0,0,0,22,35,15,12,0,0,0,0,46,127,83,17,1,0,0,0,0,14,67,115,45,0,0,0],
        [19,35,5,6,40,23,18,4,21,109,120,23,5,12,24,5,0,5,87,108,47,14,32,8,0,0,0,27,36,30,43,0,29,12,10,15,6,7,17,12,34,9,14,65,20,23,28,14,120,34,14,14,9,34,120,120,7,6,7,27,56,120,120,23,9,5,4,7,2,6,46,13,29,5,5,32,12,20,99,19,120,120,107,38,13,7,24,36,6,24,120,120,55,26,4,3,5,1,0,0,1,5,19,18,2,2,0,1,18,12,30,7,0,5,33,29,66,50,26,2,0,0,49,45,12,28,10,0],
        [28,28,28,27,13,5,4,12,4,8,29,118,69,19,21,7,3,0,0,14,14,10,105,60,0,0,0,0,11,69,76,9,5,2,18,59,17,6,1,5,42,9,16,75,31,21,17,13,118,44,18,16,17,30,78,118,4,4,8,61,118,110,54,25,10,6,21,54,5,5,6,5,38,17,11,31,6,24,64,15,115,118,117,61,13,13,22,25,2,11,66,118,87,25,10,2,10,11,3,2,9,28,4,5,21,18,35,17,6,10,4,30,20,2,13,13,7,30,71,118,0,0,3,12,50,103,44,5],
        [41,38,21,17,42,71,60,50,11,1,2,11,109,115,8,4,27,8,5,22,11,9,8,14,20,10,4,33,12,7,4,1,18,115,95,42,17,1,0,0,19,6,46,115,91,16,0,7,66,7,4,15,12,32,91,109,12,3,1,8,21,115,96,17,1,51,78,14,0,0,0,0,50,40,62,53,0,0,0,3,115,115,40,12,6,13,25,65,7,30,51,65,110,92,25,9,0,1,13,0,0,0,0,0,4,22,11,1,0,0,0,0,13,115,48,1,0,0,0,0,0,36,102,63,11,0,0,0],
        [0,0,0,0,0,2,6,4,0,0,0,0,0,1,44,57,0,0,0,0,0,15,125,52,0,0,0,0,6,57,44,2,23,1,0,0,0,6,20,23,125,30,5,2,1,3,73,125,16,10,11,46,61,97,125,93,0,0,0,31,111,96,21,0,20,6,0,0,9,114,63,5,125,125,83,8,2,26,5,23,14,56,125,125,37,10,7,10,11,2,17,87,42,5,8,19,0,0,7,32,56,91,8,0,1,17,17,3,14,71,15,5,7,9,35,10,2,5,24,39,14,16,4,9,22,6,13,11]
    ],
    'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
    'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# 创建 DataFrame
df = pd.DataFrame(data)

我们可以使用 client.insert 来插入数据:

# 查询 'default.myscale_categorical_vector_search' 表中的行数
db_count_sql="SELECT count(*) FROM default.myscale_categorical_vector_search"
# 在插入任何数据之前,获取并打印 'default.myscale_categorical_vector_search' 表中的行数
print(f"插入前,db_count 是 {client.command(db_count_sql)}")
# 将数据插入到 'myscale_categorical_vector_search' 表中
df_records = df.to_records(index=False)
df_records['date'] = pd.to_datetime(df_records['date'])
client.insert("default.myscale_categorical_vector_search", df_records.tolist(),
              column_names=df.columns.tolist())
# 插入数据后,获取并打印 'default.myscale_categorical_vector_search' 表中的行数
print(f"插入后,db_count 是 {client.command(db_count_sql)}")

示例代码执行结果:

插入前,db_count 是 0
插入后,db_count 是 10

# 二进制向量

二进制向量的导入相对复杂,假设我们有一个 Pandas DataFrame,需要入库 10 条二进制数据,其中的值如下所示:

import pandas as pd
# 创建 dict, dict 包含了需要导入的数据
data = {
    'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    'data': [
        [0,0,1,0,1,1,0,1,1,1,0,0,0,0,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,1],
        [1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,0,1,1,1,0,1,0,1,0,1,0,0,1,1,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1],
        [0,1,0,0,0,1,0,1,0,1,1,1,0,1,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,1,1,1,0,0,0,1,1,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,1,0,0,1,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,0,0,0,1,0,0,1,0,1,0,1,0,1,1,0,1],
        [1,1,1,1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,0,1,1,0,0,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,1,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,1,1,0,1,1,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,1,1],
        [1,0,1,0,1,1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,1,0,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,0,1,1,0,0,1,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0],
        [0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,1,0,1,0,1,1,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,1,1,1,1,0,0,0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,1],
        [1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,1,1,0,1,1,1],
        [1,0,1,1,0,1,1,1,1,1,1,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,1,0,1,0,1,0,1,0,0,0,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0],
        [1,1,0,1,0,1,1,1,1,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,1,0,1,0,1,0,1,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,0,1,0,0,1,0,0,0,1,1,0,0,1],
        [1,1,1,0,1,1,0,1,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1]
    ],
    'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
    'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# 创建 DataFrame
df = pd.DataFrame(data)

在使用 client.insert 插入数据之前,我们需要将 binary 数组转换为 bytearray 格式。

# 查询 'default.myscale_categorical_binary_search' 表中的行数
db_count_sql="SELECT count(*) FROM default.myscale_categorical_binary_search"
# 在插入任何数据之前,获取并打印 'default.myscale_categorical_binary_search' 表中的行数
print(f"插入前,db_count 是 {client.command(db_count_sql)}")
# 将数据插入到 'myscale_categorical_binary_search' 表中
df_records = df.to_records(index=False)
# 日期数据格式化
df_records['date'] = pd.to_datetime(df_records['date'])
# binary 数据转换为 bytearray 格式
for idx, vector in enumerate(df_records['data']):
    byte_array = bytearray()
    vector_str = "".join(str(it) for it in vector)
    for i in range(0, len(vector_str), 8):
        byte_value = int(vector_str[i:i + 8], 2)
        byte_array.append(byte_value)
    df_records['data'][idx] = byte_array
# 导入数据至数据库
client.insert("default.myscale_categorical_binary_search", df_records.tolist(),
              column_names=df.columns.tolist())
# 插入数据后,获取并打印 'default.myscale_categorical_binary_search' 表中的行数
print(f"插入后,db_count 是 {client.command(db_count_sql)}")

示例代码执行结果:

插入前,db_count 是 0
插入后,db_count 是 10

# 创建向量索引

MyScale 异步执行创建索引命令,这意味着在创建索引时不会阻塞数据库。但是,如果表非常大,创建索引仍然可能需要相当长的时间。因此,在代码中检查索引是否成功创建是重要的。 这里是一个示例代码,展示如何检查是否已创建索引:

# 浮点数向量

# 1. 对 vector<float32> 创建索引
client.command("""
ALTER TABLE default.myscale_categorical_vector_search
    ADD VECTOR INDEX categorical_vector_idx data
    TYPE MSTG
""")
# 2. 查询 'vector_indices' 系统表来检查索引创建的状态
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_vector_search'"
# 3. 打印索引创建的状态,如果索引成功创建,状态将为 'Built'
print(f"索引建立状态是 {client.command(get_index_status)}")

# 二进制向量

# 1. 对 binary 类型的数据创建索引
client.command("""
ALTER TABLE default.myscale_categorical_binary_search
    ADD VECTOR INDEX categorical_binary_idx data
    TYPE BinaryMSTG('metric_type=Hamming')
""")
# 2. 查询 'vector_indices' 系统表来检查索引创建的状态
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_binary_search'"
# 3. 打印索引创建的状态,如果索引成功创建,状态将为 'Built'
print(f"索引建立状态是 {client.command(get_index_status)}")

# 向量搜索

在这个示例中,我们执行一个 SQL 查询来选择 iddatelabel 和使用 distance 函数计算的 data 和一个样本向量数据之间的距离。LIMIT 10 子句指定函数应返回最接近的 10 个向量。

# 浮点数向量

 # 从表中随机选择一行作为目标
random_row = client.query("SELECT * FROM default.myscale_categorical_vector_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
print("当前选择的项目 id={}, label={}, date={}".format(target_row_id, target_row_label, target_row_date))
# 获取查询结果
result = client.query(f"""
SELECT id, date, label, 
    distance(data, {target_row_data}) as dist FROM default.myscale_categorical_vector_search ORDER BY dist LIMIT 10
""")
# 3. 遍历查询结果的行,并打印每行的 'id'、'date'、'label' 和距离分数 distance
print("前 10 个候选者:")
for row in result.named_results():
    print(row["id"], row["date"], row["label"], row["dist"])

示例代码执行结果:

当前选择的项目 id=3, label=animal, date=2024-08-11
前 10 个候选者:
3   2024-08-11  animal      0.0
5   2025-04-02  building    211995.0
9   1971-02-02  building    214219.0
2   1975-10-07  animal      247505.0
0   2030-09-26  person      252941.0
1   1996-06-22  building    255835.0
7   1970-09-10  building    266691.0
4   1970-01-31  animal      276685.0
8   2007-10-26  person      284773.0
6   2007-06-29  animal      298423.0

# 二进制向量

 # 1. 从表中随机选择一行作为目标
random_row = client.query("SELECT * FROM default.myscale_categorical_binary_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
vector_str = ""
for byte in target_row_data:
    binary_str = bin(byte)[2:].zfill(8)
    vector_str += binary_str
print("当前选择的项目 id={}, label={}, data={}, date={}".format(target_row_id, target_row_label, vector_str, target_row_date))
# 2. 获取查询结果
result = client.query(f"""
SELECT id, date, label, 
    distance(data, unbin('{vector_str}')) as dist FROM default.myscale_categorical_binary_search ORDER BY dist LIMIT 10
""")
# 3. 遍历查询结果的行,并打印每行的 'id'、'date'、'label' 和距离分数 distance
print("前 10 个候选者:")
for row in result.named_results():
    print(row["id"], row["date"], row["label"], row["dist"])

示例代码执行结果:

当前选择的项目 id=1, label=building, data=11100101100101110111001100100100000100000110111011101110110111010101001111000001010010111101000111100000001111110101010100101101, date=1996-06-22
前 10 个候选者:
1 1996-06-22 building 0.0
3 2024-08-11 animal 50.0
8 2007-10-26 person 50.0
9 1971-02-02 building 52.0
4 1970-01-31 animal 54.0
5 2025-04-02 building 54.0
6 2007-06-29 animal 54.0
2 1975-10-07 animal 56.0
7 1970-09-10 building 58.0
0 2030-09-26 person 65.0
Last Updated: Tue Sep 10 2024 03:19:52 GMT+0000