Skip to content

feat: support bigquery.vector_search() #736

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@
from __future__ import annotations

import typing
from typing import Literal, Optional, Union

import bigframes.constants as constants
import bigframes.core.groupby as groupby
import bigframes.core.sql
import bigframes.ml.utils as utils
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.series

if typing.TYPE_CHECKING:
import bigframes.dataframe as dataframe
Expand Down Expand Up @@ -148,3 +152,153 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:

"""
return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))


def vector_search(
base_table: str,
column_to_search: str,
query: Union[dataframe.DataFrame, series.Series],
*,
query_column_to_search: Optional[str] = None,
top_k: Optional[int] = 10,
distance_type: Literal["euclidean", "cosine"] = "euclidean",
fraction_lists_to_search: Optional[float] = None,
use_brute_force: bool = False,
) -> dataframe.DataFrame:
"""
Conduct vector search which searches embeddings to find semantically similar entities.

**Examples:**


>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bpd.options.display.progress_bar = None

DataFrame embeddings for which to find nearest neighbors. The ``ARRAY<FLOAT64>`` column
is used as the search query:

>>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
... "embedding": [[1.0, 2.0], [3.0, 5.2]]})
>>> bbq.vector_search(
... base_table="bigframes-dev.bigframes_tests_sys.base_table",
... column_to_search="my_embedding",
... query=search_query,
... top_k=2)
query_id embedding id my_embedding distance
1 cat [3. 5.2] 5 [5. 5.4] 2.009975
0 dog [1. 2.] 1 [1. 2.] 0.0
0 dog [1. 2.] 4 [1. 3.2] 1.2
1 cat [3. 5.2] 2 [2. 4.] 1.56205
<BLANKLINE>
[4 rows x 5 columns]

Series embeddings for which to find nearest neighbors:

>>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]],
... index=["dog", "cat"],
... name="embedding")
>>> bbq.vector_search(
... base_table="bigframes-dev.bigframes_tests_sys.base_table",
... column_to_search="my_embedding",
... query=search_query,
... top_k=2)
embedding id my_embedding distance
dog [1. 2.] 1 [1. 2.] 0.0
cat [3. 5.2] 5 [5. 5.4] 2.009975
dog [1. 2.] 4 [1. 3.2] 1.2
cat [3. 5.2] 2 [2. 4.] 1.56205
<BLANKLINE>
[4 rows x 4 columns]

You can specify the name of the column in the query DataFrame embeddings and distance type.
If you specify query_column_to_search_value, it will use the provided column which contains
the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value.

>>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
... "embedding": [[1.0, 2.0], [3.0, 5.2]],
... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]})
>>> bbq.vector_search(
... base_table="bigframes-dev.bigframes_tests_sys.base_table",
... column_to_search="my_embedding",
... query=search_query,
... distance_type="cosine",
... query_column_to_search="another_embedding",
... top_k=2)
query_id embedding another_embedding id my_embedding distance
1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181
0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013
1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181
0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697
<BLANKLINE>
[4 rows x 6 columns]

Args:
base_table (str):
The table to search for nearest neighbor embeddings.
column_to_search (str):
The name of the base table column to search for nearest neighbor embeddings.
The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL.
query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series):
A Series or DataFrame that provides the embeddings for which to find nearest neighbors.
query_column_to_search (str):
Specifies the name of the column in the query that contains the embeddings for which to
find nearest neighbors. The column must have a type of ``ARRAY<FLOAT64>``. All elements in
the array must be non-NULL and all values in the column must have the same array dimensions
as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame.
top_k (int, default 10):
Sepecifies the number of nearest neighbors to return. Default to 10.
distance_type (str, defalt "euclidean"):
Specifies the type of metric to use to compute the distance between two vectors.
Possible values are "euclidean" and "cosine". Default to "euclidean".
fraction_lists_to_search (float, range in [0.0, 1.0]):
Specifies the percentage of lists to search. Specifying a higher percentage leads to
higher recall and slower performance, and the converse is true when specifying a lower
percentage. It is only used when a vector index is also used. You can only specify
``fraction_lists_to_search`` when ``use_brute_force`` is set to False.
use_brute_force (bool, default False):
Determines whether to use brute force search by skipping the vector index if one is available.
Default to False.

Returns:
bigframes.dataframe.DataFrame: A DataFrame containing vector search result.
"""
if not fraction_lists_to_search and use_brute_force is True:
raise ValueError(
"You can't specify fraction_lists_to_search when use_brute_force is set to True."
)
if (
isinstance(query, bigframes.series.Series)
and query_column_to_search is not None
):
raise ValueError(
"You can't specify query_column_to_search when query is a Series."
)
# TODO(ashleyxu): Support options in vector search. b/344019989
if fraction_lists_to_search is not None or use_brute_force is True:
raise NotImplementedError(
f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}"
)
options = {
"base_table": base_table,
"column_to_search": column_to_search,
"query_column_to_search": query_column_to_search,
"distance_type": distance_type,
"top_k": top_k,
"fraction_lists_to_search": fraction_lists_to_search,
"use_brute_force": use_brute_force,
}

(query,) = utils.convert_to_dataframe(query)
sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)

sql = bigframes.core.sql.create_vector_search_sql(
sql_string=sql_string, options=options # type: ignore
)
if index_col_ids is not None:
df = query._session.read_gbq(sql, index_col=index_col_ids)
else:
df = query._session.read_gbq(sql)
df.index.names = index_labels

return df
46 changes: 45 additions & 1 deletion bigframes/core/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import datetime
import math
import textwrap
from typing import Iterable, TYPE_CHECKING
from typing import Iterable, Mapping, TYPE_CHECKING, Union

# Literals and identifiers matching this pattern can be unquoted
unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$"
Expand Down Expand Up @@ -169,3 +169,47 @@ def ordering_clause(
part = f"`{ordering_expr.id}` {asc_desc} {null_clause}"
parts.append(part)
return f"ORDER BY {' ,'.join(parts)}"


def create_vector_search_sql(
sql_string: str,
options: Mapping[str, Union[str | int | bool | float]] = {},
) -> str:
"""Encode the VECTOR SEARCH statement for BigQuery Vector Search."""

base_table = options["base_table"]
column_to_search = options["column_to_search"]
distance_type = options["distance_type"]
top_k = options["top_k"]
query_column_to_search = options.get("query_column_to_search", None)

if query_column_to_search is not None:
query_str = f"""
SELECT
query.*,
base.*,
distance,
FROM VECTOR_SEARCH(
TABLE `{base_table}`,
{simple_literal(column_to_search)},
({sql_string}),
{simple_literal(query_column_to_search)},
distance_type => {simple_literal(distance_type)},
top_k => {simple_literal(top_k)}
)
"""
else:
query_str = f"""
SELECT
query.*,
base.*,
distance,
FROM VECTOR_SEARCH(
TABLE `{base_table}`,
{simple_literal(column_to_search)},
({sql_string}),
distance_type => {simple_literal(distance_type)},
top_k => {simple_literal(top_k)}
)
"""
return query_str
136 changes: 136 additions & 0 deletions tests/system/small/bigquery/test_vector_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd

import bigframes.bigquery as bbq
import bigframes.pandas as bpd


def test_vector_search_basic_params_with_df():
search_query = bpd.DataFrame(
{
"query_id": ["dog", "cat"],
"embedding": [[1.0, 2.0], [3.0, 5.2]],
}
)
vector_search_result = bbq.vector_search(
base_table="bigframes-dev.bigframes_tests_sys.base_table",
column_to_search="my_embedding",
query=search_query,
top_k=2,
).to_pandas() # type:ignore
expected = pd.DataFrame(
{
"query_id": ["cat", "dog", "dog", "cat"],
"embedding": [
np.array([3.0, 5.2]),
np.array([1.0, 2.0]),
np.array([1.0, 2.0]),
np.array([3.0, 5.2]),
],
"id": [5, 1, 4, 2],
"my_embedding": [
np.array([5.0, 5.4]),
np.array([1.0, 2.0]),
np.array([1.0, 3.2]),
np.array([2.0, 4.0]),
],
"distance": [2.009975, 0.0, 1.2, 1.56205],
},
index=pd.Index([1, 0, 0, 1], dtype="Int64"),
)
pd.testing.assert_frame_equal(
vector_search_result, expected, check_dtype=False, rtol=0.1
)


def test_vector_search_different_params_with_query():
search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]])
vector_search_result = bbq.vector_search(
base_table="bigframes-dev.bigframes_tests_sys.base_table",
column_to_search="my_embedding",
query=search_query,
distance_type="cosine",
top_k=2,
).to_pandas() # type:ignore
expected = pd.DataFrame(
{
"0": [
np.array([1.0, 2.0]),
np.array([1.0, 2.0]),
np.array([3.0, 5.2]),
np.array([3.0, 5.2]),
],
"id": [2, 1, 1, 2],
"my_embedding": [
np.array([2.0, 4.0]),
np.array([1.0, 2.0]),
np.array([1.0, 2.0]),
np.array([2.0, 4.0]),
],
"distance": [0.0, 0.0, 0.001777, 0.001777],
},
index=pd.Index([0, 0, 1, 1], dtype="Int64"),
)
pd.testing.assert_frame_equal(
vector_search_result, expected, check_dtype=False, rtol=0.1
)


def test_vector_search_df_with_query_column_to_search():
search_query = bpd.DataFrame(
{
"query_id": ["dog", "cat"],
"embedding": [[1.0, 2.0], [3.0, 5.2]],
"another_embedding": [[1.0, 2.5], [3.3, 5.2]],
}
)
vector_search_result = bbq.vector_search(
base_table="bigframes-dev.bigframes_tests_sys.base_table",
column_to_search="my_embedding",
query=search_query,
query_column_to_search="another_embedding",
top_k=2,
).to_pandas() # type:ignore
expected = pd.DataFrame(
{
"query_id": ["dog", "dog", "cat", "cat"],
"embedding": [
np.array([1.0, 2.0]),
np.array([1.0, 2.0]),
np.array([3.0, 5.2]),
np.array([3.0, 5.2]),
],
"another_embedding": [
np.array([1.0, 2.5]),
np.array([1.0, 2.5]),
np.array([3.3, 5.2]),
np.array([3.3, 5.2]),
],
"id": [1, 4, 2, 5],
"my_embedding": [
np.array([1.0, 2.0]),
np.array([1.0, 3.2]),
np.array([2.0, 4.0]),
np.array([5.0, 5.4]),
],
"distance": [0.5, 0.7, 1.769181, 1.711724],
},
index=pd.Index([0, 0, 1, 1], dtype="Int64"),
)
pd.testing.assert_frame_equal(
vector_search_result, expected, check_dtype=False, rtol=0.1
)
Loading