Skip to content

Add synthetic vectors support for sparse_vector #130756

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,380 @@
setup:
- requires:
reason: 'synthetic vectors are required'
test_runner_features: [ capabilities ]
capabilities:
- method: GET
path: /_search
capabilities: [ synthetic_vectors_setting ]
- skip:
features: "headers"

- do:
indices.create:
index: test
body:
settings:
index.mapping.synthetic_vectors: true
mappings:
properties:
name:
type: keyword
emb:
type: sparse_vector

nested:
type: nested
properties:
paragraph_id:
type: keyword
emb:
type: sparse_vector

- do:
index:
index: test
id: "1"
body:
name: cow.jpg
emb:
token_1: 2.0
token_2: 3.0

- do:
index:
index: test
id: "2"
body:
name: moose.jpg
nested:
- paragraph_id: 0
emb:
token_1: 2.0
token_2: 3.0
- paragraph_id: 2
emb:
token_3: 2.0
token_2: 3.0
- paragraph_id: 3
emb:
token_3: 2.0
token_7: 3.0
token_1: 4.0

- do:
index:
index: test
id: "3"
body:
name: rabbit.jpg
emb:
token_3: 2.0
token_9: 3.0
token_2: 4.0

- do:
index:
index: test
id: "4"
body:
name: zoolander.jpg
nested:
- paragraph_id: 0
emb:
token_3: 2.0
token_7: 3.0
token_1: 4.0
- paragraph_id: 1
- paragraph_id: 2
emb:
token_8: 2.0

- do:
indices.refresh: {}

---
"exclude synthetic vectors":
- do:
search:
index: test
body:
sort: ["name"]

- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- not_exists: hits.hits.0._source.emb

- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- not_exists: hits.hits.1._source.nested.0.emb
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- not_exists: hits.hits.1._source.nested.1.emb
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- not_exists: hits.hits.1._source.nested.2.emb
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }

- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- not_exists: hits.hits.2._source.emb

- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- not_exists: hits.hits.3._source.nested.0.emb
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
- not_exists: hits.hits.3._source.nested.2.emb
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }

---
"include synthetic vectors":
- do:
search:
index: test
body:
_source:
exclude_vectors: false
sort: ["name"]

- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- exists: hits.hits.0._source.emb

- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- exists: hits.hits.1._source.nested.0.emb
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- exists: hits.hits.1._source.nested.1.emb
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- exists: hits.hits.1._source.nested.2.emb
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }

- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- exists: hits.hits.2._source.emb

- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- exists: hits.hits.3._source.nested.0.emb
- length: { hits.hits.3._source.nested.0.emb: 3 }
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }

- do:
search:
index: test
body:
_source:
exclude_vectors: false
includes: nested.emb
sort: ["name"]

- match: { hits.hits.0._id: "1"}
- length: { hits.hits.0._source: 0}

- match: { hits.hits.1._id: "2"}
- length: { hits.hits.3._source: 1 }
- length: { hits.hits.1._source.nested: 3 }
- exists: hits.hits.1._source.nested.0.emb
- not_exists: hits.hits.1._source.nested.0.paragraph_id
- exists: hits.hits.1._source.nested.1.emb
- not_exists: hits.hits.1._source.nested.1.paragraph_id
- exists: hits.hits.1._source.nested.2.emb
- not_exists: hits.hits.1._source.nested.2.paragraph_id

- match: { hits.hits.2._id: "3" }
- length: { hits.hits.2._source: 0}

- match: { hits.hits.3._id: "4" }
- length: { hits.hits.3._source: 1 }
- length: { hits.hits.3._source.nested: 2 }
- exists: hits.hits.3._source.nested.0.emb
- length: { hits.hits.3._source.nested.0.emb: 3 }
- not_exists: hits.hits.3._source.nested.0.paragraph_id
- exists: hits.hits.3._source.nested.1.emb
- length: { hits.hits.3._source.nested.1.emb: 1 }
- not_exists: hits.hits.3._source.nested.1.paragraph_id

- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
search:
index: test
body:
_source:
exclude_vectors: true
sort: ["name"]
fields: ["emb"]

- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- not_exists: hits.hits.0._source.emb
- length: { hits.hits.0.fields.emb: 1}
- length: { hits.hits.0.fields.emb.0: 2}
- match: { hits.hits.0.fields.emb.0.token_1: 2.0}
- match: { hits.hits.0.fields.emb.0.token_2: 3.0}

- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- not_exists: hits.hits.1._source.nested.0.emb

- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- length: { hits.hits.2.fields.emb: 1}
- length: { hits.hits.2.fields.emb.0: 3}
- match: { hits.hits.2.fields.emb.0.token_2: 4.0}
- match: { hits.hits.2.fields.emb.0.token_3: 2.0}
- match: { hits.hits.2.fields.emb.0.token_9: 3.0}

- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- not_exists: hits.hits.3._source.nested.0.emb


---
"Bulk partial update with synthetic vectors":
- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
bulk:
index: test
_source: true
body:
- '{"update": {"_id": "4"}}'
- >
{
"doc": {
"name": "zoolander2.jpg",
"emb": {
"token_12": 2.0,
"token_13": 1.0
}
}
}

- length: { items.0.update.get._source.emb: 2 }
- match: { items.0.update.get._source.emb.token_12: 2.0 }
- match: { items.0.update.get._source.emb.token_13: 1.0 }
- exists: items.0.update.get._source.nested
- length: { items.0.update.get._source.nested: 3}
- exists: items.0.update.get._source.nested.0.emb
- match: { items.0.update.get._source.nested.0.paragraph_id: 0 }
- length: { items.0.update.get._source.nested.0.emb: 3 }
- not_exists: items.0.update.get._source.nested.1.emb
- match: { items.0.update.get._source.nested.1.paragraph_id: 1 }
- exists: items.0.update.get._source.nested.2.emb
- length: { items.0.update.get._source.nested.2.emb: 1 }
- match: { items.0.update.get._source.nested.2.paragraph_id: 2 }
- set: { items.0.update.get._source.nested: original_nested }

- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
get:
_source_exclude_vectors: false
index: test
id: "4"

- match: { _source.name: zoolander2.jpg }
- length: { _source.emb: 2 }
- match: { _source.emb.token_12: 2.0 }
- match: { _source.emb.token_13: 1.0 }
- match: { _source.nested: $original_nested }

- do:
indices.refresh: {}

- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
search:
index: test
body:
_source:
"exclude_vectors": false
query:
term:
_id: 4

- match: { hits.total.value: 1 }
- match: { hits.total.relation: eq }
- match: { hits.hits.0._source.name: zoolander2.jpg }
- match: { hits.hits.0._source.nested: $original_nested }

---
"Partial update with synthetic vectors":
- do:
headers:
# Force JSON content type so that we use a parser that interprets the vectors as doubles
Content-Type: application/json
update:
index: test
id: "4"
body:
_source: true
doc: {
"name": "zoolander3.jpg",
"emb": {
"token_3": 2.0,
"token_9": 2.5
}
}

- length: { get._source.emb: 2 }
- match: { get._source.emb.token_3: 2.0 }
- match: { get._source.emb.token_9: 2.5 }
- exists: get._source.nested
- length: { get._source.nested: 3}
- exists: get._source.nested.0.emb
- match: { get._source.nested.0.paragraph_id: 0 }
- length: { get._source.nested.0.emb: 3 }
- not_exists: get._source.nested.1.emb
- match: { get._source.nested.1.paragraph_id: 1 }
- exists: get._source.nested.2.emb
- length: { get._source.nested.2.emb: 1 }
- match: { get._source.nested.2.paragraph_id: 2 }
- set: { get._source.nested: original_nested }

- do:
headers:
# Force JSON content type so that we use a parser that interprets the vectors as doubles
Content-Type: application/json
get:
_source_exclude_vectors: false
index: test
id: "4"

- length: { _source.emb: 2 }
- match: { _source.emb.token_3: 2.0 }
- match: { _source.emb.token_9: 2.5 }
- match: { _source.name: zoolander3.jpg }
- match: { _source.nested: $original_nested }

- do:
indices.refresh: {}

- do:
headers:
# Force JSON content type so that we use a parser that interprets the vectors as doubles
Content-Type: application/json
search:
index: test
body:
_source:
"exclude_vectors": false
query:
term:
_id: 4

- match: { hits.total.value: 1 }
- match: { hits.total.relation: eq }
- match: { hits.hits.0._source.name: zoolander3.jpg }
- match: { hits.hits.0._source.nested: $original_nested }
Loading