elastic · jimczi · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/....vectors/240_source_synthetic_vectors.yml → ...rs/240_source_synthetic_dense_vectors.yml b/....vectors/240_source_synthetic_vectors.yml → ...rs/240_source_synthetic_dense_vectors.yml
diff --git a/...tTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml b/...tTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml
@@ -0,0 +1,380 @@
+setup:
+  - requires:
+      reason: 'synthetic vectors are required'
+      test_runner_features: [ capabilities ]
+      capabilities:
+        - method: GET
+          path: /_search
+          capabilities: [ synthetic_vectors_setting ]
+  - skip:
+      features: "headers"
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            index.mapping.synthetic_vectors: true
+          mappings:
+            properties:
+              name:
+                type: keyword
+              emb:
+                type: sparse_vector
+
+              nested:
+                type: nested
+                properties:
+                  paragraph_id:
+                    type: keyword
+                  emb:
+                    type: sparse_vector
+
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          name: cow.jpg
+          emb:
+            token_1: 2.0
+            token_2: 3.0
+
+  - do:
+      index:
+        index: test
+        id: "2"
+        body:
+          name: moose.jpg
+          nested:
+          - paragraph_id: 0
+            emb:
+              token_1: 2.0
+              token_2: 3.0
+          - paragraph_id: 2
+            emb:
+              token_3: 2.0
+              token_2: 3.0
+          - paragraph_id: 3
+            emb:
+              token_3: 2.0
+              token_7: 3.0
+              token_1: 4.0
+
+  - do:
+      index:
+        index: test
+        id: "3"
+        body:
+          name: rabbit.jpg
+          emb:
+            token_3: 2.0
+            token_9: 3.0
+            token_2: 4.0
+
+  - do:
+      index:
+        index: test
+        id: "4"
+        body:
+          name: zoolander.jpg
+          nested:
+            - paragraph_id: 0
+              emb:
+                token_3: 2.0
+                token_7: 3.0
+                token_1: 4.0
+            - paragraph_id: 1
+            - paragraph_id: 2
+              emb:
+                token_8: 2.0
+
+  - do:
+      indices.refresh: {}
+
+---
+"exclude synthetic vectors":
+  - do:
+      search:
+        index: test
+        body:
+          sort: ["name"]
+
+  - match:      { hits.hits.0._id: "1"}
+  - match:      { hits.hits.0._source.name: "cow.jpg"}
+  - not_exists:   hits.hits.0._source.emb
+
+  - match:      { hits.hits.1._id: "2"}
+  - match:      { hits.hits.1._source.name: "moose.jpg"}
+  - length:     { hits.hits.1._source.nested: 3 }
+  - not_exists:   hits.hits.1._source.nested.0.emb
+  - match:      { hits.hits.1._source.nested.0.paragraph_id: 0 }
+  - not_exists:   hits.hits.1._source.nested.1.emb
+  - match:      { hits.hits.1._source.nested.1.paragraph_id: 2 }
+  - not_exists:   hits.hits.1._source.nested.2.emb
+  - match:      { hits.hits.1._source.nested.2.paragraph_id: 3 }
+
+  - match:      { hits.hits.2._id: "3" }
+  - match:      { hits.hits.2._source.name: "rabbit.jpg" }
+  - not_exists:   hits.hits.2._source.emb
+
+  - match:      { hits.hits.3._id: "4" }
+  - match:      { hits.hits.3._source.name: "zoolander.jpg" }
+  - length:     { hits.hits.3._source.nested: 3 }
+  - not_exists:   hits.hits.3._source.nested.0.emb
+  - match:      { hits.hits.3._source.nested.0.paragraph_id: 0 }
+  - match:      { hits.hits.3._source.nested.1.paragraph_id: 1 }
+  - not_exists:   hits.hits.3._source.nested.2.emb
+  - match:      { hits.hits.3._source.nested.2.paragraph_id: 2 }
+
+---
+"include synthetic vectors":
+  - do:
+      search:
+        index: test
+        body:
+          _source:
+            exclude_vectors: false
+          sort: ["name"]
+
+  - match:      { hits.hits.0._id: "1"}
+  - match:      { hits.hits.0._source.name: "cow.jpg"}
+  - exists:       hits.hits.0._source.emb
+
+  - match:      { hits.hits.1._id: "2"}
+  - match:      { hits.hits.1._source.name: "moose.jpg"}
+  - length:     { hits.hits.1._source.nested: 3 }
+  - exists:       hits.hits.1._source.nested.0.emb
+  - match:      { hits.hits.1._source.nested.0.paragraph_id: 0 }
+  - exists:       hits.hits.1._source.nested.1.emb
+  - match:      { hits.hits.1._source.nested.1.paragraph_id: 2 }
+  - exists:       hits.hits.1._source.nested.2.emb
+  - match:      { hits.hits.1._source.nested.2.paragraph_id: 3 }
+
+  - match:      { hits.hits.2._id: "3" }
+  - match:      { hits.hits.2._source.name: "rabbit.jpg" }
+  - exists:       hits.hits.2._source.emb
+
+  - match:      { hits.hits.3._id: "4" }
+  - match:      { hits.hits.3._source.name: "zoolander.jpg" }
+  - length:     { hits.hits.3._source.nested: 3 }
+  - exists:       hits.hits.3._source.nested.0.emb
+  - length:      { hits.hits.3._source.nested.0.emb: 3 }
+  - match:      { hits.hits.3._source.nested.0.paragraph_id: 0 }
+
+  - do:
+      search:
+        index: test
+        body:
+          _source:
+            exclude_vectors: false
+            includes: nested.emb
+          sort: ["name"]
+
+  - match:           { hits.hits.0._id: "1"}
+  - length:          { hits.hits.0._source: 0}
+
+  - match:           { hits.hits.1._id: "2"}
+  - length:          { hits.hits.3._source: 1 }
+  - length:          { hits.hits.1._source.nested: 3 }
+  - exists:            hits.hits.1._source.nested.0.emb
+  - not_exists:        hits.hits.1._source.nested.0.paragraph_id
+  - exists:            hits.hits.1._source.nested.1.emb
+  - not_exists:        hits.hits.1._source.nested.1.paragraph_id
+  - exists:            hits.hits.1._source.nested.2.emb
+  - not_exists:        hits.hits.1._source.nested.2.paragraph_id
+
+  - match:           { hits.hits.2._id: "3" }
+  - length:          { hits.hits.2._source: 0}
+
+  - match:           { hits.hits.3._id: "4" }
+  - length:          { hits.hits.3._source: 1 }
+  - length:          { hits.hits.3._source.nested: 2 }
+  - exists:            hits.hits.3._source.nested.0.emb
+  - length:          { hits.hits.3._source.nested.0.emb: 3 }
+  - not_exists:        hits.hits.3._source.nested.0.paragraph_id
+  - exists:            hits.hits.3._source.nested.1.emb
+  - length:          { hits.hits.3._source.nested.1.emb: 1 }
+  - not_exists:        hits.hits.3._source.nested.1.paragraph_id
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      search:
+        index: test
+        body:
+          _source:
+            exclude_vectors: true
+          sort: ["name"]
+          fields: ["emb"]
+
+  - match:      { hits.hits.0._id: "1"}
+  - match:      { hits.hits.0._source.name: "cow.jpg"}
+  - not_exists:   hits.hits.0._source.emb
+  - length:     { hits.hits.0.fields.emb: 1}
+  - length:     { hits.hits.0.fields.emb.0: 2}
+  - match:      { hits.hits.0.fields.emb.0.token_1: 2.0}
+  - match:      { hits.hits.0.fields.emb.0.token_2: 3.0}
+
+  - match:      { hits.hits.1._id: "2"}
+  - match:      { hits.hits.1._source.name: "moose.jpg"}
+  - length:     { hits.hits.1._source.nested: 3 }
+  - not_exists:   hits.hits.1._source.nested.0.emb
+
+  - match:      { hits.hits.2._id: "3" }
+  - match:      { hits.hits.2._source.name: "rabbit.jpg" }
+  - length:     { hits.hits.2.fields.emb: 1}
+  - length:     { hits.hits.2.fields.emb.0: 3}
+  - match:      { hits.hits.2.fields.emb.0.token_2: 4.0}
+  - match:      { hits.hits.2.fields.emb.0.token_3: 2.0}
+  - match:      { hits.hits.2.fields.emb.0.token_9: 3.0}
+
+  - match:      { hits.hits.3._id: "4" }
+  - match:      { hits.hits.3._source.name: "zoolander.jpg" }
+  - length:     { hits.hits.3._source.nested: 3 }
+  - not_exists:   hits.hits.3._source.nested.0.emb
+
+
+---
+"Bulk partial update with synthetic vectors":
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      bulk:
+        index: test
+        _source: true
+        body:
+          - '{"update": {"_id": "4"}}'
+          - >
+            {
+              "doc": {
+                "name": "zoolander2.jpg",
+                "emb": {
+                  "token_12": 2.0,
+                  "token_13": 1.0
+                }
+              }
+            }
+
+  - length:     { items.0.update.get._source.emb: 2 }
+  - match:      { items.0.update.get._source.emb.token_12: 2.0 }
+  - match:      { items.0.update.get._source.emb.token_13: 1.0 }
+  - exists:       items.0.update.get._source.nested
+  - length:     { items.0.update.get._source.nested: 3}
+  - exists:       items.0.update.get._source.nested.0.emb
+  - match:      { items.0.update.get._source.nested.0.paragraph_id: 0 }
+  - length:     { items.0.update.get._source.nested.0.emb: 3 }
+  - not_exists:   items.0.update.get._source.nested.1.emb
+  - match:      { items.0.update.get._source.nested.1.paragraph_id: 1 }
+  - exists:       items.0.update.get._source.nested.2.emb
+  - length:     { items.0.update.get._source.nested.2.emb: 1 }
+  - match:      { items.0.update.get._source.nested.2.paragraph_id: 2 }
+  - set:        { items.0.update.get._source.nested: original_nested }
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      get:
+        _source_exclude_vectors: false
+        index: test
+        id: "4"
+
+  - match:    { _source.name: zoolander2.jpg }
+  - length:   { _source.emb: 2 }
+  - match:    { _source.emb.token_12: 2.0 }
+  - match:    { _source.emb.token_13: 1.0 }
+  - match:    { _source.nested: $original_nested }
+
+  - do:
+      indices.refresh: {}
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      search:
+        index: test
+        body:
+          _source:
+            "exclude_vectors": false
+          query:
+            term:
+              _id: 4
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+  - match: { hits.hits.0._source.name: zoolander2.jpg }
+  - match: { hits.hits.0._source.nested: $original_nested }
+
+---
+"Partial update with synthetic vectors":
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the vectors as doubles
+        Content-Type: application/json
+      update:
+        index: test
+        id: "4"
+        body:
+          _source: true
+          doc: {
+            "name": "zoolander3.jpg",
+            "emb": {
+              "token_3": 2.0,
+              "token_9": 2.5
+            }
+          }
+
+  - length:     { get._source.emb: 2 }
+  - match:      { get._source.emb.token_3: 2.0 }
+  - match:      { get._source.emb.token_9: 2.5 }
+  - exists:       get._source.nested
+  - length:     { get._source.nested: 3}
+  - exists:       get._source.nested.0.emb
+  - match:      { get._source.nested.0.paragraph_id: 0 }
+  - length:     { get._source.nested.0.emb: 3 }
+  - not_exists:   get._source.nested.1.emb
+  - match:      { get._source.nested.1.paragraph_id: 1 }
+  - exists:       get._source.nested.2.emb
+  - length:     { get._source.nested.2.emb: 1 }
+  - match:      { get._source.nested.2.paragraph_id: 2 }
+  - set:        { get._source.nested: original_nested }
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the vectors as doubles
+        Content-Type: application/json
+      get:
+        _source_exclude_vectors: false
+        index: test
+        id: "4"
+
+  - length:   { _source.emb: 2 }
+  - match:    { _source.emb.token_3: 2.0 }
+  - match:    { _source.emb.token_9: 2.5 }
+  - match:    { _source.name: zoolander3.jpg }
+  - match:    { _source.nested: $original_nested }
+
+  - do:
+      indices.refresh: {}
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the vectors as doubles
+        Content-Type: application/json
+      search:
+        index: test
+        body:
+          _source:
+            "exclude_vectors": false
+          query:
+            term:
+              _id: 4
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+  - match: { hits.hits.0._source.name: zoolander3.jpg }
+  - match: { hits.hits.0._source.nested: $original_nested }