Skip to content

support arbitrary wildcard #5606

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add tests for new wildcard queries
  • Loading branch information
trinity-1686a committed Dec 18, 2024
commit 9d83c5f84f2e6e4744afff802e66e27d3683407b
47 changes: 43 additions & 4 deletions quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,13 @@ mod tests {
elements.iter().map(|elem| elem.to_string()).collect()
}

fn automaton_hashset(elements: &[&str]) -> HashSet<Automaton> {
elements
.iter()
.map(|elem| Automaton::Regex(elem.to_string()))
.collect()
}

fn hashset_field(elements: &[u32]) -> HashSet<Field> {
elements
.iter()
Expand Down Expand Up @@ -638,7 +645,12 @@ mod tests {
(2, "term1", false),
(2, "term2", false),
]),
automatons_grouped_by_field: HashMap::new(), // TODO complete tests
automatons_grouped_by_field: [(
Field::from_field_id(1),
automaton_hashset(&["my_reg.*ex"]),
)]
.into_iter()
.collect(),
};

// merging with default has no impact
Expand All @@ -656,7 +668,12 @@ mod tests {
(3, "term1", false),
(2, "term2", true),
]),
automatons_grouped_by_field: HashMap::new(), // TODO complete tests
automatons_grouped_by_field: [
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(2), automaton_hashset(&["my_reg.*ex"])),
]
.into_iter()
.collect(),
};
wi_base.merge(wi_2.clone());

Expand Down Expand Up @@ -704,6 +721,17 @@ mod tests {
);
}

let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")];
for (field, regex) in expected_automatons {
let field = Field::from_field_id(field);
let automaton = Automaton::Regex(regex.to_string());
assert!(wi_base
.automatons_grouped_by_field
.get(&field)
.unwrap()
.contains(&automaton));
}

// merge is idempotent
let mut wi_cloned = wi_base.clone();
wi_cloned.merge(wi_2);
Expand All @@ -726,7 +754,13 @@ mod tests {
(1, "term2", true),
(2, "term3", false),
]),
automatons_grouped_by_field: HashMap::new(), // TODO complete tests
automatons_grouped_by_field: [
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
]
.into_iter()
.collect(),
};
let expected = WarmupInfo {
term_dict_fields: hashset_field(&[1]),
Expand All @@ -737,7 +771,12 @@ mod tests {
(1, "term2", true),
(2, "term3", false),
]),
automatons_grouped_by_field: HashMap::new(), // TODO complete tests
automatons_grouped_by_field: [
(Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
(Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
]
.into_iter()
.collect(),
};

warmup_info.simplify();
Expand Down
12 changes: 6 additions & 6 deletions quickwit/quickwit-doc-mapper/src/query_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ pub(crate) fn build_query(

let term_set_query_fields = extract_term_set_query_fields(query_ast, &schema)?;
let (term_ranges_grouped_by_field, automatons_grouped_by_field) =
extract_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?;
extract_prefix_term_ranges_and_automaton(query_ast, &schema, tokenizer_manager)?;

let mut terms_grouped_by_field: HashMap<Field, HashMap<_, bool>> = Default::default();
query.query_terms(&mut |term, need_position| {
Expand Down Expand Up @@ -274,14 +274,14 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
}
}

fn extract_prefix_term_ranges(
type TermRangeWarmupInfo = HashMap<Field, HashMap<TermRange, PositionNeeded>>;
type AutomatonWarmupInfo = HashMap<Field, HashSet<Automaton>>;

fn extract_prefix_term_ranges_and_automaton(
query_ast: &QueryAst,
schema: &Schema,
tokenizer_manager: &TokenizerManager,
) -> anyhow::Result<(
HashMap<Field, HashMap<TermRange, PositionNeeded>>,
HashMap<Field, HashSet<Automaton>>,
)> {
) -> anyhow::Result<(TermRangeWarmupInfo, AutomatonWarmupInfo)> {
let mut visitor = ExtractPrefixTermRanges::with_schema(schema, tokenizer_manager);
visitor.visit(query_ast)?;
Ok((
Expand Down
172 changes: 92 additions & 80 deletions quickwit/quickwit-query/src/query_ast/wildcard_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ impl WildcardQuery {
let field_type = field_entry.field_type();

let sub_query_parts = parse_wildcard_query(&self.value);
// TODO handle json_path

match field_type {
FieldType::Str(ref text_options) => {
Expand Down Expand Up @@ -191,76 +190,6 @@ impl WildcardQuery {
)),
}
}

/*
pub fn extract_prefix_term(
&self,
schema: &TantivySchema,
tokenizer_manager: &TokenizerManager,
) -> Result<(Field, Term), InvalidQuery> {
let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?;
let field_type = field_entry.field_type();

let prefix = unescape_with_final_wildcard(&self.value)?;

match field_type {
FieldType::Str(ref text_options) => {
let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| {
InvalidQuery::SchemaError(format!(
"field {} is not full-text searchable",
field_entry.name()
))
})?;
let tokenizer_name = text_field_indexing.tokenizer();
let mut normalizer = tokenizer_manager
.get_normalizer(tokenizer_name)
.with_context(|| {
format!("no tokenizer named `{}` is registered", tokenizer_name)
})?;
let mut token_stream = normalizer.token_stream(&prefix);
let mut tokens = Vec::new();
token_stream.process(&mut |token| {
let term: Term = Term::from_field_text(field, &token.text);
tokens.push(term);
});
let term = extract_unique_token(tokens)?;
Ok((field, term))
}
FieldType::JsonObject(json_options) => {
let text_field_indexing =
json_options.get_text_indexing_options().ok_or_else(|| {
InvalidQuery::SchemaError(format!(
"field {} is not full-text searchable",
field_entry.name()
))
})?;
let tokenizer_name = text_field_indexing.tokenizer();
let mut normalizer = tokenizer_manager
.get_normalizer(tokenizer_name)
.with_context(|| {
format!("no tokenizer named `{}` is registered", tokenizer_name)
})?;
let mut token_stream = normalizer.token_stream(&prefix);
let mut tokens = Vec::new();

token_stream.process(&mut |token| {
let mut term = Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
);
term.append_type_and_str(&token.text);
tokens.push(term);
});
let term = extract_unique_token(tokens)?;
Ok((field, term))
}
_ => Err(InvalidQuery::SchemaError(
"trying to run a Wildcard query on a non-text field".to_string(),
)),
}
}
*/
}

impl BuildTantivyAst for WildcardQuery {
Expand All @@ -271,14 +200,6 @@ impl BuildTantivyAst for WildcardQuery {
_search_fields: &[String],
_with_validation: bool,
) -> Result<TantivyQueryAst, InvalidQuery> {
/*
let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?;

let mut phrase_prefix_query =
tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]);
phrase_prefix_query.set_max_expansions(u32::MAX);
Ok(phrase_prefix_query.into())
*/
let (field, regex) = self.to_regex(schema, tokenizer_manager)?;
let regex_query = tantivy::query::RegexQuery::from_pattern(&regex, field)
.context("failed to build regex from wildcard")?;
Expand All @@ -288,5 +209,96 @@ impl BuildTantivyAst for WildcardQuery {

#[cfg(test)]
mod tests {
// TODO add test
use tantivy::schema::{TextFieldIndexing, TextOptions};

use super::*;
use crate::create_default_quickwit_tokenizer_manager;

#[test]
fn test_wildcard_query_to_regex_on_text() {
let query = WildcardQuery {
field: "text_field".to_string(),
value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(),
};

let tokenizer_manager = create_default_quickwit_tokenizer_manager();
for tokenizer in ["raw", "whitespace"] {
let mut schema_builder = TantivySchema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
schema_builder.add_text_field("text_field", text_options);
let schema = schema_builder.build();

let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut");
}

for tokenizer in [
"raw_lowercase",
"lowercase",
"default",
"en_stem",
"chinese_compatible",
"source_code_default",
"source_code_with_hex",
] {
let mut schema_builder = TantivySchema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
schema_builder.add_text_field("text_field", text_options);
let schema = schema_builder.build();

let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();

assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut");
}
}

#[test]
fn test_wildcard_query_to_regex_on_json() {
let query = WildcardQuery {
// this volontarily contains uppercase and regex-unsafe char to make sure we properly
// keep the case, but sanitize special chars
field: "json_field.Inner.Fie*ld".to_string(),
value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(),
};

let tokenizer_manager = create_default_quickwit_tokenizer_manager();
for tokenizer in ["raw", "whitespace"] {
let mut schema_builder = TantivySchema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
schema_builder.add_json_field("json_field", text_options);
let schema = schema_builder.build();

let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
assert_eq!(
regex,
"Inner\u{1}Fie\\*ld\0sMyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"
);
}

for tokenizer in [
"raw_lowercase",
"lowercase",
"default",
"en_stem",
"chinese_compatible",
"source_code_default",
"source_code_with_hex",
] {
let mut schema_builder = TantivySchema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
schema_builder.add_json_field("json_field", text_options);
let schema = schema_builder.build();

let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();

assert_eq!(
regex,
"Inner\u{1}Fie\\*ld\0smystring wh1ch.a\\.normal tokenizer would.*cut"
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ json:
query_string:
default_field: payload.description
lenient: true
query: "Jour* AND unix"
query: "Jou*al AND unix"
expected:
hits:
total:
Expand All @@ -170,7 +170,7 @@ json:
query_string:
default_field: payload.description
lenient: true
query: "jour* AND unix"
query: "jou*al AND unix"
expected:
hits:
total:
Expand Down