Skip to content

Commit fe22a42

Browse files
authored
feat: support array_contains in LabelList scalar index (lance-format#5681)
closes lance-format#3687 Changes: - Make array_contains(list_col, value) use the existing LABEL_LIST scalar index. - DataFusion treats array_contains as an alias of array_has (often wrapped in an alias expr), so we unwrap that and map it to the LabelList index query. - Add a Python test and update the LabelList docs to mention array_has / array_contains.
1 parent 5454242 commit fe22a42

File tree

5 files changed

+64
-8
lines changed

5 files changed

+64
-8
lines changed

docs/src/format/table/index/scalar/label_list.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ The label list index uses a bitmap index internally and stores its data in:
2626

2727
The label list index provides exact results for the following query types:
2828

29-
| Query Type | Description | Operation | Result Type |
30-
|----------------------|----------------------------------------|---------------------------------------------|-------------|
31-
| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact |
32-
| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact |
29+
| Query Type | Description | Operation | Result Type |
30+
|-------------------------------------|----------------------------------------|---------------------------------------------|-------------|
31+
| **array_has / array_contains** | Array contains the specified value | Bitmap lookup for a single label | Exact |
32+
| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact |
33+
| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact |

python/python/lance/dataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2452,8 +2452,9 @@ def create_scalar_index(
24522452
* ``LABEL_LIST``. A special index that is used to index list
24532453
columns whose values have small cardinality. For example, a column that
24542454
contains lists of tags (e.g. ``["tag1", "tag2", "tag3"]``) can be indexed
2455-
with a ``LABEL_LIST`` index. This index can only speedup queries with
2456-
``array_has_any`` or ``array_has_all`` filters.
2455+
with a ``LABEL_LIST`` index. This index can speed up list membership
2456+
filters such as ``array_has_any``, ``array_has_all``, and
2457+
``array_has`` / ``array_contains``.
24572458
* ``NGRAM``. A special index that is used to index string columns. This index
24582459
creates a bitmap for each ngram in the string. By default we use trigrams.
24592460
This index can currently speed up queries using the ``contains`` function

python/python/tests/test_scalar_index.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1977,6 +1977,39 @@ def test_label_list_index(tmp_path: Path):
19771977
assert indices[0]["type"] == "LabelList"
19781978

19791979

1980+
def test_label_list_index_array_contains(tmp_path: Path):
1981+
# Include lists with NULL items to ensure NULL needle behavior matches
1982+
# non-index execution.
1983+
tbl = pa.table(
1984+
{"labels": [["foo", "bar"], ["bar"], ["baz"], ["qux", None], [None], [], None]}
1985+
)
1986+
dataset = lance.write_dataset(tbl, tmp_path / "dataset")
1987+
expected_null_rows = dataset.to_table(
1988+
filter="array_contains(labels, NULL)"
1989+
).num_rows
1990+
1991+
dataset.create_scalar_index("labels", index_type="LABEL_LIST")
1992+
1993+
result = dataset.to_table(filter="array_contains(labels, 'foo')")
1994+
assert result.num_rows == 1
1995+
1996+
result = dataset.to_table(filter="array_contains(labels, 'bar')")
1997+
assert result.num_rows == 2
1998+
1999+
result = dataset.to_table(filter="array_contains(labels, 'oof')")
2000+
assert result.num_rows == 0
2001+
2002+
explain = dataset.scanner(filter="array_contains(labels, 'foo')").explain_plan()
2003+
assert "ScalarIndexQuery" in explain
2004+
2005+
# NULL needle: preserve semantics (must match pre-index execution) and avoid
2006+
# using the LABEL_LIST index.
2007+
actual_null_rows = dataset.to_table(filter="array_contains(labels, NULL)").num_rows
2008+
assert actual_null_rows == expected_null_rows
2009+
explain = dataset.scanner(filter="array_contains(labels, NULL)").explain_plan()
2010+
assert "ScalarIndexQuery" not in explain
2011+
2012+
19802013
def test_create_index_empty_dataset(tmp_path: Path):
19812014
# Creating an index on an empty dataset is (currently) not terribly useful but
19822015
# we shouldn't return strange errors.

rust/lance-index/src/scalar/expression.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,26 @@ impl ScalarQueryParser for LabelListQueryParser {
490490
if args.len() != 2 {
491491
return None;
492492
}
493+
// DataFusion normalizes array_contains to array_has
494+
if func.name() == "array_has" {
495+
let inner_type = match data_type {
496+
DataType::List(field) | DataType::LargeList(field) => field.data_type(),
497+
_ => return None,
498+
};
499+
let scalar = maybe_scalar(&args[1], inner_type)?;
500+
// array_has(..., NULL) returns no matches in datafusion, but the index would
501+
// match rows containing NULL. Fallback to match datafusion behavior.
502+
if scalar.is_null() {
503+
return None;
504+
}
505+
let query = LabelListQuery::HasAnyLabel(vec![scalar]);
506+
return Some(IndexedExpression::index_query(
507+
column.to_string(),
508+
self.index_name.clone(),
509+
Arc::new(query),
510+
));
511+
}
512+
493513
let label_list = maybe_scalar(&args[1], data_type)?;
494514
if let ScalarValue::List(list_arr) = label_list {
495515
let list_values = list_arr.values();
@@ -1651,6 +1671,7 @@ fn visit_node(
16511671
}
16521672
match expr {
16531673
Expr::Between(between) => Ok(visit_between(between, index_info)),
1674+
Expr::Alias(alias) => visit_node(alias.expr.as_ref(), index_info, depth),
16541675
Expr::Column(_) => Ok(visit_column(expr, index_info)),
16551676
Expr::InList(in_list) => Ok(visit_in_list(in_list, index_info)),
16561677
Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)),

rust/lance-index/src/scalar/label_list.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf {
5757
impl<T: ScalarIndex + DeepSizeOf> LabelListSubIndex for T {}
5858

5959
/// A scalar index that can be used on `List<T>` columns to
60-
/// support queries with array_contains_all and array_contains_any
61-
/// using an underlying bitmap index.
60+
/// accelerate list membership filters such as `array_has_all`, `array_has_any`,
61+
/// and `array_has` / `array_contains`, using an underlying bitmap index.
6262
#[derive(Clone, Debug, DeepSizeOf)]
6363
pub struct LabelListIndex {
6464
values_index: Arc<dyn LabelListSubIndex>,

0 commit comments

Comments
 (0)