Files
pikl/crates/pikl-core/src/query/pipeline.rs
J. Champagne 6a4cc85285 feat(core): Expand filtering into pipeline supporting multiple text search modes.
Modes include: exact match, smart-case, and regular expressions.
2026-03-13 22:55:47 -04:00

618 lines
20 KiB
Rust

//! Filter pipeline with `|` chaining. Splits a query into
//! segments, applies the appropriate filter strategy to each,
//! and chains results through stages. Supports incremental
//! caching: unchanged stages keep their results.
use super::filter::{Filter, FuzzyFilter};
use super::strategy::{self, FilterKind};
/// A multi-stage filter pipeline. Each `|` in the query
/// creates a new stage that filters the previous stage's
/// output. Implements [`Filter`] so it can be used as a
/// drop-in replacement for a single filter.
pub struct FilterPipeline {
/// Master item list: (original index, label).
items: Vec<(usize, String)>,
/// Pipeline stages, one per `|`-separated segment.
stages: Vec<PipelineStage>,
/// The last raw query string, used for diffing.
last_raw_query: String,
}
struct PipelineStage {
/// The raw segment text (including prefix chars).
raw_segment: String,
kind: FilterKind,
inverse: bool,
/// The query text after prefix stripping.
query_text: String,
/// The strategy-specific filter (only used for fuzzy stages).
fuzzy: Option<FuzzyFilter>,
/// Items passing this stage (indices into master list).
cached_indices: Vec<usize>,
dirty: bool,
}
/// Split a raw query on unescaped `|` characters, respecting
/// regex delimiters (`/pattern/` and `!/pattern/`). Returns
/// the segments with `\|` unescaped to literal `|`.
fn split_pipeline(query: &str) -> Vec<String> {
let mut segments = Vec::new();
let mut current = String::new();
let chars: Vec<char> = query.chars().collect();
let len = chars.len();
let mut i = 0;
let mut in_regex = false;
// Position of the opening `/` in current segment (char count into current)
let mut regex_open_pos: usize = 0;
while i < len {
let c = chars[i];
// Escaped pipe: always produce literal `|`
if c == '\\' && i + 1 < len && chars[i + 1] == '|' {
current.push('|');
i += 2;
continue;
}
// Detect regex opening: `/` or `!/` at start of a segment
// (current is empty or whitespace-only after a previous pipe)
if !in_regex {
let trimmed = current.trim();
// `/pattern/`
if c == '/' && (trimmed.is_empty() || trimmed == "!") {
in_regex = true;
regex_open_pos = current.len();
current.push(c);
i += 1;
continue;
}
}
// Detect regex closing: `/` that is not the opening slash
if in_regex && c == '/' {
if current.len() > regex_open_pos {
// This is the closing slash
in_regex = false;
}
current.push(c);
i += 1;
continue;
}
// Unescaped pipe outside regex: split here
if c == '|' && !in_regex {
segments.push(current.trim().to_string());
current = String::new();
i += 1;
continue;
}
current.push(c);
i += 1;
}
segments.push(current.trim().to_string());
// Filter out empty segments
segments.into_iter().filter(|s| !s.is_empty()).collect()
}
impl Default for FilterPipeline {
fn default() -> Self {
Self::new()
}
}
impl FilterPipeline {
pub fn new() -> Self {
Self {
items: Vec::new(),
stages: Vec::new(),
last_raw_query: String::new(),
}
}
/// Evaluate all dirty stages in order. Each stage filters
/// against the previous stage's cached_indices.
fn evaluate(&mut self) {
for stage_idx in 0..self.stages.len() {
if !self.stages[stage_idx].dirty {
continue;
}
let input_indices: Vec<usize> = if stage_idx == 0 {
self.items.iter().map(|(idx, _)| *idx).collect()
} else {
self.stages[stage_idx - 1].cached_indices.clone()
};
let stage = &mut self.stages[stage_idx];
let result = match stage.kind {
FilterKind::Fuzzy => Self::eval_fuzzy(stage, &input_indices, stage_idx),
FilterKind::Exact => {
Self::eval_simple(stage, &input_indices, &self.items, |label, query| {
label.to_lowercase().contains(&query.to_lowercase())
})
}
FilterKind::Regex => {
let re = fancy_regex::Regex::new(&stage.query_text).ok();
Self::eval_simple(stage, &input_indices, &self.items, |label, _query| {
match &re {
Some(r) => r.is_match(label).unwrap_or(false),
None => true, // invalid regex matches everything
}
})
}
};
self.stages[stage_idx].cached_indices = result;
self.stages[stage_idx].dirty = false;
}
}
fn eval_fuzzy(
stage: &mut PipelineStage,
input_indices: &[usize],
stage_idx: usize,
) -> Vec<usize> {
let Some(fuzzy) = stage.fuzzy.as_mut() else {
return Vec::new();
};
fuzzy.set_query(&stage.query_text);
let fuzzy_results: Vec<usize> = (0..fuzzy.matched_count())
.filter_map(|i| fuzzy.matched_index(i))
.collect();
if stage.inverse {
let fuzzy_set: std::collections::HashSet<usize> = fuzzy_results.into_iter().collect();
input_indices
.iter()
.copied()
.filter(|idx| !fuzzy_set.contains(idx))
.collect()
} else if stage_idx == 0 {
fuzzy_results
} else {
let input_set: std::collections::HashSet<usize> =
input_indices.iter().copied().collect();
fuzzy_results
.into_iter()
.filter(|idx| input_set.contains(idx))
.collect()
}
}
fn eval_simple(
stage: &PipelineStage,
input_indices: &[usize],
items: &[(usize, String)],
matcher: impl Fn(&str, &str) -> bool,
) -> Vec<usize> {
if stage.query_text.is_empty() {
return input_indices.to_vec();
}
if stage.inverse {
input_indices
.iter()
.copied()
.filter(|&idx| !matcher(&items[idx].1, &stage.query_text))
.collect()
} else {
input_indices
.iter()
.copied()
.filter(|&idx| matcher(&items[idx].1, &stage.query_text))
.collect()
}
}
}
impl Filter for FilterPipeline {
fn push(&mut self, index: usize, label: &str) {
self.items.push((index, label.to_string()));
// Push to any existing fuzzy filters in stages
for stage in &mut self.stages {
if let Some(ref mut fuzzy) = stage.fuzzy {
fuzzy.push(index, label);
}
stage.dirty = true;
}
}
fn set_query(&mut self, query: &str) {
self.last_raw_query = query.to_string();
let segments = split_pipeline(query);
// Reconcile stages with new segments
let mut new_len = segments.len();
// If query is empty, clear everything
if segments.is_empty() {
self.stages.clear();
new_len = 0;
}
// Compare position-by-position
for (i, seg) in segments.iter().enumerate() {
if i < self.stages.len() {
if self.stages[i].raw_segment == *seg {
// Unchanged: keep cache
continue;
}
// Changed: update this stage, mark dirty
let parsed = strategy::parse_segment(seg);
self.stages[i].raw_segment = seg.clone();
self.stages[i].kind = parsed.kind;
self.stages[i].inverse = parsed.inverse;
self.stages[i].query_text = parsed.query.to_string();
self.stages[i].dirty = true;
// Mark all downstream stages dirty too
for j in (i + 1)..self.stages.len() {
self.stages[j].dirty = true;
}
} else {
// New stage
let parsed = strategy::parse_segment(seg);
let fuzzy = if parsed.kind == FilterKind::Fuzzy {
let mut f = FuzzyFilter::new();
for (idx, label) in &self.items {
f.push(*idx, label);
}
Some(f)
} else {
None
};
self.stages.push(PipelineStage {
raw_segment: seg.clone(),
kind: parsed.kind,
inverse: parsed.inverse,
query_text: parsed.query.to_string(),
fuzzy,
cached_indices: Vec::new(),
dirty: true,
});
}
}
// Truncate extra stages
self.stages.truncate(new_len);
// Evaluate dirty stages
self.evaluate();
}
fn matched_count(&self) -> usize {
match self.stages.last() {
Some(stage) => stage.cached_indices.len(),
None => self.items.len(),
}
}
fn matched_index(&self, match_position: usize) -> Option<usize> {
match self.stages.last() {
Some(stage) => stage.cached_indices.get(match_position).copied(),
None => self.items.get(match_position).map(|(idx, _)| *idx),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn push_items(p: &mut FilterPipeline, labels: &[&str]) {
for (i, label) in labels.iter().enumerate() {
p.push(i, label);
}
}
fn matched_labels<'a>(p: &FilterPipeline, labels: &'a [&str]) -> Vec<&'a str> {
(0..p.matched_count())
.filter_map(|i| p.matched_index(i))
.map(|idx| labels[idx])
.collect()
}
#[test]
fn empty_query_returns_all() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana", "cherry"];
push_items(&mut p, labels);
p.set_query("");
assert_eq!(p.matched_count(), 3);
}
#[test]
fn single_fuzzy_stage() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana", "cherry"];
push_items(&mut p, labels);
p.set_query("ban");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["banana"]);
}
#[test]
fn single_exact_stage() {
let mut p = FilterPipeline::new();
let labels = &["apple", "pineapple", "cherry"];
push_items(&mut p, labels);
p.set_query("'apple");
let result = matched_labels(&p, labels);
assert!(result.contains(&"apple"));
assert!(result.contains(&"pineapple"));
assert!(!result.contains(&"cherry"));
}
#[test]
fn two_stage_pipeline() {
let mut p = FilterPipeline::new();
let labels = &["error_log", "warning_temp", "info_log", "debug_temp"];
push_items(&mut p, labels);
p.set_query("'log | !temp");
let result = matched_labels(&p, labels);
assert!(result.contains(&"error_log"));
assert!(result.contains(&"info_log"));
assert!(!result.contains(&"warning_temp"));
assert!(!result.contains(&"debug_temp"));
}
#[test]
fn three_stage_pipeline() {
let mut p = FilterPipeline::new();
let labels = &[
"error_log_123",
"warning_temp_456",
"info_log_789",
"debug_temp_012",
];
push_items(&mut p, labels);
p.set_query("'log | !temp | /[0-9]+/");
let result = matched_labels(&p, labels);
assert!(result.contains(&"error_log_123"));
assert!(result.contains(&"info_log_789"));
assert_eq!(result.len(), 2);
}
#[test]
fn incremental_stage_1_preserved() {
let mut p = FilterPipeline::new();
let labels = &["error_log", "warning_temp", "info_log", "debug_temp"];
push_items(&mut p, labels);
// First query
p.set_query("'log | !error");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["info_log"]);
// Edit stage 2 only: stage 1 cache should be preserved
p.set_query("'log | !info");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["error_log"]);
}
#[test]
fn pop_stage_on_backspace() {
let mut p = FilterPipeline::new();
let labels = &["error_log", "warning_temp", "info_log"];
push_items(&mut p, labels);
p.set_query("'log | !error");
assert_eq!(matched_labels(&p, labels), vec!["info_log"]);
// Backspace over the pipe: now just "'log"
p.set_query("'log");
let result = matched_labels(&p, labels);
assert!(result.contains(&"error_log"));
assert!(result.contains(&"info_log"));
assert_eq!(result.len(), 2);
}
#[test]
fn empty_segments_skipped() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana"];
push_items(&mut p, labels);
p.set_query("apple | | banana");
// Middle empty segment should be ignored
// This should be equivalent to "apple | banana"
// which is fuzzy "apple" then fuzzy "banana".
// "apple" matches apple, "banana" matches banana.
// Pipeline: first stage matches apple, second stage filters that for banana.
// Neither "apple" nor "banana" matches both, so 0 results.
assert_eq!(p.matched_count(), 0);
}
#[test]
fn escaped_pipe() {
let mut p = FilterPipeline::new();
let labels = &["foo|bar", "foobar", "baz"];
push_items(&mut p, labels);
p.set_query("'foo\\|bar");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["foo|bar"]);
}
#[test]
fn pipe_inside_regex_not_split() {
let mut p = FilterPipeline::new();
let labels = &["foo", "bar", "baz"];
push_items(&mut p, labels);
p.set_query("/foo|bar/");
let result = matched_labels(&p, labels);
assert!(result.contains(&"foo"));
assert!(result.contains(&"bar"));
assert!(!result.contains(&"baz"));
}
#[test]
fn inverse_exact() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana", "cherry"];
push_items(&mut p, labels);
p.set_query("!'banana");
let result = matched_labels(&p, labels);
assert!(result.contains(&"apple"));
assert!(result.contains(&"cherry"));
assert!(!result.contains(&"banana"));
}
#[test]
fn inverse_regex() {
let mut p = FilterPipeline::new();
let labels = &["item-001", "item-abc", "item-123"];
push_items(&mut p, labels);
p.set_query("!/[0-9]+/");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["item-abc"]);
}
#[test]
fn add_items_picked_up() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana"];
push_items(&mut p, labels);
p.set_query("'cherry");
assert_eq!(p.matched_count(), 0);
// Add new item
p.push(2, "cherry");
// Re-evaluate with same query
p.set_query("'cherry");
assert_eq!(p.matched_count(), 1);
assert_eq!(p.matched_index(0), Some(2));
}
#[test]
fn split_pipeline_basic() {
let segs = split_pipeline("foo | bar");
assert_eq!(segs, vec!["foo", "bar"]);
}
#[test]
fn split_pipeline_escaped() {
let segs = split_pipeline("foo\\|bar");
assert_eq!(segs, vec!["foo|bar"]);
}
#[test]
fn split_pipeline_regex() {
let segs = split_pipeline("/foo|bar/ | baz");
assert_eq!(segs, vec!["/foo|bar/", "baz"]);
}
#[test]
fn split_pipeline_empty_segments() {
let segs = split_pipeline("foo | | bar");
assert_eq!(segs, vec!["foo", "bar"]);
}
#[test]
fn split_pipeline_inverse_regex() {
let segs = split_pipeline("!/foo|bar/ | baz");
assert_eq!(segs, vec!["!/foo|bar/", "baz"]);
}
// -- Pipeline edge case tests --
#[test]
fn fuzzy_as_second_stage() {
let mut p = FilterPipeline::new();
let labels = &["error_log", "warning_temp", "info_log", "debug_log"];
push_items(&mut p, labels);
// Exact first, then fuzzy second
p.set_query("'log | debug");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["debug_log"]);
}
#[test]
fn three_stage_edit_stage_one() {
let mut p = FilterPipeline::new();
let labels = &[
"error_log_123",
"warning_temp_456",
"info_log_789",
"debug_temp_012",
];
push_items(&mut p, labels);
p.set_query("'log | !error | /[0-9]+/");
assert_eq!(matched_labels(&p, labels), vec!["info_log_789"]);
// Edit stage 1: now match "temp" instead of "log"
p.set_query("'temp | !error | /[0-9]+/");
let result = matched_labels(&p, labels);
assert!(result.contains(&"warning_temp_456"));
assert!(result.contains(&"debug_temp_012"));
assert!(!result.contains(&"error_log_123"));
}
#[test]
fn invalid_regex_in_pipeline() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana", "cherry"];
push_items(&mut p, labels);
// Invalid regex: unclosed bracket. Should match everything (graceful degradation).
p.set_query("/[invalid/");
assert_eq!(p.matched_count(), 3);
}
#[test]
fn same_query_twice_stable() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana", "cherry"];
push_items(&mut p, labels);
p.set_query("ban");
let first = matched_labels(&p, labels);
p.set_query("ban");
let second = matched_labels(&p, labels);
assert_eq!(first, second);
}
#[test]
fn query_shrink_to_single() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana", "cherry"];
push_items(&mut p, labels);
p.set_query("'ban | !x");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["banana"]);
// Shrink back to single stage
p.set_query("'ban");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["banana"]);
}
#[test]
fn all_items_excluded() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana"];
push_items(&mut p, labels);
p.set_query("xyz");
assert_eq!(p.matched_count(), 0);
}
#[test]
fn single_regex_stage() {
let mut p = FilterPipeline::new();
let labels = &["item-001", "item-abc", "item-123"];
push_items(&mut p, labels);
p.set_query("/[0-9]+/");
let result = matched_labels(&p, labels);
assert_eq!(result, vec!["item-001", "item-123"]);
}
#[test]
fn inverse_fuzzy_stage() {
let mut p = FilterPipeline::new();
let labels = &["apple", "banana", "cherry"];
push_items(&mut p, labels);
p.set_query("!ban");
let result = matched_labels(&p, labels);
assert!(result.contains(&"apple"));
assert!(result.contains(&"cherry"));
assert!(!result.contains(&"banana"));
}
}