Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,324 changes: 769 additions & 555 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions helix-cli/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,13 @@ pub enum CommandType {
#[derive(Debug, Args)]
#[clap(name = "deploy", about = "Deploy a Helix project")]
pub struct DeployCommand {
#[clap(short, long, help = "The path to the project")]
#[clap(short = 'P', long, help = "The path to the project")]
pub path: Option<String>,

#[clap(short, long, help = "The output path")]
pub output: Option<String>,

#[clap(short, long, help = "Port to run the instance on")]
#[clap(short = 'p', long, help = "Port to run the instance on")]
pub port: Option<u16>,
}

Expand Down
8 changes: 7 additions & 1 deletion helixdb/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ tokio = { version = "1.44.2", features = ["full"] }
serde = { version = "1.0.217", features = ["derive"] }
serde_json = "1.0.110"
bincode = "1.3.3" # TODO: Figure out bincode 2 impl with current serde impl
sonic-rs = "0.5.0"
sonic-rs = { version = "0.3.1" }
inventory = "0.3.16"
twox-hash = "2.1.0"
heed3 = "0.22.0"
Expand Down Expand Up @@ -47,6 +47,9 @@ tokio-postgres = { version = "0.7", features = [

tempfile = { version = "3.2", optional = true }

bytes = "1.4.0"
thiserror = "1.0"

[dev-dependencies]
rand = "0.9.0"
lazy_static = "1.4.0"
Expand Down Expand Up @@ -74,3 +77,6 @@ default = ["full"]
strip = "debuginfo"
lto = true
opt-level = "z"

[build-dependencies]
lalrpop = "0.20.0"
123 changes: 51 additions & 72 deletions helixdb/src/helix_engine/bm25/bm25.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::{borrow::Cow, collections::HashMap, sync::Arc};

use crate::{
helix_engine::{
storage_core::{storage_core::HelixGraphStorage, storage_methods::StorageMethods},
// storage_core::{storage_core::HelixGraphStorage, storage_methods::StorageMethods},
types::GraphError,
vector_core::{hnsw::HNSW, vector::HVector},
},
Expand Down Expand Up @@ -319,31 +319,10 @@ impl BM25 for HBM25Config {
let k1 = 1.2;
let b = 0.75;

// Ensure we don't have division by zero
let df = df.max(1);
let total_docs = total_docs.max(1);

// Calculate IDF: log((N - df + 0.5) / (df + 0.5))
// This can be negative when df is high relative to N, which is mathematically correct
let idf = ((total_docs as f64 - df as f64 + 0.5) / (df as f64 + 0.5)).ln();

// Ensure avgdl is not zero
let avgdl = if avgdl > 0.0 {
avgdl
} else {
doc_length as f64
};

// Calculate BM25 score
let tf_component = (tf as f64 * (k1 as f64 + 1.0))
/ (tf as f64 + k1 as f64 * (1.0 - b as f64 + b as f64 * (doc_length as f64 / avgdl)));

let score = (idf * tf_component) as f32;

// The score can be negative when IDF is negative (term appears in most documents)
// This is mathematically correct - such terms have low discriminative power
// But documents with higher tf should still score higher than those with lower tf
score
let idf = ((total_docs as f64 - df as f64 + 0.5) / (df as f64 + 0.5)).ln_1p();
let numerator = tf as f64 * (k1 as f64 + 1.0);
let denominator = tf as f64 + k1 as f64 * (1.0 - b as f64 + b as f64 * (doc_length as f64 / avgdl));
(idf * numerator / denominator) as f32
}
}

Expand All @@ -359,52 +338,52 @@ pub trait HybridSearch {
) -> Result<Vec<(u128, f32)>, GraphError>;
}

impl HybridSearch for HelixGraphStorage {
fn hybrid_search(
&self,
txn: &RoTxn,
query: &str,
query_vector: &[f64],
vector_query: Option<&[f32]>,
alpha: f32,
limit: usize,
) -> Result<Vec<(u128, f32)>, GraphError> {
// Get BM25 scores
let bm25_results = self.bm25.search(txn, query, limit * 2)?; // Get more results for better fusion
let mut combined_scores: HashMap<u128, f32> = HashMap::new();

// Add BM25 scores (weighted by alpha)
for (doc_id, score) in bm25_results {
combined_scores.insert(doc_id, alpha * score);
}

// Add vector similarity scores if provided (weighted by 1-alpha)
if let Some(_query_vector) = vector_query {
// This would integrate with your existing vector search
// For now, we'll just use BM25 scores
// You would call your vector similarity search here and combine scores
let vector_results = self.vectors.search::<fn(&HVector, &RoTxn) -> bool>(
txn,
query_vector,
limit * 2,
None,
false,
)?;
for doc in vector_results {
let doc_id = doc.id;
let score = doc.distance.unwrap_or(0.0);
combined_scores.insert(doc_id, (1.0 - alpha) * score as f32);
}
}

// Sort by combined score and return top results
let mut results: Vec<(u128, f32)> = combined_scores.into_iter().collect();
results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
results.truncate(limit);

Ok(results)
}
}
// impl HybridSearch for HelixGraphStorage {
// fn hybrid_search(
// &self,
// txn: &RoTxn,
// query: &str,
// query_vector: &[f64],
// vector_query: Option<&[f32]>,
// alpha: f32,
// limit: usize,
// ) -> Result<Vec<(u128, f32)>, GraphError> {
// // Get BM25 scores
// let bm25_results = self.bm25.search(txn, query, limit * 2)?; // Get more results for better fusion
// let mut combined_scores: HashMap<u128, f32> = HashMap::new();

// // Add BM25 scores (weighted by alpha)
// for (doc_id, score) in bm25_results {
// combined_scores.insert(doc_id, alpha * score);
// }

// // Add vector similarity scores if provided (weighted by 1-alpha)
// if let Some(_query_vector) = vector_query {
// // This would integrate with your existing vector search
// // For now, we'll just use BM25 scores
// // You would call your vector similarity search here and combine scores
// let vector_results = self.vectors.search::<fn(&HVector, &RoTxn) -> bool>(
// txn,
// query_vector,
// limit * 2,
// None,
// false,
// )?;
// for doc in vector_results {
// let doc_id = doc.id;
// let score = doc.distance.unwrap_or(0.0);
// combined_scores.insert(doc_id, (1.0 - alpha) * score as f32);
// }
// }

// // Sort by combined score and return top results
// let mut results: Vec<(u128, f32)> = combined_scores.into_iter().collect();
// results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
// results.truncate(limit);

// Ok(results)
// }
// }

pub trait BM25Flatten {
fn flatten_bm25(&self) -> String;
Expand Down
Loading