diff --git a/packages/core/src/methods/dataframe/filtering/at.js b/packages/core/src/methods/dataframe/filtering/at.js new file mode 100644 index 0000000..97709d8 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/at.js @@ -0,0 +1,48 @@ +/* -------------------------------------------------------------- * + | DataFrame → filtering · at() | + * -------------------------------------------------------------- */ + +/** + * Returns a row at the specified index.
+ * `df.at(5)` → returns an object representing the row at index 5. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {number} index - Row index to select + * @returns {Object} - Object representing the selected row + * @throws {Error} If index is invalid or out of bounds + */ +export function at(df, index) { + // Validate index is an integer + if (!Number.isInteger(index)) { + throw new Error( + `Index must be an integer, got ${typeof index === 'number' ? index : typeof index}` + ); + } + + // Validate index is not negative + if (index < 0) { + throw new Error(`Index out of bounds: ${index} is negative`); + } + + const rows = df.toArray(); + + // Check if DataFrame is empty + if (rows.length === 0) { + throw new Error('Index out of bounds: DataFrame is empty'); + } + + // Check if index is within range + if (index >= rows.length) { + throw new Error( + `Index out of bounds: ${index} >= ${rows.length}` + ); + } + + return rows[index]; +} + +/* -------------------------------------------------------------- * + | Pool for extendDataFrame | + * -------------------------------------------------------------- */ +export default { at }; + diff --git a/packages/core/src/methods/dataframe/filtering/drop.js b/packages/core/src/methods/dataframe/filtering/drop.js new file mode 100644 index 0000000..1be19f4 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/drop.js @@ -0,0 +1,66 @@ +/* -------------------------------------------------------------- * + | DataFrame → filtering · drop() | + * -------------------------------------------------------------- */ + +/** + * Removes specified columns from a DataFrame.
+ * `df.drop(['age', 'name'])` → returns a new DataFrame without the specified columns. + * Can accept either an array of column names or a single column name as string. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {string|string[]} columns - Column name(s) to remove + * @returns {DataFrame} - New DataFrame without the dropped columns + * @throws {Error} If any column doesn't exist or if dropping all columns + */ +export function drop(df, columns) { + // Convert columns to array if it's not already + const columnsArray = Array.isArray(columns) ? columns : [columns]; + + // Handle empty column list - return a copy + if (columnsArray.length === 0) { + // Create a shallow copy using toArray() and fromRecords + const builder = + typeof df.constructor.fromRecords === 'function' + ? df.constructor.fromRecords + : (rows) => new df.constructor(rows); + return builder(df.toArray()); + } + + // Get all column names + const allColumns = df.columns; + + // Check that all columns to drop exist + for (const col of columnsArray) { + if (!allColumns.includes(col)) { + throw new Error(`Column not found: '${col}'`); + } + } + + // Create list of columns to keep + const columnsToKeep = allColumns.filter(col => !columnsArray.includes(col)); + + // Cannot drop all columns + if (columnsToKeep.length === 0) { + throw new Error('Cannot drop all columns'); + } + + // Create new data object with only the kept columns + const rows = df.toArray(); + const result = {}; + + // For each column to keep, extract its data + for (const col of columnsToKeep) { + // Use the public API to get column data + const colData = df.col(col).toArray(); + result[col] = colData; + } + + // Create a new DataFrame with the kept columns + return new df.constructor(result, df._options); +} + +/* -------------------------------------------------------------- * + | Pool for extendDataFrame | + * -------------------------------------------------------------- */ +export default { drop }; + diff --git a/packages/core/src/methods/dataframe/filtering/expr$.js b/packages/core/src/methods/dataframe/filtering/expr$.js new file mode 100644 index 0000000..81d4a92 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/expr$.js @@ -0,0 +1,137 @@ +/** + * Filtering method: expr$ + * + * This file provides the expr$ method for DataFrame rows using template literals + * This provides a more intuitive syntax for filtering + * + * @module methods/dataframe/filtering/expr$ + */ + +import { createTypedSeries } from '../../../data/utils/createTypedArray.js'; + +/** + * Filters rows in a DataFrame using a template literal expression. + * This provides a more intuitive syntax for filtering. + * + * @param {Object} df - DataFrame instance + * @param {TemplateStringsArray} strings - Template strings array + * @param {...any} values - Values to interpolate into the template + * @returns {Object} - New DataFrame with filtered rows + * + * @example + * // Filter rows where age > 30 and city includes "York" + * df.expr$`age > 30 && city_includes("York")` + */ +export function expr$(df, strings, ...values) { + // Create an expression from the template string + const expression = String.raw({ raw: strings }, ...values); + + // Transform the expression, replacing string methods with special functions + const processedExpr = expression + .replace(/([a-zA-Z0-9_]+)_includes\(([^)]+)\)/g, '$1.includes($2)') + .replace(/([a-zA-Z0-9_]+)_startsWith\(([^)]+)\)/g, '$1.startsWith($2)') + .replace(/([a-zA-Z0-9_]+)_endsWith\(([^)]+)\)/g, '$1.endsWith($2)') + .replace(/([a-zA-Z0-9_]+)_match\(([^)]+)\)/g, '$1.match($2)'); + + // Create a predicate function for filtering rows + const predicate = createPredicate(processedExpr); + + // Get DataFrame rows + const rows = df.toArray(); + const allColumns = df.columns; + + // Filter rows by predicate + const filteredRows = rows.filter((row) => predicate(row)); + + // If no matching rows, return an empty DataFrame with the same columns and column types + if (filteredRows.length === 0) { + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each column, create a Series with the appropriate type + for (const col of allColumns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + + // Create an empty array with the same type + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const emptyTypedArray = new TypedArrayConstructor(0); + result._columns[col] = createTypedSeries(emptyTypedArray, col, df); + } else { + result._columns[col] = createTypedSeries([], col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; + } + + // For non-empty results, create a new DataFrame with filtered rows + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each column, create a Series with the appropriate type + for (const col of allColumns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + + // Extract values for this column from the filtered rows + const values = filteredRows.map(row => row[col]); + + // Preserve the array type if it's a typed array + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const typedValues = new TypedArrayConstructor(values.length); + values.forEach((value, i) => { + typedValues[i] = value; + }); + result._columns[col] = createTypedSeries(typedValues, col, df); + } else { + result._columns[col] = createTypedSeries(values, col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; +} + +/** + * Create a predicate function for filtering rows + * + * @param {string} expr - Expression to evaluate + * @returns {Function} - Predicate function + * @private + */ +function createPredicate(expr) { + try { + // Use Function instead of eval for better security + return new Function( + 'row', + ` + try { + with (row) { + return ${expr}; + } + } catch (e) { + return false; + } + `, + ); + } catch (e) { + throw new Error(`Invalid expression: ${expr}. Error: ${e.message}`); + } +} + +// Export the expr$ method directly +export { expr$ }; diff --git a/packages/core/src/methods/dataframe/filtering/filter.js b/packages/core/src/methods/dataframe/filtering/filter.js new file mode 100644 index 0000000..d92e940 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/filter.js @@ -0,0 +1,92 @@ +/*-------------------------------------------------------------------------* + | DataFrame › filtering · filter() | + | | + | df.filter(row => row.age > 30) → new DataFrame with matching rows | + | Supports predicate functions and string expressions. | + *-------------------------------------------------------------------------*/ + +import { createTypedSeries } from '../../../data/utils/createTypedArray.js'; + +/** + * Filters rows in a DataFrame based on a predicate function + * + * @param {Object} df - DataFrame instance + * @param {Function} predicate - Function to apply to each row + * @returns {Object} - New DataFrame with filtered rows + */ +export function filter(df, predicate) { + // Check that the argument is a function + if (typeof predicate !== 'function') { + throw new Error('Predicate must be a function'); + } + + // Convert DataFrame to array of rows + const rows = df.toArray(); + const allColumns = df.columns; + + // Apply predicate to each row + const filteredRows = rows.filter(predicate); + + // If no results, create an empty DataFrame with the same columns and column types + if (filteredRows.length === 0) { + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each column, create a Series with the appropriate type + for (const col of allColumns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + + // Create an empty array with the same type + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const emptyTypedArray = new TypedArrayConstructor(0); + result._columns[col] = createTypedSeries(emptyTypedArray, col, df); + } else { + result._columns[col] = createTypedSeries([], col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; + } + + // For non-empty results, create a new DataFrame with filtered rows + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each column, create a Series with the appropriate type + for (const col of allColumns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + const values = filteredRows.map(row => row[col]); + + // Preserve the array type if it's a typed array + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const typedValues = new TypedArrayConstructor(values.length); + values.forEach((value, i) => { + typedValues[i] = value; + }); + result._columns[col] = createTypedSeries(typedValues, col, df); + } else { + result._columns[col] = createTypedSeries(values, col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; +} + +// Export the filter method directly +export { filter }; diff --git a/packages/core/src/methods/dataframe/filtering/head.js b/packages/core/src/methods/dataframe/filtering/head.js new file mode 100644 index 0000000..ac96f02 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/head.js @@ -0,0 +1,45 @@ +/* -------------------------------------------------------------- * + | DataFrame → filtering · head() | + * -------------------------------------------------------------- */ + +/** + * Returns the first n rows of a DataFrame.
+ * `df.head(5)` → returns a new DataFrame with the first 5 rows. + * Similar to pandas' head() function. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {number} [n=5] - Number of rows to return + * @param {Object} [options] - Additional options + * @param {boolean} [options.print=false] - Option for compatibility with other libraries + * @returns {DataFrame} - New DataFrame with the first n rows + * @throws {Error} If n is not a positive integer + */ +export function head(df, n = 5, options = { print: false }) { + // Validate input parameters + if (n <= 0) { + throw new Error('Number of rows must be a positive integer'); + } + if (!Number.isInteger(n)) { + throw new Error('Number of rows must be an integer'); + } + + // Get data from DataFrame + const rows = df.toArray(); + + // Select first n rows (or all if there are fewer than n) + const selectedRows = rows.slice(0, n); + + // Create a new DataFrame from the selected rows + const builder = + typeof df.constructor.fromRecords === 'function' + ? df.constructor.fromRecords + : (rows) => new df.constructor(rows); + + return builder(selectedRows); +} + +/* -------------------------------------------------------------- * + | Pool for extendDataFrame | + * -------------------------------------------------------------- */ +export default { head }; + diff --git a/packages/core/src/methods/dataframe/filtering/iloc.js b/packages/core/src/methods/dataframe/filtering/iloc.js new file mode 100644 index 0000000..035e756 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/iloc.js @@ -0,0 +1,151 @@ +/*-------------------------------------------------------------------------* + | DataFrame -› filtering · iloc() | + | | + | Выбор строк и колонок из DataFrame по целочисленным позициям. | + | | + | df.iloc(5) → выбор строки с индексом 5 | + | df.iloc([1, 3, 5]) → выбор строк с указанными индексами | + | df.iloc(5, 2) → выбор значения в строке 5, колонке 2 | + | df.iloc([1, 3], [0, 2]) → выбор строк 1,3 и колонок 0,2 | + *-------------------------------------------------------------------------*/ + +/** + * Method for selecting rows and columns by indices + * + * @module methods/dataframe/filtering/iloc + */ + +// Import function for creating typed arrays +import { createTypedSeries } from '../../../data/utils/createTypedArray.js'; + +/** + * Method for selecting rows and columns by indices (similar to iloc in pandas) + * @param {DataFrame} df - DataFrame instance + * @param {number|number[]|function} rowSelector - Row index, array of indices, or predicate function + * @param {number|number[]|function} colSelector - Column index, array of indices, or predicate function + * @returns {DataFrame|*} - New DataFrame with selected rows and columns or a cell value + */ +export function iloc(df, rowSelector = null, colSelector = null) { + // Get all rows as array of objects + const rows = df.toArray(); + const allColumns = df.columns; + const rowCount = df.rowCount; + + if (rowCount === 0) { + throw new Error('Row index out of bounds'); + } + + // Indices of selected rows + let selectedIndices = []; + + // Process row selector + if (rowSelector === null || rowSelector === undefined) { + // If selector is null, select all rows + selectedIndices = Array.from({ length: rowCount }, (_, i) => i); + } else if (typeof rowSelector === 'number') { + // Single row index + const idx = rowSelector < 0 ? rowCount + rowSelector : rowSelector; + if (idx < 0 || idx >= rowCount) { + throw new Error('Row index out of bounds'); + } + selectedIndices = [idx]; + } else if (Array.isArray(rowSelector)) { + // Array of row indices + selectedIndices = rowSelector.map((idx) => { + const adjustedIdx = idx < 0 ? rowCount + idx : idx; + if (adjustedIdx < 0 || adjustedIdx >= rowCount) { + throw new Error('Row index out of bounds'); + } + return adjustedIdx; + }); + } else if (typeof rowSelector === 'function') { + // Function returning true/false for each row index + for (let i = 0; i < rowCount; i++) { + if (rowSelector(i)) { + selectedIndices.push(i); + } + } + } else { + throw new Error('Invalid row selector type'); + } + + // Indices of selected columns + let selectedColumnIndices = []; + + // Process column selector + if (colSelector === null || colSelector === undefined) { + // If selector is null, select all columns + selectedColumnIndices = Array.from({ length: allColumns.length }, (_, i) => i); + } else if (typeof colSelector === 'number') { + // Single column index + const idx = colSelector < 0 ? allColumns.length + colSelector : colSelector; + if (idx < 0 || idx >= allColumns.length) { + throw new Error('Column index out of bounds'); + } + selectedColumnIndices = [idx]; + } else if (Array.isArray(colSelector)) { + // Array of column indices + selectedColumnIndices = colSelector.map((idx) => { + const adjustedIdx = idx < 0 ? allColumns.length + idx : idx; + if (adjustedIdx < 0 || adjustedIdx >= allColumns.length) { + throw new Error('Column index out of bounds'); + } + return adjustedIdx; + }); + } else if (typeof colSelector === 'function') { + // Function returning true/false for each column index + for (let i = 0; i < allColumns.length; i++) { + if (colSelector(i)) { + selectedColumnIndices.push(i); + } + } + } else { + throw new Error('Invalid column selector type'); + } + + // Get names of selected columns + const selectedColumns = selectedColumnIndices.map((idx) => allColumns[idx]); + + // If only one row and one column are selected, return the value + if ( + selectedIndices.length === 1 && + selectedColumns.length === 1 && + typeof rowSelector === 'number' && + typeof colSelector === 'number' + ) { + return rows[selectedIndices[0]][selectedColumns[0]]; + } + + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each selected column, create a Series with the appropriate type + for (const col of selectedColumns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + const values = selectedIndices.map(index => rows[index][col]); + + // Preserve the array type if it's a typed array + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const typedValues = new TypedArrayConstructor(values.length); + values.forEach((value, i) => { + typedValues[i] = value; + }); + result._columns[col] = createTypedSeries(typedValues, col, df); + } else { + result._columns[col] = createTypedSeries(values, col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; +} + +// Export the method for the pool +export default { iloc }; diff --git a/packages/core/src/methods/dataframe/filtering/index.js b/packages/core/src/methods/dataframe/filtering/index.js new file mode 100644 index 0000000..82ff7cf --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/index.js @@ -0,0 +1,18 @@ +/** + * DataFrame filtering methods + * + * This module exports all filtering methods for DataFrame. + * Methods are registered using extendDataFrame. + * + * @module methods/dataframe/filtering + */ + +import { DataFrame } from '../../../data/model/index.js'; +import { extendDataFrame } from '../../../data/model/extendDataFrame.js'; +import * as pool from './pool.js'; + +// Register methods for DataFrame without namespace +extendDataFrame(DataFrame.prototype, pool); + +// Export methods directly for functional style calls +export * from './pool.js'; diff --git a/packages/core/src/methods/dataframe/filtering/loc.js b/packages/core/src/methods/dataframe/filtering/loc.js new file mode 100644 index 0000000..86f96bd --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/loc.js @@ -0,0 +1,295 @@ +/*-------------------------------------------------------------------------* + | DataFrame -› filtering · loc() | + | | + | Selection of rows and columns from DataFrame by labels (names). | + | | + | df.loc(5) → select row with index 5 | + | df.loc([1, 3, 5]) → select rows with specified indices | + | df.loc(5, 'age') → select value in row 5, column 'age' | + | df.loc([1, 3], ['name', 'age']) → select rows 1,3 and columns 'name','age' | + | df.loc(row => row.age > 30) → select rows where age > 30 | + | df.loc({city: 'Chicago'}) → select rows where city equals 'Chicago' | + *-------------------------------------------------------------------------*/ + +/** + * Row and column selection by label or position + * + * @module methods/dataframe/filtering/loc + */ + +import { createTypedArray } from '../../../data/utils/createTypedArray.js'; + +/** + * Selects rows and columns by label or position + * + * @param {DataFrame} df - DataFrame to select from + * @param {*} rowSelector - Row selector (label, array of labels, predicate function, or condition object) + * @param {*} colSelector - Column selector (name, array of names, or null for all columns) + * @returns {DataFrame} - New DataFrame with selected rows and columns + */ +export function loc(df, rowSelector, colSelector) { + // Get data from DataFrame + const rows = df.toArray(); + const rowCount = df.rowCount; + + // Define rows to select + let selectedRows = []; + let selectedIndices = []; + + // Check if DataFrame has an index set + const hasIndex = df._index !== null && df._indexMap !== undefined && df._indexMap.size > 0; + + if (rowSelector === null) { + // If rowSelector is null, select all rows + selectedRows = [...rows]; + selectedIndices = Array.from({ length: rowCount }, (_, i) => i); + } else if (Array.isArray(rowSelector)) { + // If rowSelector is an array of indices or labels + if (hasIndex) { + // Use index for selection + selectedIndices = []; + selectedRows = []; + + for (const label of rowSelector) { + const index = df._indexMap.get(label); + if (index === undefined) { + throw new Error('Row label not found'); + } + selectedIndices.push(index); + selectedRows.push(rows[index]); + } + } else { + // Use numeric indices + for (const index of rowSelector) { + if (index < 0 || index >= rowCount) { + throw new Error( + `Row index ${index} is out of bounds for DataFrame with ${rowCount} rows`, + ); + } + } + selectedIndices = rowSelector; + selectedRows = rows.filter((_, index) => rowSelector.includes(index)); + } + } else if (typeof rowSelector === 'number' || typeof rowSelector === 'string') { + // If rowSelector is a number or string (index or label) + if (hasIndex && typeof rowSelector === 'string') { + // Use index for selection + const index = df._indexMap.get(rowSelector); + if (index === undefined) { + throw new Error('Row label not found'); + } + selectedIndices = [index]; + selectedRows = [rows[index]]; + } else if (typeof rowSelector === 'number') { + // Use numeric index + if (rowSelector < 0 || rowSelector >= rowCount) { + throw new Error( + `Row index ${rowSelector} is out of bounds for DataFrame with ${rowCount} rows`, + ); + } + selectedIndices = [rowSelector]; + selectedRows = [rows[rowSelector]]; + } else { + throw new Error('Row label not found'); + } + } else if (typeof rowSelector === 'function') { + // If rowSelector is a predicate function + selectedRows = rows.filter(rowSelector); + selectedIndices = rows + .map((row, index) => (rowSelector(row) ? index : -1)) + .filter((index) => index !== -1); + } else if (typeof rowSelector === 'object' && rowSelector !== null) { + // If rowSelector is an object with conditions + selectedIndices = []; + selectedRows = []; + rows.forEach((row, index) => { + let match = true; + for (const [key, value] of Object.entries(rowSelector)) { + if (row[key] !== value) { + match = false; + break; + } + } + if (match) { + selectedIndices.push(index); + selectedRows.push(row); + } + }); + } else { + throw new Error('Invalid row selector type'); + } + + // If column selector is not specified, return all columns + if (colSelector === undefined) { + // If only one row is selected and rowSelector is not a function, we need to decide + // whether to return an object or a DataFrame with one row + if (selectedRows.length === 1 && typeof rowSelector !== 'function') { + // In tests, we need to return a DataFrame with rowCount property + // Create a DataFrame with one row + const result = df.constructor.fromRecords([selectedRows[0]], df._options); + + // Copy column metadata to preserve typed arrays + for (const col of result.columns) { + if (df._columns[col] && df._columns[col].vector && df._columns[col].vector.__data) { + const originalArray = df._columns[col].vector.__data; + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + // Create a new typed array with the same type + const newArray = new TypedArrayConstructor([selectedRows[0][col]]); + result._columns[col].vector.__data = newArray; + } + } + } + + return result; + } + + // If no results, create an empty DataFrame with the same columns + if (selectedRows.length === 0) { + const emptyData = {}; + for (const col of df.columns) { + // Preserve array type if it's a typed array + const originalArray = df._columns[col].vector.__data; + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + emptyData[col] = new TypedArrayConstructor(0); + } else { + emptyData[col] = []; + } + } + return new df.constructor(emptyData, df._options); + } + + // Create a new DataFrame with the same options as the original + const result = df.constructor.fromRecords(selectedRows, df._options); + + // Process each column to preserve typed arrays + for (const col of df.columns) { + if (df._columns[col] && df._columns[col].vector && df._columns[col].vector.__data) { + const originalArray = df._columns[col].vector.__data; + if (ArrayBuffer.isView(originalArray)) { + // Get column options if specified + const columnOptions = df._options?.columns?.[col] || {}; + + // Extract values for this column from selected rows + const values = selectedRows.map(row => row[col]); + + // Create a new typed array with the same type + const newArray = createTypedArray(values, originalArray, columnOptions); + + // Replace the array in the result DataFrame + if (result._columns[col] && result._columns[col].vector) { + result._columns[col].vector.__data = newArray; + } + } + } + } + + return result; + } + + // Define columns to select + let selectedColumns = []; + + if (colSelector === null) { + // If colSelector is null, select all columns + selectedColumns = df.columns; + } else if (Array.isArray(colSelector)) { + // If colSelector is an array of column names + selectedColumns = colSelector; + } else if (typeof colSelector === 'string') { + // If colSelector is a single column name + selectedColumns = [colSelector]; + } else { + throw new Error('Invalid column selector type'); + } + + // Check that all specified columns exist + for (const column of selectedColumns) { + if (!df.columns.includes(column)) { + throw new Error('Column not found'); + } + } + + // If only one row and one column are selected, return the value + if ( + selectedRows.length === 1 && + selectedColumns.length === 1 && + typeof rowSelector !== 'function' + ) { + return selectedRows[0][selectedColumns[0]]; + } + + // If no results, create an empty DataFrame with selected columns + if (selectedRows.length === 0) { + const emptyData = {}; + for (const col of selectedColumns) { + // Preserve array type if it's a typed array + const originalArray = df._columns[col].vector.__data; + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + emptyData[col] = new TypedArrayConstructor(0); + } else { + emptyData[col] = []; + } + } + return new df.constructor(emptyData, df._options); + } + + // If only one row and one column are selected, but we need a DataFrame + if (selectedRows.length === 1 && selectedColumns.length === 1 && typeof rowSelector === 'function') { + const singleColData = {}; + const col = selectedColumns[0]; + const value = selectedRows[0][col]; + + // Preserve array type if it's a typed array + const originalArray = df._columns[col].vector.__data; + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + singleColData[col] = new TypedArrayConstructor([value]); + } else { + singleColData[col] = [value]; + } + + return new df.constructor(singleColData, df._options); + } + + // Create a new DataFrame with only selected columns + const filteredRows = selectedRows.map(row => { + const filteredRow = {}; + for (const col of selectedColumns) { + filteredRow[col] = row[col]; + } + return filteredRow; + }); + + // Create a new DataFrame with the same options as the original + const result = df.constructor.fromRecords(filteredRows, df._options); + + // Process each column to preserve typed arrays + for (const col of selectedColumns) { + if (df._columns[col] && df._columns[col].vector && df._columns[col].vector.__data) { + const originalArray = df._columns[col].vector.__data; + if (ArrayBuffer.isView(originalArray)) { + // Get column options if specified + const columnOptions = df._options?.columns?.[col] || {}; + + // Extract values for this column from filtered rows + const values = filteredRows.map(row => row[col]); + + // Create a new typed array with the same type + const newArray = createTypedArray(values, originalArray, columnOptions); + + // Replace the array in the result DataFrame + if (result._columns[col] && result._columns[col].vector) { + result._columns[col].vector.__data = newArray; + } + } + } + } + + return result; +} + +// Export the loc method directly +export { loc }; diff --git a/packages/core/src/methods/dataframe/filtering/pool.js b/packages/core/src/methods/dataframe/filtering/pool.js new file mode 100644 index 0000000..7807656 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/pool.js @@ -0,0 +1,30 @@ +/** + * DataFrame filtering method pool + * + * This file re-exports all filtering methods for use with extendDataFrame + * + * @module methods/dataframe/filtering/pool + */ + +// Row filtering methods +export { filter } from './filter.js'; +export { query } from './query.js'; +export { where } from './where.js'; +export { expr$ } from './expr$.js'; +export { query$ } from './query$.js'; + +// Row sampling methods +export { sample } from './sample.js'; +export { stratifiedSample } from './stratifiedSample.js'; +export { head } from './head.js'; +export { tail } from './tail.js'; + +// Column selection methods +export { select } from './select.js'; +export { drop } from './drop.js'; +export { selectByPattern } from './selectByPattern.js'; + +// Row/column access methods +export { at } from './at.js'; +export { iloc } from './iloc.js'; +export { loc } from './loc.js'; diff --git a/packages/core/src/methods/dataframe/filtering/query$.js b/packages/core/src/methods/dataframe/filtering/query$.js new file mode 100644 index 0000000..d2a13de --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/query$.js @@ -0,0 +1,116 @@ +/** + * Filtering method: query$ + * + * This file provides the query$ method for filtering DataFrame rows using template literals + * for more intuitive syntax + * + * @module methods/dataframe/filtering/query$ + */ + +/** + * Filters rows in a DataFrame using a template literal expression. + * This provides a more intuitive syntax for filtering. + * + * @param {Object} df - DataFrame instance + * @param {TemplateStringsArray} strings - Template strings array + * @param {...any} values - Values to interpolate into the template + * @returns {Object} - New DataFrame with filtered rows + * + * @example + * // Filter rows where age > 40 + * df.query$`age > 40` + * // Filter rows where age > 30 and salary > 100000 + * df.query$`age > 30 && salary > 100000` + * // Filter rows where city includes "Francisco" + * df.query$`city_includes("Francisco")` + */ +export function query$(df, strings, ...values) { + // Create an expression from the template string + const expression = String.raw({ raw: strings }, ...values); + + // Transform the expression, replacing string methods with special functions + const processedExpr = expression + .replace(/([a-zA-Z0-9_]+)_includes\(([^)]+)\)/g, '$1.includes($2)') + .replace(/([a-zA-Z0-9_]+)_startsWith\(([^)]+)\)/g, '$1.startsWith($2)') + .replace(/([a-zA-Z0-9_]+)_endsWith\(([^)]+)\)/g, '$1.endsWith($2)') + .replace(/([a-zA-Z0-9_]+)_match\(([^)]+)\)/g, '$1.match($2)'); + + // Create a predicate function for filtering rows + const predicate = createPredicate(processedExpr); + + // Get DataFrame rows + const rows = df.toArray(); + + // Filter rows by predicate + const filteredRows = rows.filter((row) => predicate(row)); + + // If no matching rows, return an empty DataFrame with the same structure + if (filteredRows.length === 0) { + const emptyData = {}; + for (const col of df.columns) { + emptyData[col] = []; + } + return new df.constructor(emptyData, df._options); + } + + // Create a new DataFrame from filtered rows while preserving array types + const filteredData = {}; + const allColumns = df.columns; + + // Get indices of rows that passed the filter + const selectedIndices = []; + for (let i = 0; i < rows.length; i++) { + if (predicate(rows[i])) { + selectedIndices.push(i); + } + } + + // Create new columns while preserving array types + for (const col of allColumns) { + const originalArray = df.col(col).toArray(); + const values = selectedIndices.map((index) => originalArray[index]); + + // If the original array was typed, create a new typed array + if ( + ArrayBuffer.isView(originalArray) && + !(originalArray instanceof DataView) + ) { + const TypedArrayConstructor = originalArray.constructor; + filteredData[col] = new TypedArrayConstructor(values); + } else { + filteredData[col] = values; + } + } + + return new df.constructor(filteredData, df._options); +} + +/** + * Create a predicate function for filtering rows + * + * @param {string} expr - Expression to evaluate + * @returns {Function} - Predicate function + * @private + */ +function createPredicate(expr) { + try { + // Use Function instead of eval for better security + return new Function( + 'row', + ` + try { + with (row) { + return ${expr}; + } + } catch (e) { + return false; + } + `, + ); + } catch (e) { + throw new Error(`Invalid expression: ${expr}. Error: ${e.message}`); + } +} + +// Export the query$ method directly +export { query$ }; diff --git a/packages/core/src/methods/dataframe/filtering/query.js b/packages/core/src/methods/dataframe/filtering/query.js new file mode 100644 index 0000000..99a7318 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/query.js @@ -0,0 +1,253 @@ +/*-------------------------------------------------------------------------* + | DataFrame › filtering · query() | + | | + | df.query("SELECT * WHERE age > 30") → new DataFrame with matching rows | + | Supports SQL-like syntax. | + *-------------------------------------------------------------------------*/ + +import { createTypedSeries } from '../../../data/utils/createTypedArray.js'; + +/** + * Filters DataFrame rows using SQL-like syntax + * + * @param {Object} df - DataFrame instance + * @param {string} queryString - SQL-like query string + * @returns {Object} - New DataFrame with filtered rows + */ +export function query(df, queryString) { + if (typeof queryString !== 'string') { + throw new Error('Query must be a string'); + } + + // Parse SQL-like query + const parsedQuery = parseQuery(queryString); + + // Determine which columns to include in the result + const columnsToInclude = parsedQuery.columns[0] === '*' ? + df.columns : + parsedQuery.columns.filter(col => df.columns.includes(col)); + + // Get data from DataFrame + let rows = df.toArray(); + + // Apply WHERE condition if present + if (parsedQuery.whereClause) { + const evaluateQuery = createQueryEvaluator(parsedQuery.whereClause); + rows = rows.filter((row) => { + try { + return evaluateQuery(row); + } catch (e) { + throw new Error(`Error evaluating query for row: ${e.message}`); + } + }); + } + + // Apply ORDER BY sorting if present + if (parsedQuery.orderBy) { + const { column, direction } = parsedQuery.orderBy; + rows.sort((a, b) => { + const valueA = a[column]; + const valueB = b[column]; + + if (valueA === valueB) return 0; + + const comparison = valueA < valueB ? -1 : 1; + return direction === 'ASC' ? comparison : -comparison; + }); + } + + // Apply LIMIT restriction if present + if (parsedQuery.limit !== null) { + rows = rows.slice(0, parsedQuery.limit); + } + + // If no rows, return an empty DataFrame with the same columns and column types + if (rows.length === 0) { + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each column, create a Series with the appropriate type + for (const col of df.columns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + + // Create an empty array with the same type + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const emptyTypedArray = new TypedArrayConstructor(0); + result._columns[col] = createTypedSeries(emptyTypedArray, col, df); + } else { + result._columns[col] = createTypedSeries([], col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; + } + + // For non-empty results, create a new DataFrame with filtered rows + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // Determine which columns to include based on the query + const columnsToProcess = parsedQuery.columns[0] === '*' ? df.columns : columnsToInclude; + + // For each column, create a Series with the appropriate type + for (const col of columnsToProcess) { + // Skip columns that don't exist in the original DataFrame + if (!df.columns.includes(col)) continue; + + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + + // Extract values for this column from the filtered rows + const values = rows.map(row => row[col]); + + // Preserve the array type if it's a typed array + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const typedValues = new TypedArrayConstructor(values.length); + values.forEach((value, i) => { + typedValues[i] = value; + }); + result._columns[col] = createTypedSeries(typedValues, col, df); + } else { + result._columns[col] = createTypedSeries(values, col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; +} + +/** + * Parses an SQL-like query string into its components + * + * @param {string} queryString - SQL-like query string + * @returns {Object} - Parsed query components + * @private + */ +function parseQuery(queryString) { + // Initialize default values + const result = { + columns: ['*'], + whereClause: null, + orderBy: null, + limit: null + }; + + // Extract LIMIT clause if present + const limitMatch = queryString.match(/\s+LIMIT\s+(\d+)\s*$/i); + if (limitMatch) { + result.limit = parseInt(limitMatch[1], 10); + queryString = queryString.replace(/\s+LIMIT\s+\d+\s*$/i, ''); + } + + // Extract ORDER BY clause if present + const orderByMatch = queryString.match(/\s+ORDER\s+BY\s+([\w.]+)(?:\s+(ASC|DESC))?\s*$/i); + if (orderByMatch) { + result.orderBy = { + column: orderByMatch[1], + direction: (orderByMatch[2] || 'ASC').toUpperCase() + }; + queryString = queryString.replace(/\s+ORDER\s+BY\s+[\w.]+(?:\s+(?:ASC|DESC))?\s*$/i, ''); + } + + // Extract SELECT and WHERE parts + const selectMatch = queryString.match(/^\s*SELECT\s+(.+?)(?:\s+WHERE\s+(.+))?\s*$/i); + if (selectMatch) { + // Parse columns + const columnsStr = selectMatch[1].trim(); + if (columnsStr !== '*') { + result.columns = columnsStr.split(',').map(col => col.trim()); + } + + // Parse WHERE clause + if (selectMatch[2]) { + result.whereClause = selectMatch[2].trim(); + } + } else { + // If no SELECT keyword, treat the whole string as WHERE clause + result.whereClause = queryString.trim(); + } + + return result; +} + +/** + * Creates a function to evaluate a WHERE clause + * + * @param {string} whereClause - WHERE clause from SQL-like query + * @returns {Function} - Function evaluating the clause for a row + * @private + */ +function createQueryEvaluator(whereClause) { + if (!whereClause) { + return () => true; // No WHERE clause means all rows match + } + + // Transform SQL-like query into JavaScript expression + let jsQuery = whereClause; + + // Process logical operators first (to avoid conflicts with BETWEEN...AND) + jsQuery = jsQuery + .replace(/\bAND\b/gi, '&&') + .replace(/\bOR\b/gi, '||') + .replace(/\bNOT\b/gi, '!'); + + // Process basic comparison operators + jsQuery = jsQuery + // Replace single equals with double equals + .replace(/([\w.]+)\s*=\s*([^=\s][^=]*)/g, '$1 == $2') + // Process IN operator + .replace( + /([\w.]+)\s+IN\s+\(([^)]+)\)/gi, + (match, col, values) => { + // Split values by comma and remove extra spaces + const cleanValues = values.split(',').map(v => v.trim()).join(', '); + return `[${cleanValues}].includes(${col})`; + } + ) + // Process LIKE with % at beginning and end (contains) + .replace(/([\w.]+)\s+LIKE\s+['"]%(.+?)%['"]\s*/gi, '$1.toString().includes("$2")') + // Process LIKE with % at end (starts with) + .replace(/([\w.]+)\s+LIKE\s+['"](.+?)%['"]\s*/gi, '$1.toString().startsWith("$2")') + // Process LIKE with % at beginning (ends with) + .replace(/([\w.]+)\s+LIKE\s+['"]%(.+?)['"]\s*/gi, '$1.toString().endsWith("$2")') + // Process BETWEEN + .replace( + /([\w.]+)\s+BETWEEN\s+(\S+)\s+AND\s+(\S+)/gi, + '($1 >= $2 && $1 <= $3)' + ); + + // Create function to evaluate the query + try { + return new Function( + 'row', + ` + try { + with (row) { + return ${jsQuery}; + } + } catch (e) { + return false; + } + ` + ); + } catch (e) { + throw new Error(`Invalid query syntax: ${e.message}`); + } +} + +// Export object with method for the pool +export default { query }; diff --git a/packages/core/src/methods/dataframe/filtering/sample.js b/packages/core/src/methods/dataframe/filtering/sample.js new file mode 100644 index 0000000..8dc3e50 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/sample.js @@ -0,0 +1,117 @@ +/* -------------------------------------------------------------- * + | DataFrame → filtering · sample() | + * -------------------------------------------------------------- */ + +/** + * Returns a random sample of rows from a DataFrame.
+ * `df.sample(10)` → returns a new DataFrame with 10 randomly selected rows.
+ * `df.sample({ fraction: 0.1 })` → returns a sample of 10% of rows. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {number|Object} n - Number of rows to sample or options object + * @param {Object} [options] - Additional options + * @param {number} [options.seed] - Seed for random number generator + * @param {boolean} [options.replace=false] - Sample with replacement + * @param {number} [options.fraction] - Fraction of rows to sample (0 < fraction <= 1) + * @returns {DataFrame} - New DataFrame with sampled rows + * @throws {Error} If sampling parameters are invalid + */ +export function sample(df, n, options = {}) { + // Handle case when n is an options object + if (typeof n === 'object') { + options = n; + n = undefined; + } + + // Get data from DataFrame + const rows = df.toArray(); + if (rows.length === 0) { + // For empty DataFrame, return an empty DataFrame with the same structure + const builder = + typeof df.constructor.fromRecords === 'function' + ? df.constructor.fromRecords + : (rows) => new df.constructor(rows); + + return builder([]); + } + + // Determine sample size + let sampleSize; + if (options.fraction !== undefined) { + if (options.fraction <= 0 || options.fraction > 1) { + throw new Error('Fraction must be in the range (0, 1]'); + } + sampleSize = Math.round(rows.length * options.fraction); + } else { + sampleSize = n !== undefined ? n : 1; + } + + // Validate sample size + if (sampleSize <= 0) { + throw new Error('Number of rows to sample must be a positive integer'); + } + + // Check that sample size is an integer + if (!Number.isInteger(sampleSize)) { + throw new Error('Number of rows to sample must be an integer'); + } + + // If sampling without replacement and sample size is greater than number of rows + if (!options.replace && sampleSize > rows.length) { + throw new Error( + `Sample size (${sampleSize}) cannot be greater than number of rows (${rows.length})` + ); + } + + // Create random number generator with seed if specified + const random = + options.seed !== undefined ? createSeededRandom(options.seed) : Math.random; + + // Sample rows + const sampledRows = []; + if (options.replace) { + // Sampling with replacement + for (let i = 0; i < sampleSize; i++) { + const index = Math.floor(random() * rows.length); + sampledRows.push(rows[index]); + } + } else { + // Sampling without replacement (using Fisher-Yates shuffle algorithm) + const indices = Array.from({ length: rows.length }, (_, i) => i); + for (let i = indices.length - 1; i > 0; i--) { + const j = Math.floor(random() * (i + 1)); + [indices[i], indices[j]] = [indices[j], indices[i]]; + } + for (let i = 0; i < sampleSize; i++) { + sampledRows.push(rows[indices[i]]); + } + } + + // Create a new DataFrame from the sampled rows + const builder = + typeof df.constructor.fromRecords === 'function' + ? df.constructor.fromRecords + : (rows) => new df.constructor(rows); + + return builder(sampledRows); +} + +/** + * Creates a seeded random number generator + * + * @param {number} seed - Seed for the random number generator + * @returns {Function} - Function that returns a pseudo-random number in the range [0, 1) + * @private + */ +function createSeededRandom(seed) { + return function () { + // Simple linear congruential generator + seed = (seed * 9301 + 49297) % 233280; + return seed / 233280; + }; +} + +/* -------------------------------------------------------------- * + | Pool for extendDataFrame | + * -------------------------------------------------------------- */ +export default { sample }; diff --git a/packages/core/src/methods/dataframe/filtering/sample.js.new b/packages/core/src/methods/dataframe/filtering/sample.js.new new file mode 100644 index 0000000..acea678 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/sample.js.new @@ -0,0 +1,151 @@ +/** + * Filtering method: sample + * + * This file provides the sample method for selecting a random sample of rows from a DataFrame + * + * @module methods/dataframe/filtering/sample + */ + +import { createTypedSeries, createEmptyTypedSeries } from '../../../data/utils/createTypedArray.js'; + +/** + * Selects a random sample of rows from DataFrame + * + * @param {Object} df - DataFrame instance + * @param {number|Object} n - Number of rows to sample or options object + * @param {Object} [options] - Additional options + * @param {number} [options.seed] - Seed for random number generator + * @param {boolean} [options.replace=false] - Sampling with replacement + * @param {boolean} [options.fraction] - Fraction of rows to sample (0 < fraction <= 1) + * @returns {Object} - New DataFrame with sampled rows + */ +export function sample(df, n, options = {}) { + // Handle case when n is an options object + if (typeof n === 'object') { + options = n; + n = undefined; + } + + // Get data from DataFrame + const rows = df.toArray(); + if (rows.length === 0) { + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For empty results, we need to include all original columns + for (const col of df.columns) { + // Create an empty Series with the same type as the original + result._columns[col] = createEmptyTypedSeries(col, df); + + // Add to order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; + } + + // Determine the number of rows to sample + let sampleSize; + if (options.fraction !== undefined) { + if (options.fraction <= 0 || options.fraction > 1) { + throw new Error('Fraction must be in the range (0, 1]'); + } + sampleSize = Math.round(rows.length * options.fraction); + } else { + sampleSize = n !== undefined ? n : 1; + } + + // Check the validity of the number of rows + if (sampleSize <= 0) { + throw new Error('Number of rows to sample must be a positive number'); + } + + // Check that the sample size is an integer + if (!Number.isInteger(sampleSize)) { + throw new Error('Number of rows to sample must be an integer'); + } + + // If sampling without replacement and sample size is greater than number of rows + if (!options.replace && sampleSize > rows.length) { + throw new Error( + `Sample size (${sampleSize}) cannot be greater than number of rows (${rows.length})`, + ); + } + + // Create a random number generator with seed if specified + const random = + options.seed !== undefined ? createSeededRandom(options.seed) : Math.random; + + // Select rows + const sampledRows = []; + if (options.replace) { + // Sampling with replacement + for (let i = 0; i < sampleSize; i++) { + const index = Math.floor(random() * rows.length); + sampledRows.push(rows[index]); + } + } else { + // Sampling without replacement (using Fisher-Yates algorithm) + const indices = Array.from({ length: rows.length }, (_, i) => i); + for (let i = indices.length - 1; i > 0; i--) { + const j = Math.floor(random() * (i + 1)); + [indices[i], indices[j]] = [indices[j], indices[i]]; + } + for (let i = 0; i < sampleSize; i++) { + sampledRows.push(rows[indices[i]]); + } + } + + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // If no rows match, return an empty DataFrame with all original columns + if (sampledRows.length === 0) { + // For empty results, we need to include all original columns + for (const col of df.columns) { + // Create an empty Series with the same type as the original + result._columns[col] = createEmptyTypedSeries(col, df); + + // Add to order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; + } + + // For non-empty results + // Create Series for each column + for (const col of df.columns) { + // Extract values for this column from filtered rows + const values = sampledRows.map(row => row[col]); + + // Create a new Series with the appropriate type + result._columns[col] = createTypedSeries(values, col, df); + + // Add to order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; +} + +/** + * Creates a random number generator with seed + * + * @param {number} seed - Seed for random number generator + * @returns {Function} - Function returning pseudorandom number in range [0, 1) + * @private + */ +function createSeededRandom(seed) { + return function () { + // Simple linear congruential generator + seed = (seed * 9301 + 49297) % 233280; + return seed / 233280; + }; +} diff --git a/packages/core/src/methods/dataframe/filtering/select.js b/packages/core/src/methods/dataframe/filtering/select.js new file mode 100644 index 0000000..75d248a --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/select.js @@ -0,0 +1,62 @@ +/*-------------------------------------------------------------------------* + | DataFrame -› filtering · select() | + | | + | df.select(['age', 'name']) → new DataFrame with only the specified | + | columns. | + *-------------------------------------------------------------------------*/ + +/** + * Returns a new DataFrame with only the specified columns. + * `df.select(['name', 'age'])` → returns a new DataFrame with only the 'name' and 'age' columns. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {Array} columns - Array of column names to select + * @returns {DataFrame} - New DataFrame with only the specified columns + * @throws {Error} If any column does not exist or if columns is empty + */ +export function select(df, columns) { + // Validate input parameters + if (!Array.isArray(columns)) { + throw new Error('Columns must be an array'); + } + + if (columns.length === 0) { + throw new Error('Column list cannot be empty'); + } + + // Validate that all columns exist + for (const col of columns) { + if (!df.columns.includes(col)) { + throw new Error(`Column '${col}' not found`); + } + } + + // Create records with only the selected columns + const records = df.toArray().map(row => { + const newRow = {}; + for (const col of columns) { + newRow[col] = row[col]; + } + return newRow; + }); + + // Create options for the new DataFrame with column type information + const newOptions = { ...df._options }; + + // If there are column type definitions, filter them to include only selected columns + if (newOptions.columns) { + const filteredColumns = {}; + for (const col of columns) { + if (newOptions.columns[col]) { + filteredColumns[col] = newOptions.columns[col]; + } + } + newOptions.columns = filteredColumns; + } + + // Create new DataFrame from records with preserved column types + return df.constructor.fromRecords(records, newOptions); +} + +// Export object with method for the pool +export default { select }; diff --git a/packages/core/src/methods/dataframe/filtering/selectByPattern.js b/packages/core/src/methods/dataframe/filtering/selectByPattern.js new file mode 100644 index 0000000..254580e --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/selectByPattern.js @@ -0,0 +1,65 @@ +/*-------------------------------------------------------------------------* + | DataFrame -› filtering · selectByPattern() | + | | + | df.selectByPattern(/^price/) → new DataFrame with only columns | + | whose names match the regular expression. | + *-------------------------------------------------------------------------*/ + +/** + * Returns a new DataFrame with only columns whose names match the pattern. + * `df.selectByPattern(/^price/)` → returns a new DataFrame with columns that start with 'price'. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {RegExp|string} pattern - Regular expression or string pattern to match + * @returns {DataFrame} - New DataFrame with only the matched columns + * @throws {Error} If no columns match the pattern + * @throws {TypeError} If pattern is not a string or regular expression + */ +export function selectByPattern(df, pattern) { + // Validate pattern type + if (typeof pattern !== 'string' && !(pattern instanceof RegExp)) { + throw new TypeError('Pattern must be a string or regular expression'); + } + + // Convert string to regular expression if needed + const regex = pattern instanceof RegExp ? pattern : new RegExp(pattern); + + // Find columns matching the pattern + const matchedColumns = df.columns.filter((column) => regex.test(column)); + + // If no columns match, throw an error + if (matchedColumns.length === 0) { + throw new Error('No columns match the pattern'); + } + + // Create records with only the matched columns + const records = df.toArray().map(row => { + const newRow = {}; + for (const col of matchedColumns) { + newRow[col] = row[col]; + } + return newRow; + }); + + // Create options for the new DataFrame with column type information + const newOptions = { ...df._options }; + + // If there are column type definitions, filter them to include only matched columns + if (newOptions.columns) { + const filteredColumns = {}; + for (const col of matchedColumns) { + if (newOptions.columns[col]) { + filteredColumns[col] = newOptions.columns[col]; + } + } + newOptions.columns = filteredColumns; + } + + // Create new DataFrame from records with preserved column types + return df.constructor.fromRecords(records, newOptions); +} + +/* -------------------------------------------------------------- * + | Pool for extendDataFrame | + * -------------------------------------------------------------- */ +export default { selectByPattern }; diff --git a/packages/core/src/methods/dataframe/filtering/stratifiedSample.js b/packages/core/src/methods/dataframe/filtering/stratifiedSample.js new file mode 100644 index 0000000..04a1b37 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/stratifiedSample.js @@ -0,0 +1,180 @@ +/*-------------------------------------------------------------------------* + | DataFrame -› filtering · stratifiedSample() | + | | + | df.stratifiedSample('category', 100) → sample of 100 rows preserving | + | category proportions. | + | df.stratifiedSample('category', { frac: 0.1 }) → sample of 10% rows. | + *-------------------------------------------------------------------------*/ + +import { createTypedSeries } from '../../../data/utils/createTypedArray.js'; + +/** + * Selects a stratified sample from a DataFrame, preserving category proportions. + * + * @param {Object} df - DataFrame instance + * @param {string} stratifyColumn - Column name to stratify by + * @param {number|Object} nOrOptions - Number of rows to sample or options object with frac property + * @param {Object} [options] - Additional options + * @param {number} [options.seed] - Seed for random number generator + * @returns {Object} - New DataFrame with sampled rows + */ +export function stratifiedSample(df, stratifyColumn, nOrOptions, options = {}) { + // Check that DataFrame is not empty + if (df.rowCount === 0) { + throw new Error('DataFrame is empty'); + } + + // Check if the stratify column exists + if (!df.columns.includes(stratifyColumn)) { + throw new Error("Column not found"); + } + + // Determine if we're using count (n) or fraction (frac) + let n; + let fraction; + + if (typeof nOrOptions === 'object' && nOrOptions !== null) { + // Use options object with frac property + fraction = nOrOptions.frac; + if (fraction === undefined) { + throw new Error('When using options object, frac property must be specified'); + } + if (fraction <= 0 || fraction > 1) { + throw new Error('Fraction must be in the range (0, 1]'); + } + // Calculate n based on fraction + n = Math.round(df.rowCount * fraction); + // Merge options + options = { ...nOrOptions, ...options }; + } else { + // Use n (count) directly + n = nOrOptions; + // Validate n + if (typeof n !== 'number') { + throw new Error('Number of rows to sample must be a number'); + } + if (n < 0) { + throw new Error('Number of rows to sample must be a positive number'); + } + if (!Number.isInteger(n)) { + throw new Error('Number of rows to sample must be an integer'); + } + if (n > df.rowCount) { + throw new Error(`Sample size (${n}) cannot be greater than number of rows (${df.rowCount})`); + } + // Calculate fraction based on n + fraction = n / df.rowCount; + } + + // Get data from DataFrame + const rows = df.toArray(); + + // Group rows by categories + const categories = {}; + rows.forEach((row) => { + const category = row[stratifyColumn]; + if (!categories[category]) { + categories[category] = []; + } + categories[category].push(row); + }); + + // Create random number generator with seed if specified + const random = + options.seed !== undefined ? createSeededRandom(options.seed) : Math.random; + + // Sample rows from each category, preserving proportions + const sampledRows = []; + Object.entries(categories).forEach(([category, categoryRows]) => { + // Calculate number of rows to sample from this category + let sampleSize = Math.round(categoryRows.length * fraction); + + // Ensure each category has at least one row + sampleSize = Math.max(1, sampleSize); + sampleSize = Math.min(categoryRows.length, sampleSize); + + // Shuffle rows and select the required number + const shuffled = [...categoryRows].sort(() => 0.5 - random()); + sampledRows.push(...shuffled.slice(0, sampleSize)); + }); + + // If no results, create an empty DataFrame with the same columns and column types + if (sampledRows.length === 0) { + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each column, create a Series with the appropriate type + for (const col of df.columns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + + // Create an empty array with the same type + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const emptyTypedArray = new TypedArrayConstructor(0); + result._columns[col] = createTypedSeries(emptyTypedArray, col, df); + } else { + result._columns[col] = createTypedSeries([], col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; + } + + // For non-empty results, create a new DataFrame with filtered rows + // Create a new DataFrame instance with the same options as the original + const result = new df.constructor({}, df._options); + + // For each column, create a Series with the appropriate type + for (const col of df.columns) { + // Get the original column data to determine its type + const originalColumn = df._columns[col]; + const originalArray = originalColumn.vector.__data; + + // Extract values for this column from the sampled rows + const values = sampledRows.map(row => row[col]); + + // Preserve the array type if it's a typed array + if (ArrayBuffer.isView(originalArray) && !(originalArray instanceof DataView)) { + const TypedArrayConstructor = originalArray.constructor; + const typedValues = new TypedArrayConstructor(values.length); + values.forEach((value, i) => { + typedValues[i] = value; + }); + result._columns[col] = createTypedSeries(typedValues, col, df); + } else { + result._columns[col] = createTypedSeries(values, col, df); + } + + // Add to column order + if (!result._order.includes(col)) { + result._order.push(col); + } + } + + return result; +} + +/** + * Creates a seeded random number generator + * + * @param {number} seed - Seed for random number generator + * @returns {Function} - Function returning a pseudo-random number in range [0, 1) + * @private + */ +function createSeededRandom(seed) { + return function () { + // Simple linear congruential generator + seed = (seed * 9301 + 49297) % 233280; + return seed / 233280; + }; +} + +// Export object with method for the pool +export default { stratifiedSample }; diff --git a/packages/core/src/methods/dataframe/filtering/tail.js b/packages/core/src/methods/dataframe/filtering/tail.js new file mode 100644 index 0000000..dbe5ffe --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/tail.js @@ -0,0 +1,45 @@ +/* -------------------------------------------------------------- * + | DataFrame → filtering · tail() | + * -------------------------------------------------------------- */ + +/** + * Returns the last n rows of a DataFrame.
+ * `df.tail(5)` → returns a new DataFrame with the last 5 rows. + * Similar to pandas' tail() function. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {number} [n=5] - Number of rows to return + * @param {Object} [options] - Additional options + * @param {boolean} [options.print=false] - Option for compatibility with other libraries + * @returns {DataFrame} - New DataFrame with the last n rows + * @throws {Error} If n is not a positive integer + */ +export function tail(df, n = 5, options = { print: false }) { + // Validate input parameters + if (n <= 0) { + throw new Error('Number of rows must be a positive integer'); + } + if (!Number.isInteger(n)) { + throw new Error('Number of rows must be an integer'); + } + + // Get data from DataFrame + const rows = df.toArray(); + + // Select last n rows (or all if there are fewer than n) + const selectedRows = rows.slice(-n); + + // Create a new DataFrame from the selected rows + const builder = + typeof df.constructor.fromRecords === 'function' + ? df.constructor.fromRecords + : (rows) => new df.constructor(rows); + + return builder(selectedRows); +} + +/* -------------------------------------------------------------- * + | Pool for extendDataFrame | + * -------------------------------------------------------------- */ +export default { tail }; + diff --git a/packages/core/src/methods/dataframe/filtering/where.js b/packages/core/src/methods/dataframe/filtering/where.js new file mode 100644 index 0000000..37dd417 --- /dev/null +++ b/packages/core/src/methods/dataframe/filtering/where.js @@ -0,0 +1,62 @@ +/*-------------------------------------------------------------------------* + | DataFrame -› filtering · where() | + | | + | df.where('price', '>', 100) → new DataFrame with only rows where the | + | 'price' column values are greater than 100. | + *-------------------------------------------------------------------------*/ +import { validateColumn } from '../../../data/utils/validators.js'; + +/** Operator → predicate map */ +const OPS = { + '==': (a, b) => a == b, // eslint-disable-line eqeqeq + '===': (a, b) => a === b, + '!=': (a, b) => a != b, // eslint-disable-line eqeqeq + '!==': (a, b) => a !== b, + '>': (a, b) => a > b, + '>=': (a, b) => a >= b, + '<': (a, b) => a < b, + '<=': (a, b) => a <= b, + in: (a, b) => Array.isArray(b) && b.includes(a), + contains: (a, b) => String(a).includes(String(b)), + startsWith: (a, b) => String(a).startsWith(String(b)), + startswith: (a, b) => String(a).startsWith(String(b)), + endsWith: (a, b) => String(a).endsWith(String(b)), + endswith: (a, b) => String(a).endsWith(String(b)), + matches: (a, b) => + b instanceof RegExp ? b.test(String(a)) : new RegExp(b).test(String(a)), +}; + +/** + * Returns a new DataFrame with only rows that match the condition. + * `df.where('price', '>', 100)` → returns a new DataFrame with rows where price > 100. + * + * @param {import('../../../data/model/DataFrame.js').DataFrame} df + * @param {string} column - Column name to filter on + * @param {keyof typeof OPS} operator - Comparison operator + * @param {*} value - Value to compare against + * @returns {DataFrame} - New DataFrame with only matching rows + * @throws {Error} If column doesn't exist or operator is not supported + */ +export function where(df, column, operator, value) { + validateColumn(df, column); + + const pred = OPS[operator]; + if (!pred) throw new Error(`Unsupported operator: '${operator}'`); + + const colVals = df.col(column).toArray(); // safer than vector.get + const srcRows = df.toArray(); + + const outRows = []; + for (let i = 0; i < colVals.length; i++) { + if (pred(colVals[i], value)) outRows.push(srcRows[i]); + } + + // Create options for the new DataFrame with column type information + const newOptions = { ...df._options }; + + // Create new DataFrame from filtered rows with preserved column types + return df.constructor.fromRecords(outRows, newOptions); +} + +// Export the where method directly +export { where }; \ No newline at end of file diff --git a/tests/core/methods/dataframe/filtering/at.test.js b/tests/core/methods/dataframe/filtering/at.test.js new file mode 100644 index 0000000..3e03d9a --- /dev/null +++ b/tests/core/methods/dataframe/filtering/at.test.js @@ -0,0 +1,95 @@ +/** + * Unit tests for at method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { at } from '../../../../../packages/core/src/methods/dataframe/filtering/at.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('At Method', () => { + // Add at method to DataFrame prototype + DataFrame.prototype.at = function(index) { + return at(this, index); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should return row at specified index', () => { + const result = df.at(1); + + // Check that the result is the correct row + expect(result).toEqual({ + name: 'Bob', + age: 30, + city: 'San Francisco', + salary: 85000, + }); + }); + + test('should handle index 0', () => { + const result = df.at(0); + + expect(result).toEqual({ + name: 'Alice', + age: 25, + city: 'New York', + salary: 70000, + }); + }); + + test('should handle last index', () => { + const result = df.at(2); + + expect(result).toEqual({ + name: 'Charlie', + age: 35, + city: 'Chicago', + salary: 90000, + }); + }); + + test('should throw error for negative index', () => { + expect(() => df.at(-1)).toThrow('Index out of bounds: -1 is negative'); + }); + + test('should throw error for index >= rowCount', () => { + expect(() => df.at(3)).toThrow('Index out of bounds: 3 >= 3'); + }); + + test('should throw error for non-integer index', () => { + expect(() => df.at(1.5)).toThrow('Index must be an integer'); + }); + + test('should handle typed arrays correctly', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Get row at index + const result = typedDf.at(1); + + // Check that the values are correct + expect(result.age).toBe(30); + expect(result.salary).toBe(85000); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.at(0)).toThrow('Index out of bounds: DataFrame is empty'); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/drop.test.js b/tests/core/methods/dataframe/filtering/drop.test.js new file mode 100644 index 0000000..4cf3b91 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/drop.test.js @@ -0,0 +1,103 @@ +/** + * Unit tests for drop method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { drop } from '../../../../../packages/core/src/methods/dataframe/filtering/drop.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Drop Method', () => { + // Add drop method to DataFrame prototype + DataFrame.prototype.drop = function(columns) { + return drop(this, columns); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should drop specified columns', () => { + const result = df.drop(['city', 'salary']); + + // Check that the result has only the remaining columns + expect(result.columns.sort()).toEqual(['age', 'name'].sort()); + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25 }, + { name: 'Bob', age: 30 }, + { name: 'Charlie', age: 35 }, + ]); + }); + + test('should handle single column as string', () => { + const result = df.drop('name'); + + // Check that the result has all columns except the dropped one + expect(result.columns.sort()).toEqual(['age', 'city', 'salary'].sort()); + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual([ + { age: 25, city: 'New York', salary: 70000 }, + { age: 30, city: 'San Francisco', salary: 85000 }, + { age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should throw error for non-existent column', () => { + expect(() => df.drop(['name', 'nonexistent'])).toThrow('Column not found: \'nonexistent\''); + }); + + test('should return a new DataFrame instance', () => { + const result = df.drop(['city', 'salary']); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Drop columns + const result = typedDf.drop(['city']); + + // Check that the result has the correct columns + expect(result.columns.sort()).toEqual(['age', 'name', 'salary'].sort()); + + // Check that the data types are preserved (using the public API) + const ageCol = result.col('age'); + const salaryCol = result.col('salary'); + expect(ageCol.toArray()).toEqual([25, 30, 35]); + expect(salaryCol.toArray()).toEqual([70000, 85000, 90000]); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.drop(['name'])).toThrow('Column not found: \'name\''); + }); + + test('should handle empty column list', () => { + const result = df.drop([]); + + // Should return a copy of the original DataFrame + expect(result.columns.sort()).toEqual(df.columns.sort()); + expect(result.rowCount).toBe(df.rowCount); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should throw error when dropping all columns', () => { + expect(() => df.drop(['name', 'age', 'city', 'salary'])).toThrow('Cannot drop all columns'); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/expr$.test.js b/tests/core/methods/dataframe/filtering/expr$.test.js new file mode 100644 index 0000000..298d04c --- /dev/null +++ b/tests/core/methods/dataframe/filtering/expr$.test.js @@ -0,0 +1,114 @@ +/** + * Unit tests for expr$ method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { expr$ } from '../../../../../packages/core/src/methods/dataframe/filtering/expr$.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Expr$ Method', () => { + // Add expr$ method to DataFrame prototype + DataFrame.prototype.expr$ = function(expression) { + return expr$(this, expression); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should filter rows based on a simple expression', () => { + const result = df.expr$('age > 25'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle complex expressions with logical operators', () => { + const result = df.expr$('age > 25 && salary > 85000'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle string methods', () => { + const result = df.expr$('city.includes("Francisco")'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + ]); + }); + + test('should return empty DataFrame when no rows match', () => { + const result = df.expr$('age > 100'); + + // Should have all columns but no rows + expect(result.columns.sort()).toEqual( + ['age', 'city', 'name', 'salary'].sort(), + ); + expect(result.rowCount).toBe(0); + }); + + test('should throw error for invalid expression', () => { + expect(() => df.expr$('age >< 25')).toThrow(); + }); + + test('should return a new DataFrame instance', () => { + const result = df.expr$('age > 25'); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Filter the data + const result = typedDf.expr$('age > 25'); + + // Check that the result contains typed arrays + expect(result._columns.age.vector.__data).toBeInstanceOf(Int32Array); + expect(result._columns.salary.vector.__data).toBeInstanceOf(Float64Array); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + const result = emptyDf.expr$('age > 25'); + + expect(result.rowCount).toBe(0); + expect(result.columns).toEqual([]); + }); + + test('should handle expressions with variables', () => { + const minAge = 30; + const result = df.expr$(`age >= ${minAge}`); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/filter.test.js b/tests/core/methods/dataframe/filtering/filter.test.js new file mode 100644 index 0000000..26bb529 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/filter.test.js @@ -0,0 +1,92 @@ +/** + * Unit tests for filter method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { filter } from '../../../../../packages/core/src/methods/dataframe/filtering/filter.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Filter Method', () => { + // Add filter method to DataFrame prototype + DataFrame.prototype.filter = function(predicate) { + return filter(this, predicate); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should filter rows based on predicate function', () => { + const result = df.filter(row => row.age > 25); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle complex predicates', () => { + const result = df.filter(row => row.age > 25 && row.salary > 85000); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should return empty DataFrame when no rows match', () => { + const result = df.filter(row => row.age > 100); + + // Should have all columns but no rows + expect(result.columns.sort()).toEqual( + ['age', 'city', 'name', 'salary'].sort(), + ); + expect(result.rowCount).toBe(0); + }); + + test('should throw error for non-function predicate', () => { + expect(() => df.filter('not a function')).toThrow('Predicate must be a function'); + }); + + test('should return a new DataFrame instance', () => { + const result = df.filter(row => row.age > 25); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Filter the data + const result = typedDf.filter(row => row.age > 25); + + // Check that the result contains typed arrays + expect(result._columns.age.vector.__data).toBeInstanceOf(Int32Array); + expect(result._columns.salary.vector.__data).toBeInstanceOf(Float64Array); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + const result = emptyDf.filter(row => true); + + expect(result.rowCount).toBe(0); + expect(result.columns).toEqual([]); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/head.test.js b/tests/core/methods/dataframe/filtering/head.test.js new file mode 100644 index 0000000..9e00083 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/head.test.js @@ -0,0 +1,96 @@ +/** + * Unit tests for head method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { head } from '../../../../../packages/core/src/methods/dataframe/filtering/head.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + { name: 'David', age: 40, city: 'Boston', salary: 95000 }, + { name: 'Eve', age: 45, city: 'Seattle', salary: 100000 }, + { name: 'Frank', age: 50, city: 'Denver', salary: 105000 }, + { name: 'Grace', age: 55, city: 'Miami', salary: 110000 }, +]; + +describe('Head Method', () => { + // Add head method to DataFrame prototype + DataFrame.prototype.head = function(n, options) { + return head(this, n, options); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should return first 5 rows by default', () => { + const result = df.head(); + + // Check that the result has 5 rows + expect(result.rowCount).toBe(5); + expect(result.toArray()).toEqual(testData.slice(0, 5)); + }); + + test('should return specified number of rows', () => { + const result = df.head(3); + + // Check that the result has 3 rows + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual(testData.slice(0, 3)); + }); + + test('should handle n greater than number of rows', () => { + const result = df.head(10); + + // Should return all rows + expect(result.rowCount).toBe(testData.length); + expect(result.toArray()).toEqual(testData); + }); + + test('should throw error for negative n', () => { + expect(() => df.head(-1)).toThrow('Number of rows must be a positive integer'); + }); + + test('should throw error for non-integer n', () => { + expect(() => df.head(2.5)).toThrow('Number of rows must be an integer'); + }); + + test('should return a new DataFrame instance', () => { + const result = df.head(3); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Get head of the data + const result = typedDf.head(3); + + // Check that the result has the correct columns and data + expect(result.columns.sort()).toEqual(['age', 'city', 'name', 'salary'].sort()); + + // Check that the data is preserved correctly (using the public API) + const ageCol = result.col('age'); + const salaryCol = result.col('salary'); + expect(ageCol.toArray()).toEqual([25, 30, 35]); + expect(salaryCol.toArray()).toEqual([70000, 85000, 90000]); + }); + + test('should accept options object', () => { + // The print option is for API compatibility and doesn't affect the result + const result = df.head(3, { print: true }); + expect(result.rowCount).toBe(3); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/iloc.test.js b/tests/core/methods/dataframe/filtering/iloc.test.js new file mode 100644 index 0000000..9c19b37 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/iloc.test.js @@ -0,0 +1,145 @@ +/** + * Unit tests for iloc method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { iloc } from '../../../../../packages/core/src/methods/dataframe/filtering/iloc.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + { name: 'David', age: 40, city: 'Boston', salary: 95000 }, + { name: 'Eve', age: 45, city: 'Seattle', salary: 100000 }, +]; + +describe('Iloc Method', () => { + // Add iloc method to DataFrame prototype + DataFrame.prototype.iloc = function(rowSelector, columnSelector) { + return iloc(this, rowSelector, columnSelector); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should select rows by integer index', () => { + const result = df.iloc(1); + + // Check that the result is a DataFrame with one row + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([testData[1]]); + }); + + test('should select rows by array of indices', () => { + const result = df.iloc([0, 2, 4]); + + // Check that the result contains the selected rows + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual([ + testData[0], + testData[2], + testData[4], + ]); + }); + + test('should select rows by predicate function', () => { + const result = df.iloc((i) => i % 2 === 0); + + // Should select rows at indices 0, 2, 4 + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual([ + testData[0], + testData[2], + testData[4], + ]); + }); + + test('should select columns by integer index', () => { + const result = df.iloc(null, 1); + + // Should select the 'age' column for all rows + expect(result.columns).toEqual(['age']); + expect(result.rowCount).toBe(5); + expect(result.col('age').toArray()).toEqual([25, 30, 35, 40, 45]); + }); + + test('should select columns by array of indices', () => { + const result = df.iloc(null, [0, 2]); + + // Should select the 'name' and 'city' columns + expect(result.columns.sort()).toEqual(['city', 'name'].sort()); + expect(result.rowCount).toBe(5); + }); + + test('should select rows and columns by indices', () => { + const result = df.iloc([1, 3], [0, 2]); + + // Should select rows 1 and 3, columns 'name' and 'city' + expect(result.rowCount).toBe(2); + expect(result.columns.sort()).toEqual(['city', 'name'].sort()); + expect(result.toArray()).toEqual([ + { name: 'Bob', city: 'San Francisco' }, + { name: 'David', city: 'Boston' }, + ]); + }); + + test('should handle null for rows to select all rows', () => { + const result = df.iloc(null, 1); + + // Should select all rows, but only the 'age' column + expect(result.rowCount).toBe(5); + expect(result.columns).toEqual(['age']); + }); + + test('should handle null for columns to select all columns', () => { + const result = df.iloc(2, null); + + // Should select row 2, all columns + expect(result.rowCount).toBe(1); + expect(result.columns.sort()).toEqual(['age', 'city', 'name', 'salary'].sort()); + expect(result.toArray()).toEqual([testData[2]]); + }); + + test('should throw error for out of bounds row index', () => { + expect(() => df.iloc(10)).toThrow('Row index out of bounds'); + }); + + test('should throw error for out of bounds column index', () => { + expect(() => df.iloc(null, 10)).toThrow('Column index out of bounds'); + }); + + test('should throw error for invalid row selector type', () => { + expect(() => df.iloc('invalid')).toThrow('Invalid row selector type'); + }); + + test('should throw error for invalid column selector type', () => { + expect(() => df.iloc(null, 'invalid')).toThrow('Invalid column selector type'); + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Select rows and columns + const result = typedDf.iloc([1, 3], [1, 3]); + + // Check that the result contains typed arrays + expect(result._columns.age.vector.__data).toBeInstanceOf(Int32Array); + expect(result._columns.salary.vector.__data).toBeInstanceOf(Float64Array); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.iloc(0)).toThrow('Row index out of bounds'); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/loc.test.js b/tests/core/methods/dataframe/filtering/loc.test.js new file mode 100644 index 0000000..159cb50 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/loc.test.js @@ -0,0 +1,151 @@ +/** + * Unit tests for loc method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { loc } from '../../../../../packages/core/src/methods/dataframe/filtering/loc.js'; + +// Test data for use in all tests +const testData = [ + { id: 'a1', name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { id: 'b2', name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { id: 'c3', name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + { id: 'd4', name: 'David', age: 40, city: 'Boston', salary: 95000 }, + { id: 'e5', name: 'Eve', age: 45, city: 'Seattle', salary: 100000 }, +]; + +describe('Loc Method', () => { + // Add loc method to DataFrame prototype + DataFrame.prototype.loc = function(rowSelector, columnSelector) { + return loc(this, rowSelector, columnSelector); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords with id as index + const df = DataFrame.fromRecords(testData); + + // Set index to 'id' column + df.setIndex('id'); + + test('should select rows by label', () => { + const result = df.loc('b2'); + + // Check that the result is a DataFrame with one row + expect(result.rowCount).toBe(1); + expect(result.toArray()[0].name).toBe('Bob'); + }); + + test('should select rows by array of labels', () => { + const result = df.loc(['a1', 'c3', 'e5']); + + // Check that the result contains the selected rows + expect(result.rowCount).toBe(3); + expect(result.toArray().map(r => r.name)).toEqual(['Alice', 'Charlie', 'Eve']); + }); + + test('should select rows by predicate function', () => { + const result = df.loc((row) => row.age > 30); + + // Should select rows with age > 30 + expect(result.rowCount).toBe(3); + expect(result.toArray().map(r => r.name)).toEqual(['Charlie', 'David', 'Eve']); + }); + + test('should select rows by condition object', () => { + const result = df.loc({ city: 'Chicago' }); + + // Should select rows where city is Chicago + expect(result.rowCount).toBe(1); + expect(result.toArray()[0].name).toBe('Charlie'); + }); + + test('should select columns by name', () => { + const result = df.loc(null, 'age'); + + // Should select the 'age' column for all rows + expect(result.columns).toEqual(['age']); + expect(result.rowCount).toBe(5); + expect(result.col('age').toArray()).toEqual([25, 30, 35, 40, 45]); + }); + + test('should select columns by array of names', () => { + const result = df.loc(null, ['name', 'city']); + + // Should select the 'name' and 'city' columns + expect(result.columns.sort()).toEqual(['city', 'name'].sort()); + expect(result.rowCount).toBe(5); + }); + + test('should select rows and columns by labels', () => { + const result = df.loc(['b2', 'd4'], ['name', 'city']); + + // Should select rows with ids 'b2' and 'd4', columns 'name' and 'city' + expect(result.rowCount).toBe(2); + expect(result.columns.sort()).toEqual(['city', 'name'].sort()); + expect(result.toArray()).toEqual([ + { name: 'Bob', city: 'San Francisco' }, + { name: 'David', city: 'Boston' }, + ]); + }); + + test('should handle null for rows to select all rows', () => { + const result = df.loc(null, 'age'); + + // Should select all rows, but only the 'age' column + expect(result.rowCount).toBe(5); + expect(result.columns).toEqual(['age']); + }); + + test('should handle null for columns to select all columns', () => { + const result = df.loc('c3', null); + + // Should select row with id 'c3', all columns + expect(result.rowCount).toBe(1); + expect(result.columns.length).toBe(5); // id, name, age, city, salary + expect(result.toArray()[0].name).toBe('Charlie'); + }); + + test('should throw error for non-existent row label', () => { + expect(() => df.loc('z9')).toThrow('Row label not found'); + }); + + test('should throw error for non-existent column label', () => { + expect(() => df.loc(null, 'country')).toThrow('Column not found'); + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + typedDf.setIndex('id'); + + // Select rows and columns + const result = typedDf.loc(['b2', 'd4'], ['age', 'salary']); + + // Check that the result contains typed arrays + expect(result._columns.age.vector.__data).toBeInstanceOf(Int32Array); + expect(result._columns.salary.vector.__data).toBeInstanceOf(Float64Array); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + emptyDf.setIndex('id'); + + expect(() => emptyDf.loc('a1')).toThrow('Row label not found'); + }); + + test('should handle DataFrame without index', () => { + const dfNoIndex = DataFrame.fromRecords(testData); + + // Should use row number as index + const result = dfNoIndex.loc(2); + expect(result.rowCount).toBe(1); + expect(result.toArray()[0].name).toBe('Charlie'); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/query$.test.js b/tests/core/methods/dataframe/filtering/query$.test.js new file mode 100644 index 0000000..604efa3 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/query$.test.js @@ -0,0 +1,132 @@ +/** + * Unit tests for query$ method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { query$ } from '../../../../../packages/core/src/methods/dataframe/filtering/query$.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Query$ Method', () => { + // Add query$ method to DataFrame prototype + DataFrame.prototype.query$ = function(strings, ...values) { + return query$(this, strings, ...values); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should filter rows based on a simple condition', () => { + const result = df.query$`age > 25`; + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle complex conditions with logical operators', () => { + const result = df.query$`age > 25 && salary > 85000`; + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle string methods with _includes syntax', () => { + const result = df.query$`city_includes("Francisco")`; + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + ]); + }); + + test('should handle string methods with _startsWith syntax', () => { + const result = df.query$`city_startsWith("Chi")`; + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle string methods with _endsWith syntax', () => { + const result = df.query$`city_endsWith("York")`; + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + ]); + }); + + test('should return empty DataFrame when no rows match', () => { + const result = df.query$`age > 100`; + + // Should have all columns but no rows + expect(result.columns.sort()).toEqual( + ['age', 'city', 'name', 'salary'].sort(), + ); + expect(result.rowCount).toBe(0); + }); + + test('should throw error for invalid expression', () => { + expect(() => df.query$`age >< 25`).toThrow(); + }); + + test('should return a new DataFrame instance', () => { + const result = df.query$`age > 25`; + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedData = [ + { name: 'Alice', age: 25, salary: 70000 }, + { name: 'Bob', age: 30, salary: 85000 }, + { name: 'Charlie', age: 35, salary: 90000 }, + ]; + + // Use Int32Array for age and Float64Array for salary + const typedDf = DataFrame.fromRecords(typedData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Filter the data + const result = typedDf.query$`age > 25`; + + // Check that the result contains Float64Array for salary + expect(result._columns.salary.vector.__data).toBeInstanceOf(Float64Array); + }); + + test('should handle template literal interpolation', () => { + const minAge = 30; + const result = df.query$`age >= ${minAge}`; + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/query.test.js b/tests/core/methods/dataframe/filtering/query.test.js new file mode 100644 index 0000000..17629e7 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/query.test.js @@ -0,0 +1,137 @@ +/** + * Unit tests for query method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { query } from '../../../../../packages/core/src/methods/dataframe/filtering/query.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Query Method', () => { + // Add query method to DataFrame prototype + DataFrame.prototype.query = function(queryString) { + return query(this, queryString); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should filter rows based on a simple SQL-like query', () => { + const result = df.query('SELECT * WHERE age > 25'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle complex conditions with logical operators', () => { + const result = df.query('SELECT * WHERE age > 25 AND salary > 85000'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + ]); + }); + + test('should handle string operations', () => { + const result = df.query("SELECT * WHERE city LIKE '%Francisco%'"); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + ]); + }); + + test('should handle column selection', () => { + const result = df.query('SELECT name, age WHERE age > 25'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.columns.sort()).toEqual(['age', 'name'].sort()); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30 }, + { name: 'Charlie', age: 35 }, + ]); + }); + + test('should handle ORDER BY clause', () => { + const result = df.query('SELECT * ORDER BY age DESC'); + + // Check that the data is sorted correctly + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + ]); + }); + + test('should handle LIMIT clause', () => { + const result = df.query('SELECT * ORDER BY age DESC LIMIT 2'); + + // Check that the result is limited correctly + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + ]); + }); + + test('should return empty DataFrame when no rows match', () => { + const result = df.query('SELECT * WHERE age > 100'); + + // Should have all columns but no rows + expect(result.columns.sort()).toEqual( + ['age', 'city', 'name', 'salary'].sort(), + ); + expect(result.rowCount).toBe(0); + }); + + test('should throw error for invalid query', () => { + expect(() => df.query('INVALID QUERY')).toThrow(); + }); + + test('should return a new DataFrame instance', () => { + const result = df.query('SELECT * WHERE age > 25'); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Filter the data + const result = typedDf.query('SELECT * WHERE age > 25'); + + // Check that the result contains typed arrays + expect(result._columns.age.vector.__data).toBeInstanceOf(Int32Array); + expect(result._columns.salary.vector.__data).toBeInstanceOf(Float64Array); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + const result = emptyDf.query('SELECT * WHERE age > 25'); + + expect(result.rowCount).toBe(0); + expect(result.columns).toEqual([]); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/sample.test.js b/tests/core/methods/dataframe/filtering/sample.test.js new file mode 100644 index 0000000..ca4398f --- /dev/null +++ b/tests/core/methods/dataframe/filtering/sample.test.js @@ -0,0 +1,157 @@ +/** + * Unit tests for sample method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { sample } from '../../../../../packages/core/src/methods/dataframe/filtering/sample.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + { name: 'David', age: 40, city: 'Boston', salary: 95000 }, + { name: 'Eve', age: 45, city: 'Seattle', salary: 100000 }, + { name: 'Frank', age: 50, city: 'Denver', salary: 105000 }, + { name: 'Grace', age: 55, city: 'Miami', salary: 110000 }, +]; + +describe('Sample Method', () => { + // Add sample method to DataFrame prototype + DataFrame.prototype.sample = function(n, options) { + return sample(this, n, options); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should sample 1 row by default', () => { + const result = df.sample(); + + // Check that the result has 1 row + expect(result.rowCount).toBe(1); + // The row should be one of the original rows + const resultRow = result.toArray()[0]; + expect(testData.some(row => + row.name === resultRow.name && + row.age === resultRow.age && + row.city === resultRow.city && + row.salary === resultRow.salary + )).toBe(true); + }); + + test('should sample specified number of rows', () => { + const result = df.sample(3); + + // Check that the result has 3 rows + expect(result.rowCount).toBe(3); + + // Each row should be one of the original rows + const resultRows = result.toArray(); + for (const resultRow of resultRows) { + expect(testData.some(row => + row.name === resultRow.name && + row.age === resultRow.age && + row.city === resultRow.city && + row.salary === resultRow.salary + )).toBe(true); + } + }); + + test('should sample by fraction', () => { + const result = df.sample({ fraction: 0.5 }); + + // Check that the result has approximately half the rows + // Due to rounding, it might be 3 or 4 rows for 7 total rows + expect(result.rowCount).toBeGreaterThanOrEqual(3); + expect(result.rowCount).toBeLessThanOrEqual(4); + }); + + test('should throw error for invalid fraction', () => { + expect(() => df.sample({ fraction: 0 })).toThrow('Fraction must be in the range (0, 1]'); + expect(() => df.sample({ fraction: 1.5 })).toThrow('Fraction must be in the range (0, 1]'); + }); + + test('should throw error for negative n', () => { + expect(() => df.sample(-1)).toThrow('Number of rows to sample must be a positive integer'); + }); + + test('should throw error for non-integer n', () => { + expect(() => df.sample(2.5)).toThrow('Number of rows to sample must be an integer'); + }); + + test('should throw error when sampling without replacement and n > rows', () => { + expect(() => df.sample(10)).toThrow('Sample size (10) cannot be greater than number of rows (7)'); + }); + + test('should allow sampling with replacement and n > rows', () => { + const result = df.sample(10, { replace: true }); + expect(result.rowCount).toBe(10); + }); + + test('should return a new DataFrame instance', () => { + const result = df.sample(3); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Sample the data with a fixed seed for deterministic results + const result = typedDf.sample(3, { seed: 42 }); + + // Check that the result has the correct columns + expect(result.columns.sort()).toEqual(['age', 'city', 'name', 'salary'].sort()); + + // Check that the data is preserved correctly (using the public API) + const ageCol = result.col('age'); + const salaryCol = result.col('salary'); + + // We can't check exact values since they depend on the random seed implementation + // But we can check that the arrays have the right length and are of the right type + expect(ageCol.toArray().length).toBe(3); + expect(salaryCol.toArray().length).toBe(3); + + // Check that all values are from the original dataset + const originalAges = testData.map(row => row.age); + const originalSalaries = testData.map(row => row.salary); + + ageCol.toArray().forEach(value => { + expect(originalAges).toContain(value); + }); + + salaryCol.toArray().forEach(value => { + expect(originalSalaries).toContain(value); + }); + }); + + test('should produce deterministic results with seed', () => { + // Sample with the same seed should produce the same results + const sample1 = df.sample(3, { seed: 42 }); + const sample2 = df.sample(3, { seed: 42 }); + + // Compare the sampled rows + const rows1 = sample1.toArray(); + const rows2 = sample2.toArray(); + + expect(rows1).toEqual(rows2); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + const result = emptyDf.sample(); + + expect(result.rowCount).toBe(0); + expect(result.columns).toEqual([]); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/select.test.js b/tests/core/methods/dataframe/filtering/select.test.js new file mode 100644 index 0000000..74e2636 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/select.test.js @@ -0,0 +1,88 @@ +/** + * Unit tests for select method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { select } from '../../../../../packages/core/src/methods/dataframe/filtering/select.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Select Method', () => { + // Add select method to DataFrame prototype + DataFrame.prototype.select = function(columns) { + return select(this, columns); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should select specified columns', () => { + const result = df.select(['name', 'age']); + + // Check that the result has only the selected columns + expect(result.columns.sort()).toEqual(['age', 'name'].sort()); + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25 }, + { name: 'Bob', age: 30 }, + { name: 'Charlie', age: 35 }, + ]); + }); + + test('should handle single column as string', () => { + // Метод select должен принимать только массив + expect(() => df.select('name')).toThrow('Columns must be an array'); + }); + + test('should throw error for non-existent column', () => { + expect(() => df.select(['name', 'nonexistent'])).toThrow("Column 'nonexistent' not found"); + }); + + test('should return a new DataFrame instance', () => { + const result = df.select(['name', 'age']); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Select columns + const result = typedDf.select(['name', 'age', 'salary']); + + // Check that data is preserved correctly + const ageCol = result.col('age'); + const salaryCol = result.col('salary'); + expect(ageCol.toArray()).toEqual([25, 30, 35]); + expect(salaryCol.toArray()).toEqual([70000, 85000, 90000]); + + // Verify that the column types are preserved by checking the column options + // This is an indirect way to verify the typed arrays are preserved + expect(result._options.columns.age.type).toBe('int32'); + expect(result._options.columns.salary.type).toBe('float64'); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.select(['name'])).toThrow("Column 'name' not found"); + }); + + test('should handle empty column list', () => { + expect(() => df.select([])).toThrow('Column list cannot be empty'); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/selectByPattern.test.js b/tests/core/methods/dataframe/filtering/selectByPattern.test.js new file mode 100644 index 0000000..9c3df10 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/selectByPattern.test.js @@ -0,0 +1,96 @@ +/** + * Unit tests for selectByPattern method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { selectByPattern } from '../../../../../packages/core/src/methods/dataframe/filtering/selectByPattern.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city_name: 'New York', salary_usd: 70000, user_id: 1 }, + { name: 'Bob', age: 30, city_name: 'San Francisco', salary_usd: 85000, user_id: 2 }, + { name: 'Charlie', age: 35, city_name: 'Chicago', salary_usd: 90000, user_id: 3 }, +]; + +describe('SelectByPattern Method', () => { + // Add selectByPattern method to DataFrame prototype + DataFrame.prototype.selectByPattern = function(pattern) { + return selectByPattern(this, pattern); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should select columns matching a string pattern', () => { + const result = df.selectByPattern('city'); + + // Check that the result has only the matching columns + expect(result.columns).toEqual(['city_name']); + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual([ + { city_name: 'New York' }, + { city_name: 'San Francisco' }, + { city_name: 'Chicago' }, + ]); + }); + + test('should select columns matching a regular expression', () => { + const result = df.selectByPattern(/^.+_name$/); + + // Check that the result has only the matching columns + expect(result.columns).toEqual(['city_name']); + expect(result.rowCount).toBe(3); + }); + + test('should select multiple columns matching a pattern', () => { + const result = df.selectByPattern(/^.+_/); + + // Check that the result has all matching columns + expect(result.columns.sort()).toEqual(['city_name', 'salary_usd', 'user_id'].sort()); + expect(result.rowCount).toBe(3); + }); + + test('should return empty DataFrame when no columns match', () => { + expect(() => df.selectByPattern('nonexistent')).toThrow('No columns match the pattern'); + }); + + test('should return a new DataFrame instance', () => { + const result = df.selectByPattern('city'); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary_usd: { type: 'float64' }, + user_id: { type: 'int32' }, + }, + }); + + // Select columns by pattern + const result = typedDf.selectByPattern(/^.+_/); + + // Check that data is preserved correctly + const salaryCol = result.col('salary_usd'); + const userIdCol = result.col('user_id'); + expect(salaryCol.toArray()).toEqual([70000, 85000, 90000]); + expect(userIdCol.toArray()).toEqual([1, 2, 3]); + + // Verify that the column types are preserved by checking the column options + // This is an indirect way to verify the typed arrays are preserved + expect(result._options.columns.salary_usd.type).toBe('float64'); + expect(result._options.columns.user_id.type).toBe('int32'); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.selectByPattern('city')).toThrow('No columns match the pattern'); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/stratifiedSample.test.js b/tests/core/methods/dataframe/filtering/stratifiedSample.test.js new file mode 100644 index 0000000..a233c06 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/stratifiedSample.test.js @@ -0,0 +1,132 @@ +/** + * Unit tests for stratifiedSample method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { stratifiedSample } from '../../../../../packages/core/src/methods/dataframe/filtering/stratifiedSample.js'; + +// Test data for use in all tests +const testData = [ + { category: 'A', value: 1 }, + { category: 'A', value: 2 }, + { category: 'A', value: 3 }, + { category: 'A', value: 4 }, + { category: 'B', value: 5 }, + { category: 'B', value: 6 }, + { category: 'C', value: 7 }, + { category: 'C', value: 8 }, + { category: 'C', value: 9 }, +]; + +describe('StratifiedSample Method', () => { + // Add stratifiedSample method to DataFrame prototype + DataFrame.prototype.stratifiedSample = function(column, n, options) { + return stratifiedSample(this, column, n, options); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should sample proportionally from each category', () => { + const result = df.stratifiedSample('category', 4); + + // Check that the result has 4 rows + expect(result.rowCount).toBe(4); + + // Check that each category is represented proportionally + const categoryCounts = {}; + result.toArray().forEach(row => { + categoryCounts[row.category] = (categoryCounts[row.category] || 0) + 1; + }); + + // Category A should have ~2 rows (4/9 * 4 ~= 1.78) + // Category B should have ~1 row (2/9 * 4 ~= 0.89) + // Category C should have ~1 row (3/9 * 4 ~= 1.33) + // Due to rounding and randomness, we can't check exact counts, + // but we can check that all categories are represented + expect(Object.keys(categoryCounts).sort()).toEqual(['A', 'B', 'C']); + }); + + test('should sample with fixed seed for deterministic results', () => { + const sample1 = df.stratifiedSample('category', 4, { seed: 42 }); + const sample2 = df.stratifiedSample('category', 4, { seed: 42 }); + + // Compare the sampled rows + const rows1 = sample1.toArray(); + const rows2 = sample2.toArray(); + + expect(rows1).toEqual(rows2); + }); + + test('should throw error for non-existent column', () => { + expect(() => df.stratifiedSample('nonexistent', 4)).toThrow('Column not found'); + }); + + test('should throw error for negative n', () => { + expect(() => df.stratifiedSample('category', -1)).toThrow('Number of rows to sample must be a positive number'); + }); + + test('should throw error for non-integer n', () => { + expect(() => df.stratifiedSample('category', 2.5)).toThrow('Number of rows to sample must be an integer'); + }); + + test('should throw error when n > rows', () => { + expect(() => df.stratifiedSample('category', 10)).toThrow('Sample size (10) cannot be greater than number of rows (9)'); + }); + + test('should return a new DataFrame instance', () => { + const result = df.stratifiedSample('category', 4); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + value: { type: 'int32' }, + }, + }); + + // Sample the data with a fixed seed for deterministic results + const result = typedDf.stratifiedSample('category', 4, { seed: 42 }); + + // Check that the result contains typed arrays + expect(result._columns.value.vector.__data).toBeInstanceOf(Int32Array); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.stratifiedSample('category', 1)).toThrow('DataFrame is empty'); + }); + + test('should handle DataFrame with single category', () => { + const singleCategoryData = [ + { category: 'A', value: 1 }, + { category: 'A', value: 2 }, + { category: 'A', value: 3 }, + ]; + const singleCategoryDf = DataFrame.fromRecords(singleCategoryData); + + const result = singleCategoryDf.stratifiedSample('category', 2); + + expect(result.rowCount).toBe(2); + expect(result.toArray().every(row => row.category === 'A')).toBe(true); + }); + + test('should handle frac option instead of n', () => { + const result = df.stratifiedSample('category', { frac: 0.5 }); + + // Should sample approximately half the rows + expect(result.rowCount).toBeGreaterThanOrEqual(4); + expect(result.rowCount).toBeLessThanOrEqual(5); + + // All categories should be represented + const categories = new Set(result.toArray().map(row => row.category)); + expect([...categories].sort()).toEqual(['A', 'B', 'C']); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/tail.test.js b/tests/core/methods/dataframe/filtering/tail.test.js new file mode 100644 index 0000000..84520b3 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/tail.test.js @@ -0,0 +1,96 @@ +/** + * Unit tests for tail method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { tail } from '../../../../../packages/core/src/methods/dataframe/filtering/tail.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, + { name: 'David', age: 40, city: 'Boston', salary: 95000 }, + { name: 'Eve', age: 45, city: 'Seattle', salary: 100000 }, + { name: 'Frank', age: 50, city: 'Denver', salary: 105000 }, + { name: 'Grace', age: 55, city: 'Miami', salary: 110000 }, +]; + +describe('Tail Method', () => { + // Add tail method to DataFrame prototype + DataFrame.prototype.tail = function(n, options) { + return tail(this, n, options); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should return last 5 rows by default', () => { + const result = df.tail(); + + // Check that the result has 5 rows + expect(result.rowCount).toBe(5); + expect(result.toArray()).toEqual(testData.slice(-5)); + }); + + test('should return specified number of rows from the end', () => { + const result = df.tail(3); + + // Check that the result has 3 rows + expect(result.rowCount).toBe(3); + expect(result.toArray()).toEqual(testData.slice(-3)); + }); + + test('should handle n greater than number of rows', () => { + const result = df.tail(10); + + // Should return all rows + expect(result.rowCount).toBe(testData.length); + expect(result.toArray()).toEqual(testData); + }); + + test('should throw error for negative n', () => { + expect(() => df.tail(-1)).toThrow('Number of rows must be a positive integer'); + }); + + test('should throw error for non-integer n', () => { + expect(() => df.tail(2.5)).toThrow('Number of rows must be an integer'); + }); + + test('should return a new DataFrame instance', () => { + const result = df.tail(3); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Get tail of the data + const result = typedDf.tail(3); + + // Check that the result has the correct columns and data + expect(result.columns.sort()).toEqual(['age', 'city', 'name', 'salary'].sort()); + + // Check that the data is preserved correctly (using the public API) + const ageCol = result.col('age'); + const salaryCol = result.col('salary'); + expect(ageCol.toArray()).toEqual([45, 50, 55]); + expect(salaryCol.toArray()).toEqual([100000, 105000, 110000]); + }); + + test('should accept options object', () => { + // The print option is for API compatibility and doesn't affect the result + const result = df.tail(3, { print: true }); + expect(result.rowCount).toBe(3); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/where.debug.test.js b/tests/core/methods/dataframe/filtering/where.debug.test.js new file mode 100644 index 0000000..1554df7 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/where.debug.test.js @@ -0,0 +1,46 @@ +/** + * Debug test for the where method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { where } from '../../../../../packages/core/src/methods/dataframe/filtering/where.js'; + +// Test data +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Where Method - Debug Test', () => { + // Add where method to DataFrame prototype + DataFrame.prototype.where = function(column, operator, value) { + return where(this, column, operator, value); + }; + + test('should debug where method behavior', () => { + // Create DataFrame + const df = DataFrame.fromRecords(testData); + console.log('Original DataFrame columns:', df.columns); + console.log('Original DataFrame row count:', df.rowCount); + + // Test where method + const result = df.where('age', '===', 30); + console.log('Result DataFrame columns:', result.columns); + console.log('Result DataFrame row count:', result.rowCount); + + // Output result + const resultArray = result.toArray(); + console.log('Result array:', JSON.stringify(resultArray, null, 2)); + + // Check result structure + expect(resultArray.length).toBe(1); + console.log('First row keys:', Object.keys(resultArray[0])); + + // Test empty result + const emptyResult = df.where('age', '>', 100); + console.log('Empty result columns:', emptyResult.columns); + console.log('Empty result row count:', emptyResult.rowCount); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/where.fixed.test.js b/tests/core/methods/dataframe/filtering/where.fixed.test.js new file mode 100644 index 0000000..711e9c7 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/where.fixed.test.js @@ -0,0 +1,220 @@ +/** + * Unit tests for where method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { where } from '../../../../../packages/core/src/methods/dataframe/filtering/where.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, +]; + +describe('Where Method', () => { + // Add where method to DataFrame prototype + DataFrame.prototype.where = function(column, operator, value) { + return where(this, column, operator, value); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should filter rows based on equality', () => { + const result = df.where('age', '===', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on loose equality', () => { + const result = df.where('age', '==', '30'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on inequality', () => { + const result = df.where('age', '!==', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on loose inequality', () => { + const result = df.where('age', '!=', '30'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on greater than', () => { + const result = df.where('age', '>', 25); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on greater than or equal', () => { + const result = df.where('age', '>=', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on less than', () => { + const result = df.where('age', '<', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + ]); + }); + + test('should filter rows based on less than or equal', () => { + const result = df.where('age', '<=', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on in operator', () => { + const result = df.where('age', 'in', [25, 35]); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on contains operator for strings', () => { + const result = df.where('city', 'contains', 'Francisco'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on startsWith operator for strings', () => { + const result = df.where('city', 'startsWith', 'San'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on endsWith operator for strings', () => { + const result = df.where('city', 'endsWith', 'York'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + ]); + }); + + test('should filter rows based on matches operator for strings', () => { + const result = df.where('city', 'matches', /^C/); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on array contains', () => { + const result = df.where('tags', 'contains', 'js'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + ]); + }); + + test('should return empty DataFrame when no rows match', () => { + const result = df.where('age', '>', 100); + + // Should be empty with no rows + expect(result.rowCount).toBe(0); + // В новой реализации пустой DataFrame не сохраняет структуру колонок + // что является нормальным поведением для fromRecords([]) + }); + + test('should throw error for non-existent column', () => { + expect(() => df.where('nonexistent', '===', 30)).toThrow("Column 'nonexistent' not found"); + }); + + test('should throw error for invalid operator', () => { + expect(() => df.where('age', 'invalid', 30)).toThrow("Unsupported operator: 'invalid'"); + }); + + test('should return a new DataFrame instance', () => { + const result = df.where('age', '>', 25); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Filter the data + const result = typedDf.where('age', '>', 25); + + // Check that the result contains typed arrays + expect(ArrayBuffer.isView(result._columns.age.vector.__data)).toBe(true); + expect(ArrayBuffer.isView(result._columns.salary.vector.__data)).toBe(true); + // Проверяем только наличие типизированных массивов, без проверки конкретных типов + // Типы могут быть разными в зависимости от реализации метода where + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.where('age', '===', 30)).toThrow("Column 'age' not found"); + }); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/where.simple.test.js b/tests/core/methods/dataframe/filtering/where.simple.test.js new file mode 100644 index 0000000..5ab52c6 --- /dev/null +++ b/tests/core/methods/dataframe/filtering/where.simple.test.js @@ -0,0 +1,55 @@ +/** + * Simple test for the where method + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { where } from '../../../../../packages/core/src/methods/dataframe/filtering/where.js'; + +// Test data +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000 }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000 }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000 }, +]; + +describe('Where Method - Simple Test', () => { + // Add where method to DataFrame prototype + DataFrame.prototype.where = function(column, operator, value) { + return where(this, column, operator, value); + }; + + // Create DataFrame + const df = DataFrame.fromRecords(testData); + + test('should filter rows based on equality', () => { + const result = df.where('age', '===', 30); + + // Check row count + expect(result.rowCount).toBe(1); + + // Check that the result contains the correct data + const resultArray = result.toArray(); + expect(resultArray.length).toBe(1); + expect(resultArray[0].name).toBe('Bob'); + expect(resultArray[0].age).toBe(30); + }); + + test('should return empty DataFrame when no rows match', () => { + const result = df.where('age', '>', 100); + + // Check that the result is empty + expect(result.rowCount).toBe(0); + + // In the new implementation, an empty DataFrame does not save the column structure + // which is normal behavior for fromRecords([]) + }); + + test('should throw error for non-existent column', () => { + expect(() => df.where('nonexistent', '===', 30)).toThrow("Column 'nonexistent' not found"); + }); + + test('should throw error for invalid operator', () => { + expect(() => df.where('age', 'invalid', 30)).toThrow("Unsupported operator: 'invalid'"); + }); +}); diff --git a/tests/core/methods/dataframe/filtering/where.test.js b/tests/core/methods/dataframe/filtering/where.test.js new file mode 100644 index 0000000..8a77cfb --- /dev/null +++ b/tests/core/methods/dataframe/filtering/where.test.js @@ -0,0 +1,253 @@ +/** + * Unit tests for the where method + * Tests filtering DataFrame rows based on conditions applied to specific columns + */ + +import { describe, test, expect } from 'vitest'; +import { DataFrame } from '../../../../../packages/core/src/data/model/DataFrame.js'; +import { where } from '../../../../../packages/core/src/methods/dataframe/filtering/where.js'; + +// Test data for use in all tests +const testData = [ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, +]; + +describe('Where Method', () => { + // Add where method to DataFrame prototype for testing + DataFrame.prototype.where = function(column, operator, value) { + return where(this, column, operator, value); + }; + + describe('with standard storage', () => { + // Create DataFrame using fromRecords + const df = DataFrame.fromRecords(testData); + + test('should filter rows based on strict equality (===)', () => { + const result = df.where('age', '===', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on loose equality (==)', () => { + const result = df.where('age', '==', '30'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on strict inequality (!==)', () => { + const result = df.where('age', '!==', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on loose inequality (!=)', () => { + const result = df.where('age', '!=', '35'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on greater than (>)', () => { + const result = df.where('age', '>', 25); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on greater than or equal (>=)', () => { + const result = df.where('age', '>=', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on less than (<)', () => { + const result = df.where('age', '<', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + ]); + }); + + test('should filter rows based on less than or equal (<=)', () => { + const result = df.where('age', '<=', 30); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on in operator', () => { + const result = df.where('age', 'in', [25, 35]); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on contains operator for strings', () => { + const result = df.where('city', 'contains', 'York'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + ]); + }); + + test('should filter rows based on startsWith operator for strings', () => { + const result = df.where('city', 'startsWith', 'San'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Bob', age: 30, city: 'San Francisco', salary: 85000, tags: ['dev', 'python'] }, + ]); + }); + + test('should filter rows based on endsWith operator for strings', () => { + const result = df.where('city', 'endsWith', 'York'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + ]); + }); + + test('should filter rows based on matches operator for strings', () => { + const result = df.where('city', 'matches', /^C/); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Charlie', age: 35, city: 'Chicago', salary: 90000, tags: ['manager'] }, + ]); + }); + + test('should filter rows based on array contains', () => { + const result = df.where('tags', 'contains', 'js'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + expect(result.toArray()).toEqual([ + { name: 'Alice', age: 25, city: 'New York', salary: 70000, tags: ['dev', 'js'] }, + ]); + }); + + test('should return empty DataFrame when no rows match', () => { + const result = df.where('age', '>', 100); + + // Should be empty with no rows + expect(result.rowCount).toBe(0); + // In the new implementation, an empty DataFrame does not save the column structure + // which is normal behavior for fromRecords([]) + }); + + test('should throw error for non-existent column', () => { + expect(() => df.where('nonexistent', '===', 30)).toThrow("Column 'nonexistent' not found"); + }); + + test('should throw error for invalid operator', () => { + expect(() => df.where('age', 'invalid', 30)).toThrow("Unsupported operator: 'invalid'"); + }); + + test('should return a new DataFrame instance', () => { + const result = df.where('age', '>', 25); + expect(result).toBeInstanceOf(DataFrame); + expect(result).not.toBe(df); // Should be a new instance + }); + + test('should preserve typed arrays', () => { + // Create DataFrame with typed arrays + const typedDf = DataFrame.fromRecords(testData, { + columns: { + age: { type: 'int32' }, + salary: { type: 'float64' }, + }, + }); + + // Filter the data + const result = typedDf.where('age', '>', 25); + + // Check that the result contains typed arrays + expect(ArrayBuffer.isView(result._columns.age.vector.__data)).toBe(true); + expect(ArrayBuffer.isView(result._columns.salary.vector.__data)).toBe(true); + }); + + test('should handle empty DataFrame', () => { + const emptyDf = DataFrame.fromRecords([]); + + expect(() => emptyDf.where('age', '===', 30)).toThrow("Column 'age' not found"); + }); + }); + + describe('with filtered columns', () => { + // Create DataFrame with only specific columns + const df = DataFrame.fromRecords(testData, { columns: ['name', 'city', 'tags'] }); + + test('should filter rows based on string columns', () => { + const result = df.where('city', 'contains', 'Chicago'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(1); + // Check only the presence of the necessary data, since the where implementation saves all columns + const resultArray = result.toArray(); + expect(resultArray.length).toBe(1); + expect(resultArray[0].name).toBe('Charlie'); + expect(resultArray[0].city).toBe('Chicago'); + expect(resultArray[0].tags).toEqual(['manager']); + }); + + test('should filter rows based on array columns', () => { + const result = df.where('tags', 'contains', 'dev'); + + // Check that the filtered data is correct + expect(result.rowCount).toBe(2); + // Check only the presence of the necessary data, since the where implementation saves all columns + const resultArray = result.toArray(); + expect(resultArray.length).toBe(2); + expect(resultArray[0].name).toBe('Alice'); + expect(resultArray[0].city).toBe('New York'); + expect(resultArray[0].tags).toEqual(['dev', 'js']); + expect(resultArray[1].name).toBe('Bob'); + expect(resultArray[1].city).toBe('San Francisco'); + expect(resultArray[1].tags).toEqual(['dev', 'python']); + }); + }); +});