Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@ Steve Pitt
* Showing only distinct data for a column
* Sorting by the specified columns
* Data generation

## Planned Features
* Reduce functions
* Diffing two tables by the specified columns

# Installation

Expand Down Expand Up @@ -119,6 +117,21 @@ table file1.csv --join file2.csv --on 'id=product_id'
table ./test-data/table-format.out --sort "!available,id"
```

* Diff two tables by the 'id' column:

```bash
table file1.csv --diff file2.csv --on 'id=id'
```
Produces:

```
╭────┬────────────┬───────────┬───────────╮
│ id │ first_name │ last_name │ _source │
├────┼────────────┼───────────┼───────────┤
│ 2 │ Mary │ McAdams │ left │
╰────┴────────────┴───────────┴───────────╯
```

* Generate table with test data

```bash
Expand Down Expand Up @@ -150,6 +163,12 @@ INSERT INTO orders (id, amound, status) VALUES ('64FC986A-93A1-4579-B7F5-896CD77
INSERT INTO orders (id, amound, status) VALUES ('74CB99C8-D23F-4081-901B-8634187E4269', 529, 'ok');
```

* For working with JSONL files (one JSON object per line) it can be combined with the `jq` tool (no nesting yet):

```bash
cat objects.jsonl | jq --slurp -r '(map(keys) | add | unique) as $cols | map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv' | table
```

## Building from source

```bash
Expand Down
103 changes: 103 additions & 0 deletions Sources/table/Diff.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import Foundation

enum DiffMode: String {
case left = "left"
case right = "right"
case both = "both"

static func fromString(_ str: String) throws -> DiffMode {
guard let mode = DiffMode(rawValue: str.lowercased()) else {
throw RuntimeError("Invalid diff mode: \(str). Supported modes: left, right, both. Default is both.")
}
return mode
}
}

class Diff {
let firstColumn: String
let secondColumn: String
let secondColIndex: Int
let matchTable: ParsedTable
let mode: DiffMode

var loaded: Bool = false
var rowsCache: Set<String> = []
// Store all rows from second table for right/both modes
var secondTableRows: [Row] = []

init(firstColumn: String, secondColumn: String, matchTable: ParsedTable, mode: DiffMode) throws {
self.firstColumn = firstColumn
self.secondColumn = secondColumn
self.matchTable = matchTable
self.mode = mode
self.secondColIndex = try matchTable.header.index(ofColumn: secondColumn).orThrow(RuntimeError("Column \(secondColumn) is not found in second table"))

debug("Diffing tables on columns \(firstColumn)=\(secondColumn) with mode: \(mode.rawValue)")
}

func exists(row: Row) -> Bool {
guard let columnValue = row[firstColumn] else {
return false
}
return rowsCache.contains(columnValue)
}

func load() throws -> Diff {
while let r = try matchTable.next() {
let colValue = r[secondColIndex]
rowsCache.insert(colValue)
// Store rows for right/both modes
if mode == .right || mode == .both {
secondTableRows.append(r)
}
}

loaded = true
debug("Loaded \(rowsCache.count) rows from second table for diff")

return self
}

static func parse(_ file: String, diffOn: String?, noInHeader: Bool, firstTable: any Table, mode: String?) throws -> Diff {
let matchTable = try ParsedTable.parse(path: file, hasHeader: !noInHeader, headerOverride: nil, delimeter: nil, userTypes: nil)
return try parse(matchTable, diffOn: diffOn, firstTable: firstTable, mode: mode)
}

static func parse(_ matchTable: ParsedTable, diffOn: String?, firstTable: any Table, mode: String?) throws -> Diff {
let (first, second) = try diffOn.map { diffExpr in
let components = diffExpr.components(separatedBy: "=")

if components.count != 2 {
throw RuntimeError("Diff expression should have format: table1_column=table2_column")
}

let firstCol = components[0].trimmingCharacters(in: .whitespacesAndNewlines)
let secondCol = components[1].trimmingCharacters(in: .whitespacesAndNewlines)

// Validate columns exist
if firstTable.header.index(ofColumn: firstCol) == nil {
throw RuntimeError("Column \(firstCol) is not found in first table")
}

if matchTable.header.index(ofColumn: secondCol) == nil {
throw RuntimeError("Column \(secondCol) is not found in second table")
}

return (firstCol, secondCol)
} ?? {
let firstCol = firstTable.header[0]
let secondCol = matchTable.header[0]
return (firstCol, secondCol)
}()

let diffMode = try mode.map { try DiffMode.fromString($0) } ?? .both

return try Diff(
firstColumn: first,
secondColumn: second,
matchTable: matchTable,
mode: diffMode
).load()
}
}

15 changes: 14 additions & 1 deletion Sources/table/MainApp.swift
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,15 @@ struct MainApp: AsyncParsableCommand {
@Option(name: .customLong("join"), help: "Speficies a second file path to join with the current one. Joining column is the first one for both tables or can be specified by the --on option.")
var joinFile: String?

@Option(name: .customLong("on"), help: "Speficies column names to join on. Requires --join option. Syntax {table1 column}={table 2 column}. Example: --on city_id=id.")
@Option(name: .customLong("on"), help: "Speficies column names to join or diff on. Requires --join or --diff option. Syntax {table1 column}={table 2 column}. Example: --on city_id=id.")
var joinCriteria: String?

@Option(name: .customLong("diff"), help: "Specifies a second file path to diff with the current one. Shows rows in first table that don't exist in second table by default. Use --diff-mode to control behavior.")
var diffFile: String?

@Option(name: .customLong("diff-mode"), help: "Controls diff behavior. Options: left - show rows only in first table, right - show rows only in second table, both (default) - show rows from both tables with a marker column.")
var diffMode: String?

@Option(name: .customLong("sort"), help: "Sorts output by the specified columns. Example: --sort column1,column2. Use '!' prefix to sort in descending order.")
var sortColumns: String?

Expand Down Expand Up @@ -200,6 +206,13 @@ struct MainApp: AsyncParsableCommand {
table = JoinTableView(table: table, join: try Join.parse(joinFile, joinOn: joinCriteria, firstTable: table))
}

if let diffFile {
if joinFile != nil {
throw RuntimeError("--join and --diff options cannot be used together")
}
table = DiffTableView(table: table, diff: try Diff.parse(diffFile, diffOn: joinCriteria, noInHeader: noInHeader, firstTable: table, mode: diffMode))
}

if !distinctColumns.isEmpty {
try distinctColumns.forEach { if table.header.index(ofColumn: $0) == nil { throw RuntimeError("Column \($0) in distinct clause is not found in the table") } }
table = DistinctTableView(table: table, distinctColumns: distinctColumns)
Expand Down
121 changes: 121 additions & 0 deletions Sources/table/TableView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,127 @@ class SampledTableView: Table {
}
}

/** Table view for diffing two tables */
class DiffTableView: Table {
var table: any Table
let diff: Diff
let header: Header

private var memoizedTable: InMemoryTableView?
private var firstTableCache: Set<String>?
private var filteredSecondTableRows: [Row] = []
private var secondTableCursor: Int = 0
private var firstTableRows: [Row] = []
private var firstTableCursor: Int = 0
private var firstTableExhausted: Bool = false

init(table: any Table, diff: Diff) {
self.table = table
self.diff = diff

if diff.mode == .both {
self.header = Header(components: ["_source"], types: [.string]) + table.header
} else {
self.header = table.header
}

if diff.mode == .right || diff.mode == .both {
memoizedTable = table.memoized()
try? memoizedTable?.load()
buildFirstTableCache()
filterSecondTableRows()
}
}

func next() throws -> Row? {
switch diff.mode {
case .left:
var row = try table.next()
while let curRow = row {
if !diff.exists(row: curRow) {
return curRow
}
row = try table.next()
}
return nil

case .right:
return nextFromSecondTable()

case .both:
if !firstTableExhausted {
while firstTableCursor < firstTableRows.count {
let curRow = firstTableRows[firstTableCursor]
firstTableCursor += 1

if !diff.exists(row: curRow) {
// Add marker column
let markerCell = Cell(value: "left", type: .string)
return Row(
header: header,
index: curRow.index,
cells: [markerCell] + curRow.components
)
}
}
firstTableExhausted = true
}

return nextFromSecondTable()
}
}

private func buildFirstTableCache() {
guard let memoized = memoizedTable else { return }

firstTableCache = Set<String>()
memoized.rewind()

while let row = memoized.next() {
if let key = row[diff.firstColumn] {
firstTableCache?.insert(key)
}
if diff.mode == .both {
firstTableRows.append(row)
}
}

debug("DiffTableView: Loaded \(firstTableCache?.count ?? 0) rows from first table for diff")
}

private func filterSecondTableRows() {
guard let firstCache = firstTableCache else { return }

filteredSecondTableRows = diff.secondTableRows.filter { row in
let key = row[diff.secondColIndex]
return !firstCache.contains(key)
}

debug("DiffTableView: Found \(filteredSecondTableRows.count) rows in the second table absent in the first")
}

private func nextFromSecondTable() -> Row? {
guard secondTableCursor < filteredSecondTableRows.count else {
return nil
}

let row = filteredSecondTableRows[secondTableCursor]
secondTableCursor += 1

if diff.mode == .both {
// Add marker column
let markerCell = Cell(value: "right", type: .string)
return Row(
header: header,
index: row.index,
cells: [markerCell] + row.components
)
} else {
return row
}
}
}

/** Table view fully loaded into memory */
class InMemoryTableView: InMemoryTable {
var table: any Table
Expand Down
Loading