diff --git a/sources/Utils.swift b/sources/Utils.swift index a1cfdb7..c18d26d 100644 --- a/sources/Utils.swift +++ b/sources/Utils.swift @@ -76,3 +76,13 @@ extension Array where Element:Hashable { return lookup } } + +extension Sequence where Element : Hashable { + /// Creates a dictionary mapping elements of the sequence to the number of times they occur in the sequence. + /// - returns: The dictionary of occurence counts. + func occurenceCounts() -> [Element: Int] { + reduce(into: [:]) { partialResult, element in + partialResult[element, default: 0] += 1 + } + } +} diff --git a/sources/imperative/reader/internal/DialectDetector.swift b/sources/imperative/reader/internal/DialectDetector.swift new file mode 100644 index 0000000..1f9e4bc --- /dev/null +++ b/sources/imperative/reader/internal/DialectDetector.swift @@ -0,0 +1,249 @@ +// Parts of the code in this file are adapted from the CleverCSV Python library. +// See: https://github.com/alan-turing-institute/CleverCSV + +/* + Copyright (c) 2018 The Alan Turing Institute + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + */ + +/// Provides the means for detecting a CSV file's dialect +enum DialectDetector { + private static let fieldDelimiters: [Unicode.Scalar] = [",", ";", "\t", "|"] + + /// Detects the dialect used in the provided CSV file. + /// + /// A dialect describes the way in which a CSV file is formatted, i.e. which field + /// delimiter, row delimiter and escape character is used. + /// + /// - Parameter stringScalars: The raw CSV data. + /// - Returns: The detected dialect. + static func detectDialect(stringScalars: [UnicodeScalar]) -> Dialect { + let dialects = Self.fieldDelimiters.map { Dialect(fieldDelimiter: $0) } + + var maxConsistency = -Double.infinity + var scores: [Dialect: Double] = [:] + + // TODO: Sort dialects from most to least probable? + for dialect in dialects { + let patternScore = Self.calculatePatternScore(stringScalars: stringScalars, dialect: dialect) + + if patternScore < maxConsistency { + // Skip the computation of the type score for dialects with a low pattern score. + continue + } + // TODO: Calculate type score? + let typeScore = 1.0 + let consistencyScore = patternScore * typeScore + maxConsistency = max(maxConsistency, consistencyScore) + scores[dialect] = consistencyScore + } + + let best = scores.max { a, b in a.value < b.value } + + return best?.key ?? Dialect(fieldDelimiter: ",") + } + + private static let eps = 0.001 + + /// Calculates a score for the given dialect by anayzing the row patterns that result when interpreting the CSV data using that dialect. + /// + /// The correct dialect is expected to produce many rows of the same pattern + /// The pattern score favors row patterns that occur often, that are long and favors having fewer row patterns. + /// + /// - parameter stringScalars: The raw CSV data. + /// - parameter dialect: A dialect for which to calculate the score. + /// - returns: The calculated pattern score for the given dialect. + static func calculatePatternScore(stringScalars: [UnicodeScalar], dialect: Dialect) -> Double { + let (abstractions, _) = Self.makeAbstraction(stringScalars: stringScalars, dialect: dialect) + +#warning("TODO: Break ties based on generated errors") + + let rowPatternCounts: [ArraySlice: Int] = abstractions + .split(separator: .rowDelimiter) + .occurenceCounts() + + var score = 0.0 + for (rowPattern, count) in rowPatternCounts { + let fieldCount = Double(rowPattern.split(separator: .fieldDelimiter).count) + score += Double(count) * max(Self.eps, fieldCount - 1.0) / fieldCount + } + score /= Double(rowPatternCounts.count) + + return score + } + + /// Describes a CSV file's formatting. + struct Dialect: Hashable { + let fieldDelimiter: Unicode.Scalar + let rowDelimiter: Unicode.Scalar = "\n" + let escapeCharacter: Unicode.Scalar = "\"" + } +} + +// MARK: - + +extension DialectDetector { + /// An abstracted piece of CSV data + enum Abstraction: Character, Hashable { + case cell = "C", fieldDelimiter = "D", rowDelimiter = "R" + + /// The type of error raised by `makeAbstraction`. + enum Error: Swift.Error { + /// An escape character, e.g. a quote, occured in an invalid place. + /// + /// Example: + /// ``` + /// foo,bar"wrong",baz + /// ``` + case invalidEscapeCharacterPosition + + /// The last escaped field was not closed due to an uneven number of escape characters. + /// + /// Example: + /// ``` + /// foo,bar,"baz + /// ``` + case unbalancedEscapeCharacters + } + } + + /// Builds an abstraction of the CSV data by parsing it with the provided dialect. + /// + /// For example, consider the following CSV data: + /// ``` + /// one,two,three + /// foo,funny ;),bar + /// ``` + /// Assuming a field delimiter of `,` this produces the following abstraction: + /// ``` + /// CDCDC + /// CDCDC + /// ``` + /// Here, `C` represents a cell (field) and `D` stands for a field delimiter. + /// + /// However when we instead consider `;` as the field delimiter, the following abstraction is produced: + /// ``` + /// C + /// CDC + /// ``` + /// This abstraction can then be used to guess the delimiter, because the correct + /// delimiter will produce an abstraction with many identical row patterns. + /// + /// - parameter stringScalars: The raw CSV data. + /// - parameter dialect: The dialect to use for speculatively interpreting the CSV data. + /// - throws: An `Abstraction.Error`. + /// - returns: An array of cells and delimiters. + /// - todo: Currently assuming that delimiters can only be made up of a single Unicode scalar. + static func makeAbstraction(stringScalars: [Unicode.Scalar], dialect: Dialect) -> ([Abstraction], [Abstraction.Error]) { + var abstraction: [Abstraction] = [] + var errors: [Abstraction.Error] = [] + var escaped = false + + var iter = stringScalars.makeIterator() + var queuedNextScalar: Unicode.Scalar? = nil + while true { + guard let scalar = queuedNextScalar ?? iter.next() else { break } + queuedNextScalar = nil + + switch scalar { + case dialect.fieldDelimiter: + if escaped { continue } + + switch abstraction.last { + // - two consecutive field delimiters OR + // - field delimiter after row delimiter, i.e. at start of line OR + // - field delimiter at the very beginning, i.e. at start of first line + // all imply an empty cell + case .fieldDelimiter, .rowDelimiter, nil: + abstraction.append(.cell) + fallthrough + case .cell: + abstraction.append(.fieldDelimiter) + } + + case dialect.rowDelimiter: + if escaped { continue } + + switch abstraction.last { + // - two consecutive row delimiters + // - row delimiter after field delimiter + // - row delimiter at the very beginning, i.e. at start of first line + // all imply an empty cell + case .rowDelimiter, .fieldDelimiter, nil: + abstraction.append(.cell) + fallthrough + case .cell: + abstraction.append(.rowDelimiter) + } + + case dialect.escapeCharacter: + if !escaped { + if abstraction.last == .cell { + // encountered an escape character after the beginning of a field + errors.append(.invalidEscapeCharacterPosition) + } + escaped = true + continue + } + + // we are in an escaped context, so the encountered escape character + // is either the end of the field or must be followed by another escape character + let nextScalar = iter.next() + + switch nextScalar { + case dialect.escapeCharacter: + // the escape character was escaped + continue + case nil: + // end of file + escaped = false + case dialect.fieldDelimiter, dialect.rowDelimiter: + // end of field + escaped = false + queuedNextScalar = nextScalar + default: + // encountered a non-delimiter character after the field ended + errors.append(.invalidEscapeCharacterPosition) + escaped = false + queuedNextScalar = nextScalar + } + + default: + switch abstraction.last { + case .cell: + continue + case .fieldDelimiter, .rowDelimiter, nil: + abstraction.append(.cell) + } + } + } + + if abstraction.last == .fieldDelimiter { + abstraction.append(.cell) + } + + if escaped { + // reached EOF without closing the last escaped field + errors.append(.unbalancedEscapeCharacters) + } + + return (abstraction, errors) + } +} diff --git a/sources/imperative/reader/internal/ReaderInference.swift b/sources/imperative/reader/internal/ReaderInference.swift index 2abe5c7..bb4765b 100644 --- a/sources/imperative/reader/internal/ReaderInference.swift +++ b/sources/imperative/reader/internal/ReaderInference.swift @@ -146,14 +146,45 @@ extension CSVReader { /// - throws: `CSVError` exclusively. /// - todo: Implement the field and row inferences. static func inferDelimiters(field: Delimiter.Field, row: Delimiter.Row, decoder: ScalarDecoder, buffer: ScalarBuffer) throws -> Delimiter.Scalars { + let fieldDelimiter: Delimiter.Field + let rowDelimiter: Delimiter.Row + switch (field.isKnown, row.isKnown) { case (true, true): - guard let delimiters = Delimiter.Scalars(field: field.scalars, row: row.scalars) else { - throw Error._invalidDelimiters(field: field, row: row) - } - return delimiters + fieldDelimiter = field + rowDelimiter = row + + case (false, true): + fieldDelimiter = try Self.inferFieldDelimiter(decoder: decoder, buffer: buffer) + rowDelimiter = row + default: throw Error._unsupportedInference() } + + guard let delimiters = Delimiter.Scalars(field: fieldDelimiter.scalars, row: rowDelimiter.scalars) else { + throw Error._invalidDelimiters(field: fieldDelimiter, row: rowDelimiter) + } + + return delimiters + } + + /// Tries to infer the field delimiter from the raw data. + /// - parameter decoder: The instance providing the input `Unicode.Scalar`s. + /// - parameter buffer: Small buffer use to store `Unicode.Scalar` values that have been read from the input, but haven't yet been processed. + /// - returns: The inferred `Delimiter.Field`. + static func inferFieldDelimiter(decoder: ScalarDecoder, buffer: ScalarBuffer) rethrows -> Delimiter.Field { + let sampleLength = 50 + var tmp: [UnicodeScalar] = [] + tmp.reserveCapacity(sampleLength) + while tmp.count < sampleLength { + guard let scalar = try buffer.next() ?? decoder() else { break } + tmp.append(scalar) + } + + let detectedDialect = DialectDetector.detectDialect(stringScalars: tmp) + buffer.preppend(scalars: tmp) + + return Delimiter.Field(unicodeScalarLiteral: detectedDialect.fieldDelimiter) } } @@ -166,10 +197,10 @@ fileprivate extension CSVReader.Error { help: "Set different delimiters for fields and rows.", userInfo: ["Field delimiter": field.scalars, "Row delimiters": row.scalars]) } - /// Delimiter inference is not yet implemented. + /// Row delimiter inference is not yet implemented. static func _unsupportedInference() -> CSVError { CSVError(.invalidConfiguration, - reason: "Delimiter inference is not yet supported by this library", + reason: "Row delimiter inference is not yet supported by this library", help: "Specify a concrete delimiter or get in contact with the maintainer") } } diff --git a/tests/imperative/DialectDetectorTests.swift b/tests/imperative/DialectDetectorTests.swift new file mode 100644 index 0000000..77b13d2 --- /dev/null +++ b/tests/imperative/DialectDetectorTests.swift @@ -0,0 +1,147 @@ +@testable import CodableCSV +import XCTest + +final class DialectDetectorTests: XCTestCase {} + +// MARK: - Tests for detectDialect + +extension DialectDetectorTests { + func test_detectDialect() throws { + // Adapted from CPython + // See: https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/test/test_csv.py#L1039 + let dialects = [ + ( + """ + Harry's, Arlington Heights, IL, 2/1/03, Kimi Hayes + Shark City, Glendale Heights, IL, 12/28/02, Prezence + Tommy's Place, Blue Island, IL, 12/28/02, Blue Sunday/White Crow + Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back + """, + DialectDetector.Dialect(fieldDelimiter: ",") + ), +// ( +// """ +// 'Harry''s':'Arlington Heights':'IL':'2/1/03':'Kimi Hayes' +// 'Shark City':'Glendale Heights':'IL':'12/28/02':'Prezence' +// 'Tommy''s Place':'Blue Island':'IL':'12/28/02':'Blue Sunday/White Crow' +// 'Stonecutters ''Seafood'' and Chop House':'Lemont':'IL':'12/19/02':'Week Back' +// """, +// DialectDetector.Dialect(fieldDelimiter: ":") +// ), + ] + + for (csv, expectedDialect) in dialects { + let dialect = DialectDetector.detectDialect(stringScalars: Array(csv.unicodeScalars)) + XCTAssertEqual(dialect, expectedDialect, csv) + } + } +} + +// MARK: - Tests for calculatePatternScore + +extension DialectDetectorTests { + // Adapted from CleverCSV + // See: https://github.com/alan-turing-institute/CleverCSV/blob/master/tests/test_unit/test_detect_pattern.py#L160-L195 + func test_calculatePatternScore() throws { + let dialectScores: [(DialectDetector.Dialect, Double)] = [ + (.init(fieldDelimiter: ","), 7 / 4), + (.init(fieldDelimiter: ";"), 10 / 3), + ] + let csv = #""" + 7,5; Mon, Jan 12;6,40 + 100; Fri, Mar 21;8,23 + 8,2; Thu, Sep 17;2,71 + 538,0;;7,26 + "NA"; Wed, Oct 4;6,93 + """# + + for (dialect, expectedScore) in dialectScores { + let score = DialectDetector.calculatePatternScore(stringScalars: Array(csv.unicodeScalars), dialect: dialect) + XCTAssertEqual(score, expectedScore, "Delimiter: \(dialect.fieldDelimiter)") + } + } + + /// Demonstrates that it is useful to check for the correctness of the CSV + /// that results from a particular dialect because there may be instances where + /// two field delimiters both get a score of 1.0 despite one of them leading to + /// a valid CSV and the other leading to a malformed CSV + func test_calculatePatternScore_TieBreaking() { + let csv = """ + foo;,bar + baz;,"boo" + """ + + let dialectErrors: [(DialectDetector.Dialect, [DialectDetector.Abstraction.Error])] = [ + (.init(fieldDelimiter: ","), []), + (.init(fieldDelimiter: ";"), [.invalidEscapeCharacterPosition]), + ] + + for (dialect, expectedErrors) in dialectErrors { + let msg = "Delimiter: \(dialect.fieldDelimiter)" + let scalars = Array(csv.unicodeScalars) + let score = DialectDetector.calculatePatternScore(stringScalars: scalars, dialect: dialect) + XCTAssertEqual(score, 1.0, msg) + let (abstraction, errors) = DialectDetector.makeAbstraction(stringScalars: scalars, dialect: dialect) + XCTAssertEqual(abstraction, [.cell, .fieldDelimiter, .cell, .rowDelimiter, .cell, .fieldDelimiter, .cell]) + XCTAssertEqual(errors, expectedErrors, msg) + } + } +} + +// MARK: - Tests for makeAbstraction + +extension DialectDetectorTests { + func test_makeAbstraction() throws { + let abstractions: [(String, [DialectDetector.Abstraction])] = [ + ("", []), + ("foo", [.cell]), + + (",", [.cell, .fieldDelimiter, .cell]), + (",,", [.cell, .fieldDelimiter, .cell, .fieldDelimiter, .cell]), + + ("\n", [.cell, .rowDelimiter]), + ("\n\n", [.cell, .rowDelimiter, .cell, .rowDelimiter]), + + (",\n,", [.cell, .fieldDelimiter, .cell, .rowDelimiter, .cell, .fieldDelimiter, .cell]), + (",foo\n,bar", [.cell, .fieldDelimiter, .cell, .rowDelimiter, .cell, .fieldDelimiter, .cell]), + ] + let dialect = DialectDetector.Dialect(fieldDelimiter: ",") + + for (csv, expected) in abstractions { + let (abstraction, _) = DialectDetector.makeAbstraction(stringScalars: Array(csv.unicodeScalars), dialect: dialect) + XCTAssertEqual(abstraction, expected, csv) + } + } + + func test_makeAbstraction_HandlesEscaping() throws { + let escapingAbstractions: [(String, [DialectDetector.Abstraction])] = [ + (#" "foo",bar "#, [.cell, .fieldDelimiter, .cell]), + (#" "foo ""quoted"" \n ,bar",baz "#, [.cell, .fieldDelimiter, .cell]), + (#" a,"bc""d""e""f""a",\n "#, [.cell, .fieldDelimiter, .cell, .fieldDelimiter, .cell]), + ] + let dialect = DialectDetector.Dialect(fieldDelimiter: ",") + for (csv, expected) in escapingAbstractions { + let strippedCSV = csv.trimmingCharacters(in: .whitespaces) + let (abstraction, _) = DialectDetector.makeAbstraction(stringScalars: Array(strippedCSV.unicodeScalars), dialect: dialect) + XCTAssertEqual(abstraction, expected, csv) + } + } + + func test_makeAbstraction_HandlesInvalidEscaping() throws { + let dialect = DialectDetector.Dialect(fieldDelimiter: ",") + let malformedCSVs: [(String, [DialectDetector.Abstraction])] = [ + // escaping + (#" foo,x"bar" "#, [.cell, .fieldDelimiter, .cell]), + (#" foo,"bar"x "#, [.cell, .fieldDelimiter, .cell]), + (#" foo,"bar "#, [.cell, .fieldDelimiter, .cell]), + // different number of fields per row + ("foo,bar\n\n", [.cell, .fieldDelimiter, .cell, .rowDelimiter, .cell, .rowDelimiter]), + ] + + for (csv, expected) in malformedCSVs { + let strippedCSV = csv.trimmingCharacters(in: .whitespaces) + let (abstraction, _) = DialectDetector.makeAbstraction(stringScalars: Array(strippedCSV.unicodeScalars), dialect: dialect) + XCTAssertEqual(abstraction, expected, strippedCSV) + } + } +} diff --git a/tests/imperative/ReaderInferenceTests.swift b/tests/imperative/ReaderInferenceTests.swift new file mode 100644 index 0000000..a1c12a4 --- /dev/null +++ b/tests/imperative/ReaderInferenceTests.swift @@ -0,0 +1,57 @@ +@testable import CodableCSV +import XCTest + +final class ReaderInferenceTests: XCTestCase { + private enum _TestData { + /// A CSV row representing a header row (4 fields). + static let headers = ["seq", "Name", "Country", "Number Pair"] + /// Small amount of regular CSV rows (4 fields per row). + static let content = [["1", "Marcos", "Spain", "99"], + ["2", "Kina", "Papua New Guinea", "88"], + ["3", "Alex", "Germany", "77"], + ["4", "Marine-Anaïs", "France", "66"]] + + /// Some longer CSV rows + static let longContent = [ + ["ff60766c-08e7-4db4-bfd3-dcc60c15251f", "foofoofoo", "barbarbar", "bazbazbaz"], + ["f9165d00-03fc-4d8d-838c-1fba1d26d92d", "foofoofoo", "barbarbar", "bazbazbaz"], + ] + + /// Encodes the test data into a Swift `String`. + /// - parameter sample: + /// - parameter delimiters: Unicode scalars to use to mark fields and rows. + /// - returns: Swift String representing the CSV file. + static func toCSV(_ sample: [[String]], delimiters: Delimiter.Pair) -> String { + let (f, r) = (delimiters.field.description, delimiters.row.description) + return sample.map { $0.joined(separator: f) }.joined(separator: r).appending(r) + } + } +} + +extension ReaderInferenceTests { + func testInference() throws { + let fieldDelimiters: [Delimiter.Field] = [",", ";", "|", "\t"] + + var configuration = CSVReader.Configuration() + configuration.delimiters = (field: nil, row: "\n") + + for fieldDelimiter in fieldDelimiters { + let testString = _TestData.toCSV(_TestData.content, delimiters: (fieldDelimiter, "\n")) + let result = try CSVReader.decode(input: testString, configuration: configuration) + XCTAssertEqual(result.rows, _TestData.content, "Delimiter: \(fieldDelimiter)") + } + } + + func testInference_longRows() throws { + let fieldDelimiters: [Delimiter.Field] = [",", ";", "|", "\t"] + + var configuration = CSVReader.Configuration() + configuration.delimiters = (field: nil, row: "\n") + + for fieldDelimiter in fieldDelimiters { + let testString = _TestData.toCSV(_TestData.longContent, delimiters: (fieldDelimiter, "\n")) + let result = try CSVReader.decode(input: testString, configuration: configuration) + XCTAssertEqual(result.rows, _TestData.longContent) + } + } +}