From b0ac551b54b9dd655e3c7454beac99fac3287aff Mon Sep 17 00:00:00 2001 From: Shoaib Samim Date: Sun, 18 Jan 2026 03:30:54 +0530 Subject: [PATCH 1/4] Add readParquetFiles for partitioned parquet datasets --- dataframe.cabal | 12 +++++----- src/DataFrame.hs | 2 +- src/DataFrame/IO/Parquet.hs | 45 ++++++++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 7 deletions(-) diff --git a/dataframe.cabal b/dataframe.cabal index 615720c..593ab06 100644 --- a/dataframe.cabal +++ b/dataframe.cabal @@ -77,10 +77,10 @@ library DataFrame.IO.JSON, DataFrame.IO.Unstable.CSV, DataFrame.IO.Parquet, - DataFrame.IO.Parquet.Binary - DataFrame.IO.Parquet.Dictionary - DataFrame.IO.Parquet.Levels - DataFrame.IO.Parquet.Thrift + DataFrame.IO.Parquet.Binary, + DataFrame.IO.Parquet.Dictionary, + DataFrame.IO.Parquet.Levels, + DataFrame.IO.Parquet.Thrift, DataFrame.IO.Parquet.ColumnStatistics, DataFrame.IO.Parquet.Compression, DataFrame.IO.Parquet.Encoding, @@ -117,7 +117,9 @@ library zlib >= 0.5 && < 1, zstd >= 0.1.2.0 && < 0.2, mmap >= 0.5.8 && < 0.6, - parallel >= 3.2.2.0 && < 5 + parallel >= 3.2.2.0 && < 5, + filepath >= 1.4 && < 2, + hs-source-dirs: src c-sources: cbits/process_csv.c include-dirs: cbits diff --git a/src/DataFrame.hs b/src/DataFrame.hs index d11b760..c50c43b 100644 --- a/src/DataFrame.hs +++ b/src/DataFrame.hs @@ -251,7 +251,7 @@ import DataFrame.IO.CSV as CSV ( writeCsv, writeSeparated, ) -import DataFrame.IO.Parquet as Parquet (readParquet) +import DataFrame.IO.Parquet as Parquet (readParquet, readParquetFiles) import DataFrame.IO.Unstable.CSV as UnstableCSV ( fastReadCsvUnstable, fastReadTsvUnstable, diff --git a/src/DataFrame/IO/Parquet.hs b/src/DataFrame/IO/Parquet.hs index dca9b47..7e466b8 100644 --- a/src/DataFrame/IO/Parquet.hs +++ b/src/DataFrame/IO/Parquet.hs @@ -2,7 +2,10 @@ {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TypeApplications #-} -module DataFrame.IO.Parquet where +module DataFrame.IO.Parquet + ( readParquet + , readParquetFiles + ) where import Control.Monad import Data.Bits @@ -18,12 +21,23 @@ import Data.Word import qualified DataFrame.Internal.Column as DI import DataFrame.Internal.DataFrame (DataFrame) import qualified DataFrame.Operations.Core as DI +import DataFrame.Operations.Merge () + import DataFrame.IO.Parquet.Dictionary import DataFrame.IO.Parquet.Levels import DataFrame.IO.Parquet.Page import DataFrame.IO.Parquet.Thrift import DataFrame.IO.Parquet.Types +import System.Directory + ( doesDirectoryExist + , listDirectory + ) + +import System.FilePath + ( () + , takeExtension + ) {- | Read a parquet file from path and load it into a dataframe. @@ -107,6 +121,35 @@ readParquet path = do pure $ DI.fromNamedColumns orderedColumns +readParquetFiles :: FilePath -> IO DataFrame +readParquetFiles path = do + isDir <- doesDirectoryExist path + + allFiles <- if isDir + then getRecursiveContents path + else pure [path] + + let parquetFiles = filter (\f -> takeExtension f == ".parquet") allFiles + + case parquetFiles of + [] -> error $ "readParquetFiles: no parquet files found in " ++ path + _ -> do + dfs <- mapM readParquet parquetFiles + pure (mconcat dfs) + +getRecursiveContents :: FilePath -> IO [FilePath] +getRecursiveContents topPath = do + names <- listDirectory topPath + paths <- forM names $ \name -> do + let path = topPath name + isDirectory <- doesDirectoryExist path + if isDirectory + then getRecursiveContents path + else return [path] + return (concat paths) + + + readMetadataFromPath :: FilePath -> IO FileMetadata readMetadataFromPath path = do contents <- BSO.readFile path From 0448c3e1ad214d33473026fa3c2f26d63576b162 Mon Sep 17 00:00:00 2001 From: Shoaib Samim Date: Tue, 20 Jan 2026 00:59:04 +0530 Subject: [PATCH 2/4] Add readParquetFiles using glob for partitioned parquet datasets --- dataframe.cabal | 1 + src/DataFrame/IO/Parquet.hs | 44 ++++++++++++++----------------------- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/dataframe.cabal b/dataframe.cabal index 593ab06..5cb8921 100644 --- a/dataframe.cabal +++ b/dataframe.cabal @@ -119,6 +119,7 @@ library mmap >= 0.5.8 && < 0.6, parallel >= 3.2.2.0 && < 5, filepath >= 1.4 && < 2, + Glob >= 0.10 && < 1, hs-source-dirs: src c-sources: cbits/process_csv.c diff --git a/src/DataFrame/IO/Parquet.hs b/src/DataFrame/IO/Parquet.hs index 7e466b8..fa41489 100644 --- a/src/DataFrame/IO/Parquet.hs +++ b/src/DataFrame/IO/Parquet.hs @@ -22,6 +22,7 @@ import qualified DataFrame.Internal.Column as DI import DataFrame.Internal.DataFrame (DataFrame) import qualified DataFrame.Operations.Core as DI import DataFrame.Operations.Merge () +import System.FilePath.Glob (glob) import DataFrame.IO.Parquet.Dictionary @@ -29,15 +30,9 @@ import DataFrame.IO.Parquet.Levels import DataFrame.IO.Parquet.Page import DataFrame.IO.Parquet.Thrift import DataFrame.IO.Parquet.Types -import System.Directory - ( doesDirectoryExist - , listDirectory - ) +import System.Directory (doesDirectoryExist) -import System.FilePath - ( () - , takeExtension - ) +import System.FilePath (()) {- | Read a parquet file from path and load it into a dataframe. @@ -124,31 +119,24 @@ readParquet path = do readParquetFiles :: FilePath -> IO DataFrame readParquetFiles path = do isDir <- doesDirectoryExist path - - allFiles <- if isDir - then getRecursiveContents path - else pure [path] - let parquetFiles = filter (\f -> takeExtension f == ".parquet") allFiles + let pattern = + if isDir + then path "*" + else path - case parquetFiles of - [] -> error $ "readParquetFiles: no parquet files found in " ++ path - _ -> do - dfs <- mapM readParquet parquetFiles - pure (mconcat dfs) + matches <- glob pattern -getRecursiveContents :: FilePath -> IO [FilePath] -getRecursiveContents topPath = do - names <- listDirectory topPath - paths <- forM names $ \name -> do - let path = topPath name - isDirectory <- doesDirectoryExist path - if isDirectory - then getRecursiveContents path - else return [path] - return (concat paths) + files <- filterM (fmap not . doesDirectoryExist) matches + case files of + [] -> + error $ + "readParquetFiles: no parquet files found for " ++ path + _ -> do + dfs <- mapM readParquet files + pure (mconcat dfs) readMetadataFromPath :: FilePath -> IO FileMetadata readMetadataFromPath path = do From daacbd997a460e07379f8bb94876463959ae36da Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Mon, 19 Jan 2026 12:27:27 -0800 Subject: [PATCH 3/4] Refactor variable naming for pattern matching --- src/DataFrame/IO/Parquet.hs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/DataFrame/IO/Parquet.hs b/src/DataFrame/IO/Parquet.hs index fa41489..a2ba6d9 100644 --- a/src/DataFrame/IO/Parquet.hs +++ b/src/DataFrame/IO/Parquet.hs @@ -120,12 +120,9 @@ readParquetFiles :: FilePath -> IO DataFrame readParquetFiles path = do isDir <- doesDirectoryExist path - let pattern = - if isDir - then path "*" - else path + let pat = if isDir then path "*" else path - matches <- glob pattern + matches <- glob pat files <- filterM (fmap not . doesDirectoryExist) matches From 266df879364fbd0759e90615a14cf0595baee226 Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Mon, 19 Jan 2026 12:41:45 -0800 Subject: [PATCH 4/4] Fix formatting of module exports in Parquet.hs --- src/DataFrame/IO/Parquet.hs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/DataFrame/IO/Parquet.hs b/src/DataFrame/IO/Parquet.hs index a2ba6d9..7079ece 100644 --- a/src/DataFrame/IO/Parquet.hs +++ b/src/DataFrame/IO/Parquet.hs @@ -2,10 +2,10 @@ {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TypeApplications #-} -module DataFrame.IO.Parquet - ( readParquet - , readParquetFiles - ) where +module DataFrame.IO.Parquet ( + readParquet, + readParquetFiles, +) where import Control.Monad import Data.Bits @@ -130,7 +130,6 @@ readParquetFiles path = do [] -> error $ "readParquetFiles: no parquet files found for " ++ path - _ -> do dfs <- mapM readParquet files pure (mconcat dfs)