llnl · nightlark · Jul 24, 2025 · Jul 14, 2025
diff --git a/dataset-generation/Create_Linux_DB.py b/dataset-generation/Create_Linux_DB.py
@@ -37,11 +37,12 @@
 
 from typing_extensions import Self
 
+from dapper_python.databases.database import Database
 from dapper_python.normalize import NormalizedFileName, normalize_file_name
 
 
 @dataclass
-class PackageInfo:
+class PackageDetails:
     full_package_name: str
     file_path: PurePosixPath
 
@@ -59,8 +60,8 @@ def __post_init__(self):
 
     @classmethod
     def from_linux_package_file(cls, line:str) -> Self:
-        """Creates a PackageInfo object out of a single line from the linux contents file
-        Uses simple parsing to split the line into package_name and file_path and then construct the PackageInfo object
+        """Creates a PackageDetails object out of a single line from the linux contents file
+        Uses simple parsing to split the line into package_name and file_path and then construct the PackageDetails object
 
         :param line: A line of text from the linux contents file
         :return: The package info for that line
@@ -72,7 +73,91 @@ def from_linux_package_file(cls, line:str) -> Self:
         )
 
 
-def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
+class LinuxDatabase(Database):
+
+    def __init__(self, db_path:Path) -> None:
+        super().__init__(db_path, mode='rwc')
+        self._init_database()
+
+    def _init_database(self) -> None:
+        with self.cursor() as cursor:
+            #Would there be any benefit to having a separate package table
+            #Which the files table references as a foreign key vs directly saving the package into the files table?
+            create_table_cmd = """
+            CREATE TABLE
+            IF NOT EXISTS package_files(
+                id INTEGER PRIMARY KEY,
+                file_name TEXT,
+                normalized_file_name TEXT,
+                file_path TEXT,
+                package_name TEXT,
+                full_package_name TEXT
+            )
+            """
+            cursor.execute(create_table_cmd)
+
+            #Index the filename colum for fast lookups
+            #Currently does not index package name as use case does not require fast lookups on package name and reduces filesize
+            index_cmd = """
+                CREATE INDEX idx_file_name
+                ON package_files(file_name);
+            """
+            cursor.execute(index_cmd)
+            index_cmd = """
+                CREATE INDEX idx_normalized_file_name
+                ON package_files(normalized_file_name);
+            """
+            cursor.execute(index_cmd)
+
+            #Metadata information about dataset
+            create_table_cmd = """
+                CREATE TABLE
+                IF NOT EXISTS dataset_version(
+                    version INTEGER PRIMARY KEY,
+                    format TEXT,
+                    timestamp INTEGER
+                )
+            """
+            cursor.execute(create_table_cmd)
+
+    def set_version(self, version:int) -> None:
+        with self.cursor() as cursor:
+            metdata_remove_cmd = """
+                DELETE FROM dataset_version
+            """
+            cursor.execute(metdata_remove_cmd)
+
+            metadata_add_cmd = """
+                INSERT INTO dataset_version(version, format, timestamp)
+                VALUES (?, "Linux", ?)
+            """
+            cursor.execute(metadata_add_cmd, (version, int(datetime.now().timestamp())))
+
+    def add_package(self, package_details:PackageDetails) -> None:
+        #Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json
+        normalized_file = normalize_file_name(package_details.file_name)
+        match normalized_file:
+            case str(name):
+                normalized_file_name = name.lower()
+            case NormalizedFileName():
+                normalized_file_name = normalized_file.name.lower()
+            case _:
+                raise TypeError(f"Failed to normalize file: {package_details.file_name}")
+
+        cursor = self.cursor()
+        insert_cmd = """
+            INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name)
+            VALUES (?, ?, ?, ?, ?)
+        """
+
+        cursor.execute(
+            insert_cmd,
+            (package_details.file_name, normalized_file_name, str(package_details.file_path),
+             package_details.package_name, package_details.full_package_name,)
+        )
+
+
+def read_package_data(uri: str | Path, *, encoding='utf-8') -> TextIOWrapper:
     """Reads a file either from disk or by downloading it from the provided URL
     Will attempt to read the provided file as a text file
 
@@ -83,7 +168,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
     if isinstance(uri, Path):
         if not uri.exists():
             raise FileNotFoundError(f"File {uri} does not exist")
-
         return TextIOWrapper(FileIO(uri, mode='rb'), encoding=encoding)
 
     elif isinstance(uri, str):
@@ -99,11 +183,11 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
 
             content = BytesIO()
             progress_bar = tqdm(
-                    total=file_size,
-                    desc='Downloading package file', colour='blue',
-                    unit='B', unit_divisor=1024, unit_scale=True,
-                    position=None, leave=None,
-                )
+                total=file_size,
+                desc='Downloading package file', colour='blue',
+                unit='B', unit_divisor=1024, unit_scale=True,
+                position=None, leave=None,
+            )
             with progress_bar:
                 for chunk in web_request.iter_content(chunk_size=8*1024):
                     content.write(chunk)
@@ -125,7 +209,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
     else:
         raise TypeError(f"Invalid input: {uri}")
 
-
 def main():
     parser = argparse.ArgumentParser(
         description="Create Linux DB by parsing the Linux Contents file"
@@ -156,85 +239,22 @@ def main():
     if args.output.exists():
         raise FileExistsError(f"File {args.output} already exists")
 
-    file = read_data(args.input)
+    file = read_package_data(args.input)
     line_count = sum(1 for _ in file)
     file.seek(0)
 
-    with sqlite3.connect(args.output) as db:
-        cursor = db.cursor()
-
-        #Would there be any benefit to having a separate package table
-        #Which the files table references as a foreign key vs directly saving the package into the files table?
-        create_table_cmd = """
-            CREATE TABLE package_files(
-                id INTEGER PRIMARY KEY,
-                file_name TEXT,
-                normalized_file_name TEXT,
-                file_path TEXT,
-                package_name TEXT,
-                full_package_name TEXT
-            )
-        """
-        cursor.execute(create_table_cmd)
-
-        insert_cmd = """
-            INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name)
-            VALUES (?, ?, ?, ?, ?)
-        """
+    with LinuxDatabase(args.output) as db:
         progress_iter = tqdm(
             file,
             total=line_count,
             desc='Processing Data', colour='green',
             unit='Entry',
         )
         for line in progress_iter:
-            package = PackageInfo.from_linux_package_file(line)
-
-            #Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json
-            normalized_file = normalize_file_name(package.file_name)
-            match normalized_file:
-                case str(name):
-                    normalized_name = name.lower()
-                case NormalizedFileName():
-                    normalized_name = normalized_file.name.lower()
-                case _:
-                    raise TypeError(f"Failed to normalize file: {package.file_name}")
-
-            cursor.execute(
-                insert_cmd,
-                (
-                    package.file_name, normalized_name, str(package.file_path),
-                    package.package_name, package.full_package_name,
-                )
-            )
+            package = PackageDetails.from_linux_package_file(line)
+            db.add_package(package)
 
-        #Index the filename colum for fast lookups
-        #Currently does not index package name as use case does not require fast lookups on package name and reduces filesize
-        index_cmd = """
-            CREATE INDEX idx_file_name
-            ON package_files(file_name);
-        """
-        cursor.execute(index_cmd)
-        index_cmd = """
-            CREATE INDEX idx_normalized_file_name
-            ON package_files(normalized_file_name);
-        """
-        cursor.execute(index_cmd)
-
-        #Metadata information about table
-        create_table_cmd = """
-            CREATE TABLE dataset_version(
-                version INTEGER PRIMARY KEY,
-                format TEXT,
-                timestamp INTEGER
-            )
-        """
-        cursor.execute(create_table_cmd)
-        metadata_add_cmd = """
-            INSERT INTO dataset_version(version, format, timestamp)
-            VALUES (?, "Linux", ?)
-        """
-        cursor.execute(metadata_add_cmd, (args.version, int(datetime.now().timestamp())))
+        db.set_version(args.version)
 
 if __name__ == "__main__":
     main()