Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 99 additions & 79 deletions dataset-generation/Create_Linux_DB.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@

from typing_extensions import Self

from dapper_python.databases.database import Database
from dapper_python.normalize import NormalizedFileName, normalize_file_name


@dataclass
class PackageInfo:
class PackageDetails:
full_package_name: str
file_path: PurePosixPath

Expand All @@ -59,8 +60,8 @@ def __post_init__(self):

@classmethod
def from_linux_package_file(cls, line:str) -> Self:
"""Creates a PackageInfo object out of a single line from the linux contents file
Uses simple parsing to split the line into package_name and file_path and then construct the PackageInfo object
"""Creates a PackageDetails object out of a single line from the linux contents file
Uses simple parsing to split the line into package_name and file_path and then construct the PackageDetails object

:param line: A line of text from the linux contents file
:return: The package info for that line
Expand All @@ -72,7 +73,91 @@ def from_linux_package_file(cls, line:str) -> Self:
)


def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
class LinuxDatabase(Database):

def __init__(self, db_path:Path) -> None:
super().__init__(db_path, mode='rwc')
self._init_database()

def _init_database(self) -> None:
with self.cursor() as cursor:
#Would there be any benefit to having a separate package table
#Which the files table references as a foreign key vs directly saving the package into the files table?
create_table_cmd = """
CREATE TABLE
IF NOT EXISTS package_files(
id INTEGER PRIMARY KEY,
file_name TEXT,
normalized_file_name TEXT,
file_path TEXT,
package_name TEXT,
full_package_name TEXT
)
"""
cursor.execute(create_table_cmd)

#Index the filename colum for fast lookups
#Currently does not index package name as use case does not require fast lookups on package name and reduces filesize
index_cmd = """
CREATE INDEX idx_file_name
ON package_files(file_name);
"""
cursor.execute(index_cmd)
index_cmd = """
CREATE INDEX idx_normalized_file_name
ON package_files(normalized_file_name);
"""
cursor.execute(index_cmd)

#Metadata information about dataset
create_table_cmd = """
CREATE TABLE
IF NOT EXISTS dataset_version(
version INTEGER PRIMARY KEY,
format TEXT,
timestamp INTEGER
)
"""
cursor.execute(create_table_cmd)

def set_version(self, version:int) -> None:
with self.cursor() as cursor:
metdata_remove_cmd = """
DELETE FROM dataset_version
"""
cursor.execute(metdata_remove_cmd)

metadata_add_cmd = """
INSERT INTO dataset_version(version, format, timestamp)
VALUES (?, "Linux", ?)
"""
cursor.execute(metadata_add_cmd, (version, int(datetime.now().timestamp())))

def add_package(self, package_details:PackageDetails) -> None:
#Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json
normalized_file = normalize_file_name(package_details.file_name)
match normalized_file:
case str(name):
normalized_file_name = name.lower()
case NormalizedFileName():
normalized_file_name = normalized_file.name.lower()
case _:
raise TypeError(f"Failed to normalize file: {package_details.file_name}")

cursor = self.cursor()
insert_cmd = """
INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name)
VALUES (?, ?, ?, ?, ?)
"""

cursor.execute(
insert_cmd,
(package_details.file_name, normalized_file_name, str(package_details.file_path),
package_details.package_name, package_details.full_package_name,)
)


def read_package_data(uri: str | Path, *, encoding='utf-8') -> TextIOWrapper:
"""Reads a file either from disk or by downloading it from the provided URL
Will attempt to read the provided file as a text file

Expand All @@ -83,7 +168,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
if isinstance(uri, Path):
if not uri.exists():
raise FileNotFoundError(f"File {uri} does not exist")

return TextIOWrapper(FileIO(uri, mode='rb'), encoding=encoding)

elif isinstance(uri, str):
Expand All @@ -99,11 +183,11 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:

content = BytesIO()
progress_bar = tqdm(
total=file_size,
desc='Downloading package file', colour='blue',
unit='B', unit_divisor=1024, unit_scale=True,
position=None, leave=None,
)
total=file_size,
desc='Downloading package file', colour='blue',
unit='B', unit_divisor=1024, unit_scale=True,
position=None, leave=None,
)
with progress_bar:
for chunk in web_request.iter_content(chunk_size=8*1024):
content.write(chunk)
Expand All @@ -125,7 +209,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
else:
raise TypeError(f"Invalid input: {uri}")


def main():
parser = argparse.ArgumentParser(
description="Create Linux DB by parsing the Linux Contents file"
Expand Down Expand Up @@ -156,85 +239,22 @@ def main():
if args.output.exists():
raise FileExistsError(f"File {args.output} already exists")

file = read_data(args.input)
file = read_package_data(args.input)
line_count = sum(1 for _ in file)
file.seek(0)

with sqlite3.connect(args.output) as db:
cursor = db.cursor()

#Would there be any benefit to having a separate package table
#Which the files table references as a foreign key vs directly saving the package into the files table?
create_table_cmd = """
CREATE TABLE package_files(
id INTEGER PRIMARY KEY,
file_name TEXT,
normalized_file_name TEXT,
file_path TEXT,
package_name TEXT,
full_package_name TEXT
)
"""
cursor.execute(create_table_cmd)

insert_cmd = """
INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name)
VALUES (?, ?, ?, ?, ?)
"""
with LinuxDatabase(args.output) as db:
progress_iter = tqdm(
file,
total=line_count,
desc='Processing Data', colour='green',
unit='Entry',
)
for line in progress_iter:
package = PackageInfo.from_linux_package_file(line)

#Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json
normalized_file = normalize_file_name(package.file_name)
match normalized_file:
case str(name):
normalized_name = name.lower()
case NormalizedFileName():
normalized_name = normalized_file.name.lower()
case _:
raise TypeError(f"Failed to normalize file: {package.file_name}")

cursor.execute(
insert_cmd,
(
package.file_name, normalized_name, str(package.file_path),
package.package_name, package.full_package_name,
)
)
package = PackageDetails.from_linux_package_file(line)
db.add_package(package)

#Index the filename colum for fast lookups
#Currently does not index package name as use case does not require fast lookups on package name and reduces filesize
index_cmd = """
CREATE INDEX idx_file_name
ON package_files(file_name);
"""
cursor.execute(index_cmd)
index_cmd = """
CREATE INDEX idx_normalized_file_name
ON package_files(normalized_file_name);
"""
cursor.execute(index_cmd)

#Metadata information about table
create_table_cmd = """
CREATE TABLE dataset_version(
version INTEGER PRIMARY KEY,
format TEXT,
timestamp INTEGER
)
"""
cursor.execute(create_table_cmd)
metadata_add_cmd = """
INSERT INTO dataset_version(version, format, timestamp)
VALUES (?, "Linux", ?)
"""
cursor.execute(metadata_add_cmd, (args.version, int(datetime.now().timestamp())))
db.set_version(args.version)

if __name__ == "__main__":
main()
Loading
Loading