From 53a5018c5d102b2cc043e9131c4a7ee8e065f286 Mon Sep 17 00:00:00 2001 From: Zoynels <5841616+Zoynels@users.noreply.github.com> Date: Fri, 22 Nov 2019 20:53:16 +0600 Subject: [PATCH 1/2] Add support to read dbf file from zip-archive Add support to read dbf file from zip-archive. Prefer one-file archve, get only one last file. --- dbfread/dbf.py | 106 +++++++++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/dbfread/dbf.py b/dbfread/dbf.py index 5793530..0fbfb1a 100644 --- a/dbfread/dbf.py +++ b/dbfread/dbf.py @@ -3,6 +3,7 @@ """ import os import sys +import io import datetime import collections @@ -93,48 +94,65 @@ def __init__(self, filename, encoding=None, ignorecase=True, self.ignore_missing_memofile = ignore_missing_memofile self.char_decode_errors = char_decode_errors - if recfactory is None: - self.recfactory = lambda items: items - else: - self.recfactory = recfactory - - # Name part before .dbf is the table name - self.name = os.path.basename(filename) - self.name = os.path.splitext(self.name)[0].lower() - self._records = None - self._deleted = None - - if ignorecase: - self.filename = ifind(filename) - if not self.filename: - raise DBFNotFound('could not find file {!r}'.format(filename)) - else: - self.filename = filename - - # Filled in by self._read_headers() - self.memofilename = None - self.header = None - self.fields = [] # namedtuples - self.field_names = [] # strings - - with open(self.filename, mode='rb') as infile: - self._read_header(infile) - self._read_field_headers(infile) - self._check_headers() - - try: - self.date = datetime.date(expand_year(self.header.year), - self.header.month, - self.header.day) - except ValueError: - # Invalid date or '\x00\x00\x00'. - self.date = None - - self.memofilename = self._get_memofilename() - - if load: - self.load() - + + try: + zfile = None + if filename.endswith(".zip"): + from zipfile import ZipFile + zfile = ZipFile(filename) + self.io = zfile + self.fname = zfile.namelist()[-1] + self.mode = "r" + else: + self.io = io + self.fname = filename + self.mode = "rb" + + if recfactory is None: + self.recfactory = lambda items: items + else: + self.recfactory = recfactory + + # Name part before .dbf is the table name + self.name = os.path.basename(filename) + self.name = os.path.splitext(self.name)[0].lower() + self._records = None + self._deleted = None + + if ignorecase: + self.filename = ifind(filename) + if not self.filename: + raise DBFNotFound('could not find file {!r}'.format(filename)) + else: + self.filename = filename + + # Filled in by self._read_headers() + self.memofilename = None + self.header = None + self.fields = [] # namedtuples + self.field_names = [] # strings + + with self.io.open(self.fname, mode = self.mode) as infile: + self._read_header(infile) + self._read_field_headers(infile) + self._check_headers() + + try: + self.date = datetime.date(expand_year(self.header.year), + self.header.month, + self.header.day) + except ValueError: + # Invalid date or '\x00\x00\x00'. + self.date = None + + self.memofilename = self._get_memofilename() + + if load: + self.load() + finally: + if zfile is not None: + zfile.close() + @property def dbversion(self): return get_dbversion_string(self.header.dbversion) @@ -271,7 +289,7 @@ def _skip_record(self, infile): def _count_records(self, record_type=b' '): count = 0 - with open(self.filename, 'rb') as infile: + with self.io.open(self.fname, mode = self.mode) as infile: # Skip to first record. infile.seek(self.header.headerlen, 0) @@ -289,7 +307,7 @@ def _count_records(self, record_type=b' '): return count def _iter_records(self, record_type=b' '): - with open(self.filename, 'rb') as infile, \ + with self.io.open(self.fname, mode = self.mode) as infile, \ self._open_memofile() as memofile: # Skip to first record. From 71f2a2a23fe7b1927b2a76f655a50f810dbc7038 Mon Sep 17 00:00:00 2001 From: Zoynels <5841616+Zoynels@users.noreply.github.com> Date: Fri, 22 Nov 2019 20:55:50 +0600 Subject: [PATCH 2/2] Add built-in export function to pandas dataframe Add built-in export function to pandas dataframe --- dbfread/dbf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dbfread/dbf.py b/dbfread/dbf.py index 0fbfb1a..8c7f409 100644 --- a/dbfread/dbf.py +++ b/dbfread/dbf.py @@ -341,6 +341,11 @@ def _iter_records(self, record_type=b' '): else: skip_record(infile) + def DataFrame(self): + import pandas as pd + df = pd.DataFrame() + return df.from_records(self.records) + def __iter__(self): if self.loaded: return list.__iter__(self._records)