Skip to content

Types coerced to string #34

@GPHemsley-RELX

Description

@GPHemsley-RELX

utils.parse_type() coerces certain types to string, losing the ability to manipulate or reformat native Python types.

def parse_type(data_type, buffer, length=None, version=3, props=None):
parsed = ""
# Bool or int8
if data_type == TYPE_INT8:
parsed = struct.unpack_from("b", buffer)[0]
elif data_type == TYPE_INT16:
parsed = struct.unpack_from("h", buffer)[0]
elif data_type == TYPE_INT32 or data_type == TYPE_COMPLEX:
parsed = struct.unpack_from("i", buffer)[0]
elif data_type == TYPE_MONEY:
parsed = struct.unpack_from("q", buffer)[0]
if props and "Format" in props:
prop_format = props['Format']
if parsed == 0:
parsed = [y for x, y in FORMAT_TO_DEFAULT_VALUE.items() if prop_format.startswith(x)]
if not parsed:
LOGGER.warning(f"parse_type got unknown format while parsing money field {prop_format}")
else:
parsed = parsed[0]
else:
parsed = parse_money_type(parsed, prop_format)
elif data_type == TYPE_FLOAT32:
parsed = struct.unpack_from("f", buffer)[0]
elif data_type == TYPE_FLOAT64:
parsed = struct.unpack_from("d", buffer)[0]
elif data_type == TYPE_DATETIME:
double_datetime = struct.unpack_from("q", buffer)[0]
parsed = mdb_date_to_readable(double_datetime)
elif data_type == TYPE_BINARY:
parsed = buffer[:length]
offset = length
elif data_type == TYPE_OLE:
parsed = buffer
elif data_type == TYPE_GUID:
parsed = buffer[:16]
guid = uuid.UUID(parsed.hex())
parsed = str(guid)
elif data_type == TYPE_96_bit_17_BYTES:
parsed = buffer[:17]
elif data_type == TYPE_TEXT:
if version > 3:
# Looks like if BOM is present text is already decoded
if buffer.startswith(b"\xfe\xff") or buffer.startswith(b"\xff\xfe"):
buff = buffer[2:]
parsed = get_decoded_text(buff)
else:
parsed = buffer.decode("utf-16", errors='ignore')
else:
parsed = get_decoded_text(buffer)
if "\x00" in parsed:
LOGGER.debug(f"Parsed string contains NUL (0x00) characters: {parsed}")
parsed = parsed.replace("\x00", "")
else:
LOGGER.debug(f"parse_type - unsupported data type: {data_type}")
return parsed

This especially affects datetime, which is further processed by utils.mdb_date_to_readable():

# https://stackoverflow.com/questions/45560782
def mdb_date_to_readable(double_time):
try:
dtime_bytes = struct.pack("Q", double_time)
dtime_double = struct.unpack('<d', dtime_bytes)[0]
dtime_frac, dtime_whole = math.modf(dtime_double)
dtime = (ACCESS_EPOCH + timedelta(days=dtime_whole) + timedelta(days=dtime_frac))
if dtime == ACCESS_EPOCH:
return "(Empty Date)"
return str(dtime)
except OverflowError:
return "(Invalid Date)"
except struct.error:
return "(Invalid Date)"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions