diff --git a/src/common_file_operations.rs b/src/common_file_operations.rs index 80d7d3d..12cfd64 100644 --- a/src/common_file_operations.rs +++ b/src/common_file_operations.rs @@ -4,7 +4,7 @@ use binrw::{BinReaderExt, BinResult, binread}; use half::f16; use std::ffi::CString; -use std::io::SeekFrom; +use std::io::{Read, SeekFrom}; pub(crate) fn read_bool_from + std::cmp::PartialEq>(x: T) -> bool { x == T::from(1u8) @@ -14,6 +14,28 @@ pub(crate) fn write_bool_as>(x: &bool) -> T { if *x { T::from(1u8) } else { T::from(0u8) } } +/// Read a null-terminated UTF-8 string from a reader at its current position. +pub(crate) fn read_null_terminated_utf8(reader: &mut R) -> String { + let mut bytes = Vec::new(); + let mut buf = [0u8; 1]; + while reader.read_exact(&mut buf).is_ok() && buf[0] != 0 { + bytes.push(buf[0]); + } + String::from_utf8(bytes).unwrap_or_default() +} + +/// Read a null-terminated UTF-8 string from a byte slice starting at `offset`. +/// Returns the decoded string and the offset immediately after the null terminator. +pub(crate) fn null_terminated_utf8(data: &[u8], offset: usize) -> (String, usize) { + let end = data[offset..] + .iter() + .position(|&b| b == 0) + .map(|p| p + offset) + .unwrap_or(data.len()); + let s = String::from_utf8(data[offset..end].to_vec()).unwrap_or_default(); + (s, end + 1) +} + pub(crate) fn read_string(byte_stream: Vec) -> String { let str = String::from_utf8(byte_stream).unwrap_or_default(); str.trim_matches(char::from(0)).to_string() // trim \0 from the end of strings @@ -37,18 +59,8 @@ pub(crate) fn strings_parser( let mut strings: Vec = vec![]; for offset in strings_offset { - let string_offset = base_offset + *offset as u64; - - let mut string = String::new(); - - reader.seek(SeekFrom::Start(string_offset))?; - let mut next_char = reader.read_le::().unwrap() as char; - while next_char != '\0' { - string.push(next_char); - next_char = reader.read_le::().unwrap() as char; - } - - strings.push(string); + reader.seek(SeekFrom::Start(base_offset + *offset as u64))?; + strings.push(read_null_terminated_utf8(reader)); } Ok(strings) @@ -56,14 +68,7 @@ pub(crate) fn strings_parser( #[binrw::parser(reader)] pub(crate) fn read_string_until_null() -> BinResult { - let mut string = String::new(); - - let mut next_char = reader.read_le::().unwrap() as char; - while next_char != '\0' { - string.push(next_char); - next_char = reader.read_le::().unwrap() as char; - } - Ok(string) + Ok(read_null_terminated_utf8(reader)) } fn read_half1(data: [u16; 1]) -> Half1 { @@ -184,4 +189,73 @@ mod tests { 4 ); } + + #[test] + fn read_null_terminated_utf8_ascii() { + let data = b"hello\0rest"; + let mut cursor = std::io::Cursor::new(&data[..]); + assert_eq!(read_null_terminated_utf8(&mut cursor), "hello"); + // cursor should be positioned right after the null byte + assert_eq!(cursor.position(), 6); + } + + #[test] + fn read_null_terminated_utf8_chinese() { + // "你好" in UTF-8: [0xE4,0xBD,0xA0, 0xE5,0xA5,0xBD] + null + let data = b"\xe4\xbd\xa0\xe5\xa5\xbd\0"; + let mut cursor = std::io::Cursor::new(&data[..]); + assert_eq!(read_null_terminated_utf8(&mut cursor), "你好"); + } + + #[test] + fn read_null_terminated_utf8_empty() { + let data = b"\0trailing"; + let mut cursor = std::io::Cursor::new(&data[..]); + assert_eq!(read_null_terminated_utf8(&mut cursor), ""); + } + + #[test] + fn read_null_terminated_utf8_invalid_fallback() { + // Invalid UTF-8 sequence: 0xFF is never valid in UTF-8 + let data: &[u8] = &[0xFF, 0xFE, 0x00]; + let mut cursor = std::io::Cursor::new(data); + assert_eq!(read_null_terminated_utf8(&mut cursor), ""); + } + + #[test] + fn null_terminated_utf8_ascii() { + let data = b"foo\0bar\0"; + let (s, next) = null_terminated_utf8(data, 0); + assert_eq!(s, "foo"); + assert_eq!(next, 4); + let (s2, next2) = null_terminated_utf8(data, next); + assert_eq!(s2, "bar"); + assert_eq!(next2, 8); + } + + #[test] + fn null_terminated_utf8_chinese() { + // "装备" in UTF-8: [0xE8,0xA3,0x85, 0xE5,0xA4,0x87] + null + let data = b"\xe8\xa3\x85\xe5\xa4\x87\0"; + let (s, _) = null_terminated_utf8(data, 0); + assert_eq!(s, "装备"); + } + + #[test] + fn null_terminated_utf8_at_offset() { + let data = b"\0hello\0world\0"; + let (s, next) = null_terminated_utf8(data, 1); + assert_eq!(s, "hello"); + assert_eq!(next, 7); + let (s2, _) = null_terminated_utf8(data, next); + assert_eq!(s2, "world"); + } + + #[test] + fn null_terminated_utf8_empty_at_offset() { + let data = b"a\0\0b\0"; + let (s, next) = null_terminated_utf8(data, 2); + assert_eq!(s, ""); + assert_eq!(next, 3); + } } diff --git a/src/dic.rs b/src/dic.rs index a18e28c..a4e69f8 100644 --- a/src/dic.rs +++ b/src/dic.rs @@ -136,7 +136,9 @@ impl Dictionary { } let chara = Dictionary::index_to_rune(&lut, id as u32); - self.dump_dict_node(&mut result, *v as i32, String::from(chara as u8 as char)) + if let Some(c) = char::from_u32(chara as u32) { + self.dump_dict_node(&mut result, *v as i32, String::from(c)) + } } Some(result) diff --git a/src/exd_file_operations.rs b/src/exd_file_operations.rs index f7fd6a8..2192637 100644 --- a/src/exd_file_operations.rs +++ b/src/exd_file_operations.rs @@ -8,6 +8,8 @@ use std::{ use binrw::{BinRead, BinWrite, Endian}; +use crate::common_file_operations::read_null_terminated_utf8; + use crate::{ excel::{Field, Row}, exd::EXD, @@ -125,15 +127,7 @@ impl EXD { )) .ok()?; - let mut string = String::new(); - - let mut byte: u8 = Self::read_data_raw(cursor).unwrap(); - while byte != 0 { - string.push(byte as char); - byte = Self::read_data_raw(cursor).unwrap(); - } - - Some(Field::String(string)) + Some(Field::String(read_null_terminated_utf8(cursor))) } ColumnDataType::Bool => { let bool_data: i8 = Self::read_data_raw(cursor).unwrap(); diff --git a/src/model/mod.rs b/src/model/mod.rs index a5beab3..3acb27a 100755 --- a/src/model/mod.rs +++ b/src/model/mod.rs @@ -16,7 +16,7 @@ use binrw::{BinWrite, BinWriterExt, binrw}; use bitflags::bitflags; use crate::common::Platform; -use crate::common_file_operations::{read_bool_from, write_bool_as}; +use crate::common_file_operations::{null_terminated_utf8, read_bool_from, write_bool_as}; use crate::{ByteBuffer, ByteSpan, ReadableFile, WritableFile}; use vertex_declarations::{ VERTEX_ELEMENT_SIZE, VertexDeclaration, VertexType, VertexUsage, vertex_element_parser, @@ -770,33 +770,15 @@ impl ReadableFile for MDL { let mut affected_bone_names = vec![]; for offset in &model.bone_name_offsets { - let mut offset = *offset; - let mut string = String::new(); - - let mut next_char = model.header.strings[offset as usize] as char; - while next_char != '\0' { - string.push(next_char); - offset += 1; - next_char = model.header.strings[offset as usize] as char; - } - - affected_bone_names.push(string); + let (name, _) = null_terminated_utf8(&model.header.strings, *offset as usize); + affected_bone_names.push(name); } let mut material_names = vec![]; for offset in &model.material_name_offsets { - let mut offset = *offset; - let mut string = String::new(); - - let mut next_char = model.header.strings[offset as usize] as char; - while next_char != '\0' { - string.push(next_char); - offset += 1; - next_char = model.header.strings[offset as usize] as char; - } - - material_names.push(string); + let (name, _) = null_terminated_utf8(&model.header.strings, *offset as usize); + material_names.push(name); } let mut lods = vec![]; @@ -1075,18 +1057,10 @@ impl ReadableFile for MDL { vertex.position[2] = new_vertex.position[2] - old_vertex.position[2]; } - let mut offset = shape.string_offset; - let mut string = String::new(); - - let mut next_char = model.header.strings[offset as usize] as char; - while next_char != '\0' { - string.push(next_char); - offset += 1; - next_char = model.header.strings[offset as usize] as char; - } + let (name, _) = null_terminated_utf8(&model.header.strings, shape.string_offset as usize); shapes.push(Shape { - name: string, + name, morphed_vertices, }); } diff --git a/src/mtrl.rs b/src/mtrl.rs index da61b9d..2c00861 100644 --- a/src/mtrl.rs +++ b/src/mtrl.rs @@ -6,7 +6,7 @@ use std::io::Cursor; use crate::common::Platform; -use crate::common_file_operations::{Half1, Half2, Half3}; +use crate::common_file_operations::{Half1, Half2, Half3, null_terminated_utf8}; use crate::mtrl::ColorDyeTable::{ DawntrailColorDyeTable, LegacyColorDyeTable, OpaqueColorDyeTable, }; @@ -471,31 +471,15 @@ impl ReadableFile for Material { let mut offset = 0; for _ in 0..mat_data.file_header.texture_count { - let mut string = String::new(); - - let mut next_char = mat_data.strings[offset] as char; - while next_char != '\0' { - string.push(next_char); - offset += 1; - next_char = mat_data.strings[offset] as char; - } - - texture_paths.push(string); - - offset += 1; + let (s, next) = null_terminated_utf8(&mat_data.strings, offset); + texture_paths.push(s); + offset = next; } - // TODO: move to reusable function - let mut shader_package_name = String::new(); - - offset = mat_data.file_header.shader_package_name_offset as usize; - - let mut next_char = mat_data.strings[offset] as char; - while next_char != '\0' { - shader_package_name.push(next_char); - offset += 1; - next_char = mat_data.strings[offset] as char; - } + let (shader_package_name, _) = null_terminated_utf8( + &mat_data.strings, + mat_data.file_header.shader_package_name_offset as usize, + ); // bg/ffxiv/wil_w1/evt/w1eb/material/w1eb_f1_vfog1a.mtrl has a shader value list of 9, which doesn't make sense in this system // eventually we need to un-hardcode it from vec4 or whatever diff --git a/src/scn.rs b/src/scn.rs index 6449afd..b77740d 100644 --- a/src/scn.rs +++ b/src/scn.rs @@ -3,10 +3,10 @@ use std::io::SeekFrom; -use binrw::{BinReaderExt, BinResult, BinWrite, binrw}; +use binrw::{BinResult, BinWrite, binrw}; use crate::{ - common_file_operations::{read_bool_from, write_bool_as}, + common_file_operations::{read_bool_from, read_null_terminated_utf8, write_bool_as}, layer::Layer, string_heap::{HeapPointer, HeapStringFromPointer, StringHeap}, tmb::Tmb, @@ -438,16 +438,8 @@ fn strings_from_offsets(offsets: &Vec) -> BinResult> { for offset in offsets { let string_offset = *offset as u64; - let mut string = String::new(); - reader.seek(SeekFrom::Start(base_offset + string_offset))?; - let mut next_char = reader.read_le::().unwrap() as char; - while next_char != '\0' { - string.push(next_char); - next_char = reader.read_le::().unwrap() as char; - } - - strings.push(string); + strings.push(read_null_terminated_utf8(reader)); } Ok(strings) diff --git a/src/sqpack/mod.rs b/src/sqpack/mod.rs index 02d26ca..c0dab06 100644 --- a/src/sqpack/mod.rs +++ b/src/sqpack/mod.rs @@ -6,7 +6,7 @@ use std::io::{Read, Seek, SeekFrom, Write}; use binrw::{BinRead, BinWrite, Endian, binrw}; use data::{BlockHeader, CompressionMode}; -use crate::common::{Platform, Region}; +use crate::common::Platform; use crate::compression::no_header_decompress; mod data; diff --git a/src/string_heap.rs b/src/string_heap.rs index 9a2b940..3fef258 100644 --- a/src/string_heap.rs +++ b/src/string_heap.rs @@ -5,7 +5,7 @@ use std::io::{Cursor, Read, Seek, SeekFrom, Write}; use binrw::{BinRead, BinReaderExt, BinResult, BinWrite, Endian, Error, binrw}; -use crate::{ByteBuffer, common_file_operations::write_string}; +use crate::{ByteBuffer, common_file_operations::{read_null_terminated_utf8, write_string}}; /// A string that exists in a different location in the file, usually a heap with a bunch of other strings. #[binrw] @@ -166,18 +166,11 @@ impl StringHeap { { let offset = self.pos + offset as u64; - let mut string = String::new(); - let old_pos = reader.stream_position().unwrap(); - reader.seek(SeekFrom::Start(offset)).unwrap(); - let mut next_char = reader.read_le::().unwrap() as char; - while next_char != '\0' { - string.push(next_char); - next_char = reader.read_le::().unwrap() as char; - } + let s = read_null_terminated_utf8(reader); reader.seek(SeekFrom::Start(old_pos)).unwrap(); - string + s } }