diff --git a/turtle/src/error.rs b/turtle/src/error.rs index 8d420e1..b0bbe57 100644 --- a/turtle/src/error.rs +++ b/turtle/src/error.rs @@ -1,4 +1,4 @@ -use crate::MAX_STACK_SIZE; +use crate::{MAX_BUFFER_SIZE, MAX_STACK_SIZE}; use oxilangtag::LanguageTagParseError; use oxiri::IriParseError; use rio_api::parser::{LineBytePosition, ParseError}; @@ -32,6 +32,7 @@ pub enum TurtleErrorKind { error: LanguageTagParseError, }, StackOverflow, + BufferOverflow, } impl fmt::Display for TurtleError { @@ -56,6 +57,9 @@ impl fmt::Display for TurtleError { TurtleErrorKind::StackOverflow => { write!(f, "The parser encountered more than {} nested constructions. This number is limited in order to avoid stack overflow OS errors.", MAX_STACK_SIZE) } + TurtleErrorKind::BufferOverflow => { + write!(f, "The parser encountered a term with more than {} bytes. The size is limited in order to avoid out of memory error on invalid files.", MAX_BUFFER_SIZE) + } }?; if let Some(position) = self.position { write!( diff --git a/turtle/src/gnquads.rs b/turtle/src/gnquads.rs index b98c0f3..0eb6f6b 100644 --- a/turtle/src/gnquads.rs +++ b/turtle/src/gnquads.rs @@ -219,6 +219,7 @@ pub(crate) fn parse_variable<'a>( } else { return Ok(Variable { name: buffer }); } + read.check_buffer_size(buffer)?; } } diff --git a/turtle/src/lib.rs b/turtle/src/lib.rs index 25c7df6..2b9618c 100644 --- a/turtle/src/lib.rs +++ b/turtle/src/lib.rs @@ -70,3 +70,6 @@ pub use gtrig::GTriGParser; /// This limit is set in order to avoid stack overflow error when parsing such structures due to too many recursive calls. /// The actual limit value is a wet finger compromise between not failing to parse valid files and avoiding to trigger stack overflow errors. const MAX_STACK_SIZE: usize = 128; + +/// Maximal size of a buffer (useful to limit memory consumption). +const MAX_BUFFER_SIZE: usize = 10_000_000; diff --git a/turtle/src/shared.rs b/turtle/src/shared.rs index 5dec6f3..2fe6390 100644 --- a/turtle/src/shared.rs +++ b/turtle/src/shared.rs @@ -73,6 +73,7 @@ pub fn parse_iriref( read_utf8_char(read)? }), } + read.check_buffer_size(buffer)?; } } @@ -117,6 +118,7 @@ pub fn parse_blank_node_label<'a>( } } } + read.check_buffer_size(buffer)?; } } @@ -138,6 +140,7 @@ pub fn parse_langtag( break; } } + read.check_buffer_size(buffer)?; } LanguageTag::parse(buffer.as_str()).map_err(|error| { read.parse_error(TurtleErrorKind::InvalidLanguageTag { @@ -177,6 +180,7 @@ pub fn parse_string_literal_quote_inner( read_utf8_char(read)? }), } + read.check_buffer_size(buffer)?; } } diff --git a/turtle/src/turtle.rs b/turtle/src/turtle.rs index 471358b..d601bb2 100644 --- a/turtle/src/turtle.rs +++ b/turtle/src/turtle.rs @@ -1010,6 +1010,7 @@ pub(crate) fn parse_numeric_literal<'a>( } _ => break, } + read.check_buffer_size(buffer)?; } // We read the digits after . @@ -1044,6 +1045,7 @@ pub(crate) fn parse_numeric_literal<'a>( } _ => break, } + read.check_buffer_size(buffer)?; } Some(count_after) } else { @@ -1231,6 +1233,7 @@ pub(crate) fn parse_prefixed_name<'a>( } } } + read.check_buffer_size(buffer)?; } Ok(NamedNode { iri: buffer }) } @@ -1320,6 +1323,7 @@ fn parse_exponent( } else { return Ok(()); } + read.check_buffer_size(buffer)?; } } @@ -1368,6 +1372,7 @@ fn parse_string_literal_long_quote_inner( read_utf8_char(read)? }), } + read.check_buffer_size(buffer)?; } } @@ -1427,6 +1432,7 @@ fn parse_pn_prefix( } } } + read.check_buffer_size(buffer)?; } } diff --git a/turtle/src/utils.rs b/turtle/src/utils.rs index f24d01a..5cf66a0 100644 --- a/turtle/src/utils.rs +++ b/turtle/src/utils.rs @@ -1,5 +1,5 @@ use crate::error::*; -use crate::MAX_STACK_SIZE; +use crate::{MAX_BUFFER_SIZE, MAX_STACK_SIZE}; use rio_api::parser::LineBytePosition; use std::collections::VecDeque; use std::io::{BufRead, ErrorKind, Read}; @@ -175,6 +175,9 @@ impl LookAheadByteReader { Err(e) if e.kind() == ErrorKind::Interrupted => {} Err(e) => return Err(e.into()), } + if self.buffer.len() > MAX_BUFFER_SIZE { + return Err(self.parse_error(TurtleErrorKind::BufferOverflow)); + } } } @@ -208,6 +211,14 @@ impl LookAheadByteReader { pub fn decrement_stack_size(&mut self) { self.stack_size -= 1; } + + pub fn check_buffer_size(&self, buffer: &str) -> Result<(), TurtleError> { + if buffer.len() > MAX_BUFFER_SIZE { + Err(self.parse_error(TurtleErrorKind::BufferOverflow)) + } else { + Ok(()) + } + } } #[derive(Default)] diff --git a/turtle/tests/recovery.rs b/turtle/tests/recovery.rs index 3ab9fd9..9862c05 100644 --- a/turtle/tests/recovery.rs +++ b/turtle/tests/recovery.rs @@ -43,3 +43,22 @@ fn nquads_error_recovery() { assert_eq!(count, 3); assert_eq!(count_err, 2); } + +#[test] +fn very_big_literal() { + let mut data = String::with_capacity(12_000_000); + data.push_str(" \""); + for _ in 0..11_000_000 { + data.push('0'); + } + data.push_str("\" ."); + let mut parser = NTriplesParser::new(Cursor::new(&data)); + assert!(parser + .parse_step(&mut |_| Ok(()) as Result<(), TurtleError>) + .is_ok()); + assert!(parser + .parse_step(&mut |_| Ok(()) as Result<(), TurtleError>) + .unwrap_err() + .to_string() + .contains("out of memory")); +}