use std::borrow::Cow; use std::ops::{Deref, Range, RangeFrom}; /// Lexing an input file, in the sense of breaking it up into substrings based on delimiters and /// whitespace. use std::str::FromStr; use crate::error::*; use crate::primitive::Name; mod str; pub use self::str::{HexStringLexer, StringLexer}; /// `Lexer` has functionality to jump around and traverse the PDF lexemes of a string in any direction. #[derive(Copy, Clone)] #[allow(dead_code)] pub struct Lexer<'a> { pos: usize, buf: &'a [u8], file_offset: usize, } // find the position where condition(data[pos-1]) == false and condition(data[pos]) == true #[inline] fn boundary_rev(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize { match data[..pos].iter().rposition(|&b| !condition(b)) { Some(start) => start + 1, None => 0, } } // find the position where condition(data[pos-1]) == true and condition(data[pos]) == false #[inline] fn boundary(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize { match data[pos..].iter().position(|&b| !condition(b)) { Some(start) => pos + start, None => data.len(), } } #[inline] fn is_whitespace(b: u8) -> bool { matches!(b, 0 | b' ' | b'\r' | b'\n' | b'\t') } #[inline] fn not(f: impl Fn(T) -> bool) -> impl Fn(T) -> bool { move |t| !f(t) } impl<'a> Lexer<'a> { pub fn new(buf: &'a [u8]) -> Lexer<'a> { Lexer { pos: 0, buf, file_offset: 0, } } pub fn with_offset(buf: &'a [u8], file_offset: usize) -> Lexer<'a> { Lexer { pos: 0, buf, file_offset, } } /// Returns next lexeme. Lexer moves to the next byte after the lexeme. (needs to be tested) #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Result> { let (lexeme, pos) = self.next_word()?; self.pos = pos; Ok(lexeme) } /// consume the whitespace sequence following the stream start pub fn next_stream(&mut self) -> Result<()> { let pos = self.skip_whitespace(self.pos)?; if !self.buf[pos..].starts_with(b"stream") { // bail!("next token isn't 'stream'"); } let &b0 = self.buf.get(pos + 6).ok_or(PdfError::EOF)?; if b0 == b'\n' { self.pos = pos + 7; } else if b0 == b'\r' { let &b1 = self.buf.get(pos + 7).ok_or(PdfError::EOF)?; if b1 != b'\n' { bail!("invalid whitespace following 'stream'"); // bail!("invalid whitespace following 'stream'"); } self.pos = pos + 8; } else { bail!("invalid whitespace"); } Ok(()) } /// Gives previous lexeme. Lexer moves to the first byte of this lexeme. (needs to be tested) pub fn back(&mut self) -> Result> { //println!("back: {:?}", String::from_utf8_lossy(&self.buf[self.pos.saturating_sub(20) .. self.pos])); // first reverse until we find non-whitespace let end_pos = boundary_rev(self.buf, self.pos, is_whitespace); let start_pos = boundary_rev(self.buf, end_pos, not(is_whitespace)); self.pos = start_pos; Ok(self.new_substr(start_pos..end_pos)) } /// Look at the next lexeme. Will return empty substr if the next character is EOF. pub fn peek(&self) -> Result> { match self.next_word() { Ok((substr, _)) => Ok(substr), Err(PdfError::EOF) => Ok(self.new_substr(self.pos..self.pos)), Err(e) => Err(e), } } /// Returns `Ok` if the next lexeme matches `expected` - else `Err`. pub fn next_expect(&mut self, expected: &'static str) -> Result<()> { let word = self.next()?; if word.equals(expected.as_bytes()) { Ok(()) } else { Err(PdfError::UnexpectedLexeme { pos: self.pos, lexeme: word.to_string(), expected, }) } } /// skip whitespaces and return the position of the first non-whitespace character #[inline] fn skip_whitespace(&self, pos: usize) -> Result { // Move away from eventual whitespace let pos = boundary(self.buf, pos, is_whitespace); if pos >= self.buf.len() { Err(PdfError::EOF) } else { Ok(pos) } } /// Used by next, peek and back - returns substring and new position /// If forward, places pointer at the next non-whitespace character. /// If backward, places pointer at the start of the current word. // TODO ^ backward case is actually not tested or.. thought about that well. fn next_word(&self) -> Result<(Substr<'a>, usize)> { if self.pos == self.buf.len() { return Err(PdfError::EOF); } let mut pos = self.skip_whitespace(self.pos)?; while self.buf.get(pos) == Some(&b'%') { pos += 1; if let Some(off) = self.buf[pos..].iter().position(|&b| b == b'\n') { pos += off + 1; } // Move away from eventual whitespace pos = self.skip_whitespace(pos)?; } let start_pos = pos; // If first character is delimiter, this lexeme only contains that character. // - except << and >> which go together, and / which marks the start of a // name token. if self.is_delimiter(pos) { if self.buf[pos] == b'/' { pos = self.advance_pos(pos)?; while !self.is_whitespace(pos) && !self.is_delimiter(pos) { match self.advance_pos(pos) { Ok(p) => pos = p, Err(_) => break, } } return Ok((self.new_substr(start_pos..pos), pos)); } if let Some(slice) = self.buf.get(pos..=pos + 1) { if slice == b"<<" || slice == b">>" { pos = self.advance_pos(pos)?; } } pos = self.advance_pos(pos)?; return Ok((self.new_substr(start_pos..pos), pos)); } // Read to past the end of lexeme while !self.is_whitespace(pos) && !self.is_delimiter(pos) { match self.advance_pos(pos) { Ok(p) => pos = p, Err(_) => break, } } let result = self.new_substr(start_pos..pos); // Move away from whitespace again //pos = self.skip_whitespace(pos)?; Ok((result, pos)) } /// Just a helper for next_word. #[inline] fn advance_pos(&self, pos: usize) -> Result { if pos < self.buf.len() { Ok(pos + 1) } else { Err(PdfError::EOF) } } #[inline] pub fn next_as(&mut self) -> Result where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static, { self.next().and_then(|word| word.to::()) } #[inline] pub fn get_pos(&self) -> usize { self.pos } #[inline] pub fn new_substr(&self, mut range: Range) -> Substr<'a> { // if the range is backward, fix it // start is inclusive, end is exclusive. keep that in mind if range.start > range.end { let new_end = range.start + 1; range.start = range.end + 1; range.end = new_end; } Substr { file_offset: self.file_offset + range.start, slice: &self.buf[range], } } /// Just a helper function for set_pos, set_pos_from_end and offset_pos. #[inline] pub fn set_pos(&mut self, wanted_pos: usize) -> Substr<'a> { let new_pos = wanted_pos.min(self.buf.len()); let range = if self.pos < new_pos { self.pos..new_pos } else { new_pos..self.pos }; self.pos = new_pos; self.new_substr(range) } /// Returns the substr between the old and new positions #[inline] pub fn set_pos_from_end(&mut self, new_pos: usize) -> Substr<'a> { self.set_pos(self.buf.len().saturating_sub(new_pos).saturating_sub(1)) } /// Returns the substr between the old and new positions #[inline] pub fn offset_pos(&mut self, offset: usize) -> Substr<'a> { self.set_pos(self.pos.wrapping_add(offset)) } /// Moves pos to start of next line. Returns the skipped-over substring. #[allow(dead_code)] pub fn seek_newline(&mut self) -> Substr { let start = self.pos; while self.buf[self.pos] != b'\n' && self.incr_pos() {} self.incr_pos(); self.new_substr(start..self.pos) } // TODO: seek_substr and seek_substr_back should use next() or back()? /// Moves pos to after the found `substr`. Returns Substr with traversed text if `substr` is found. #[allow(dead_code)] pub fn seek_substr(&mut self, substr: impl AsRef<[u8]>) -> Option> { // let substr = substr.as_ref(); let start = self.pos; let mut matched = 0; loop { if self.pos >= self.buf.len() { return None; } if self.buf[self.pos] == substr[matched] { matched += 1; } else { matched = 0; } if matched == substr.len() { break; } self.pos += 1; } self.pos += 1; Some(self.new_substr(start..(self.pos - substr.len()))) } //TODO perhaps seek_substr_back should, like back(), move to the first letter of the substr. /// Searches for string backward. Moves to after the found `substr`, returns the traversed /// Substr if found. pub fn seek_substr_back(&mut self, substr: &[u8]) -> Result> { let end = self.pos; match self.buf[..end] .windows(substr.len()) .rposition(|w| w == substr) { Some(start) => { self.pos = start + substr.len(); Ok(self.new_substr(self.pos..end)) } None => Err(PdfError::NotFound { word: String::from_utf8_lossy(substr).into(), }), } } /// Read and return slice of at most n bytes. #[allow(dead_code)] pub fn read_n(&mut self, n: usize) -> Substr<'a> { let start_pos = self.pos; self.pos += n; if self.pos >= self.buf.len() { self.pos = self.buf.len() - 1; } if start_pos < self.buf.len() { self.new_substr(start_pos..self.pos) } else { self.new_substr(0..0) } } /// Returns slice from current position to end. #[inline] pub fn get_remaining_slice(&self) -> &'a [u8] { &self.buf[self.pos..] } /// for debugging pub fn ctx(&self) -> Cow { String::from_utf8_lossy( &self.buf[self.pos.saturating_sub(40)..self.buf.len().min(self.pos + 40)], ) } #[inline] fn incr_pos(&mut self) -> bool { if self.pos >= self.buf.len() - 1 { false } else { self.pos += 1; true } } #[inline] fn is_whitespace(&self, pos: usize) -> bool { self.buf .get(pos) .map(|&b| is_whitespace(b)) .unwrap_or(false) } #[inline] fn is_delimiter(&self, pos: usize) -> bool { self.buf .get(pos) .map(|b| b"()<>[]{}/%".contains(b)) .unwrap_or(false) } } /// A slice from some original string - a lexeme. #[derive(Copy, Clone, Debug)] pub struct Substr<'a> { slice: &'a [u8], file_offset: usize, } impl<'a> Substr<'a> { pub fn new + ?Sized>(data: &'a T, file_offset: usize) -> Self { Substr { slice: data.as_ref(), file_offset, } } // to: &S -> U. Possibly expensive conversion. // as: &S -> &U. Cheap borrow conversion // into: S -> U. Cheap ownership transfer conversion. #[allow(clippy::inherent_to_string)] pub fn to_string(&self) -> String { String::from_utf8_lossy(self.as_slice()).into() } pub fn to_name(&self) -> Result { Ok(Name(std::str::from_utf8(self.as_slice())?.into())) } pub fn to_vec(&self) -> Vec { self.slice.to_vec() } pub fn to(&self) -> Result where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static, { std::str::from_utf8(self.slice)? .parse::() .map_err(|e| PdfError::Parse { source: e.into() }) } pub fn is_integer(&self) -> bool { if self.slice.len() == 0 { return false; } let mut slice = self.slice; if slice[0] == b'-' { if slice.len() < 2 { return false; } slice = &slice[1..]; } is_int(slice) } pub fn is_real_number(&self) -> bool { self.real_number().is_some() } pub fn real_number(&self) -> Option { if self.slice.len() == 0 { return None; } let mut slice = self.slice; if slice[0] == b'-' { if slice.len() < 2 { return None; } slice = &slice[1..]; } if let Some(i) = slice.iter().position(|&b| b == b'.') { if !is_int(&slice[..i]) { return None; } slice = &slice[i + 1..]; } if let Some(len) = slice.iter().position(|&b| !b.is_ascii_digit()) { if len == 0 { return None; } let end = self.slice.len() - slice.len() + len; Some(Substr { file_offset: self.file_offset, slice: &self.slice[..end], }) } else { Some(*self) } } pub fn as_slice(&self) -> &'a [u8] { self.slice } pub fn as_str(&self) -> Result<&str> { std::str::from_utf8(self.slice).map_err(|e| PdfError::Parse { source: e.into() }) } pub fn equals(&self, other: impl AsRef<[u8]>) -> bool { self.slice == other.as_ref() } pub fn reslice(&self, range: RangeFrom) -> Substr<'a> { Substr { file_offset: self.file_offset + range.start, slice: &self.slice[range], } } pub fn file_range(&self) -> Range { self.file_offset..self.file_offset + self.slice.len() } } #[inline] fn is_int(b: &[u8]) -> bool { b.iter().all(|&b| b.is_ascii_digit()) } impl<'a> Deref for Substr<'a> { type Target = [u8]; fn deref(&self) -> &[u8] { self.as_slice() } } impl<'a> PartialEq<&[u8]> for Substr<'a> { fn eq(&self, rhs: &&[u8]) -> bool { self.equals(rhs) } } impl<'a> PartialEq<&str> for Substr<'a> { fn eq(&self, rhs: &&str) -> bool { self.equals(rhs.as_bytes()) } } #[cfg(test)] mod tests { use super::*; use std::fs; use std::fs::File; use std::io::{BufWriter, Write}; use std::u32::MAX; #[test] fn test_boundary_rev() { assert_eq!(boundary_rev(b" hello", 3, not(is_whitespace)), 1); assert_eq!(boundary_rev(b" hello", 3, is_whitespace), 3); } #[test] fn test_boundary() { assert_eq!(boundary(b" hello ", 3, not(is_whitespace)), 6); assert_eq!(boundary(b" hello ", 3, is_whitespace), 3); assert_eq!(boundary(b"01234 7orld", 5, is_whitespace), 7); assert_eq!(boundary(b"01234 7orld", 7, is_whitespace), 7); assert_eq!(boundary(b"q\n", 1, is_whitespace), 2); } #[test] fn test_substr() { assert!(Substr::new("123", 0).is_real_number()); assert!(Substr::new("123.", 0).is_real_number()); assert!(Substr::new("123.45", 0).is_real_number()); assert!(Substr::new(".45", 0).is_real_number()); assert!(Substr::new("-.45", 0).is_real_number()); assert!(!Substr::new("123.45", 0).is_integer()); assert!(Substr::new("123", 0).is_integer()); } #[test] fn test_lexed() { let file_data = fs::read("/home/kschuettler/Dokumente/TestFiles/18 - EVIDIS - Corrosao Irritacao ocular aguda.pdf").expect("File not found!"); println!("{}", file_data.len()); let mut lexer = Lexer::new(&file_data[0..]); let file = File::create("/tmp/pdf.txt").unwrap(); let mut writer = BufWriter::new(file); let mut depth = false; let mut stream = false; let mut dict = 0u32; let lex_count = MAX; let mut lex_count_left = lex_count; while let Ok(s) = lexer.next() { if lex_count_left == 0 { break; } if stream && s.to_string().as_str() == "endstream" { stream = false; writer .write("endstream\n".as_ref()) .expect("Could not write to buffer"); continue; } else if stream { continue; } lex_count_left -= 1; match s.to_string().as_str() { "obj" => depth = true, "endobj" => depth = false, "stream" => { stream = true; writer .write("stream ... ".as_ref()) .expect("Could not write to buffer"); continue; } "<<" => dict += 1, ">>" => dict -= 1, _ => (), } writer.write(s.as_ref()).expect("Could not write to buffer"); if dict == 0 { writer .write("\n".as_ref()) .expect("Could not write to buffer"); } else { writer .write(" ".as_ref()) .expect("Could not write to buffer"); } match s.to_string().as_str() { _ => (), } } writer.flush().expect("Could not flush buffer"); } }