129 lines
4.0 KiB
Rust
129 lines
4.0 KiB
Rust
extern crate pdf;
|
||
|
||
use std::collections::HashMap;
|
||
use std::env::args;
|
||
use std::fs;
|
||
use std::time::SystemTime;
|
||
|
||
use pdf::enc::StreamFilter;
|
||
use pdf::error::PdfError;
|
||
use pdf::file::{FileOptions, Log};
|
||
use pdf::object::*;
|
||
use pdf::primitive::Primitive;
|
||
|
||
struct VerboseLog;
|
||
impl Log for VerboseLog {
|
||
fn load_object(&self, r: PlainRef) {
|
||
println!("load {r:?}");
|
||
}
|
||
fn log_get(&self, r: PlainRef) {
|
||
println!("get {r:?}");
|
||
}
|
||
}
|
||
|
||
#[cfg(feature = "cache")]
|
||
fn main() -> Result<(), PdfError> {
|
||
let path = "/home/kschuettler/Dokumente/TestFiles/SYNGENTA_EFSA_sanitisation_GFL_v1.pdf"; //args().nth(1).expect("no file given");
|
||
println!("read: {}", path);
|
||
let now = SystemTime::now();
|
||
|
||
let file = FileOptions::cached().log(VerboseLog).open(&path).unwrap();
|
||
let resolver = file.resolver();
|
||
|
||
if let Some(ref info) = file.trailer.info_dict {
|
||
let title = info.title.as_ref().map(|p| p.to_string_lossy());
|
||
let author = info.author.as_ref().map(|p| p.to_string_lossy());
|
||
|
||
let descr = match (title, author) {
|
||
(Some(title), None) => title,
|
||
(None, Some(author)) => format!("[no title] – {}", author),
|
||
(Some(title), Some(author)) => format!("{} – {}", title, author),
|
||
_ => "PDF".into(),
|
||
};
|
||
println!("{}", descr);
|
||
}
|
||
|
||
let mut images: Vec<_> = vec![];
|
||
let mut fonts = HashMap::new();
|
||
|
||
for page in file.pages() {
|
||
let page = page.unwrap();
|
||
let resources = page.resources().unwrap();
|
||
for (i, font) in resources
|
||
.fonts
|
||
.values()
|
||
.map(|lazy_font| lazy_font.load(&resolver))
|
||
.filter_map(|f| f.ok())
|
||
.enumerate()
|
||
{
|
||
let name = match &font.name {
|
||
Some(name) => name.as_str().into(),
|
||
None => i.to_string(),
|
||
};
|
||
fonts.insert(name, font.clone());
|
||
}
|
||
images.extend(
|
||
resources
|
||
.xobjects
|
||
.iter()
|
||
.map(|(_name, &r)| resolver.get(r).unwrap())
|
||
.filter(|o| matches!(**o, XObject::Image(_))),
|
||
);
|
||
}
|
||
|
||
for (i, o) in images.iter().enumerate() {
|
||
let img = match **o {
|
||
XObject::Image(ref im) => im,
|
||
_ => continue,
|
||
};
|
||
let (mut data, filter) = img.raw_image_data(&resolver)?;
|
||
let ext = match filter {
|
||
Some(StreamFilter::DCTDecode(_)) => "jpeg",
|
||
Some(StreamFilter::JBIG2Decode(_)) => "jbig2",
|
||
Some(StreamFilter::JPXDecode) => "jp2k",
|
||
Some(StreamFilter::FlateDecode(_)) => "png",
|
||
Some(StreamFilter::CCITTFaxDecode(_)) => {
|
||
data = fax::tiff::wrap(&data, img.width, img.height).into();
|
||
"tiff"
|
||
}
|
||
_ => continue,
|
||
};
|
||
|
||
let fname = format!("extracted_image_{}.{}", i, ext);
|
||
|
||
fs::write(fname.as_str(), data).unwrap();
|
||
println!("Wrote file {}", fname);
|
||
}
|
||
println!("Found {} image(s).", images.len());
|
||
|
||
for (name, font) in fonts.iter() {
|
||
let fname = format!("font_{}", name);
|
||
if let Some(Ok(data)) = font.embedded_data(&resolver) {
|
||
fs::write(fname.as_str(), data).unwrap();
|
||
println!("Wrote file {}", fname);
|
||
}
|
||
}
|
||
println!("Found {} font(s).", fonts.len());
|
||
|
||
if let Some(ref forms) = file.get_root().forms {
|
||
println!("Forms:");
|
||
for field in forms.fields.iter() {
|
||
print!(" {:?} = ", field.name);
|
||
match field.value {
|
||
Primitive::String(ref s) => println!("{}", s.to_string_lossy()),
|
||
Primitive::Integer(i) => println!("{}", i),
|
||
Primitive::Name(ref s) => println!("{}", s),
|
||
ref p => println!("{:?}", p),
|
||
}
|
||
}
|
||
}
|
||
|
||
if let Ok(elapsed) = now.elapsed() {
|
||
println!(
|
||
"Time: {}s",
|
||
elapsed.as_secs() as f64 + elapsed.subsec_nanos() as f64 * 1e-9
|
||
);
|
||
}
|
||
Ok(())
|
||
}
|