Before running text extraction, check whether the PDF was digitally created or is a scan of a physical document.
use pdfluent::PdfDocument;
fn main() -> pdfluent::Result<()> {
let doc = PdfDocument::open("document.pdf")?;
for (i, page) in doc.pages().enumerate() {
let has_text = page.has_selectable_text();
let has_images = page.has_raster_images();
println!(
"Page {}: text={} images={} => {}",
i + 1,
has_text,
has_images,
classify(has_text, has_images)
);
}
Ok(())
}
fn classify(has_text: bool, has_images: bool) -> &'static str {
match (has_text, has_images) {
(true, _) => "digital",
(false, true) => "scanned",
(false, false) => "blank or vector-only",
}
}No additional features are required. Page inspection is part of the base crate.
# Cargo.toml
[dependencies]
pdfluent = "0.9"Use doc.pages() to get an iterator over all pages. Each Page gives you access to content stream analysis.
use pdfluent::PdfDocument;
let doc = PdfDocument::open("document.pdf")?;
for (i, page) in doc.pages().enumerate() {
println!("Page {}: {:?}", i + 1, page.content_type());
}has_selectable_text() returns true if the page content stream contains any text operators. has_raster_images() returns true if the page contains XObject images.
for page in doc.pages() {
let has_text = page.has_selectable_text();
let has_images = page.has_raster_images();
if !has_text && has_images {
println!("This page appears to be a scan.");
}
}Count pages without text. A score above 80% is a strong indicator that the document is a scan or a mix.
let total = doc.page_count() as f32;
let no_text = doc.pages()
.filter(|p| !p.has_selectable_text())
.count() as f32;
let scan_ratio = no_text / total;
println!("Scan ratio: {:.0}%", scan_ratio * 100.0);
if scan_ratio > 0.8 {
println!("Likely a scanned document. Consider running OCR.");
}Some scanned PDFs have a hidden text layer added by OCR software. Use has_invisible_text() to detect this.
for (i, page) in doc.pages().enumerate() {
if page.has_invisible_text() {
println!(
"Page {} has an OCR text layer (invisible text).",
i + 1
);
}
}No JVM, no runtime, no DLL dependencies. Ships as a single native binary or WASM module.
Rust's ownership model prevents buffer overflows and use-after-free. No segfaults in PDF parsing.
Same code runs server-side, in Docker, on AWS Lambda, on Cloudflare Workers, or in the browser via WASM.