Get each word or character with its x, y, width, and height on the page. Useful for building search, redaction, or document analysis tools.
use pdfluent::PdfDocument;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let doc = PdfDocument::open("invoice.pdf")?;
let page = doc.page(0)?;
for word in page.extract_words()? {
println!(
"{:?} x={:.1} y={:.1} w={:.1} h={:.1}",
word.text, word.x, word.y, word.width, word.height
);
}
Ok(())
}Open the PDF and get the page you want to analyse. Pages are 0-indexed.
use pdfluent::PdfDocument;
let doc = PdfDocument::open("invoice.pdf")?;
let page = doc.page(0)?;extract_words() returns a Vec<TextWord>. Each TextWord has a text field and a bounding box (x, y, width, height) in PDF points, measured from the bottom-left of the page.
for word in page.extract_words()? {
println!(
"{} at ({:.1}, {:.1})",
word.text, word.x, word.y
);
}For finer granularity, use extract_chars(). Each TextChar includes the Unicode codepoint and its individual bounding box.
for ch in page.extract_chars()? {
println!(
"U+{:04X} '{}' at ({:.1},{:.1}) {}x{}",
ch.codepoint as u32, ch.text,
ch.x, ch.y, ch.width, ch.height
);
}Combine extract_words() with a simple search to locate a term on the page. The bounding box can then be used to draw a highlight annotation.
let words = page.extract_words()?;
let hits: Vec<_> = words.iter()
.filter(|w| w.text.eq_ignore_ascii_case("total"))
.collect();
for hit in hits {
println!("Found 'total' at ({:.1}, {:.1})", hit.x, hit.y);
}PDF coordinates are in points (1/72 inch). Multiply by your DPI and divide by 72 to convert to pixels for rendering.
let dpi = 150.0_f64;
let scale = dpi / 72.0;
for word in page.extract_words()? {
let px_x = word.x * scale;
let px_y = word.y * scale;
println!("{} at ({:.0}px, {:.0}px)", word.text, px_x, px_y);
}No JVM, no runtime, no DLL dependencies. Ships as a single native binary or WASM module.
Rust's ownership model prevents buffer overflows and use-after-free. No segfaults in PDF parsing.
Same code runs server-side, in Docker, on AWS Lambda, on Cloudflare Workers, or in the browser via WASM.