Read all text content from a PDF document. PDFluent preserves reading order and handles multi-column layouts, right-to-left scripts, and CID fonts.
use pdfluent::PdfDocument;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let doc = PdfDocument::open("document.pdf")?;
for (i, page) in doc.pages().enumerate() {
let text = page.extract_text()?;
println!("--- Page {} ---", i + 1);
println!("{}", text);
}
Ok(())
}Load the PDF. Text extraction works page by page, so memory usage stays low even for large documents.
use pdfluent::PdfDocument;
let doc = PdfDocument::open("contract.pdf")?;Access a page by its 0-based index and call extract_text(). The method returns a plain String with words separated by spaces and paragraphs separated by newlines.
let page = doc.page(0)?;
let text = page.extract_text()?;
println!("{}", text);Iterate over doc.pages() to process every page. Each call to extract_text() is independent.
let full_text: String = doc
.pages()
.map(|p| p.extract_text().unwrap_or_default())
.collect::<Vec<_>>()
.join("
");The default extraction follows the PDF content stream order. For multi-column documents, use TextExtractionOptions to enable layout analysis.
use pdfluent::{TextExtractionOptions, ReadingOrder};
let opts = TextExtractionOptions::default()
.reading_order(ReadingOrder::LayoutAnalysis);
let text = doc.page(0)?.extract_text_with_options(&opts)?;Use extract_lines() to get a Vec<TextLine> where each entry contains the string and its approximate vertical position.
for line in doc.page(0)?.extract_lines()? {
println!("[y={:.1}] {}", line.y, line.text);
}No JVM, no runtime, no DLL dependencies. Ships as a single native binary or WASM module.
Rust's ownership model prevents buffer overflows and use-after-free. No segfaults in PDF parsing.
Same code runs server-side, in Docker, on AWS Lambda, on Cloudflare Workers, or in the browser via WASM.