Extract each page as an image with PDFluent, send it to AWS Textract for OCR, then write back an invisible text layer. Works for printed text, tables, and forms.
use aws_sdk_textract::Client;
use pdfluent::{Sdk, ocr::{OcrLayerOptions, OcrWord}};
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let sdk = Sdk::new()?;
let doc = sdk.open("scanned_invoice.pdf")?;
let config = aws_config::load_from_env().await;
let textract = Client::new(&config);
let mut builder = doc.add_ocr_layer();
for page in doc.pages().filter(|p| p.is_image_only()) {
// Render page to PNG bytes at 300 DPI
let png_bytes = doc.render_page_to_bytes(page.index(), 300)?;
// Call Textract synchronous DetectDocumentText
let resp = textract
.detect_document_text()
.document(
aws_sdk_textract::types::Document::builder()
.bytes(aws_sdk_textract::primitives::Blob::new(png_bytes.clone()))
.build(),
)
.send()
.await?;
// Convert Textract blocks to PDFluent OcrWord list
let words: Vec<OcrWord> = resp
.blocks()
.iter()
.filter(|b| b.block_type() == Some(&aws_sdk_textract::types::BlockType::Word))
.filter_map(|b| {
let bbox = b.geometry()?.bounding_box()?;
let text = b.text()?.to_string();
Some(OcrWord {
text,
// Textract returns fractions of page width/height
left: bbox.left() as f64,
top: bbox.top() as f64,
width: bbox.width() as f64,
height: bbox.height() as f64,
confidence: b.confidence().map(|c| c as f64),
})
})
.collect();
builder.add_page_words(page.index(), words);
}
let opts = OcrLayerOptions::builder()
.text_rendering_mode(pdfluent::ocr::TextRenderingMode::Invisible)
.build();
let searchable = builder.finish(opts)?;
searchable.save("invoice_searchable.pdf")?;
println!("Done.");
Ok(())
}You need PDFluent, the AWS SDK for Rust, tokio for async, and anyhow for error handling.
# Cargo.toml
[dependencies]
pdfluent = "0.9"
aws-config = { version = "1", features = ["behavior-version-latest"] }
aws-sdk-textract = "1"
tokio = { version = "1", features = ["full"] }
anyhow = "1"Textract uses standard AWS credential resolution. Set environment variables or use an IAM role if running on EC2 or Lambda.
export AWS_ACCESS_KEY_ID=your_key_id
export AWS_SECRET_ACCESS_KEY=your_secret
export AWS_REGION=us-east-1PDFluent detects pages that have no text layer. Only those pages need OCR — pages with existing text are passed through unchanged.
let sdk = Sdk::new()?;
let doc = sdk.open("scanned_invoice.pdf")?;
let scanned: Vec<u32> = doc.pages()
.filter(|p| p.is_image_only())
.map(|p| p.index())
.collect();
println!("{} pages need OCR", scanned.len());Render the page to PNG bytes at 300 DPI and send them directly to Textract. The synchronous DetectDocumentText call works for pages up to 10 MB; use StartDocumentTextDetection for larger documents.
let png_bytes = doc.render_page_to_bytes(page_index, 300)?;
let resp = textract
.detect_document_text()
.document(
aws_sdk_textract::types::Document::builder()
.bytes(aws_sdk_textract::primitives::Blob::new(png_bytes))
.build(),
)
.send()
.await?;For multi-page PDFs over 10 MB, or when you want table and form field detection, use StartDocumentAnalysis. It processes the document asynchronously and returns a JobId you poll until the status is SUCCEEDED.
// Start async job (supports TABLES and FORMS feature types)
let start_resp = textract
.start_document_analysis()
.document_location(
aws_sdk_textract::types::DocumentLocation::builder()
.s3_object(
aws_sdk_textract::types::S3Object::builder()
.bucket("my-bucket")
.name("scanned_invoice.pdf")
.build(),
)
.build(),
)
.feature_types(aws_sdk_textract::types::FeatureType::Tables)
.feature_types(aws_sdk_textract::types::FeatureType::Forms)
.send()
.await?;
let job_id = start_resp.job_id().unwrap();
// Poll until complete
loop {
let status_resp = textract
.get_document_analysis()
.job_id(job_id)
.send()
.await?;
match status_resp.job_status() {
Some(aws_sdk_textract::types::JobStatus::Succeeded) => break,
Some(aws_sdk_textract::types::JobStatus::Failed) => anyhow::bail!("Textract job failed"),
_ => tokio::time::sleep(std::time::Duration::from_secs(2)).await,
}
}Textract returns bounding boxes as fractions of the page (0.0–1.0). PDFluent accepts this format directly in OcrWord. Filter for BlockType::Word to get word-level entries.
let words: Vec<OcrWord> = resp
.blocks()
.iter()
.filter(|b| b.block_type() == Some(&aws_sdk_textract::types::BlockType::Word))
.filter_map(|b| {
let bbox = b.geometry()?.bounding_box()?;
Some(OcrWord {
text: b.text()?.to_string(),
left: bbox.left() as f64,
top: bbox.top() as f64,
width: bbox.width() as f64,
height: bbox.height() as f64,
confidence: b.confidence().map(|c| c as f64),
})
})
.collect();
builder.add_page_words(page_index, words);Call builder.finish() with the layer options to produce a new PDF with the invisible text overlay applied to all processed pages.
let opts = OcrLayerOptions::builder()
.text_rendering_mode(pdfluent::ocr::TextRenderingMode::Invisible)
.conform_to_pdfa2b(true) // optional: archive-safe output
.build();
let searchable = builder.finish(opts)?;
searchable.save("invoice_searchable.pdf")?;No JVM, no runtime, no DLL dependencies. Ships as a single native binary or WASM module.
Rust's ownership model prevents buffer overflows and use-after-free. No segfaults in PDF parsing.
Same code runs server-side, in Docker, on AWS Lambda, on Cloudflare Workers, or in the browser via WASM.