How-to guides/Batch Processing

Process thousands of PDFs in parallel with Rust

Use rayon and PDFluent together to process a folder of PDFs across all CPU cores. No thread pool setup required.

rust
use rayon::prelude::*;
use std::fs;
use pdfluent::PdfDocument;

fn main() -> anyhow::Result<()> {
    let files: Vec<_> = fs::read_dir("./input")?
        .filter_map(|e| e.ok())
        .map(|e| e.path())
        .filter(|p| p.extension().map_or(false, |e| e == "pdf"))
        .collect();

    files.par_iter().for_each(|path| {
        match process_file(path) {
            Ok(_) => println!("OK  {:?}", path.file_name().unwrap()),
            Err(e) => eprintln!("ERR {:?}: {}", path.file_name().unwrap(), e),
        }
    });

    Ok(())
}

fn process_file(path: &std::path::Path) -> pdfluent::Result<()> {
    let doc = PdfDocument::open(path)?;
    let text = doc.extract_text()?;
    let out = path.with_extension("txt");
    fs::write(out, text)?;
    Ok(())
}
Install:cargo add pdfluentDownload SDK →

Step by step

1

Add PDFluent and rayon to Cargo.toml

rayon provides a parallel iterator that distributes work across all available CPU cores automatically.

rust
# Cargo.toml
[dependencies]
pdfluent = "0.9"
rayon = "1.10"
anyhow = "1"
2

Collect the list of PDF files

Use std::fs::read_dir to walk the input directory. Filter by extension to skip non-PDF files.

rust
use std::fs;
use std::path::PathBuf;

let files: Vec<PathBuf> = fs::read_dir("./input")?
    .filter_map(|e| e.ok())
    .map(|e| e.path())
    .filter(|p| p.extension().map_or(false, |ext| ext == "pdf"))
    .collect();

println!("Found {} PDF files", files.len());
3

Process files in parallel with rayon

Replace .iter() with .par_iter() to run each file on a separate thread. PdfDocument::open is Send, so it works safely across rayon threads.

rust
use rayon::prelude::*;

files.par_iter().for_each(|path| {
    match process_file(path) {
        Ok(_) => println!("OK  {}", path.display()),
        Err(e) => eprintln!("ERR {}: {}", path.display(), e),
    }
});
4

Write the per-file processing function

Keep the function focused on one document. Open, process, save. Errors are returned and logged by the caller.

rust
fn process_file(path: &std::path::Path) -> pdfluent::Result<()> {
    let mut doc = PdfDocument::open(path)?;

    // Example: stamp a watermark on every page
    for page in doc.pages_mut() {
        page.add_watermark("CONFIDENTIAL")?;
    }

    let out_path = std::path::Path::new("./output")
        .join(path.file_name().unwrap());
    doc.save(out_path)?;
    Ok(())
}
5

Collect results for a summary report

Use par_iter().map() instead of par_iter().for_each() when you need to collect success and error counts.

rust
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;

let ok_count = Arc::new(AtomicUsize::new(0));
let err_count = Arc::new(AtomicUsize::new(0));

files.par_iter().for_each(|path| {
    let ok = ok_count.clone();
    let err = err_count.clone();
    match process_file(path) {
        Ok(_) => { ok.fetch_add(1, Ordering::Relaxed); }
        Err(e) => {
            eprintln!("ERR {}: {}", path.display(), e);
            err.fetch_add(1, Ordering::Relaxed);
        }
    }
});

println!(
    "Done. {} ok, {} errors",
    ok_count.load(Ordering::Relaxed),
    err_count.load(Ordering::Relaxed)
);

Notes and tips

  • rayon uses one thread per logical CPU by default. Set RAYON_NUM_THREADS=4 to cap concurrency, which is useful on memory-constrained servers.
  • PdfDocument::open reads the full document into memory. For very large files (>500 MB), set RAYON_NUM_THREADS equal to the number of files you can fit in RAM simultaneously.
  • Output directory must exist before you start. Create it with fs::create_dir_all("./output")? near the top of main().
  • For recursive directory traversal use the walkdir crate alongside rayon::iter::ParallelBridge.

Why PDFluent for this

Pure Rust

No JVM, no runtime, no DLL dependencies. Ships as a single native binary or WASM module.

Memory safe

Rust's ownership model prevents buffer overflows and use-after-free. No segfaults in PDF parsing.

Runs anywhere

Same code runs server-side, in Docker, on AWS Lambda, on Cloudflare Workers, or in the browser via WASM.

Frequently asked questions