Code for How to Extract Images from PDF in Python Tutorial

pdf_image_extractor_cli.py

import os
import fitz  # PyMuPDF
import io
from PIL import Image
import argparse

parser = argparse.ArgumentParser(description="Extract images from a PDF file.")
parser.add_argument("file", help="PDF file to extract images from.")
parser.add_argument("-o", "--output", help="Output directory for the extracted images.", default="extracted_images")
parser.add_argument("-f", "--format", help="Desired output image format, default is PNG.", default="png")
parser.add_argument("-w", "--width", help="Minimum width for extracted images, default is 100.", default=100, type=int)
parser.add_argument("-he", "--height", help="Minimum height for extracted images, default is 100.", default=100, type=int)
# Parse the arguments
args = parser.parse_args()

# Output directory for the extracted images
output_dir = args.output
# Desired output image format
output_format = args.format
# Minimum width and height for extracted images
min_width = args.width
min_height = args.height
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# File path you want to extract images from
file = args.file
# Open the file
pdf_file = fitz.open(file)
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]
    # Get image list
    image_list = page.get_images(full=True)
    # Print the number of images found on this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print(f"[!] No images found on page {page_index}")
    # Iterate over the images on the page
    for image_index, img in enumerate(image_list, start=1):
        # Get the XREF of the image
        xref = img[0]
        # Extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        # Get the image extension
        image_ext = base_image["ext"]
        # Load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # Check if the image meets the minimum dimensions and save it
        if image.width >= min_width and image.height >= min_height:
            image.save(
                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
                format=output_format.upper())
        else:
            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")

pdf_image_extractor.py

import os
import fitz  # PyMuPDF
import io
from PIL import Image

# Output directory for the extracted images
output_dir = "extracted_images"
# Desired output image format
output_format = "png"
# Minimum width and height for extracted images
min_width = 100
min_height = 100
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# File path you want to extract images from
file = "1710.05006.pdf"
# Open the file
pdf_file = fitz.open(file)
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]
    # Get image list
    image_list = page.get_images(full=True)
    # Print the number of images found on this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print(f"[!] No images found on page {page_index}")
    # Iterate over the images on the page
    for image_index, img in enumerate(image_list, start=1):
        # Get the XREF of the image
        xref = img[0]
        # Extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        # Get the image extension
        image_ext = base_image["ext"]
        # Load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # Check if the image meets the minimum dimensions and save it
        if image.width >= min_width and image.height >= min_height:
            image.save(
                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
                format=output_format.upper())
        else:
            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")

Ethical Hacking with Python EBook - Resources - Top

New Tutorials

How to Track Your AI Visibility with Python

How to Automate Excel Reports in Python using Openpyxl

How to Build a Website Blocker in Python

How to Analyze the Most In-Demand Tech Skills from Job Postings in Python

How to Generate and Visualize Text Embeddings in Python

Code for How to Extract Images from PDF in Python Tutorial

Tags

New Tutorials

Popular Tutorials