Code for How to Extract All PDF Links in Python Tutorial


View on Github

pdf_link_extractor.py

import pikepdf # pip3 install pikepdf

file = "1810.04805.pdf"
# file = "1710.05006.pdf"
pdf_file = pikepdf.Pdf.open(file)
urls = []
# iterate over PDF pages
for page in pdf_file.pages:
    for annots in page.get("/Annots"):
        uri = annots.get("/A").get("/URI")
        if uri is not None:
            print("[+] URL Found:", uri)
            urls.append(uri)

print("[*] Total URLs extracted:", len(urls))

pdf_link_extractor_regex.py

import fitz # pip install PyMuPDF
import re

# a regular expression of URLs
url_regex = r"https?:\/\/(www\.)?[[email protected]:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
# extract raw text from pdf
# file = "1710.05006.pdf"
file = "1810.04805.pdf"
# open the PDF file
with fitz.open(file) as pdf:
    text = ""
    for page in pdf:
        # extract text of each PDF page
        text += page.getText()
urls = []
# extract all urls using the regular expression
for match in re.finditer(url_regex, text):
    url = match.group()
    print("[+] URL Found:", url)
    urls.append(url)
print("[*] Total URLs extracted:", len(urls))