PDF to John
#!/usr/bin/env python3
# This software is Copyright (c) 2023 Benjamin Dornel <benjamindornel@gmail.com>
# and it is hereby released to the general public under the following terms:
# Redistribution and use in source and binary forms, with or without
# modification, are permitted.
import argparse
import logging
try:
from pyhanko.pdf_utils.misc import PdfReadError
from pyhanko.pdf_utils.reader import PdfFileReader
except ImportError:
print("pyhanko is missing, run 'pip install --user pyhanko==0.20.1' to install it!")
exit(1)
logger = logging.getLogger(__name__)
class SecurityRevision:
"""Represents Standard Security Handler Revisions
and the corresponding key length for the /O and /U entries
In Revision 5, the /O and /U entries were extended to 48 bytes,
with three logical parts -- a 32 byte verification hash,
an 8 byte validation salt, and an 8 byte key salt."""
revisions = {
2: 32, # RC4_BASIC
3: 32, # RC4_EXTENDED
4: 32, # RC4_OR_AES128
5: 48, # AES_R5_256
6: 48, # AES_256
}
@classmethod
def get_key_length(cls, revision):
"""
Get the key length for a given revision,
defaults to 48 if no revision is specified.
"""
return cls.revisions.get(revision, 48)
class PdfHashExtractor:
"""
Extracts hash and encryption information from a PDF file
Attributes:
- `file_name`: PDF file path.
- `strict`: Boolean that controls whether an error is raised, if a PDF
has problems e.g. Multiple definitions in encryption dictionary
for a specific key. Defaults to `False`.
- `algorithm`: Encryption algorithm used by the standard security handler
- `length`: The length of the encryption key, in bits. Defaults to 40.
- `permissions`: User access permissions
- `revision`: Revision of the standard security handler
"""
def __init__(self, file_name: str, strict: bool = False):
self.file_name = file_name
with open(file_name, "rb") as doc:
self.pdf = PdfFileReader(doc, strict=strict)
self.encrypt_dict = self.pdf._get_encryption_params()
if not self.encrypt_dict:
raise RuntimeError("File not encrypted")
self.algorithm: int = self.encrypt_dict.get("/V")
self.length: int = self.encrypt_dict.get("/Length", 40)
self.permissions: int = self.encrypt_dict["/P"]
self.revision: int = self.encrypt_dict["/R"]
@property
def document_id(self) -> bytes:
return self.pdf.document_id[0]
@property
def encrypt_metadata(self) -> str:
"""
Get a string representation of whether metadata is encrypted.
Returns "1" if metadata is encrypted, "0" otherwise.
"""
return str(int(self.pdf.security_handler.encrypt_metadata))
def parse(self) -> str:
"""
Parse PDF encryption information into a formatted string for John
"""
passwords = self.get_passwords()
fields = [
f"$pdf${self.algorithm}",
self.revision,
self.length,
self.permissions,
self.encrypt_metadata,
len(self.document_id),
self.document_id.hex(),
passwords,
]
return "*".join(map(str, fields))
def get_passwords(self) -> str:
"""
Creates a string consisting of the hexidecimal string of the
/U, /O, /UE and /OE entries and their corresponding byte string length
"""
passwords = []
keys = ("udata", "odata", "oeseed", "ueseed")
max_key_length = SecurityRevision.get_key_length(self.revision)
for key in keys:
if data := getattr(self.pdf.security_handler, key):
data: bytes = data[:max_key_length]
passwords.extend([str(len(data)), data.hex()])
return "*".join(passwords)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="PDF Hash Extractor")
parser.add_argument(
"pdf_files", nargs="+", help="PDF file(s) to extract information from"
)
parser.add_argument(
"-d", "--debug", action="store_true", help="Print the encryption dictionary"
)
args = parser.parse_args()
for filename in args.pdf_files:
try:
extractor = PdfHashExtractor(filename)
pdf_hash = extractor.parse()
print(pdf_hash)
if args.debug:
if extractor.encrypt_dict:
print("Encryption Dictionary:")
for key, value in extractor.encrypt_dict.items():
print(f"{key}: {value}")
else:
print("No encryption dictionary found in the PDF.")
except PdfReadError as error:
logger.error("%s : %s", filename, error, exc_info=True)