I have code that detects a table in a PDF that appears after a specific section, and parses the information in the table and copies it into a pandas dataframe.
Now, I want to indicate whether a box is checked (or not blank) next to the information parsed from the table.
Here is a link to the PDF
Here is my code so far which can't quite seem to identify whether a box is marked or not.
import pandas as pd
import re
import fitz
from math import sqrt
from io import BytesIO
PDF_FILE_NAME = "path/test_doc.pdf"
SECTION_HEADER = "Section 3: Table"
# --- Helper Functions (Re-using the reliable text extraction) ---
def clean_item_text(text):
"""Removes leading symbols and cleans up whitespace."""
if pd.isna(text) or text == "":
return ""
# Pattern to find known symbols: ☑, ☐, □, ■, X, x, ✓, followed by optional space
cleaned = re.sub(r"[\u2611\u2610\u25A1\u25A0Xx\u2713]\s*", "", str(text).strip())
return cleaned.strip()
def extract_table_text(pdf_path, section_header):
"""
Extracts the table data, but cleans the item text to get only the name.
"""
with fitz.open(pdf_path) as doc:
text_pages = [page.get_text("text") for page in doc]
full_text = "".join(text_pages)
full_text = full_text.replace("Sec$on", "Section")
section_match = re.search(rf"{re.escape(section_header)}", full_text, re.IGNORECASE)
if not section_match:
raise ValueError(f"Section '{section_header}' not found.")
section_start = section_match.end()
text_after_section = full_text[section_start:].strip()
table_text = re.split(r"Section\s*\d+\s*:", text_after_section, maxsplit=1)[0]
lines = [l.strip() for l in table_text.split("\n") if l.strip()]
if len(lines) < 6:
raise ValueError("Insufficient lines found for table structure.")
headers = [l.strip('"').strip() for l in lines[2:5]]
items_raw = lines[5:]
# Define column splits based on the provided data structure
col1_raw, col2_raw, col3_raw = items_raw[0:3], items_raw[3:9], items_raw[9:15]
# Process raw lists to get cleaned text for the DF
col1 = [clean_item_text(x) for x in col1_raw]
col2 = [clean_item_text(x) for x in col2_raw]
col3 = [clean_item_text(x) for x in col3_raw]
maxlen = max(len(col1), len(col2), len(col3))
for c in (col1, col2, col3):
while len(c) < maxlen:
c.append("")
df = pd.DataFrame({
headers[0]: col1,
headers[1]: col2,
headers[2]: col3
})
# Return both the DataFrame and the list of headers
return df, headers
# --- OCR/Image Analysis Logic ---
def scan_checkbox_roi(pdf_path, df, all_headers):
"""
Scans an image region (ROI) to the left of each item name to detect a mark.
"""
mapping = {}
# Flatten all items in the DataFrame to a list of unique names (and filter blanks)
all_items = [item for col in all_headers for item in df[col].dropna().tolist() if item != ""]
all_items = list(set(all_items))
print("="*60)
print("IMAGE SCAN (OCR) ATTEMPT")
print("="*60)
with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc, 1):
# Find coordinates of all relevant items on the page
words = page.get_text("words")
# Map item name to its bounding box (bbox)
item_coords = {}
for word in words:
text = clean_item_text(word[4])
if text in all_items and text not in item_coords:
item_coords[text] = word[:4] # (x0, y0, x1, y1)
# Process each found item
for item_text, item_bbox in item_coords.items():
# Define ROI: A small rectangle to the left of the item text.
# x0 = item_bbox[0] - 25, y0 = item_bbox[1] - 5
# x1 = item_bbox[0] - 5, y1 = item_bbox[3] + 5
roi_rect = fitz.Rect(item_bbox[0] - 25, item_bbox[1] - 5,
item_bbox[0] - 5, item_bbox[3] + 5)
if not roi_rect.is_empty:
# 1. Render the ROI to a Pixmap (Image) at high resolution
matrix = fitz.Matrix(3, 3)
pix = page.get_pixmap(matrix=matrix, clip=roi_rect)
# 2. Analyze Pixels for a Mark
dark_pixel_threshold = 0.9 # 90% white threshold
dark_pixel_count = 0
total_pixels = pix.width * pix.height
for i in range(0, len(pix.samples), pix.n):
r, g, b = pix.samples[i:i+3]
# Convert RGB to grayscale (luminance)
luminance = (0.2126 * r + 0.7152 * g + 0.0722 * b) / 255.0
if luminance < dark_pixel_threshold:
dark_pixel_count += 1
# 3. Determine Status
mark_ratio = dark_pixel_count / total_pixels
if mark_ratio > 0.05: # If more than 5% of pixels are dark (mark detected)
status = "checked"
else:
status = "unchecked"
mapping[item_text] = status
print(f" ✓ '{item_text}' (Ratio: {mark_ratio:.3f}) -> {status}")
else:
mapping[item_text] = ""
print(f" ✗ '{item_text}' - Invalid ROI")
return mapping
# --- Main Logic ---
def parse_pdf_for_table_with_checkboxes(pdf_file_path, section_header):
# 1. Extract the clean item names and original headers
df, original_data_cols = extract_table_text(pdf_file_path, section_header)
# 2. Use the item names to guide the image scanning for status
checkbox_map = scan_checkbox_roi(pdf_file_path, df, original_data_cols)
# 3. Apply status to dataframe (FIXED LOGIC)
# Ensure we only iterate over the original columns before adding new ones
for col in original_data_cols:
status_col = f"{col} Status"
def get_status(x):
if pd.isna(x) or x == "":
return ""
val = str(x).strip()
return checkbox_map.get(val, "")
df[status_col] = df[col].map(get_status)
# Re-order columns using the clean, original column list
new_cols = []
for h in original_data_cols:
new_cols.append(h)
new_cols.append(f"{h} Status")
return df[new_cols]
# Run
result = parse_pdf_for_table_with_checkboxes(PDF_FILE_NAME, SECTION_HEADER)
The final dataframe should look like this:
Col1 Col1_Status Col2 Col2_Status Col3 Col3_Status
Item1 Checked Item4 Checked Item10 Checked
Item2 Item5 Item11
Item3 Item6 Item12
Item7 Checked Item13 Checked
Item8 Item14
Item9 Item15 Checked
But the columns are a little misaligned and none of the Xs in the boxes are being detected.
How do I solve this problem?
pymupdf.Page.find_tableswhich returns aTableFinderobject which has method...to_pandas()!get_statusfunction before the loop, not inside