Using Lift to Turn Research PDFs into Structured JSON with Controlled, Schema-Guided Field-Level Evaluation

def render_pdf(d, path):
“””Draw a realistic 3-page report. Page breaks are forced so the headline metric on
page 1 (abstract) is physically separated from the results table on page 3.”””
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer,
Table, TableStyle, PageBreak)
ss = getSampleStyleSheet()
H1 = ParagraphStyle(“H1”, parent=ss[“Title”], fontSize=16, leading=20, spaceAfter=6)
AUTH = ParagraphStyle(“AUTH”, parent=ss[“Normal”], fontSize=9.5, textColor=colors.grey, spaceAfter=10)
H2 = ParagraphStyle(“H2”, parent=ss[“Heading2”], fontSize=12, spaceBefore=8, spaceAfter=4)
BODY = ParagraphStyle(“BODY”, parent=ss[“Normal”], fontSize=10, leading=14, spaceAfter=6)
sota_phrase = (f”surpassing the previous best of {d[‘prior_best’]}”
if d[“beats_sota”] else
f”approaching but not exceeding the previous best of {d[‘prior_best’]}”)
authors_line = “, “.join(f”{n} ({a})” for (n, a) in d[“authors”])
story = []
story += [Paragraph(d[“title”], H1), Paragraph(authors_line, AUTH), Paragraph(“Abstract”, H2)]
story += [Paragraph(
f”We introduce {d[‘method’]}, a model for {d[‘task’]}. On the {d[‘primary_benchmark’]} ”
f”benchmark, {d[‘method’]} attains {d[‘test_acc’]} {d[‘metric_name’]} on the held-out ”
f”test set, {sota_phrase}. Our {d[‘params_m’]}M-parameter model is evaluated across ”
f”{len(d[‘datasets’])} datasets ({‘, ‘.join(d[‘datasets’])}). ”
f”Extensive ablations confirm the contribution of each component.”, BODY)]
story += [Paragraph(“Keywords”, H2),
Paragraph(f”{d[‘task’]}; representation learning; {d[‘primary_benchmark’]}”, BODY),
PageBreak()]
story += [Paragraph(“1 Method and Training Details”, H2)]
story += [Paragraph(
f”{d[‘method’]} is trained end-to-end with the {d[‘optimizer’]} optimizer. ”
f”We tune on a validation split and report final numbers on the test split. ”
f”The full training configuration is summarized in Table 1.”, BODY)]
hp = [[“Hyperparameter”, “Value”],
[“Optimizer”, d[“optimizer”]],
[“Learning rate”, str(d[“lr”])],
[“Batch size”, str(d[“batch”])],
[“Epochs”, str(d[“epochs”])],
[“Parameters”, f”{d[‘params_m’]}M”]]
t1 = Table(hp, colWidths=[2.4 * inch, 2.0 * inch])
t1.setStyle(TableStyle([
(“BACKGROUND”, (0, 0), (-1, 0), colors.HexColor(“#2b3a67”)),
(“TEXTCOLOR”, (0, 0), (-1, 0), colors.white),
(“FONTSIZE”, (0, 0), (-1, -1), 9.5),
(“GRID”, (0, 0), (-1, -1), 0.4, colors.grey),
(“ROWBACKGROUNDS”, (0, 1), (-1, -1), [colors.white, colors.HexColor(“#eef1f8”)]),
(“LEFTPADDING”, (0, 0), (-1, -1), 8), (“TOPPADDING”, (0, 0), (-1, -1), 4),
(“BOTTOMPADDING”, (0, 0), (-1, -1), 4)]))
story += [Spacer(1, 4), t1, Spacer(1, 6),
Paragraph(“Table 1. Training configuration.”, BODY),
Paragraph(“2 Datasets”, H2),
Paragraph(
f”We evaluate on {‘, ‘.join(d[‘datasets’])}. {d[‘primary_benchmark’]} is our ”
f”primary benchmark; the remaining datasets are used for generalization ”
f”studies.”, BODY),
PageBreak()]
story += [Paragraph(“3 Results”, H2)]
res = [[“Method”, f”Val. {d[‘metric_name’]}”, f”Test {d[‘metric_name’]}”],
[f”{d[‘baseline_name’]} (baseline)”, str(d[“baseline_val”]), str(d[“baseline_test”])],
[f”{d[‘method’]} (ours)”, str(d[“val_acc”]), str(d[“test_acc”])]]
t2 = Table(res, colWidths=[2.6 * inch, 1.7 * inch, 1.7 * inch])
t2.setStyle(TableStyle([
(“BACKGROUND”, (0, 0), (-1, 0), colors.HexColor(“#7a2e2e”)),
(“TEXTCOLOR”, (0, 0), (-1, 0), colors.white),
(“FONTSIZE”, (0, 0), (-1, -1), 9.5),
(“GRID”, (0, 0), (-1, -1), 0.4, colors.grey),
(“FONTNAME”, (0, 2), (-1, 2), “Helvetica-Bold”),
(“ROWBACKGROUNDS”, (0, 1), (-1, -1), [colors.white, colors.HexColor(“#f7eeee”)]),
(“LEFTPADDING”, (0, 0), (-1, -1), 8), (“TOPPADDING”, (0, 0), (-1, -1), 4),
(“BOTTOMPADDING”, (0, 0), (-1, -1), 4)]))
story += [Spacer(1, 4), t2, Spacer(1, 6),
Paragraph(f”Table 2. Results on {d[‘primary_benchmark’]}. ”
f”Best test result in bold.”, BODY),
Paragraph(“4 Limitations”, H2)]
for lim in d[“limitations”]:
story += [Paragraph(“• ” + lim, BODY)]
story += [Paragraph(“5 Funding and Code Availability”, H2),
Paragraph(d[“funding_note”], BODY)]
SimpleDocTemplate(path, pagesize=LETTER,
topMargin=0.8 * inch, bottomMargin=0.8 * inch,
leftMargin=0.9 * inch, rightMargin=0.9 * inch).build(story)
print(“STEP 3/7 · Generating synthetic report PDFs…”)
CORPUS = []
for i, d in enumerate(DOCS):
path = f”/content/report_{i}.pdf” if os.path.isdir(“/content”) else f”report_{i}.pdf”
render_pdf(d, path)
CORPUS.append((d, ground_truth(d), path))
print(f” ✓ {os.path.basename(path)} — {d[‘method’]}”)
print()
if SHOW_FIRST_PAGE:
try:
import pypdfium2 as pdfium, matplotlib.pyplot as plt
pg = pdfium.PdfDocument(CORPUS[0][2])[0]
img = pg.render(scale=2.0).to_pil()
plt.figure(figsize=(6.4, 8.3)); plt.imshow(img); plt.axis(“off”)
plt.title(“What lift reads — page 1 of report_0.pdf”, fontsize=10); plt.show()
except Exception as e:
print(” (page preview skipped:”, e, “)\n”)

What's Hot

Using Lift to Turn Research PDFs into Structured JSON with Controlled, Schema-Guided Field-Level Evaluation

Why Powerful ML Is Deceptively Easy — Part 2

Deploying retail AI to scale personalisation and customer insight

Why Powerful ML Is Deceptively Easy — Part 2

Deploying retail AI to scale personalisation and customer insight

CUP (Common Useful Python): Building Reliable Python Workflows with Baidu’s Utility Toolkit

What Can We Do When Memory Becomes the New Bottleneck in Data Engineering?

Japan Bets on AI Robots to Solve a Worker Shortage

Google AI Introduces TabFM: A Hybrid-Attention Tabular Foundation Model for Zero-Shot Classification and Regression