diff --git a/.gitignore b/.gitignore
index 9f73a9f61..d0e485598 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,5 @@ cert.txt
 
 # lets each user define their own vscode settings
 .vscode/settings.json
+
+.serena/
\ No newline at end of file
diff --git a/docs/bill-pdf-text-extraction.md b/docs/bill-pdf-text-extraction.md
new file mode 100644
index 000000000..4b2b3dfc5
--- /dev/null
+++ b/docs/bill-pdf-text-extraction.md
@@ -0,0 +1,63 @@
+# Bill PDF Text Extraction
+
+Some Massachusetts Legislature bill records have `content.DocumentText` set to
+null in the Document API even though the bill PDF contains embedded text. Maple
+now falls back to the official PDF at `/Bills/{court}/{billId}.pdf` when the API
+text is missing.
+
+## Extraction Scope
+
+The current extractor handles PDFs with embedded text. It does not perform OCR,
+so scanned or image-only PDFs are reported but not repaired.
+
+Known 194th General Court examples:
+
+- `H1`: large embedded-text PDF.
+- `H4787`: short embedded-text PDF.
+- `H5008`: ballot initiative embedded-text PDF.
+- `S2539`: regulatory/report-style embedded-text PDF.
+- `H18`: image-only/scanned PDF; no OCR support in this implementation.
+
+## Runtime Scraper Behavior
+
+The bill scraper first calls the MA Legislature Document API. If
+`DocumentText` is present, it stores the API response as before. If
+`DocumentText` is null or absent, the scraper downloads the PDF and tries to
+extract text with `pdf-parse`.
+
+Successful PDF extraction stores the result in the existing
+`content.DocumentText` field. Failed extraction leaves `DocumentText` absent and
+logs the extraction status.
+
+## Backfill Existing Bills
+
+Run the PDF text backfill in dry-run mode first:
+
+```sh
+yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --bills "H1 H18 H4787 H5008 S2539" --output ./bill-pdf-text-dry-run.csv
+```
+
+After reviewing the CSV, commit writes:
+
+```sh
+yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --commit true --output ./bill-pdf-text-dev.csv
+```
+
+The script only writes `content.DocumentText` and `fetchedAt` for bills that are
+missing `content.DocumentText`. Bills that already have text are skipped.
+
+## Summary And Topic Backfill
+
+Updating existing bill documents does not trigger the Python LLM function,
+because that function currently runs on document creation only. After committing
+PDF text, run the LLM backfill for the repaired bills:
+
+```sh
+python llm/backfill_summaries_runner.py --court 194 --bill-ids "H1 H4787 H5008 S2539" --output ./summaries-and-topics.csv
+```
+
+Use `--dry-run` to verify which rows would be processed without updating
+Firestore.
+
+`backfill_summaries.py` is the legacy immediate-run wrapper.
+`backfill_summaries_runner.py` is the import-safe CLI and test target.
diff --git a/functions/package.json b/functions/package.json
index d459dda2d..a726ba9bb 100644
--- a/functions/package.json
+++ b/functions/package.json
@@ -29,6 +29,7 @@
     "luxon": "^2.3.1",
     "nanoid": "^3.3.2",
     "object-hash": "^3.0.0",
+    "pdf-parse": "1.1.1",
     "runtypes": "6.6.0",
     "ssl-root-cas": "^1.3.1",
     "typesense": "^1.2.2",
@@ -40,6 +41,7 @@
     "@types/jsdom": "^21.1.7",
     "@types/luxon": "^2.0.9",
     "@types/object-hash": "^2.2.1",
+    "@types/pdf-parse": "1.1.5",
     "copyfiles": "^2.4.1",
     "firebase-functions-test": "^0.3.3",
     "firebase-tools": "^13.18.0",
diff --git a/functions/src/bills/bills.test.ts b/functions/src/bills/bills.test.ts
new file mode 100644
index 000000000..1d99b8712
--- /dev/null
+++ b/functions/src/bills/bills.test.ts
@@ -0,0 +1,82 @@
+jest.mock("../malegislature", () => ({
+  getDocument: jest.fn(),
+  getDocumentPdf: jest.fn()
+}))
+jest.mock("./pdfText", () => ({
+  extractBillTextFromPdf: jest.fn()
+}))
+
+import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
+import { extractBillTextFromPdf } from "./pdfText"
+
+const mockedApi = jest.requireMock("../malegislature") as {
+  getDocument: jest.Mock
+  getDocumentPdf: jest.Mock
+}
+const mockedExtractBillTextFromPdf =
+  extractBillTextFromPdf as jest.MockedFunction<typeof extractBillTextFromPdf>
+
+describe("getDocumentWithPdfTextFallback", () => {
+  beforeEach(() => {
+    jest.resetAllMocks()
+  })
+
+  it("does not fetch a PDF when API text is present", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: "API text" })
+
+    await expect(
+      getDocumentWithPdfTextFallback(194, "H1")
+    ).resolves.toMatchObject({
+      content: { DocumentText: "API text" },
+      documentTextSource: "api"
+    })
+    expect(mockedApi.getDocumentPdf).not.toHaveBeenCalled()
+  })
+
+  it("sets DocumentText when PDF extraction succeeds", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: null })
+    mockedApi.getDocumentPdf.mockResolvedValue(Buffer.from("pdf"))
+    mockedExtractBillTextFromPdf.mockResolvedValue({
+      status: "extracted",
+      text: "PDF text",
+      pageCount: 1,
+      charCount: 7
+    })
+
+    await expect(
+      getDocumentWithPdfTextFallback(194, "H1")
+    ).resolves.toMatchObject({
+      content: { DocumentText: "PDF text" },
+      documentTextSource: "pdf",
+      pdfTextExtraction: { status: "extracted" }
+    })
+  })
+
+  it("leaves DocumentText absent when PDF has no text", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: null })
+    mockedApi.getDocumentPdf.mockResolvedValue(Buffer.from("pdf"))
+    mockedExtractBillTextFromPdf.mockResolvedValue({
+      status: "no-text",
+      pageCount: 1,
+      charCount: 0
+    })
+
+    const result = await getDocumentWithPdfTextFallback(194, "H18")
+
+    expect(result.content).not.toHaveProperty("DocumentText")
+    expect(result.pdfTextExtraction).toMatchObject({ status: "no-text" })
+  })
+
+  it("leaves DocumentText absent when PDF fetch fails", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: null })
+    mockedApi.getDocumentPdf.mockRejectedValue(new Error("not found"))
+
+    const result = await getDocumentWithPdfTextFallback(194, "H18")
+
+    expect(result.content).not.toHaveProperty("DocumentText")
+    expect(result.pdfTextExtraction).toMatchObject({
+      status: "fetch-error",
+      error: "not found"
+    })
+  })
+})
diff --git a/functions/src/bills/bills.ts b/functions/src/bills/bills.ts
index 3602c4f07..58aeb8a71 100644
--- a/functions/src/bills/bills.ts
+++ b/functions/src/bills/bills.ts
@@ -1,9 +1,13 @@
 import { isString } from "lodash"
+import { logger } from "firebase-functions"
 import { logFetchError } from "../common"
 import * as api from "../malegislature"
 import { createScraper } from "../scraper"
+import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
 import { Bill, MISSING_TIMESTAMP } from "./types"
 
+export { getDocumentWithPdfTextFallback } from "./documentTextFallback"
+
 /**
  * There are around 8000 documents. With 8 batches per day, 20 parallel
  * scrapers, and 50 documents per batch, we will process all documents once per
@@ -18,7 +22,8 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } =
     fetchBatchTimeout: 240,
     startBatchTimeout: 240,
     fetchResource: async (court: number, id: string, current) => {
-      const content = await api.getDocument({ id, court })
+      const { content, pdfTextExtraction } =
+        await getDocumentWithPdfTextFallback(court, id)
       const history = await api
         .getBillHistory(court, id)
         .catch(logFetchError("bill history", id))
@@ -28,8 +33,11 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } =
         .getSimilarBills(court, id)
         .catch(logFetchError("similar bills", id))
         .then(bills => bills?.map(b => b.BillNumber).filter(isString) ?? [])
-      if (content.DocumentText == null) {
-        delete content.DocumentText
+
+      if (content.DocumentText == null && pdfTextExtraction) {
+        logger.info(
+          `No bill text extracted from PDF for ${court}/${id}: ${pdfTextExtraction.status}`
+        )
       }
 
       const resource: Partial<Bill> = {
diff --git a/functions/src/bills/documentTextFallback.ts b/functions/src/bills/documentTextFallback.ts
new file mode 100644
index 000000000..ec203ff29
--- /dev/null
+++ b/functions/src/bills/documentTextFallback.ts
@@ -0,0 +1,60 @@
+import * as api from "../malegislature"
+import { extractBillTextFromPdf, PdfTextExtractionResult } from "./pdfText"
+
+export type DocumentTextFallbackResult = {
+  content: any
+  documentTextSource?: "api" | "pdf"
+  pdfTextExtraction?: PdfTextExtractionResult | PdfFetchFailure
+}
+
+type PdfFetchFailure = {
+  status: "fetch-error"
+  charCount: 0
+  pageCount?: undefined
+  error: string
+}
+
+export async function getDocumentWithPdfTextFallback(
+  court: number,
+  id: string
+): Promise<DocumentTextFallbackResult> {
+  const content = await api.getDocument({ id, court })
+
+  if (content.DocumentText != null) {
+    return {
+      content,
+      documentTextSource: "api"
+    }
+  }
+
+  delete content.DocumentText
+
+  let pdf: Buffer
+  try {
+    pdf = await api.getDocumentPdf({ id, court })
+  } catch (e) {
+    return {
+      content,
+      pdfTextExtraction: {
+        status: "fetch-error",
+        charCount: 0,
+        error: e instanceof Error ? e.message : String(e)
+      }
+    }
+  }
+
+  const pdfTextExtraction = await extractBillTextFromPdf(pdf)
+  if (pdfTextExtraction.status === "extracted") {
+    content.DocumentText = pdfTextExtraction.text
+    return {
+      content,
+      documentTextSource: "pdf",
+      pdfTextExtraction
+    }
+  }
+
+  return {
+    content,
+    pdfTextExtraction
+  }
+}
diff --git a/functions/src/bills/pdfText.test.ts b/functions/src/bills/pdfText.test.ts
new file mode 100644
index 000000000..6a9ba58d1
--- /dev/null
+++ b/functions/src/bills/pdfText.test.ts
@@ -0,0 +1,98 @@
+const mockedPdfParse = jest.fn()
+
+jest.mock("pdf-parse/lib/pdf-parse", () => mockedPdfParse)
+
+import { extractBillTextFromPdf, normalizeExtractedBillText } from "./pdfText"
+
+describe("normalizeExtractedBillText", () => {
+  it("trims and collapses noisy whitespace", () => {
+    expect(
+      normalizeExtractedBillText(" \r\n  Section   1.   Text\t\t here.  \n\n\n")
+    ).toBe("Section 1. Text here.")
+  })
+
+  it("removes standalone page counters", () => {
+    expect(
+      normalizeExtractedBillText("1 of 3\nHOUSE No. 1\n-- 2 of 3 --\nBill text")
+    ).toBe("HOUSE No. 1\nBill text")
+  })
+
+  it("preserves substantive bill text", () => {
+    const text =
+      "The General Laws are hereby amended.\nSection 2. This act shall take effect."
+
+    expect(normalizeExtractedBillText(text)).toBe(text)
+  })
+})
+
+describe("extractBillTextFromPdf", () => {
+  beforeEach(() => {
+    mockedPdfParse.mockReset()
+  })
+
+  it("returns extracted when text is long enough", async () => {
+    mockedPdfParse.mockResolvedValue({
+      text: "An Act " + "with enough extracted text. ".repeat(10),
+      numpages: 2,
+      numrender: 2,
+      info: {},
+      metadata: {},
+      version: "default"
+    })
+
+    const result = await extractBillTextFromPdf(Buffer.from("pdf"))
+
+    expect(result.status).toBe("extracted")
+    expect(result.pageCount).toBe(2)
+    expect(result.text).toContain("An Act")
+  })
+
+  it("returns no-text for empty extraction", async () => {
+    mockedPdfParse.mockResolvedValue({
+      text: " \n\t ",
+      numpages: 1,
+      numrender: 1,
+      info: {},
+      metadata: {},
+      version: "default"
+    })
+
+    await expect(
+      extractBillTextFromPdf(Buffer.from("pdf"))
+    ).resolves.toMatchObject({
+      status: "no-text",
+      charCount: 0,
+      pageCount: 1
+    })
+  })
+
+  it("returns too-short for tiny extraction", async () => {
+    mockedPdfParse.mockResolvedValue({
+      text: "short text",
+      numpages: 1,
+      numrender: 1,
+      info: {},
+      metadata: {},
+      version: "default"
+    })
+
+    await expect(
+      extractBillTextFromPdf(Buffer.from("pdf"))
+    ).resolves.toMatchObject({
+      status: "too-short",
+      text: "short text",
+      pageCount: 1
+    })
+  })
+
+  it("returns parse-error when parser throws", async () => {
+    mockedPdfParse.mockRejectedValue(new Error("bad pdf"))
+
+    await expect(
+      extractBillTextFromPdf(Buffer.from("pdf"))
+    ).resolves.toMatchObject({
+      status: "parse-error",
+      error: "bad pdf"
+    })
+  })
+})
diff --git a/functions/src/bills/pdfText.ts b/functions/src/bills/pdfText.ts
new file mode 100644
index 000000000..ddfe1a03a
--- /dev/null
+++ b/functions/src/bills/pdfText.ts
@@ -0,0 +1,64 @@
+const pdfParse =
+  require("pdf-parse/lib/pdf-parse") as typeof import("pdf-parse")
+
+export const MIN_EXTRACTED_TEXT_CHARS = 10
+
+export type PdfTextExtractionStatus =
+  | "extracted"
+  | "no-text"
+  | "too-short"
+  | "parse-error"
+
+export type PdfTextExtractionResult = {
+  status: PdfTextExtractionStatus
+  text?: string
+  pageCount?: number
+  charCount: number
+  error?: string
+}
+
+export async function extractBillTextFromPdf(
+  pdf: Buffer
+): Promise<PdfTextExtractionResult> {
+  try {
+    const result = await pdfParse(pdf),
+      text = normalizeExtractedBillText(result.text),
+      charCount = text.replace(/\s/g, "").length,
+      pageCount = result.numpages
+
+    if (!text) {
+      return { status: "no-text", pageCount, charCount }
+    }
+
+    if (charCount < MIN_EXTRACTED_TEXT_CHARS) {
+      return { status: "too-short", text, pageCount, charCount }
+    }
+
+    return { status: "extracted", text, pageCount, charCount }
+  } catch (e) {
+    return {
+      status: "parse-error",
+      charCount: 0,
+      error: e instanceof Error ? e.message : String(e)
+    }
+  }
+}
+
+export function normalizeExtractedBillText(raw: string): string {
+  return raw
+    .replace(/\r\n?/g, "\n")
+    .split("\n")
+    .map(line => line.trim())
+    .filter(line => !!line && !isPageCounter(line))
+    .join("\n")
+    .replace(/[ \t]+/g, " ")
+    .replace(/\n{3,}/g, "\n\n")
+    .trim()
+}
+
+function isPageCounter(line: string) {
+  return (
+    /^\d+\s+(of|OF)\s+\d+$/.test(line) ||
+    /^--\s*\d+\s+of\s+\d+\s*--$/.test(line)
+  )
+}
diff --git a/functions/src/malegislature.ts b/functions/src/malegislature.ts
index 3771830d8..df0d93f01 100644
--- a/functions/src/malegislature.ts
+++ b/functions/src/malegislature.ts
@@ -106,14 +106,17 @@ export async function getDocumentPdf({
 }: {
   id: string
   court: number
-}) {
+}): Promise<Buffer> {
   const response = await request({
     baseURL: "https://malegislature.gov",
     url: `/Bills/${court}/${id}.pdf`,
     method: "GET",
+    responseType: "arraybuffer",
     timeout: 30_000
   })
-  return response as any
+  return Buffer.isBuffer(response)
+    ? response
+    : Buffer.from(response as ArrayBuffer)
 }
 
 export async function listMembers({
diff --git a/functions/yarn.lock b/functions/yarn.lock
index 6289a53f7..2e1522d04 100644
--- a/functions/yarn.lock
+++ b/functions/yarn.lock
@@ -1518,6 +1518,13 @@
   resolved "https://registry.npmjs.org/@types/object-hash/-/object-hash-2.2.1.tgz"
   integrity sha512-i/rtaJFCsPljrZvP/akBqEwUP2y5cZLOmvO+JaYnz01aPknrQ+hB5MRcO7iqCUsFaYfTG8kGfKUyboA07xeDHQ==
 
+"@types/pdf-parse@1.1.5":
+  version "1.1.5"
+  resolved "https://registry.yarnpkg.com/@types/pdf-parse/-/pdf-parse-1.1.5.tgz#a0959022604457169177622b512ed03b975f10e2"
+  integrity sha512-kBfrSXsloMnUJOKi25s3+hRmkycHfLK6A09eRGqF/N8BkQoPUmaCr+q8Cli5FnfohEz/rsv82zAiPz/LXtOGhA==
+  dependencies:
+    "@types/node" "*"
+
 "@types/qs@*":
   version "6.9.10"
   resolved "https://registry.npmjs.org/@types/qs/-/qs-6.9.10.tgz"
@@ -2689,7 +2696,7 @@ debug@4.3.1:
   dependencies:
     ms "2.1.2"
 
-debug@^3.2.7:
+debug@^3.1.0, debug@^3.2.7:
   version "3.2.7"
   resolved "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz"
   integrity sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==
@@ -5632,6 +5639,11 @@ node-emoji@^2.2.0:
     emojilib "^2.4.0"
     skin-tone "^2.0.0"
 
+node-ensure@^0.0.0:
+  version "0.0.0"
+  resolved "https://registry.yarnpkg.com/node-ensure/-/node-ensure-0.0.0.tgz#ecae764150de99861ec5c810fd5d096b183932a7"
+  integrity sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==
+
 node-fetch@^2.6.1, node-fetch@^2.6.7, node-fetch@^2.6.9, node-fetch@^2.7.0:
   version "2.7.0"
   resolved "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz"
@@ -5971,6 +5983,14 @@ path-to-regexp@^8.0.0:
   resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-8.2.0.tgz#73990cc29e57a3ff2a0d914095156df5db79e8b4"
   integrity sha512-TdrF7fW9Rphjq4RjrW0Kp2AW0Ahwu9sRGTkS6bvDi0SCwZlEZYmcfDbEsTz8RVk0EHIS/Vd1bv3JhG+1xZuAyQ==
 
+pdf-parse@1.1.1:
+  version "1.1.1"
+  resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7"
+  integrity sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==
+  dependencies:
+    debug "^3.1.0"
+    node-ensure "^0.0.0"
+
 pg-cloudflare@^1.2.7:
   version "1.2.7"
   resolved "https://registry.yarnpkg.com/pg-cloudflare/-/pg-cloudflare-1.2.7.tgz#a1f3d226bab2c45ae75ea54d65ec05ac6cfafbef"
diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index 774f9c7c0..da89edb24 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -1,101 +1,19 @@
-"""This script fills any missing 'summary' or 'topics' fields on the data model.
+"""Legacy import-time entrypoint for backfilling bill summaries and topics.
 
-The document must have a 'Title' and 'DocumentText' field to generate them. The
-script queries only the general court 194 bills, modifies the firebase database
-in-place, and generates a CSV with a description of what happened. The header for
-the CSV is `bill_id,status,summary,topics`. The possible statuses are,
-
-- `skipped` - the bill doesn't have either a title or text, skip it
-- `previous_summary` - the bill previously had a summary, skip it
-- `failed_summary` - something went wrong when trying to summarize, skip it
-- `previous_topics` - the bill previously had topics, skip it
-- `failed_topics` - something went wrong when trying to generate topics, skip it
-- `generated_summary` - both the summary and topics were generated successfully
-
-Developer notes:
-- you'll need to set the 'OPENAI_API_KEY' environment variable
+Importing this module immediately runs the backfill for General Court 194 and
+writes `./summaries-and-topics.csv`, matching the original script behavior.
+Use `backfill_summaries_runner.py` for an import-safe CLI and test target.
 """
 
 import firebase_admin
-from llm_functions import get_summary_api_function, get_tags_api_function_v2
+from backfill_summaries_runner import (
+    CSV_SUMMARY_OUTPUT,
+    backfill_summaries,
+    write_rows,
+)
 from firebase_admin import firestore
-from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC
-import csv
 
-# Module constants
-FIREBASE_COLLECTION_PATH = "generalCourts/194/bills"
-CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv"
-
-# Application Default credentials are automatically created.
-app = firebase_admin.initialize_app()
+firebase_admin.initialize_app()
 db = firestore.client()
-
-
-def make_bill_summary(bill_id, status, summary, topics):
-    """Generate a row for csv.writerow
-
-    The goal with this function is to not forget all the arguments to subsequent
-    csv.writerow calls.
-    """
-    return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"]
-
-
-bills_ref = db.collection(FIREBASE_COLLECTION_PATH)
-bills = bills_ref.get()
-with open(CSV_SUMMARY_OUTPUT, "w") as csvfile:
-    csv_writer = csv.writer(csvfile)
-    csv_writer.writerow(["bill_id", "status", "summary", "topics"])
-    for bill in bills:
-        document = bill.to_dict()
-        bill_id = document["id"]
-        document_text = document.get("content", {}).get("DocumentText")
-        document_title = document.get("content", {}).get("Title")
-        summary = document.get("summary")
-
-        # No document text or title, skip it because we can't summarize it
-        if document_text is None or document_title is None:
-            csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None))
-            continue
-
-        # If the summary is already populated move on
-        if summary is not None:
-            csv_writer.writerow(
-                make_bill_summary(bill_id, "previous_summary", None, None)
-            )
-            continue
-
-        summary = get_summary_api_function(bill_id, document_title, document_text)
-        if summary["status"] in [-1, -2]:
-            csv_writer.writerow(
-                make_bill_summary(bill_id, "failed_summary", None, None)
-            )
-            continue
-        # Note: `normalize_summary` does some post-processing to clean up the summaries
-        # As of 2025-10-21 this was necessary due to the LLM prompt
-        summary = summary["summary"]
-        bill.reference.update({"summary": summary})
-
-        # If the topics are already populated, just make a note of it
-        topics = document.get("topics")
-        if topics is not None:
-            csv_writer.writerow(
-                make_bill_summary(bill_id, "previous_topics", None, None)
-            )
-
-        tags = get_tags_api_function_v2(bill_id, document_title, summary)
-        # If the tags fail, make a note and at least write the summary for debugging
-        if tags["status"] != 1:
-            csv_writer.writerow(make_bill_summary(bill_id, "failed_topics", None, None))
-            csv_writer.writerow(
-                make_bill_summary(bill_id, "generated_summary", summary, None)
-            )
-            continue
-        topics_and_categories = get_categories_from_topics(
-            tags["tags"], CATEGORY_BY_TOPIC
-        )
-        bill.reference.update({"topics": topics_and_categories})
-        csv_writer.writerow(
-            make_bill_summary(
-                bill_id, "generated_summary_and_topics", summary, topics_and_categories
-            )
-        )
+rows = backfill_summaries(db, court=194)
+write_rows(CSV_SUMMARY_OUTPUT, rows)
diff --git a/llm/backfill_summaries_runner.py b/llm/backfill_summaries_runner.py
new file mode 100644
index 000000000..22b49893e
--- /dev/null
+++ b/llm/backfill_summaries_runner.py
@@ -0,0 +1,146 @@
+"""Import-safe runner for backfilling missing bill summaries and topics.
+
+The document must have a `content.Title` and `content.DocumentText` field to
+generate a summary. Topics can be generated from either an existing summary or a
+newly generated one.
+
+The CSV header is `bill_id,status,summary,topics`. The possible statuses are,
+
+- `skipped` - the bill doesn't have either a title or text, skip it
+- `previous_summary` - the bill previously had a summary
+- `failed_summary` - something went wrong when trying to summarize, skip it
+- `previous_topics` - the bill previously had topics, skip it
+- `failed_topics` - something went wrong when trying to generate topics, skip it
+- `generated_summary_and_topics` - the summary and topics were generated
+  successfully
+
+Developer notes:
+- you'll need to set the `OPENAI_API_KEY` environment variable
+"""
+
+import argparse
+import csv
+from typing import Any
+
+import firebase_admin
+from bill_on_document_created import CATEGORY_BY_TOPIC, get_categories_from_topics
+from firebase_admin import firestore
+from llm_functions import get_summary_api_function, get_tags_api_function_v2
+
+CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv"
+
+
+def make_bill_summary(bill_id, status, summary, topics) -> list[str]:
+    """Generate a row for csv.writerow."""
+    return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"]
+
+
+def parse_bill_ids(raw_bill_ids: str | None) -> set[str] | None:
+    if raw_bill_ids is None:
+        return None
+
+    bill_ids = {
+        bill_id.strip()
+        for chunk in raw_bill_ids.split(",")
+        for bill_id in chunk.split()
+        if bill_id.strip()
+    }
+    return bill_ids or None
+
+
+def backfill_summaries(
+    db: Any,
+    court: int = 194,
+    bill_ids: set[str] | None = None,
+    dry_run: bool = False,
+) -> list[list[str]]:
+    rows = [["bill_id", "status", "summary", "topics"]]
+    bills_ref = db.collection(f"generalCourts/{court}/bills")
+    bills = bills_ref.get()
+
+    for bill in bills:
+        document = bill.to_dict()
+        bill_id = document["id"]
+        if bill_ids is not None and bill_id not in bill_ids:
+            continue
+
+        document_text = document.get("content", {}).get("DocumentText")
+        document_title = document.get("content", {}).get("Title")
+        summary = document.get("summary")
+        topics = document.get("topics")
+
+        if document_text is None or document_title is None:
+            rows.append(make_bill_summary(bill_id, "skipped", None, None))
+            continue
+
+        if summary is None:
+            summary_response = get_summary_api_function(
+                bill_id, document_title, document_text
+            )
+            if summary_response["status"] in [-1, -2]:
+                rows.append(make_bill_summary(bill_id, "failed_summary", None, None))
+                continue
+
+            summary = summary_response["summary"]
+            if not dry_run:
+                bill.reference.update({"summary": summary})
+        else:
+            rows.append(make_bill_summary(bill_id, "previous_summary", None, None))
+
+        if topics is not None:
+            rows.append(make_bill_summary(bill_id, "previous_topics", None, None))
+            continue
+
+        tags = get_tags_api_function_v2(bill_id, document_title, summary)
+        if tags["status"] != 1:
+            rows.append(make_bill_summary(bill_id, "failed_topics", summary, None))
+            continue
+
+        topics_and_categories = get_categories_from_topics(
+            tags["tags"], CATEGORY_BY_TOPIC
+        )
+        if not dry_run:
+            bill.reference.update({"topics": topics_and_categories})
+
+        rows.append(
+            make_bill_summary(
+                bill_id,
+                "generated_summary_and_topics",
+                summary,
+                topics_and_categories,
+            )
+        )
+
+    return rows
+
+
+def write_rows(output: str, rows: list[list[str]]) -> None:
+    with open(output, "w", newline="") as csvfile:
+        csv.writer(csvfile).writerows(rows)
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--court", type=int, default=194)
+    parser.add_argument("--bill-ids")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--output", default=CSV_SUMMARY_OUTPUT)
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> None:
+    args = parse_args(argv)
+
+    firebase_admin.initialize_app()
+    db = firestore.client()
+    rows = backfill_summaries(
+        db,
+        court=args.court,
+        bill_ids=parse_bill_ids(args.bill_ids),
+        dry_run=args.dry_run,
+    )
+    write_rows(args.output, rows)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llm/test_backfill_summaries.py b/llm/test_backfill_summaries.py
new file mode 100644
index 000000000..1824f27ec
--- /dev/null
+++ b/llm/test_backfill_summaries.py
@@ -0,0 +1,107 @@
+import backfill_summaries_runner
+
+
+class FakeReference:
+    def __init__(self):
+        self.updates = []
+
+    def update(self, payload):
+        self.updates.append(payload)
+
+
+class FakeBill:
+    def __init__(self, data):
+        self._data = data
+        self.reference = FakeReference()
+
+    def to_dict(self):
+        return self._data
+
+
+class FakeCollection:
+    def __init__(self, bills):
+        self._bills = bills
+
+    def get(self):
+        return self._bills
+
+
+class FakeDb:
+    def __init__(self, bills):
+        self.bills = bills
+        self.path = None
+
+    def collection(self, path):
+        self.path = path
+        return FakeCollection(self.bills)
+
+
+def test_backfill_summaries_skips_bills_without_text():
+    bill = FakeBill({"id": "H18", "content": {"Title": "No text"}})
+    rows = backfill_summaries_runner.backfill_summaries(FakeDb([bill]))
+
+    assert rows == [
+        ["bill_id", "status", "summary", "topics"],
+        ["H18", "skipped", "None", "None"],
+    ]
+    assert bill.reference.updates == []
+
+
+def test_backfill_summaries_generates_topics_when_summary_exists(monkeypatch):
+    bill = FakeBill(
+        {
+            "id": "H1",
+            "content": {"Title": "Title", "DocumentText": "Text"},
+            "summary": "Existing summary",
+        }
+    )
+    monkeypatch.setattr(
+        backfill_summaries_runner,
+        "get_tags_api_function_v2",
+        lambda bill_id, title, summary: {
+            "status": 1,
+            "tags": ["Consumer protection"],
+        },
+    )
+
+    rows = backfill_summaries_runner.backfill_summaries(FakeDb([bill]))
+
+    assert rows[1] == ["H1", "previous_summary", "None", "None"]
+    assert rows[2][0:3] == [
+        "H1",
+        "generated_summary_and_topics",
+        "Existing summary",
+    ]
+    assert bill.reference.updates == [
+        {"topics": [{"topic": "Consumer protection", "category": "Commerce"}]}
+    ]
+
+
+def test_backfill_summaries_dry_run_avoids_firestore_updates(monkeypatch):
+    bill = FakeBill({"id": "H1", "content": {"Title": "Title", "DocumentText": "Text"}})
+    monkeypatch.setattr(
+        backfill_summaries_runner,
+        "get_summary_api_function",
+        lambda bill_id, title, text: {"status": 1, "summary": "New summary"},
+    )
+    monkeypatch.setattr(
+        backfill_summaries_runner,
+        "get_tags_api_function_v2",
+        lambda bill_id, title, summary: {
+            "status": 1,
+            "tags": ["Consumer protection"],
+        },
+    )
+
+    rows = backfill_summaries_runner.backfill_summaries(FakeDb([bill]), dry_run=True)
+
+    assert rows[1][0:3] == ["H1", "generated_summary_and_topics", "New summary"]
+    assert bill.reference.updates == []
+
+
+def test_parse_bill_ids_accepts_spaces_and_commas():
+    assert backfill_summaries_runner.parse_bill_ids("H1 H18,S2539") == {
+        "H1",
+        "H18",
+        "S2539",
+    }
diff --git a/scripts/README.md b/scripts/README.md
index ef835e4b2..2d0536d2c 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -84,6 +84,21 @@ yarn firebase-admin run-script backfillBallotQuestionTestimonyCounts --env prod
 
 <!-- TODO: document -->
 
+#### `backfillBillPdfText`
+
+Fills missing `content.DocumentText` on bill documents by checking the MA
+Legislature Document API and then falling back to embedded text in the official
+bill PDF. The script is dry-run by default; pass `--commit true` to write
+updates.
+
+```sh
+yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --bills "H1 H18 H4787 H5008 S2539" --output ./bill-pdf-text-dry-run.csv
+yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --commit true --output ./bill-pdf-text-dev.csv
+```
+
+See `docs/bill-pdf-text-extraction.md` for extraction categories and the LLM
+summary/topic follow-up.
+
 #### `backfillBillNotificationEvents`
 
 <!-- TODO: document -->
diff --git a/scripts/firebase-admin/backfillBillPdfText.ts b/scripts/firebase-admin/backfillBillPdfText.ts
new file mode 100644
index 000000000..53f90b88f
--- /dev/null
+++ b/scripts/firebase-admin/backfillBillPdfText.ts
@@ -0,0 +1,162 @@
+import fs from "fs"
+import { z } from "zod"
+import { getDocumentWithPdfTextFallback } from "../../functions/src/bills/documentTextFallback"
+import { Timestamp } from "../../functions/src/firebase"
+import { Script } from "./types"
+
+type BackfillStatus =
+  | "skipped_has_document_text"
+  | "updated_api_text"
+  | "updated_pdf_text"
+  | "pdf_no_text"
+  | "pdf_too_short"
+  | "pdf_fetch_failed"
+  | "pdf_parse_failed"
+  | "dry_run_would_update"
+
+type BackfillRow = {
+  bill_id: string
+  status: BackfillStatus
+  source: "api" | "pdf" | ""
+  page_count: number | ""
+  char_count: number | ""
+  error: string
+}
+
+const Args = z.object({
+  court: z.coerce.number(),
+  bills: z.string().optional(),
+  commit: z
+    .union([z.boolean(), z.string()])
+    .optional()
+    .transform(value => value === true || value === "true"),
+  output: z.string().optional()
+})
+
+export const script: Script = async ({ db, args }) => {
+  const { court, bills, commit = false, output } = Args.parse(args),
+    allowlist = parseBillIds(bills),
+    snapshot = await db.collection(`/generalCourts/${court}/bills`).get(),
+    rows: BackfillRow[] = []
+
+  let processed = 0
+
+  for (const bill of snapshot.docs) {
+    const id = bill.id,
+      data = bill.data()
+
+    if (allowlist && !allowlist.has(id)) {
+      continue
+    }
+
+    if (data.content?.DocumentText != null) {
+      rows.push({
+        bill_id: id,
+        status: "skipped_has_document_text",
+        source: "",
+        page_count: "",
+        char_count: "",
+        error: ""
+      })
+      continue
+    }
+
+    processed++
+
+    const result = await getDocumentWithPdfTextFallback(court, id),
+      documentText = result.content.DocumentText
+
+    if (documentText != null) {
+      if (commit) {
+        await bill.ref.update({
+          "content.DocumentText": documentText,
+          fetchedAt: Timestamp.now()
+        })
+      }
+
+      rows.push({
+        bill_id: id,
+        status: commit
+          ? result.documentTextSource === "api"
+            ? "updated_api_text"
+            : "updated_pdf_text"
+          : "dry_run_would_update",
+        source: result.documentTextSource ?? "",
+        page_count: result.pdfTextExtraction?.pageCount ?? "",
+        char_count:
+          result.pdfTextExtraction?.charCount ??
+          documentText.replace(/\s/g, "").length,
+        error: ""
+      })
+      continue
+    }
+
+    const extraction = result.pdfTextExtraction
+    rows.push({
+      bill_id: id,
+      status: mapFailureStatus(extraction?.status),
+      source: "pdf",
+      page_count: extraction?.pageCount ?? "",
+      char_count: extraction?.charCount ?? "",
+      error: extraction?.error ?? ""
+    })
+  }
+
+  const csv = toCsv(rows)
+  if (output) {
+    fs.writeFileSync(output, csv)
+    console.log(`Wrote ${rows.length} rows to ${output}`)
+  } else {
+    console.log(csv)
+  }
+
+  console.log(
+    `${commit ? "Updated" : "Dry run checked"} ${processed} missing-text bills`
+  )
+}
+
+function parseBillIds(bills?: string) {
+  if (!bills) return undefined
+
+  return new Set(
+    bills
+      .split(/[,\s]+/)
+      .map(id => id.trim())
+      .filter(Boolean)
+  )
+}
+
+function mapFailureStatus(status?: string): BackfillStatus {
+  switch (status) {
+    case "no-text":
+      return "pdf_no_text"
+    case "too-short":
+      return "pdf_too_short"
+    case "parse-error":
+      return "pdf_parse_failed"
+    case "fetch-error":
+    default:
+      return "pdf_fetch_failed"
+  }
+}
+
+function toCsv(rows: BackfillRow[]) {
+  return [
+    ["bill_id", "status", "source", "page_count", "char_count", "error"],
+    ...rows.map(row => [
+      row.bill_id,
+      row.status,
+      row.source,
+      row.page_count,
+      row.char_count,
+      row.error
+    ])
+  ]
+    .map(row => row.map(formatCsvCell).join(","))
+    .join("\n")
+}
+
+function formatCsvCell(value: string | number) {
+  const text = String(value)
+  return /[",\n]/.test(text) ? `"${text.replace(/"/g, '""')}"` : text
+}