diff --git a/.gitignore b/.gitignore index 9f73a9f61..d0e485598 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,5 @@ cert.txt # lets each user define their own vscode settings .vscode/settings.json + +.serena/ \ No newline at end of file diff --git a/docs/bill-pdf-text-extraction.md b/docs/bill-pdf-text-extraction.md new file mode 100644 index 000000000..4b2b3dfc5 --- /dev/null +++ b/docs/bill-pdf-text-extraction.md @@ -0,0 +1,63 @@ +# Bill PDF Text Extraction + +Some Massachusetts Legislature bill records have `content.DocumentText` set to +null in the Document API even though the bill PDF contains embedded text. Maple +now falls back to the official PDF at `/Bills/{court}/{billId}.pdf` when the API +text is missing. + +## Extraction Scope + +The current extractor handles PDFs with embedded text. It does not perform OCR, +so scanned or image-only PDFs are reported but not repaired. + +Known 194th General Court examples: + +- `H1`: large embedded-text PDF. +- `H4787`: short embedded-text PDF. +- `H5008`: ballot initiative embedded-text PDF. +- `S2539`: regulatory/report-style embedded-text PDF. +- `H18`: image-only/scanned PDF; no OCR support in this implementation. + +## Runtime Scraper Behavior + +The bill scraper first calls the MA Legislature Document API. If +`DocumentText` is present, it stores the API response as before. If +`DocumentText` is null or absent, the scraper downloads the PDF and tries to +extract text with `pdf-parse`. + +Successful PDF extraction stores the result in the existing +`content.DocumentText` field. Failed extraction leaves `DocumentText` absent and +logs the extraction status. + +## Backfill Existing Bills + +Run the PDF text backfill in dry-run mode first: + +```sh +yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --bills "H1 H18 H4787 H5008 S2539" --output ./bill-pdf-text-dry-run.csv +``` + +After reviewing the CSV, commit writes: + +```sh +yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --commit true --output ./bill-pdf-text-dev.csv +``` + +The script only writes `content.DocumentText` and `fetchedAt` for bills that are +missing `content.DocumentText`. Bills that already have text are skipped. + +## Summary And Topic Backfill + +Updating existing bill documents does not trigger the Python LLM function, +because that function currently runs on document creation only. After committing +PDF text, run the LLM backfill for the repaired bills: + +```sh +python llm/backfill_summaries_runner.py --court 194 --bill-ids "H1 H4787 H5008 S2539" --output ./summaries-and-topics.csv +``` + +Use `--dry-run` to verify which rows would be processed without updating +Firestore. + +`backfill_summaries.py` is the legacy immediate-run wrapper. +`backfill_summaries_runner.py` is the import-safe CLI and test target. diff --git a/functions/package.json b/functions/package.json index d459dda2d..a726ba9bb 100644 --- a/functions/package.json +++ b/functions/package.json @@ -29,6 +29,7 @@ "luxon": "^2.3.1", "nanoid": "^3.3.2", "object-hash": "^3.0.0", + "pdf-parse": "1.1.1", "runtypes": "6.6.0", "ssl-root-cas": "^1.3.1", "typesense": "^1.2.2", @@ -40,6 +41,7 @@ "@types/jsdom": "^21.1.7", "@types/luxon": "^2.0.9", "@types/object-hash": "^2.2.1", + "@types/pdf-parse": "1.1.5", "copyfiles": "^2.4.1", "firebase-functions-test": "^0.3.3", "firebase-tools": "^13.18.0", diff --git a/functions/src/bills/bills.test.ts b/functions/src/bills/bills.test.ts new file mode 100644 index 000000000..1d99b8712 --- /dev/null +++ b/functions/src/bills/bills.test.ts @@ -0,0 +1,82 @@ +jest.mock("../malegislature", () => ({ + getDocument: jest.fn(), + getDocumentPdf: jest.fn() +})) +jest.mock("./pdfText", () => ({ + extractBillTextFromPdf: jest.fn() +})) + +import { getDocumentWithPdfTextFallback } from "./documentTextFallback" +import { extractBillTextFromPdf } from "./pdfText" + +const mockedApi = jest.requireMock("../malegislature") as { + getDocument: jest.Mock + getDocumentPdf: jest.Mock +} +const mockedExtractBillTextFromPdf = + extractBillTextFromPdf as jest.MockedFunction + +describe("getDocumentWithPdfTextFallback", () => { + beforeEach(() => { + jest.resetAllMocks() + }) + + it("does not fetch a PDF when API text is present", async () => { + mockedApi.getDocument.mockResolvedValue({ DocumentText: "API text" }) + + await expect( + getDocumentWithPdfTextFallback(194, "H1") + ).resolves.toMatchObject({ + content: { DocumentText: "API text" }, + documentTextSource: "api" + }) + expect(mockedApi.getDocumentPdf).not.toHaveBeenCalled() + }) + + it("sets DocumentText when PDF extraction succeeds", async () => { + mockedApi.getDocument.mockResolvedValue({ DocumentText: null }) + mockedApi.getDocumentPdf.mockResolvedValue(Buffer.from("pdf")) + mockedExtractBillTextFromPdf.mockResolvedValue({ + status: "extracted", + text: "PDF text", + pageCount: 1, + charCount: 7 + }) + + await expect( + getDocumentWithPdfTextFallback(194, "H1") + ).resolves.toMatchObject({ + content: { DocumentText: "PDF text" }, + documentTextSource: "pdf", + pdfTextExtraction: { status: "extracted" } + }) + }) + + it("leaves DocumentText absent when PDF has no text", async () => { + mockedApi.getDocument.mockResolvedValue({ DocumentText: null }) + mockedApi.getDocumentPdf.mockResolvedValue(Buffer.from("pdf")) + mockedExtractBillTextFromPdf.mockResolvedValue({ + status: "no-text", + pageCount: 1, + charCount: 0 + }) + + const result = await getDocumentWithPdfTextFallback(194, "H18") + + expect(result.content).not.toHaveProperty("DocumentText") + expect(result.pdfTextExtraction).toMatchObject({ status: "no-text" }) + }) + + it("leaves DocumentText absent when PDF fetch fails", async () => { + mockedApi.getDocument.mockResolvedValue({ DocumentText: null }) + mockedApi.getDocumentPdf.mockRejectedValue(new Error("not found")) + + const result = await getDocumentWithPdfTextFallback(194, "H18") + + expect(result.content).not.toHaveProperty("DocumentText") + expect(result.pdfTextExtraction).toMatchObject({ + status: "fetch-error", + error: "not found" + }) + }) +}) diff --git a/functions/src/bills/bills.ts b/functions/src/bills/bills.ts index 3602c4f07..58aeb8a71 100644 --- a/functions/src/bills/bills.ts +++ b/functions/src/bills/bills.ts @@ -1,9 +1,13 @@ import { isString } from "lodash" +import { logger } from "firebase-functions" import { logFetchError } from "../common" import * as api from "../malegislature" import { createScraper } from "../scraper" +import { getDocumentWithPdfTextFallback } from "./documentTextFallback" import { Bill, MISSING_TIMESTAMP } from "./types" +export { getDocumentWithPdfTextFallback } from "./documentTextFallback" + /** * There are around 8000 documents. With 8 batches per day, 20 parallel * scrapers, and 50 documents per batch, we will process all documents once per @@ -18,7 +22,8 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } = fetchBatchTimeout: 240, startBatchTimeout: 240, fetchResource: async (court: number, id: string, current) => { - const content = await api.getDocument({ id, court }) + const { content, pdfTextExtraction } = + await getDocumentWithPdfTextFallback(court, id) const history = await api .getBillHistory(court, id) .catch(logFetchError("bill history", id)) @@ -28,8 +33,11 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } = .getSimilarBills(court, id) .catch(logFetchError("similar bills", id)) .then(bills => bills?.map(b => b.BillNumber).filter(isString) ?? []) - if (content.DocumentText == null) { - delete content.DocumentText + + if (content.DocumentText == null && pdfTextExtraction) { + logger.info( + `No bill text extracted from PDF for ${court}/${id}: ${pdfTextExtraction.status}` + ) } const resource: Partial = { diff --git a/functions/src/bills/documentTextFallback.ts b/functions/src/bills/documentTextFallback.ts new file mode 100644 index 000000000..ec203ff29 --- /dev/null +++ b/functions/src/bills/documentTextFallback.ts @@ -0,0 +1,60 @@ +import * as api from "../malegislature" +import { extractBillTextFromPdf, PdfTextExtractionResult } from "./pdfText" + +export type DocumentTextFallbackResult = { + content: any + documentTextSource?: "api" | "pdf" + pdfTextExtraction?: PdfTextExtractionResult | PdfFetchFailure +} + +type PdfFetchFailure = { + status: "fetch-error" + charCount: 0 + pageCount?: undefined + error: string +} + +export async function getDocumentWithPdfTextFallback( + court: number, + id: string +): Promise { + const content = await api.getDocument({ id, court }) + + if (content.DocumentText != null) { + return { + content, + documentTextSource: "api" + } + } + + delete content.DocumentText + + let pdf: Buffer + try { + pdf = await api.getDocumentPdf({ id, court }) + } catch (e) { + return { + content, + pdfTextExtraction: { + status: "fetch-error", + charCount: 0, + error: e instanceof Error ? e.message : String(e) + } + } + } + + const pdfTextExtraction = await extractBillTextFromPdf(pdf) + if (pdfTextExtraction.status === "extracted") { + content.DocumentText = pdfTextExtraction.text + return { + content, + documentTextSource: "pdf", + pdfTextExtraction + } + } + + return { + content, + pdfTextExtraction + } +} diff --git a/functions/src/bills/pdfText.test.ts b/functions/src/bills/pdfText.test.ts new file mode 100644 index 000000000..6a9ba58d1 --- /dev/null +++ b/functions/src/bills/pdfText.test.ts @@ -0,0 +1,98 @@ +const mockedPdfParse = jest.fn() + +jest.mock("pdf-parse/lib/pdf-parse", () => mockedPdfParse) + +import { extractBillTextFromPdf, normalizeExtractedBillText } from "./pdfText" + +describe("normalizeExtractedBillText", () => { + it("trims and collapses noisy whitespace", () => { + expect( + normalizeExtractedBillText(" \r\n Section 1. Text\t\t here. \n\n\n") + ).toBe("Section 1. Text here.") + }) + + it("removes standalone page counters", () => { + expect( + normalizeExtractedBillText("1 of 3\nHOUSE No. 1\n-- 2 of 3 --\nBill text") + ).toBe("HOUSE No. 1\nBill text") + }) + + it("preserves substantive bill text", () => { + const text = + "The General Laws are hereby amended.\nSection 2. This act shall take effect." + + expect(normalizeExtractedBillText(text)).toBe(text) + }) +}) + +describe("extractBillTextFromPdf", () => { + beforeEach(() => { + mockedPdfParse.mockReset() + }) + + it("returns extracted when text is long enough", async () => { + mockedPdfParse.mockResolvedValue({ + text: "An Act " + "with enough extracted text. ".repeat(10), + numpages: 2, + numrender: 2, + info: {}, + metadata: {}, + version: "default" + }) + + const result = await extractBillTextFromPdf(Buffer.from("pdf")) + + expect(result.status).toBe("extracted") + expect(result.pageCount).toBe(2) + expect(result.text).toContain("An Act") + }) + + it("returns no-text for empty extraction", async () => { + mockedPdfParse.mockResolvedValue({ + text: " \n\t ", + numpages: 1, + numrender: 1, + info: {}, + metadata: {}, + version: "default" + }) + + await expect( + extractBillTextFromPdf(Buffer.from("pdf")) + ).resolves.toMatchObject({ + status: "no-text", + charCount: 0, + pageCount: 1 + }) + }) + + it("returns too-short for tiny extraction", async () => { + mockedPdfParse.mockResolvedValue({ + text: "short text", + numpages: 1, + numrender: 1, + info: {}, + metadata: {}, + version: "default" + }) + + await expect( + extractBillTextFromPdf(Buffer.from("pdf")) + ).resolves.toMatchObject({ + status: "too-short", + text: "short text", + pageCount: 1 + }) + }) + + it("returns parse-error when parser throws", async () => { + mockedPdfParse.mockRejectedValue(new Error("bad pdf")) + + await expect( + extractBillTextFromPdf(Buffer.from("pdf")) + ).resolves.toMatchObject({ + status: "parse-error", + error: "bad pdf" + }) + }) +}) diff --git a/functions/src/bills/pdfText.ts b/functions/src/bills/pdfText.ts new file mode 100644 index 000000000..ddfe1a03a --- /dev/null +++ b/functions/src/bills/pdfText.ts @@ -0,0 +1,64 @@ +const pdfParse = + require("pdf-parse/lib/pdf-parse") as typeof import("pdf-parse") + +export const MIN_EXTRACTED_TEXT_CHARS = 10 + +export type PdfTextExtractionStatus = + | "extracted" + | "no-text" + | "too-short" + | "parse-error" + +export type PdfTextExtractionResult = { + status: PdfTextExtractionStatus + text?: string + pageCount?: number + charCount: number + error?: string +} + +export async function extractBillTextFromPdf( + pdf: Buffer +): Promise { + try { + const result = await pdfParse(pdf), + text = normalizeExtractedBillText(result.text), + charCount = text.replace(/\s/g, "").length, + pageCount = result.numpages + + if (!text) { + return { status: "no-text", pageCount, charCount } + } + + if (charCount < MIN_EXTRACTED_TEXT_CHARS) { + return { status: "too-short", text, pageCount, charCount } + } + + return { status: "extracted", text, pageCount, charCount } + } catch (e) { + return { + status: "parse-error", + charCount: 0, + error: e instanceof Error ? e.message : String(e) + } + } +} + +export function normalizeExtractedBillText(raw: string): string { + return raw + .replace(/\r\n?/g, "\n") + .split("\n") + .map(line => line.trim()) + .filter(line => !!line && !isPageCounter(line)) + .join("\n") + .replace(/[ \t]+/g, " ") + .replace(/\n{3,}/g, "\n\n") + .trim() +} + +function isPageCounter(line: string) { + return ( + /^\d+\s+(of|OF)\s+\d+$/.test(line) || + /^--\s*\d+\s+of\s+\d+\s*--$/.test(line) + ) +} diff --git a/functions/src/malegislature.ts b/functions/src/malegislature.ts index 3771830d8..df0d93f01 100644 --- a/functions/src/malegislature.ts +++ b/functions/src/malegislature.ts @@ -106,14 +106,17 @@ export async function getDocumentPdf({ }: { id: string court: number -}) { +}): Promise { const response = await request({ baseURL: "https://malegislature.gov", url: `/Bills/${court}/${id}.pdf`, method: "GET", + responseType: "arraybuffer", timeout: 30_000 }) - return response as any + return Buffer.isBuffer(response) + ? response + : Buffer.from(response as ArrayBuffer) } export async function listMembers({ diff --git a/functions/yarn.lock b/functions/yarn.lock index 6289a53f7..2e1522d04 100644 --- a/functions/yarn.lock +++ b/functions/yarn.lock @@ -1518,6 +1518,13 @@ resolved "https://registry.npmjs.org/@types/object-hash/-/object-hash-2.2.1.tgz" integrity sha512-i/rtaJFCsPljrZvP/akBqEwUP2y5cZLOmvO+JaYnz01aPknrQ+hB5MRcO7iqCUsFaYfTG8kGfKUyboA07xeDHQ== +"@types/pdf-parse@1.1.5": + version "1.1.5" + resolved "https://registry.yarnpkg.com/@types/pdf-parse/-/pdf-parse-1.1.5.tgz#a0959022604457169177622b512ed03b975f10e2" + integrity sha512-kBfrSXsloMnUJOKi25s3+hRmkycHfLK6A09eRGqF/N8BkQoPUmaCr+q8Cli5FnfohEz/rsv82zAiPz/LXtOGhA== + dependencies: + "@types/node" "*" + "@types/qs@*": version "6.9.10" resolved "https://registry.npmjs.org/@types/qs/-/qs-6.9.10.tgz" @@ -2689,7 +2696,7 @@ debug@4.3.1: dependencies: ms "2.1.2" -debug@^3.2.7: +debug@^3.1.0, debug@^3.2.7: version "3.2.7" resolved "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz" integrity sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ== @@ -5632,6 +5639,11 @@ node-emoji@^2.2.0: emojilib "^2.4.0" skin-tone "^2.0.0" +node-ensure@^0.0.0: + version "0.0.0" + resolved "https://registry.yarnpkg.com/node-ensure/-/node-ensure-0.0.0.tgz#ecae764150de99861ec5c810fd5d096b183932a7" + integrity sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw== + node-fetch@^2.6.1, node-fetch@^2.6.7, node-fetch@^2.6.9, node-fetch@^2.7.0: version "2.7.0" resolved "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz" @@ -5971,6 +5983,14 @@ path-to-regexp@^8.0.0: resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-8.2.0.tgz#73990cc29e57a3ff2a0d914095156df5db79e8b4" integrity sha512-TdrF7fW9Rphjq4RjrW0Kp2AW0Ahwu9sRGTkS6bvDi0SCwZlEZYmcfDbEsTz8RVk0EHIS/Vd1bv3JhG+1xZuAyQ== +pdf-parse@1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7" + integrity sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A== + dependencies: + debug "^3.1.0" + node-ensure "^0.0.0" + pg-cloudflare@^1.2.7: version "1.2.7" resolved "https://registry.yarnpkg.com/pg-cloudflare/-/pg-cloudflare-1.2.7.tgz#a1f3d226bab2c45ae75ea54d65ec05ac6cfafbef" diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index 774f9c7c0..da89edb24 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -1,101 +1,19 @@ -"""This script fills any missing 'summary' or 'topics' fields on the data model. +"""Legacy import-time entrypoint for backfilling bill summaries and topics. -The document must have a 'Title' and 'DocumentText' field to generate them. The -script queries only the general court 194 bills, modifies the firebase database -in-place, and generates a CSV with a description of what happened. The header for -the CSV is `bill_id,status,summary,topics`. The possible statuses are, - -- `skipped` - the bill doesn't have either a title or text, skip it -- `previous_summary` - the bill previously had a summary, skip it -- `failed_summary` - something went wrong when trying to summarize, skip it -- `previous_topics` - the bill previously had topics, skip it -- `failed_topics` - something went wrong when trying to generate topics, skip it -- `generated_summary` - both the summary and topics were generated successfully - -Developer notes: -- you'll need to set the 'OPENAI_API_KEY' environment variable +Importing this module immediately runs the backfill for General Court 194 and +writes `./summaries-and-topics.csv`, matching the original script behavior. +Use `backfill_summaries_runner.py` for an import-safe CLI and test target. """ import firebase_admin -from llm_functions import get_summary_api_function, get_tags_api_function_v2 +from backfill_summaries_runner import ( + CSV_SUMMARY_OUTPUT, + backfill_summaries, + write_rows, +) from firebase_admin import firestore -from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC -import csv -# Module constants -FIREBASE_COLLECTION_PATH = "generalCourts/194/bills" -CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv" - -# Application Default credentials are automatically created. -app = firebase_admin.initialize_app() +firebase_admin.initialize_app() db = firestore.client() - - -def make_bill_summary(bill_id, status, summary, topics): - """Generate a row for csv.writerow - - The goal with this function is to not forget all the arguments to subsequent - csv.writerow calls. - """ - return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"] - - -bills_ref = db.collection(FIREBASE_COLLECTION_PATH) -bills = bills_ref.get() -with open(CSV_SUMMARY_OUTPUT, "w") as csvfile: - csv_writer = csv.writer(csvfile) - csv_writer.writerow(["bill_id", "status", "summary", "topics"]) - for bill in bills: - document = bill.to_dict() - bill_id = document["id"] - document_text = document.get("content", {}).get("DocumentText") - document_title = document.get("content", {}).get("Title") - summary = document.get("summary") - - # No document text or title, skip it because we can't summarize it - if document_text is None or document_title is None: - csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None)) - continue - - # If the summary is already populated move on - if summary is not None: - csv_writer.writerow( - make_bill_summary(bill_id, "previous_summary", None, None) - ) - continue - - summary = get_summary_api_function(bill_id, document_title, document_text) - if summary["status"] in [-1, -2]: - csv_writer.writerow( - make_bill_summary(bill_id, "failed_summary", None, None) - ) - continue - # Note: `normalize_summary` does some post-processing to clean up the summaries - # As of 2025-10-21 this was necessary due to the LLM prompt - summary = summary["summary"] - bill.reference.update({"summary": summary}) - - # If the topics are already populated, just make a note of it - topics = document.get("topics") - if topics is not None: - csv_writer.writerow( - make_bill_summary(bill_id, "previous_topics", None, None) - ) - - tags = get_tags_api_function_v2(bill_id, document_title, summary) - # If the tags fail, make a note and at least write the summary for debugging - if tags["status"] != 1: - csv_writer.writerow(make_bill_summary(bill_id, "failed_topics", None, None)) - csv_writer.writerow( - make_bill_summary(bill_id, "generated_summary", summary, None) - ) - continue - topics_and_categories = get_categories_from_topics( - tags["tags"], CATEGORY_BY_TOPIC - ) - bill.reference.update({"topics": topics_and_categories}) - csv_writer.writerow( - make_bill_summary( - bill_id, "generated_summary_and_topics", summary, topics_and_categories - ) - ) +rows = backfill_summaries(db, court=194) +write_rows(CSV_SUMMARY_OUTPUT, rows) diff --git a/llm/backfill_summaries_runner.py b/llm/backfill_summaries_runner.py new file mode 100644 index 000000000..22b49893e --- /dev/null +++ b/llm/backfill_summaries_runner.py @@ -0,0 +1,146 @@ +"""Import-safe runner for backfilling missing bill summaries and topics. + +The document must have a `content.Title` and `content.DocumentText` field to +generate a summary. Topics can be generated from either an existing summary or a +newly generated one. + +The CSV header is `bill_id,status,summary,topics`. The possible statuses are, + +- `skipped` - the bill doesn't have either a title or text, skip it +- `previous_summary` - the bill previously had a summary +- `failed_summary` - something went wrong when trying to summarize, skip it +- `previous_topics` - the bill previously had topics, skip it +- `failed_topics` - something went wrong when trying to generate topics, skip it +- `generated_summary_and_topics` - the summary and topics were generated + successfully + +Developer notes: +- you'll need to set the `OPENAI_API_KEY` environment variable +""" + +import argparse +import csv +from typing import Any + +import firebase_admin +from bill_on_document_created import CATEGORY_BY_TOPIC, get_categories_from_topics +from firebase_admin import firestore +from llm_functions import get_summary_api_function, get_tags_api_function_v2 + +CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv" + + +def make_bill_summary(bill_id, status, summary, topics) -> list[str]: + """Generate a row for csv.writerow.""" + return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"] + + +def parse_bill_ids(raw_bill_ids: str | None) -> set[str] | None: + if raw_bill_ids is None: + return None + + bill_ids = { + bill_id.strip() + for chunk in raw_bill_ids.split(",") + for bill_id in chunk.split() + if bill_id.strip() + } + return bill_ids or None + + +def backfill_summaries( + db: Any, + court: int = 194, + bill_ids: set[str] | None = None, + dry_run: bool = False, +) -> list[list[str]]: + rows = [["bill_id", "status", "summary", "topics"]] + bills_ref = db.collection(f"generalCourts/{court}/bills") + bills = bills_ref.get() + + for bill in bills: + document = bill.to_dict() + bill_id = document["id"] + if bill_ids is not None and bill_id not in bill_ids: + continue + + document_text = document.get("content", {}).get("DocumentText") + document_title = document.get("content", {}).get("Title") + summary = document.get("summary") + topics = document.get("topics") + + if document_text is None or document_title is None: + rows.append(make_bill_summary(bill_id, "skipped", None, None)) + continue + + if summary is None: + summary_response = get_summary_api_function( + bill_id, document_title, document_text + ) + if summary_response["status"] in [-1, -2]: + rows.append(make_bill_summary(bill_id, "failed_summary", None, None)) + continue + + summary = summary_response["summary"] + if not dry_run: + bill.reference.update({"summary": summary}) + else: + rows.append(make_bill_summary(bill_id, "previous_summary", None, None)) + + if topics is not None: + rows.append(make_bill_summary(bill_id, "previous_topics", None, None)) + continue + + tags = get_tags_api_function_v2(bill_id, document_title, summary) + if tags["status"] != 1: + rows.append(make_bill_summary(bill_id, "failed_topics", summary, None)) + continue + + topics_and_categories = get_categories_from_topics( + tags["tags"], CATEGORY_BY_TOPIC + ) + if not dry_run: + bill.reference.update({"topics": topics_and_categories}) + + rows.append( + make_bill_summary( + bill_id, + "generated_summary_and_topics", + summary, + topics_and_categories, + ) + ) + + return rows + + +def write_rows(output: str, rows: list[list[str]]) -> None: + with open(output, "w", newline="") as csvfile: + csv.writer(csvfile).writerows(rows) + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--court", type=int, default=194) + parser.add_argument("--bill-ids") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--output", default=CSV_SUMMARY_OUTPUT) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + + firebase_admin.initialize_app() + db = firestore.client() + rows = backfill_summaries( + db, + court=args.court, + bill_ids=parse_bill_ids(args.bill_ids), + dry_run=args.dry_run, + ) + write_rows(args.output, rows) + + +if __name__ == "__main__": + main() diff --git a/llm/test_backfill_summaries.py b/llm/test_backfill_summaries.py new file mode 100644 index 000000000..1824f27ec --- /dev/null +++ b/llm/test_backfill_summaries.py @@ -0,0 +1,107 @@ +import backfill_summaries_runner + + +class FakeReference: + def __init__(self): + self.updates = [] + + def update(self, payload): + self.updates.append(payload) + + +class FakeBill: + def __init__(self, data): + self._data = data + self.reference = FakeReference() + + def to_dict(self): + return self._data + + +class FakeCollection: + def __init__(self, bills): + self._bills = bills + + def get(self): + return self._bills + + +class FakeDb: + def __init__(self, bills): + self.bills = bills + self.path = None + + def collection(self, path): + self.path = path + return FakeCollection(self.bills) + + +def test_backfill_summaries_skips_bills_without_text(): + bill = FakeBill({"id": "H18", "content": {"Title": "No text"}}) + rows = backfill_summaries_runner.backfill_summaries(FakeDb([bill])) + + assert rows == [ + ["bill_id", "status", "summary", "topics"], + ["H18", "skipped", "None", "None"], + ] + assert bill.reference.updates == [] + + +def test_backfill_summaries_generates_topics_when_summary_exists(monkeypatch): + bill = FakeBill( + { + "id": "H1", + "content": {"Title": "Title", "DocumentText": "Text"}, + "summary": "Existing summary", + } + ) + monkeypatch.setattr( + backfill_summaries_runner, + "get_tags_api_function_v2", + lambda bill_id, title, summary: { + "status": 1, + "tags": ["Consumer protection"], + }, + ) + + rows = backfill_summaries_runner.backfill_summaries(FakeDb([bill])) + + assert rows[1] == ["H1", "previous_summary", "None", "None"] + assert rows[2][0:3] == [ + "H1", + "generated_summary_and_topics", + "Existing summary", + ] + assert bill.reference.updates == [ + {"topics": [{"topic": "Consumer protection", "category": "Commerce"}]} + ] + + +def test_backfill_summaries_dry_run_avoids_firestore_updates(monkeypatch): + bill = FakeBill({"id": "H1", "content": {"Title": "Title", "DocumentText": "Text"}}) + monkeypatch.setattr( + backfill_summaries_runner, + "get_summary_api_function", + lambda bill_id, title, text: {"status": 1, "summary": "New summary"}, + ) + monkeypatch.setattr( + backfill_summaries_runner, + "get_tags_api_function_v2", + lambda bill_id, title, summary: { + "status": 1, + "tags": ["Consumer protection"], + }, + ) + + rows = backfill_summaries_runner.backfill_summaries(FakeDb([bill]), dry_run=True) + + assert rows[1][0:3] == ["H1", "generated_summary_and_topics", "New summary"] + assert bill.reference.updates == [] + + +def test_parse_bill_ids_accepts_spaces_and_commas(): + assert backfill_summaries_runner.parse_bill_ids("H1 H18,S2539") == { + "H1", + "H18", + "S2539", + } diff --git a/scripts/README.md b/scripts/README.md index ef835e4b2..2d0536d2c 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -84,6 +84,21 @@ yarn firebase-admin run-script backfillBallotQuestionTestimonyCounts --env prod +#### `backfillBillPdfText` + +Fills missing `content.DocumentText` on bill documents by checking the MA +Legislature Document API and then falling back to embedded text in the official +bill PDF. The script is dry-run by default; pass `--commit true` to write +updates. + +```sh +yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --bills "H1 H18 H4787 H5008 S2539" --output ./bill-pdf-text-dry-run.csv +yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --commit true --output ./bill-pdf-text-dev.csv +``` + +See `docs/bill-pdf-text-extraction.md` for extraction categories and the LLM +summary/topic follow-up. + #### `backfillBillNotificationEvents` diff --git a/scripts/firebase-admin/backfillBillPdfText.ts b/scripts/firebase-admin/backfillBillPdfText.ts new file mode 100644 index 000000000..53f90b88f --- /dev/null +++ b/scripts/firebase-admin/backfillBillPdfText.ts @@ -0,0 +1,162 @@ +import fs from "fs" +import { z } from "zod" +import { getDocumentWithPdfTextFallback } from "../../functions/src/bills/documentTextFallback" +import { Timestamp } from "../../functions/src/firebase" +import { Script } from "./types" + +type BackfillStatus = + | "skipped_has_document_text" + | "updated_api_text" + | "updated_pdf_text" + | "pdf_no_text" + | "pdf_too_short" + | "pdf_fetch_failed" + | "pdf_parse_failed" + | "dry_run_would_update" + +type BackfillRow = { + bill_id: string + status: BackfillStatus + source: "api" | "pdf" | "" + page_count: number | "" + char_count: number | "" + error: string +} + +const Args = z.object({ + court: z.coerce.number(), + bills: z.string().optional(), + commit: z + .union([z.boolean(), z.string()]) + .optional() + .transform(value => value === true || value === "true"), + output: z.string().optional() +}) + +export const script: Script = async ({ db, args }) => { + const { court, bills, commit = false, output } = Args.parse(args), + allowlist = parseBillIds(bills), + snapshot = await db.collection(`/generalCourts/${court}/bills`).get(), + rows: BackfillRow[] = [] + + let processed = 0 + + for (const bill of snapshot.docs) { + const id = bill.id, + data = bill.data() + + if (allowlist && !allowlist.has(id)) { + continue + } + + if (data.content?.DocumentText != null) { + rows.push({ + bill_id: id, + status: "skipped_has_document_text", + source: "", + page_count: "", + char_count: "", + error: "" + }) + continue + } + + processed++ + + const result = await getDocumentWithPdfTextFallback(court, id), + documentText = result.content.DocumentText + + if (documentText != null) { + if (commit) { + await bill.ref.update({ + "content.DocumentText": documentText, + fetchedAt: Timestamp.now() + }) + } + + rows.push({ + bill_id: id, + status: commit + ? result.documentTextSource === "api" + ? "updated_api_text" + : "updated_pdf_text" + : "dry_run_would_update", + source: result.documentTextSource ?? "", + page_count: result.pdfTextExtraction?.pageCount ?? "", + char_count: + result.pdfTextExtraction?.charCount ?? + documentText.replace(/\s/g, "").length, + error: "" + }) + continue + } + + const extraction = result.pdfTextExtraction + rows.push({ + bill_id: id, + status: mapFailureStatus(extraction?.status), + source: "pdf", + page_count: extraction?.pageCount ?? "", + char_count: extraction?.charCount ?? "", + error: extraction?.error ?? "" + }) + } + + const csv = toCsv(rows) + if (output) { + fs.writeFileSync(output, csv) + console.log(`Wrote ${rows.length} rows to ${output}`) + } else { + console.log(csv) + } + + console.log( + `${commit ? "Updated" : "Dry run checked"} ${processed} missing-text bills` + ) +} + +function parseBillIds(bills?: string) { + if (!bills) return undefined + + return new Set( + bills + .split(/[,\s]+/) + .map(id => id.trim()) + .filter(Boolean) + ) +} + +function mapFailureStatus(status?: string): BackfillStatus { + switch (status) { + case "no-text": + return "pdf_no_text" + case "too-short": + return "pdf_too_short" + case "parse-error": + return "pdf_parse_failed" + case "fetch-error": + default: + return "pdf_fetch_failed" + } +} + +function toCsv(rows: BackfillRow[]) { + return [ + ["bill_id", "status", "source", "page_count", "char_count", "error"], + ...rows.map(row => [ + row.bill_id, + row.status, + row.source, + row.page_count, + row.char_count, + row.error + ]) + ] + .map(row => row.map(formatCsvCell).join(",")) + .join("\n") +} + +function formatCsvCell(value: string | number) { + const text = String(value) + return /[",\n]/.test(text) ? `"${text.replace(/"/g, '""')}"` : text +}