scripts/extract_paper_text.py

"""Extract text from benchmark PDF and paper DOCX.

This script exists to work around environments where the chat tooling
cannot directly read PDF/DOCX. It generates plain text files that are
easy for an agent to read and quote.

Usage:
  python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw"
"""

from __future__ import annotations

import argparse
import os
from pathlib import Path


def extract_pdf_text(pdf_path: Path) -> str:
    from pypdf import PdfReader

    reader = PdfReader(str(pdf_path))
    parts = []
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        parts.append(f"\n\n===== PAGE {i} =====\n\n{text}")
    return "".join(parts).strip() + "\n"


def extract_docx_text(docx_path: Path) -> str:
    import docx  # python-docx

    d = docx.Document(str(docx_path))
    lines = []

    # paragraphs
    for p in d.paragraphs:
        t = (p.text or "").strip()
        if t:
            lines.append(t)

    # tables (best-effort)
    for table in d.tables:
        for row in table.rows:
            cells = [c.text.strip() for c in row.cells]
            if any(cells):
                lines.append("\t".join(cells))

    return "\n".join(lines).strip() + "\n"


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF")
    ap.add_argument("--paper", required=True, help="Path to target paper DOCX")
    ap.add_argument("--out", required=True, help="Output directory")
    args = ap.parse_args()

    benchmark_path = Path(args.benchmark)
    paper_path = Path(args.paper)
    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)

    if not benchmark_path.exists():
        raise SystemExit(f"Benchmark not found: {benchmark_path}")
    if not paper_path.exists():
        raise SystemExit(f"Paper not found: {paper_path}")

    benchmark_txt = extract_pdf_text(benchmark_path)
    paper_txt = extract_docx_text(paper_path)

    (out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8")
    (out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8")

    print("OK")
    print(f"- {out_dir / 'benchmark.txt'}")
    print(f"- {out_dir / 'paper.txt'}")


if __name__ == "__main__":
    main()
评审专家 agent 2026-02-10 10:54:09 +00:00			`"""Extract text from benchmark PDF and paper DOCX.`

			`This script exists to work around environments where the chat tooling`
			`cannot directly read PDF/DOCX. It generates plain text files that are`
			`easy for an agent to read and quote.`

			`Usage:`
			`python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw"`
			`"""`

			`from __future__ import annotations`

			`import argparse`
			`import os`
			`from pathlib import Path`


			`def extract_pdf_text(pdf_path: Path) -> str:`
			`from pypdf import PdfReader`

			`reader = PdfReader(str(pdf_path))`
			`parts = []`
			`for i, page in enumerate(reader.pages, start=1):`
			`text = page.extract_text() or ""`
			`parts.append(f"\n\n===== PAGE {i} =====\n\n{text}")`
			`return "".join(parts).strip() + "\n"`


			`def extract_docx_text(docx_path: Path) -> str:`
			`import docx # python-docx`

			`d = docx.Document(str(docx_path))`
			`lines = []`

			`# paragraphs`
			`for p in d.paragraphs:`
			`t = (p.text or "").strip()`
			`if t:`
			`lines.append(t)`

			`# tables (best-effort)`
			`for table in d.tables:`
			`for row in table.rows:`
			`cells = [c.text.strip() for c in row.cells]`
			`if any(cells):`
			`lines.append("\t".join(cells))`

			`return "\n".join(lines).strip() + "\n"`


			`def main() -> None:`
			`ap = argparse.ArgumentParser()`
			`ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF")`
			`ap.add_argument("--paper", required=True, help="Path to target paper DOCX")`
			`ap.add_argument("--out", required=True, help="Output directory")`
			`args = ap.parse_args()`

			`benchmark_path = Path(args.benchmark)`
			`paper_path = Path(args.paper)`
			`out_dir = Path(args.out)`
			`out_dir.mkdir(parents=True, exist_ok=True)`

			`if not benchmark_path.exists():`
			`raise SystemExit(f"Benchmark not found: {benchmark_path}")`
			`if not paper_path.exists():`
			`raise SystemExit(f"Paper not found: {paper_path}")`

			`benchmark_txt = extract_pdf_text(benchmark_path)`
			`paper_txt = extract_docx_text(paper_path)`

			`(out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8")`
			`(out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8")`

			`print("OK")`
			`print(f"- {out_dir / 'benchmark.txt'}")`
			`print(f"- {out_dir / 'paper.txt'}")`


			`if __name__ == "__main__":`
			`main()`