"""Extract text from benchmark PDF and paper DOCX. This script exists to work around environments where the chat tooling cannot directly read PDF/DOCX. It generates plain text files that are easy for an agent to read and quote. Usage: python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw" """ from __future__ import annotations import argparse import os from pathlib import Path def extract_pdf_text(pdf_path: Path) -> str: from pypdf import PdfReader reader = PdfReader(str(pdf_path)) parts = [] for i, page in enumerate(reader.pages, start=1): text = page.extract_text() or "" parts.append(f"\n\n===== PAGE {i} =====\n\n{text}") return "".join(parts).strip() + "\n" def extract_docx_text(docx_path: Path) -> str: import docx # python-docx d = docx.Document(str(docx_path)) lines = [] # paragraphs for p in d.paragraphs: t = (p.text or "").strip() if t: lines.append(t) # tables (best-effort) for table in d.tables: for row in table.rows: cells = [c.text.strip() for c in row.cells] if any(cells): lines.append("\t".join(cells)) return "\n".join(lines).strip() + "\n" def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF") ap.add_argument("--paper", required=True, help="Path to target paper DOCX") ap.add_argument("--out", required=True, help="Output directory") args = ap.parse_args() benchmark_path = Path(args.benchmark) paper_path = Path(args.paper) out_dir = Path(args.out) out_dir.mkdir(parents=True, exist_ok=True) if not benchmark_path.exists(): raise SystemExit(f"Benchmark not found: {benchmark_path}") if not paper_path.exists(): raise SystemExit(f"Paper not found: {paper_path}") benchmark_txt = extract_pdf_text(benchmark_path) paper_txt = extract_docx_text(paper_path) (out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8") (out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8") print("OK") print(f"- {out_dir / 'benchmark.txt'}") print(f"- {out_dir / 'paper.txt'}") if __name__ == "__main__": main()