Files
essay/90_scripts/extract_paper_text.py

81 lines
2.3 KiB
Python
Raw Permalink Normal View History

2026-02-10 10:54:09 +00:00
"""Extract text from benchmark PDF and paper DOCX.
This script exists to work around environments where the chat tooling
cannot directly read PDF/DOCX. It generates plain text files that are
easy for an agent to read and quote.
Usage:
python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw"
"""
from __future__ import annotations
import argparse
import os
from pathlib import Path
def extract_pdf_text(pdf_path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(pdf_path))
parts = []
for i, page in enumerate(reader.pages, start=1):
text = page.extract_text() or ""
parts.append(f"\n\n===== PAGE {i} =====\n\n{text}")
return "".join(parts).strip() + "\n"
def extract_docx_text(docx_path: Path) -> str:
import docx # python-docx
d = docx.Document(str(docx_path))
lines = []
# paragraphs
for p in d.paragraphs:
t = (p.text or "").strip()
if t:
lines.append(t)
# tables (best-effort)
for table in d.tables:
for row in table.rows:
cells = [c.text.strip() for c in row.cells]
if any(cells):
lines.append("\t".join(cells))
return "\n".join(lines).strip() + "\n"
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF")
ap.add_argument("--paper", required=True, help="Path to target paper DOCX")
ap.add_argument("--out", required=True, help="Output directory")
args = ap.parse_args()
benchmark_path = Path(args.benchmark)
paper_path = Path(args.paper)
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
if not benchmark_path.exists():
raise SystemExit(f"Benchmark not found: {benchmark_path}")
if not paper_path.exists():
raise SystemExit(f"Paper not found: {paper_path}")
benchmark_txt = extract_pdf_text(benchmark_path)
paper_txt = extract_docx_text(paper_path)
(out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8")
(out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8")
print("OK")
print(f"- {out_dir / 'benchmark.txt'}")
print(f"- {out_dir / 'paper.txt'}")
if __name__ == "__main__":
main()