81 lines
2.3 KiB
Python
81 lines
2.3 KiB
Python
|
|
"""Extract text from benchmark PDF and paper DOCX.
|
||
|
|
|
||
|
|
This script exists to work around environments where the chat tooling
|
||
|
|
cannot directly read PDF/DOCX. It generates plain text files that are
|
||
|
|
easy for an agent to read and quote.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw"
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import os
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
|
||
|
|
def extract_pdf_text(pdf_path: Path) -> str:
|
||
|
|
from pypdf import PdfReader
|
||
|
|
|
||
|
|
reader = PdfReader(str(pdf_path))
|
||
|
|
parts = []
|
||
|
|
for i, page in enumerate(reader.pages, start=1):
|
||
|
|
text = page.extract_text() or ""
|
||
|
|
parts.append(f"\n\n===== PAGE {i} =====\n\n{text}")
|
||
|
|
return "".join(parts).strip() + "\n"
|
||
|
|
|
||
|
|
|
||
|
|
def extract_docx_text(docx_path: Path) -> str:
|
||
|
|
import docx # python-docx
|
||
|
|
|
||
|
|
d = docx.Document(str(docx_path))
|
||
|
|
lines = []
|
||
|
|
|
||
|
|
# paragraphs
|
||
|
|
for p in d.paragraphs:
|
||
|
|
t = (p.text or "").strip()
|
||
|
|
if t:
|
||
|
|
lines.append(t)
|
||
|
|
|
||
|
|
# tables (best-effort)
|
||
|
|
for table in d.tables:
|
||
|
|
for row in table.rows:
|
||
|
|
cells = [c.text.strip() for c in row.cells]
|
||
|
|
if any(cells):
|
||
|
|
lines.append("\t".join(cells))
|
||
|
|
|
||
|
|
return "\n".join(lines).strip() + "\n"
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
ap = argparse.ArgumentParser()
|
||
|
|
ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF")
|
||
|
|
ap.add_argument("--paper", required=True, help="Path to target paper DOCX")
|
||
|
|
ap.add_argument("--out", required=True, help="Output directory")
|
||
|
|
args = ap.parse_args()
|
||
|
|
|
||
|
|
benchmark_path = Path(args.benchmark)
|
||
|
|
paper_path = Path(args.paper)
|
||
|
|
out_dir = Path(args.out)
|
||
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
if not benchmark_path.exists():
|
||
|
|
raise SystemExit(f"Benchmark not found: {benchmark_path}")
|
||
|
|
if not paper_path.exists():
|
||
|
|
raise SystemExit(f"Paper not found: {paper_path}")
|
||
|
|
|
||
|
|
benchmark_txt = extract_pdf_text(benchmark_path)
|
||
|
|
paper_txt = extract_docx_text(paper_path)
|
||
|
|
|
||
|
|
(out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8")
|
||
|
|
(out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8")
|
||
|
|
|
||
|
|
print("OK")
|
||
|
|
print(f"- {out_dir / 'benchmark.txt'}")
|
||
|
|
print(f"- {out_dir / 'paper.txt'}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|