整理
This commit is contained in:
80
90_scripts/extract_paper_text.py
Normal file
80
90_scripts/extract_paper_text.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Extract text from benchmark PDF and paper DOCX.
|
||||
|
||||
This script exists to work around environments where the chat tooling
|
||||
cannot directly read PDF/DOCX. It generates plain text files that are
|
||||
easy for an agent to read and quote.
|
||||
|
||||
Usage:
|
||||
python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw"
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_pdf_text(pdf_path: Path) -> str:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(str(pdf_path))
|
||||
parts = []
|
||||
for i, page in enumerate(reader.pages, start=1):
|
||||
text = page.extract_text() or ""
|
||||
parts.append(f"\n\n===== PAGE {i} =====\n\n{text}")
|
||||
return "".join(parts).strip() + "\n"
|
||||
|
||||
|
||||
def extract_docx_text(docx_path: Path) -> str:
|
||||
import docx # python-docx
|
||||
|
||||
d = docx.Document(str(docx_path))
|
||||
lines = []
|
||||
|
||||
# paragraphs
|
||||
for p in d.paragraphs:
|
||||
t = (p.text or "").strip()
|
||||
if t:
|
||||
lines.append(t)
|
||||
|
||||
# tables (best-effort)
|
||||
for table in d.tables:
|
||||
for row in table.rows:
|
||||
cells = [c.text.strip() for c in row.cells]
|
||||
if any(cells):
|
||||
lines.append("\t".join(cells))
|
||||
|
||||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF")
|
||||
ap.add_argument("--paper", required=True, help="Path to target paper DOCX")
|
||||
ap.add_argument("--out", required=True, help="Output directory")
|
||||
args = ap.parse_args()
|
||||
|
||||
benchmark_path = Path(args.benchmark)
|
||||
paper_path = Path(args.paper)
|
||||
out_dir = Path(args.out)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not benchmark_path.exists():
|
||||
raise SystemExit(f"Benchmark not found: {benchmark_path}")
|
||||
if not paper_path.exists():
|
||||
raise SystemExit(f"Paper not found: {paper_path}")
|
||||
|
||||
benchmark_txt = extract_pdf_text(benchmark_path)
|
||||
paper_txt = extract_docx_text(paper_path)
|
||||
|
||||
(out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8")
|
||||
(out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8")
|
||||
|
||||
print("OK")
|
||||
print(f"- {out_dir / 'benchmark.txt'}")
|
||||
print(f"- {out_dir / 'paper.txt'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
258
90_scripts/test_docx_upload.py
Normal file
258
90_scripts/test_docx_upload.py
Normal file
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""test_docx_upload.py
|
||||
|
||||
Test whether an OpenAI-compatible /v1/chat/completions endpoint supports
|
||||
direct docx upload + content analysis.
|
||||
|
||||
Flow:
|
||||
1) ping chat/completions
|
||||
2) probe /v1/files (many gateways return 404)
|
||||
3) try two "direct docx" variants:
|
||||
A) messages[].content as an array with {type: input_file, mime_type, data}
|
||||
B) top-level files: [{filename, mime_type, data}]
|
||||
4) fallback: extract docx text locally and send as plain text
|
||||
|
||||
Security:
|
||||
- API key is read from an environment variable only; never written to disk.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import datetime as dt
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
try:
|
||||
from docx import Document
|
||||
except Exception:
|
||||
Document = None
|
||||
|
||||
|
||||
DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
|
||||
|
||||
def now_stamp() -> str:
|
||||
return dt.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
|
||||
def safe_write(path: Path, content: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def dump_json(path: Path, obj) -> None:
|
||||
safe_write(path, json.dumps(obj, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
def http_head(url: str, headers: dict, timeout: int = 30) -> requests.Response:
|
||||
return requests.head(url, headers=headers, timeout=timeout, allow_redirects=False)
|
||||
|
||||
|
||||
def http_post_json(url: str, headers: dict, payload: dict, timeout: int = 120) -> requests.Response:
|
||||
return requests.post(url, headers=headers, json=payload, timeout=timeout)
|
||||
|
||||
|
||||
def read_docx_text(docx_path: Path) -> str:
|
||||
if Document is None:
|
||||
raise RuntimeError("python-docx not installed: pip install python-docx")
|
||||
doc = Document(str(docx_path))
|
||||
paras = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
|
||||
return "\n".join(paras)
|
||||
|
||||
|
||||
def truncate_text(s: str, max_chars: int) -> str:
|
||||
if len(s) <= max_chars:
|
||||
return s
|
||||
return s[:max_chars] + "\n\n[TRUNCATED] original_len=%d truncated_len=%d" % (len(s), max_chars)
|
||||
|
||||
|
||||
def summarize_response(resp: requests.Response) -> str:
|
||||
ct = resp.headers.get("Content-Type", "")
|
||||
return "HTTP %s Content-Type=%s len=%d" % (resp.status_code, ct, len(resp.content))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description="Test docx upload support for chat/completions gateways",
|
||||
epilog=textwrap.dedent(
|
||||
"""\
|
||||
Example:
|
||||
export API_KEY='sk-***'
|
||||
python test_docx_upload.py \
|
||||
--api-base 'http://120.24.249.39:18317' \
|
||||
--model 'gemini-3-pro-preview' \
|
||||
--docx './我的论文/飞机稿_20260130.docx' \
|
||||
--out './_upload_test_out'
|
||||
"""
|
||||
),
|
||||
)
|
||||
ap.add_argument("--api-base", required=True, help="e.g. http://120.24.249.39:18317")
|
||||
ap.add_argument("--model", required=True, help="e.g. gemini-3-pro-preview")
|
||||
ap.add_argument("--docx", required=True, help="Path to .docx")
|
||||
ap.add_argument(
|
||||
"--prompt",
|
||||
default="请用中文提炼该文档的四级大纲,并说明论证逻辑与文风特点。",
|
||||
help="Prompt to run against the document",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--api-key-env",
|
||||
default="API_KEY",
|
||||
help="Environment variable name containing API key (default: API_KEY)",
|
||||
)
|
||||
ap.add_argument("--timeout", type=int, default=180, help="POST timeout seconds (default: 180)")
|
||||
ap.add_argument(
|
||||
"--max-text-chars",
|
||||
type=int,
|
||||
default=60000,
|
||||
help="Fallback mode: max extracted text characters to send (default: 60000)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--out",
|
||||
default="./_upload_test_out_%s" % now_stamp(),
|
||||
help="Output directory (default: timestamped)",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
api_key = os.getenv(args.api_key_env)
|
||||
if not api_key:
|
||||
print("[ERROR] env %s is empty; export %s='...'" % (args.api_key_env, args.api_key_env), file=sys.stderr)
|
||||
return 2
|
||||
|
||||
api_base = args.api_base.rstrip("/")
|
||||
chat_url = api_base + "/v1/chat/completions"
|
||||
files_url = api_base + "/v1/files"
|
||||
|
||||
outdir = Path(args.out).resolve()
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": "Bearer %s" % api_key,
|
||||
}
|
||||
head_headers = {"Authorization": "Bearer %s" % api_key}
|
||||
|
||||
docx_path = Path(args.docx)
|
||||
if not docx_path.exists():
|
||||
print("[ERROR] docx not found: %s" % docx_path, file=sys.stderr)
|
||||
return 2
|
||||
|
||||
docx_bytes = docx_path.read_bytes()
|
||||
docx_b64 = base64.b64encode(docx_bytes).decode("ascii")
|
||||
size_mb = len(docx_bytes) / (1024.0 * 1024.0)
|
||||
print("[INFO] docx=%s size=%.2fMB out=%s" % (docx_path, size_mb, outdir))
|
||||
|
||||
meta = {
|
||||
"api_base": api_base,
|
||||
"chat_url": chat_url,
|
||||
"files_url": files_url,
|
||||
"model": args.model,
|
||||
"docx": str(docx_path),
|
||||
"docx_size_bytes": len(docx_bytes),
|
||||
"prompt": args.prompt,
|
||||
"note": "API key is read from env only; never written to output files.",
|
||||
}
|
||||
dump_json(outdir / "00_meta.json", meta)
|
||||
|
||||
# 1) ping
|
||||
ping_payload = {"model": args.model, "messages": [{"role": "user", "content": "ping"}]}
|
||||
dump_json(outdir / "01_ping_request.json", ping_payload)
|
||||
print("[STEP 1] POST %s ping ..." % chat_url)
|
||||
ping_resp = http_post_json(chat_url, headers=headers, payload=ping_payload, timeout=args.timeout)
|
||||
safe_write(outdir / "01_ping_response.txt", ping_resp.text)
|
||||
print("[STEP 1] %s" % summarize_response(ping_resp))
|
||||
if ping_resp.status_code != 200:
|
||||
print("[ERROR] ping failed; see %s" % (outdir / "01_ping_response.txt"), file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# 2) probe /v1/files
|
||||
print("[STEP 2] HEAD %s ..." % files_url)
|
||||
try:
|
||||
files_head = http_head(files_url, headers=head_headers, timeout=30)
|
||||
safe_write(outdir / "02_files_head_status.txt", "%s\n%s\n" % (files_head.status_code, dict(files_head.headers)))
|
||||
print("[STEP 2] HTTP %s" % files_head.status_code)
|
||||
except Exception as e:
|
||||
safe_write(outdir / "02_files_head_status.txt", "EXCEPTION: %r\n" % (e,))
|
||||
print("[STEP 2] exception: %r" % (e,))
|
||||
|
||||
# 3A) input_file in content array
|
||||
print("[STEP 3A] try messages[].content input_file ...")
|
||||
payload_a = {
|
||||
"model": args.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": args.prompt},
|
||||
{"type": "input_file", "mime_type": DOCX_MIME, "data": docx_b64},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
dump_json(outdir / "03A_input_file_request.json", payload_a)
|
||||
resp_a = http_post_json(chat_url, headers=headers, payload=payload_a, timeout=args.timeout)
|
||||
safe_write(outdir / "03A_input_file_response.txt", resp_a.text)
|
||||
print("[STEP 3A] %s" % summarize_response(resp_a))
|
||||
if resp_a.status_code == 200:
|
||||
print("[RESULT] supports variant A (input_file). See 03A_input_file_response.txt")
|
||||
return 0
|
||||
|
||||
# 3B) top-level files field
|
||||
print("[STEP 3B] try top-level files field ...")
|
||||
payload_b = {
|
||||
"model": args.model,
|
||||
"messages": [{"role": "user", "content": args.prompt}],
|
||||
"files": [{"filename": docx_path.name, "mime_type": DOCX_MIME, "data": docx_b64}],
|
||||
}
|
||||
dump_json(outdir / "03B_files_field_request.json", payload_b)
|
||||
resp_b = http_post_json(chat_url, headers=headers, payload=payload_b, timeout=args.timeout)
|
||||
safe_write(outdir / "03B_files_field_response.txt", resp_b.text)
|
||||
print("[STEP 3B] %s" % summarize_response(resp_b))
|
||||
if resp_b.status_code == 200:
|
||||
print("[RESULT] supports variant B (top-level files). See 03B_files_field_response.txt")
|
||||
return 0
|
||||
|
||||
# 4) fallback: extract text and send as plain content
|
||||
print("[STEP 4] fallback: extract docx text and send as plain text ...")
|
||||
try:
|
||||
extracted = read_docx_text(docx_path)
|
||||
except Exception as e:
|
||||
safe_write(outdir / "04_fallback_extract_error.txt", "%r\n" % (e,))
|
||||
print("[RESULT] direct upload A/B failed; fallback extraction failed: %r" % (e,), file=sys.stderr)
|
||||
return 1
|
||||
|
||||
extracted_trunc = truncate_text(extracted, args.max_text_chars)
|
||||
safe_write(outdir / "04_fallback_extracted_text.txt", extracted_trunc)
|
||||
|
||||
payload_c = {
|
||||
"model": args.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "%s\n\n=== 文档正文(从 docx 提取)===\n%s" % (args.prompt, extracted_trunc),
|
||||
}
|
||||
],
|
||||
}
|
||||
dump_json(outdir / "04_fallback_text_request.json", payload_c)
|
||||
resp_c = http_post_json(chat_url, headers=headers, payload=payload_c, timeout=args.timeout)
|
||||
safe_write(outdir / "04_fallback_text_response.txt", resp_c.text)
|
||||
print("[STEP 4] %s" % summarize_response(resp_c))
|
||||
if resp_c.status_code == 200:
|
||||
print("[RESULT] direct upload A/B failed; fallback text mode succeeded.")
|
||||
return 0
|
||||
|
||||
print("[RESULT] direct upload A/B failed and fallback failed; inspect output dir: %s" % outdir, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user