from __future__ import annotations import base64 import json from dataclasses import dataclass from typing import Any, Iterable, Optional import requests @dataclass(frozen=True) class OcrBox: # 4 个顶点坐标(相对截图图片坐标系) points: tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]] def center(self) -> tuple[int, int]: xs = [p[0] for p in self.points] ys = [p[1] for p in self.points] return (int(sum(xs) / 4), int(sum(ys) / 4)) @dataclass(frozen=True) class OcrItem: text: str box: OcrBox class UmiClient: """ 调用 Umi-OCR 的 HTTP API,并将 data_format=dict 的返回解析为 text+box。 """ def __init__(self, url: str = "http://127.0.0.1:1224/api/ocr", timeout_s: float = 15.0) -> None: self.url = url self.timeout_s = timeout_s def check_service(self) -> None: """ Umi-OCR 没有稳定的 healthz 文档接口,这里用一次轻量请求做连通性检测。 只要能建立连接并返回 JSON(即使是业务错误),就认为服务已启动。 """ try: r = requests.post(self.url, json={"base64": "", "options": {"data_format": "dict"}}, timeout=3) _ = r.text # 触发实际请求 except requests.RequestException as e: raise RuntimeError(f"无法连接 Umi-OCR 服务:{self.url}。请先在 Umi-OCR 中开启 HTTP 服务。") from e def ocr_bytes(self, image_bytes: bytes) -> list[OcrItem]: img64 = base64.b64encode(image_bytes).decode("utf-8") payload = {"base64": img64, "options": {"data_format": "dict"}} resp = requests.post(self.url, json=payload, timeout=self.timeout_s) resp.raise_for_status() data = resp.json() return self._parse_umi_dict(data) def _parse_umi_dict(self, data: dict[str, Any]) -> list[OcrItem]: # 兼容:当返回不是 dict 时直接报错,方便定位 if not isinstance(data, dict): raise ValueError(f"Umi-OCR 返回非 JSON 对象:{type(data)}") items: list[OcrItem] = [] data_list = data.get("data", []) if not isinstance(data_list, list): raise ValueError(f"Umi-OCR 返回 data 字段不是 list:{json.dumps(data, ensure_ascii=False)[:500]}") for it in data_list: if not isinstance(it, dict): continue text = str(it.get("text", "")).strip() box = it.get("box") pts = _coerce_box_points(box) if not text or pts is None: continue items.append(OcrItem(text=text, box=OcrBox(points=pts))) return items def find_text( self, target_name: str, items: Iterable[OcrItem], *, exact: bool = True, case_sensitive: bool = False, ) -> Optional[tuple[int, int]]: """ 在给定 OCR items 中查找目标文字,返回其在截图坐标系下的中心点 (x, y)。 """ t = target_name if case_sensitive else target_name.lower() for item in items: s = item.text if case_sensitive else item.text.lower() ok = (s == t) if exact else (t in s) if ok: return item.box.center() return None def _coerce_box_points(box: Any) -> Optional[tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]]: """ Umi-OCR 的 box 在 data_format=dict 下通常是 4 个点: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] """ if not (isinstance(box, list) and len(box) == 4): return None pts: list[tuple[int, int]] = [] for p in box: if not (isinstance(p, (list, tuple)) and len(p) == 2): return None pts.append((int(p[0]), int(p[1]))) return (pts[0], pts[1], pts[2], pts[3])