Files
yidaima_tools/core/ocr_client.py
王鹏 a2f5875d1b init
2026-04-09 14:55:54 +08:00

111 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import base64
import json
from dataclasses import dataclass
from typing import Any, Iterable, Optional
import requests
@dataclass(frozen=True)
class OcrBox:
# 4 个顶点坐标(相对截图图片坐标系)
points: tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]
def center(self) -> tuple[int, int]:
xs = [p[0] for p in self.points]
ys = [p[1] for p in self.points]
return (int(sum(xs) / 4), int(sum(ys) / 4))
@dataclass(frozen=True)
class OcrItem:
text: str
box: OcrBox
class UmiClient:
"""
调用 Umi-OCR 的 HTTP API并将 data_format=dict 的返回解析为 text+box。
"""
def __init__(self, url: str = "http://127.0.0.1:1224/api/ocr", timeout_s: float = 15.0) -> None:
self.url = url
self.timeout_s = timeout_s
def check_service(self) -> None:
"""
Umi-OCR 没有稳定的 healthz 文档接口,这里用一次轻量请求做连通性检测。
只要能建立连接并返回 JSON即使是业务错误就认为服务已启动。
"""
try:
r = requests.post(self.url, json={"base64": "", "options": {"data_format": "dict"}}, timeout=3)
_ = r.text # 触发实际请求
except requests.RequestException as e:
raise RuntimeError(f"无法连接 Umi-OCR 服务:{self.url}。请先在 Umi-OCR 中开启 HTTP 服务。") from e
def ocr_bytes(self, image_bytes: bytes) -> list[OcrItem]:
img64 = base64.b64encode(image_bytes).decode("utf-8")
payload = {"base64": img64, "options": {"data_format": "dict"}}
resp = requests.post(self.url, json=payload, timeout=self.timeout_s)
resp.raise_for_status()
data = resp.json()
return self._parse_umi_dict(data)
def _parse_umi_dict(self, data: dict[str, Any]) -> list[OcrItem]:
# 兼容:当返回不是 dict 时直接报错,方便定位
if not isinstance(data, dict):
raise ValueError(f"Umi-OCR 返回非 JSON 对象:{type(data)}")
items: list[OcrItem] = []
data_list = data.get("data", [])
if not isinstance(data_list, list):
raise ValueError(f"Umi-OCR 返回 data 字段不是 list{json.dumps(data, ensure_ascii=False)[:500]}")
for it in data_list:
if not isinstance(it, dict):
continue
text = str(it.get("text", "")).strip()
box = it.get("box")
pts = _coerce_box_points(box)
if not text or pts is None:
continue
items.append(OcrItem(text=text, box=OcrBox(points=pts)))
return items
def find_text(
self,
target_name: str,
items: Iterable[OcrItem],
*,
exact: bool = True,
case_sensitive: bool = False,
) -> Optional[tuple[int, int]]:
"""
在给定 OCR items 中查找目标文字,返回其在截图坐标系下的中心点 (x, y)。
"""
t = target_name if case_sensitive else target_name.lower()
for item in items:
s = item.text if case_sensitive else item.text.lower()
ok = (s == t) if exact else (t in s)
if ok:
return item.box.center()
return None
def _coerce_box_points(box: Any) -> Optional[tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]]:
"""
Umi-OCR 的 box 在 data_format=dict 下通常是 4 个点:
[[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
"""
if not (isinstance(box, list) and len(box) == 4):
return None
pts: list[tuple[int, int]] = []
for p in box:
if not (isinstance(p, (list, tuple)) and len(p) == 2):
return None
pts.append((int(p[0]), int(p[1])))
return (pts[0], pts[1], pts[2], pts[3])