This commit is contained in:
王鹏
2026-04-09 14:55:54 +08:00
commit a2f5875d1b
60 changed files with 5210 additions and 0 deletions

1
core/__init__.py Normal file
View File

@@ -0,0 +1 @@

Binary file not shown.

Binary file not shown.

Binary file not shown.

87
core/actions.py Normal file
View File

@@ -0,0 +1,87 @@
from __future__ import annotations
import time
from dataclasses import dataclass
from typing import Optional
import pyautogui
from core.ocr_client import UmiClient
from utils.screenshot import capture_screen
@dataclass(frozen=True)
class MatchResult:
# 屏幕坐标(可直接点击)
x: int
y: int
# 命中的文字(当前实现为目标文字本身;若后续做模糊匹配可返回实际匹配串)
text: str
class ActionRunner:
def __init__(self, ocr: UmiClient, *, prefer_mss: bool = True, default_region: Optional[tuple[int, int, int, int]] = None):
self.ocr = ocr
self.prefer_mss = prefer_mss
self.default_region = default_region
def locate_text(
self,
text: str,
*,
region: Optional[tuple[int, int, int, int]] = None,
exact: bool = True,
case_sensitive: bool = False,
) -> Optional[MatchResult]:
cap = capture_screen(region or self.default_region, prefer_mss=self.prefer_mss)
items = self.ocr.ocr_bytes(cap.image_bytes)
pt = self.ocr.find_text(text, items, exact=exact, case_sensitive=case_sensitive)
if pt is None:
return None
img_x, img_y = pt
left, top, _, _ = cap.region
# OCR 坐标是“截图图片像素坐标”,需先缩放到屏幕坐标,再加上截图区域偏移
scr_x = int(left + img_x * cap.scale_x)
scr_y = int(top + img_y * cap.scale_y)
return MatchResult(x=scr_x, y=scr_y, text=text)
def click_text(
self,
text: str,
*,
region: Optional[tuple[int, int, int, int]] = None,
exact: bool = True,
case_sensitive: bool = False,
clicks: int = 1,
interval: float = 0.05,
button: str = "left",
move_duration: float = 0.0,
pause: float = 0.05,
) -> MatchResult:
pyautogui.PAUSE = pause
m = self.locate_text(text, region=region, exact=exact, case_sensitive=case_sensitive)
if m is None:
raise TimeoutError(f"未找到文字:{text}")
pyautogui.moveTo(m.x, m.y, duration=move_duration)
pyautogui.click(x=m.x, y=m.y, clicks=clicks, interval=interval, button=button)
return m
def wait_for_text(
self,
text: str,
*,
timeout: float = 20.0,
interval: float = 0.5,
region: Optional[tuple[int, int, int, int]] = None,
exact: bool = True,
case_sensitive: bool = False,
) -> MatchResult:
end = time.time() + timeout
while time.time() < end:
m = self.locate_text(text, region=region, exact=exact, case_sensitive=case_sensitive)
if m is not None:
return m
time.sleep(interval)
raise TimeoutError(f"等待超时:{timeout}s 内未出现文字:{text}")

28
core/browser.py Normal file
View File

@@ -0,0 +1,28 @@
from __future__ import annotations
"""
本项目的核心是“桌面 OCR + 点击”。若后续需要操作 Web 端(如 DMS/ERP可以在此处接入 Playwright。
当前先提供一个最小骨架,避免影响主流程。
"""
from dataclasses import dataclass
from typing import Optional
@dataclass(frozen=True)
class BrowserConfig:
headless: bool = False
slow_mo_ms: int = 0
user_data_dir: Optional[str] = None
class Browser:
def __init__(self, config: BrowserConfig = BrowserConfig()) -> None:
self.config = config
def __enter__(self) -> "Browser":
return self
def __exit__(self, exc_type, exc, tb) -> None:
return None

110
core/ocr_client.py Normal file
View File

@@ -0,0 +1,110 @@
from __future__ import annotations
import base64
import json
from dataclasses import dataclass
from typing import Any, Iterable, Optional
import requests
@dataclass(frozen=True)
class OcrBox:
# 4 个顶点坐标(相对截图图片坐标系)
points: tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]
def center(self) -> tuple[int, int]:
xs = [p[0] for p in self.points]
ys = [p[1] for p in self.points]
return (int(sum(xs) / 4), int(sum(ys) / 4))
@dataclass(frozen=True)
class OcrItem:
text: str
box: OcrBox
class UmiClient:
"""
调用 Umi-OCR 的 HTTP API并将 data_format=dict 的返回解析为 text+box。
"""
def __init__(self, url: str = "http://127.0.0.1:1224/api/ocr", timeout_s: float = 15.0) -> None:
self.url = url
self.timeout_s = timeout_s
def check_service(self) -> None:
"""
Umi-OCR 没有稳定的 healthz 文档接口,这里用一次轻量请求做连通性检测。
只要能建立连接并返回 JSON即使是业务错误就认为服务已启动。
"""
try:
r = requests.post(self.url, json={"base64": "", "options": {"data_format": "dict"}}, timeout=3)
_ = r.text # 触发实际请求
except requests.RequestException as e:
raise RuntimeError(f"无法连接 Umi-OCR 服务:{self.url}。请先在 Umi-OCR 中开启 HTTP 服务。") from e
def ocr_bytes(self, image_bytes: bytes) -> list[OcrItem]:
img64 = base64.b64encode(image_bytes).decode("utf-8")
payload = {"base64": img64, "options": {"data_format": "dict"}}
resp = requests.post(self.url, json=payload, timeout=self.timeout_s)
resp.raise_for_status()
data = resp.json()
return self._parse_umi_dict(data)
def _parse_umi_dict(self, data: dict[str, Any]) -> list[OcrItem]:
# 兼容:当返回不是 dict 时直接报错,方便定位
if not isinstance(data, dict):
raise ValueError(f"Umi-OCR 返回非 JSON 对象:{type(data)}")
items: list[OcrItem] = []
data_list = data.get("data", [])
if not isinstance(data_list, list):
raise ValueError(f"Umi-OCR 返回 data 字段不是 list{json.dumps(data, ensure_ascii=False)[:500]}")
for it in data_list:
if not isinstance(it, dict):
continue
text = str(it.get("text", "")).strip()
box = it.get("box")
pts = _coerce_box_points(box)
if not text or pts is None:
continue
items.append(OcrItem(text=text, box=OcrBox(points=pts)))
return items
def find_text(
self,
target_name: str,
items: Iterable[OcrItem],
*,
exact: bool = True,
case_sensitive: bool = False,
) -> Optional[tuple[int, int]]:
"""
在给定 OCR items 中查找目标文字,返回其在截图坐标系下的中心点 (x, y)。
"""
t = target_name if case_sensitive else target_name.lower()
for item in items:
s = item.text if case_sensitive else item.text.lower()
ok = (s == t) if exact else (t in s)
if ok:
return item.box.center()
return None
def _coerce_box_points(box: Any) -> Optional[tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]]:
"""
Umi-OCR 的 box 在 data_format=dict 下通常是 4 个点:
[[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
"""
if not (isinstance(box, list) and len(box) == 4):
return None
pts: list[tuple[int, int]] = []
for p in box:
if not (isinstance(p, (list, tuple)) and len(p) == 2):
return None
pts.append((int(p[0]), int(p[1])))
return (pts[0], pts[1], pts[2], pts[3])