init

2026-04-09 14:55:54 +08:00
commit a2f5875d1b
60 changed files with 5210 additions and 0 deletions
--- a/core/init.py
+++ b/core/init.py
@@ -0,0 +1 @@
+
--- a/core/pycache/init.cpython-311.pyc
+++ b/core/pycache/init.cpython-311.pyc
--- a/core/pycache/actions.cpython-311.pyc
+++ b/core/pycache/actions.cpython-311.pyc
--- a/core/pycache/ocr_client.cpython-311.pyc
+++ b/core/pycache/ocr_client.cpython-311.pyc
--- a/core/actions.py
+++ b/core/actions.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+import pyautogui
+
+from core.ocr_client import UmiClient
+from utils.screenshot import capture_screen
+
+
+@dataclass(frozen=True)
+class MatchResult:
+    # 屏幕坐标（可直接点击）
+    x: int
+    y: int
+    # 命中的文字（当前实现为目标文字本身；若后续做模糊匹配可返回实际匹配串）
+    text: str
+
+
+class ActionRunner:
+    def __init__(self, ocr: UmiClient, *, prefer_mss: bool = True, default_region: Optional[tuple[int, int, int, int]] = None):
+        self.ocr = ocr
+        self.prefer_mss = prefer_mss
+        self.default_region = default_region
+
+    def locate_text(
+        self,
+        text: str,
+        *,
+        region: Optional[tuple[int, int, int, int]] = None,
+        exact: bool = True,
+        case_sensitive: bool = False,
+    ) -> Optional[MatchResult]:
+        cap = capture_screen(region or self.default_region, prefer_mss=self.prefer_mss)
+        items = self.ocr.ocr_bytes(cap.image_bytes)
+        pt = self.ocr.find_text(text, items, exact=exact, case_sensitive=case_sensitive)
+        if pt is None:
+            return None
+
+        img_x, img_y = pt
+        left, top, _, _ = cap.region
+        # OCR 坐标是“截图图片像素坐标”，需先缩放到屏幕坐标，再加上截图区域偏移
+        scr_x = int(left + img_x * cap.scale_x)
+        scr_y = int(top + img_y * cap.scale_y)
+        return MatchResult(x=scr_x, y=scr_y, text=text)
+
+    def click_text(
+        self,
+        text: str,
+        *,
+        region: Optional[tuple[int, int, int, int]] = None,
+        exact: bool = True,
+        case_sensitive: bool = False,
+        clicks: int = 1,
+        interval: float = 0.05,
+        button: str = "left",
+        move_duration: float = 0.0,
+        pause: float = 0.05,
+    ) -> MatchResult:
+        pyautogui.PAUSE = pause
+        m = self.locate_text(text, region=region, exact=exact, case_sensitive=case_sensitive)
+        if m is None:
+            raise TimeoutError(f"未找到文字：{text}")
+        pyautogui.moveTo(m.x, m.y, duration=move_duration)
+        pyautogui.click(x=m.x, y=m.y, clicks=clicks, interval=interval, button=button)
+        return m
+
+    def wait_for_text(
+        self,
+        text: str,
+        *,
+        timeout: float = 20.0,
+        interval: float = 0.5,
+        region: Optional[tuple[int, int, int, int]] = None,
+        exact: bool = True,
+        case_sensitive: bool = False,
+    ) -> MatchResult:
+        end = time.time() + timeout
+        while time.time() < end:
+            m = self.locate_text(text, region=region, exact=exact, case_sensitive=case_sensitive)
+            if m is not None:
+                return m
+            time.sleep(interval)
+        raise TimeoutError(f"等待超时：{timeout}s 内未出现文字：{text}")
+
--- a/core/browser.py
+++ b/core/browser.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+"""
+本项目的核心是“桌面 OCR + 点击”。若后续需要操作 Web 端（如 DMS/ERP），可以在此处接入 Playwright。
+当前先提供一个最小骨架，避免影响主流程。
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(frozen=True)
+class BrowserConfig:
+    headless: bool = False
+    slow_mo_ms: int = 0
+    user_data_dir: Optional[str] = None
+
+
+class Browser:
+    def __init__(self, config: BrowserConfig = BrowserConfig()) -> None:
+        self.config = config
+
+    def __enter__(self) -> "Browser":
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        return None
+
--- a/core/ocr_client.py
+++ b/core/ocr_client.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import base64
+import json
+from dataclasses import dataclass
+from typing import Any, Iterable, Optional
+
+import requests
+
+
+@dataclass(frozen=True)
+class OcrBox:
+    # 4 个顶点坐标（相对截图图片坐标系）
+    points: tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]
+
+    def center(self) -> tuple[int, int]:
+        xs = [p[0] for p in self.points]
+        ys = [p[1] for p in self.points]
+        return (int(sum(xs) / 4), int(sum(ys) / 4))
+
+
+@dataclass(frozen=True)
+class OcrItem:
+    text: str
+    box: OcrBox
+
+
+class UmiClient:
+    """
+    调用 Umi-OCR 的 HTTP API，并将 data_format=dict 的返回解析为 text+box。
+    """
+
+    def __init__(self, url: str = "http://127.0.0.1:1224/api/ocr", timeout_s: float = 15.0) -> None:
+        self.url = url
+        self.timeout_s = timeout_s
+
+    def check_service(self) -> None:
+        """
+        Umi-OCR 没有稳定的 healthz 文档接口，这里用一次轻量请求做连通性检测。
+        只要能建立连接并返回 JSON（即使是业务错误），就认为服务已启动。
+        """
+        try:
+            r = requests.post(self.url, json={"base64": "", "options": {"data_format": "dict"}}, timeout=3)
+            _ = r.text  # 触发实际请求
+        except requests.RequestException as e:
+            raise RuntimeError(f"无法连接 Umi-OCR 服务：{self.url}。请先在 Umi-OCR 中开启 HTTP 服务。") from e
+
+    def ocr_bytes(self, image_bytes: bytes) -> list[OcrItem]:
+        img64 = base64.b64encode(image_bytes).decode("utf-8")
+        payload = {"base64": img64, "options": {"data_format": "dict"}}
+        resp = requests.post(self.url, json=payload, timeout=self.timeout_s)
+        resp.raise_for_status()
+        data = resp.json()
+        return self._parse_umi_dict(data)
+
+    def _parse_umi_dict(self, data: dict[str, Any]) -> list[OcrItem]:
+        # 兼容：当返回不是 dict 时直接报错，方便定位
+        if not isinstance(data, dict):
+            raise ValueError(f"Umi-OCR 返回非 JSON 对象：{type(data)}")
+
+        items: list[OcrItem] = []
+        data_list = data.get("data", [])
+        if not isinstance(data_list, list):
+            raise ValueError(f"Umi-OCR 返回 data 字段不是 list：{json.dumps(data, ensure_ascii=False)[:500]}")
+
+        for it in data_list:
+            if not isinstance(it, dict):
+                continue
+            text = str(it.get("text", "")).strip()
+            box = it.get("box")
+            pts = _coerce_box_points(box)
+            if not text or pts is None:
+                continue
+            items.append(OcrItem(text=text, box=OcrBox(points=pts)))
+        return items
+
+    def find_text(
+        self,
+        target_name: str,
+        items: Iterable[OcrItem],
+        *,
+        exact: bool = True,
+        case_sensitive: bool = False,
+    ) -> Optional[tuple[int, int]]:
+        """
+        在给定 OCR items 中查找目标文字，返回其在截图坐标系下的中心点 (x, y)。
+        """
+        t = target_name if case_sensitive else target_name.lower()
+        for item in items:
+            s = item.text if case_sensitive else item.text.lower()
+            ok = (s == t) if exact else (t in s)
+            if ok:
+                return item.box.center()
+        return None
+
+
+def _coerce_box_points(box: Any) -> Optional[tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]]:
+    """
+    Umi-OCR 的 box 在 data_format=dict 下通常是 4 个点：
+    [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
+    """
+    if not (isinstance(box, list) and len(box) == 4):
+        return None
+    pts: list[tuple[int, int]] = []
+    for p in box:
+        if not (isinstance(p, (list, tuple)) and len(p) == 2):
+            return None
+        pts.append((int(p[0]), int(p[1])))
+    return (pts[0], pts[1], pts[2], pts[3])
+