Skip to content

伪 manga-image-translator 纯旋转检测 #972

@lhj5426

Description

@lhj5426

老哥 我发现

Image

如图 这个使用 manga-image-translator 的旋转检测 是

包含 旋转的检测 而不是纯的 只对旋转检测

然后问claude code 修改了下

达到了 伪 只检测 旋转矩形 的目的

bandicam.2025-09-25.19-19-56-952.mp4

不过这个检测结果看得有点胃疼

置信度 修改 \manga-image-translator\manga_translator.py 第21行

def init(self, use_cuda=False,img_detect_size=1536,unclip_ratio=0,box_threshold=0.5,text_threshold=0.5

检测相关参数:

  • box_threshold=0.7 - 文字框置信度阈值
    • 只有置信度 ≥ 0.7 的文字区域才会被保留
    • 值越高越严格,漏检增加;值越低检测更全面,误检增加
  • text_threshold=0.5 - 文字像素阈值
    • 用于二值化文字区域,区分文字和背景
    • 影响文字区域边界的精确度
  • unclip_ratio=2.2 - 文字框扩展比例
    • 检测到的文字框会向外扩展 2.2 倍
    • 确保完整包含文字内容

不是很精准 独立的旋转检测的 服务器启动脚本

#!/usr/bin/env python3

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import torch

import os
import time
import datetime
from bottle import route, run, template, request, static_file
import json
import sys
import base64
from manga_translator import Translator
import cv2
import numpy as np
from utils import Quadrilateral
import math

if os.path.exists("use_cuda"):
    t = Translator(use_cuda=True)
    print("✓ 纯旋转文字检测服务器 - GPU模式已启用")
    print(f"CUDA可用: {torch.cuda.is_available()}")
else:
    t = Translator()
    print("✓ 纯旋转文字检测服务器 - CPU模式")

t.use_ctc_model = False

def calculate_text_angle(quadrilateral):
    """计算文字区域的旋转角度"""
    pts = quadrilateral.pts
    # 计算第一条边的角度
    dx = pts[1][0] - pts[0][0]
    dy = pts[1][1] - pts[0][1]
    angle = math.degrees(math.atan2(dy, dx))
    # 标准化角度到 [0, 360)
    angle = angle % 360
    return angle

def is_rotated_text(quadrilateral, tolerance=0):
    """判断是否为旋转文字 - 只有精确0°、90°、180°、270°才算正常"""
    angle = calculate_text_angle(quadrilateral)

    # 只有精确的标准角度才算正常
    normal_angles = [0, 90, 180, 270]

    if angle in normal_angles:
        return False, angle  # 精确标准角度,不是旋转

    return True, angle  # 其他所有角度都是旋转

# 将原来的接口路径映射到旋转检测,保持兼容性
@route('/ocr_and_mask', method='POST')
@route('/detect', method='POST')
@route('/ocr', method='POST')
def rotation_only_detect():
    """统一接口:只检测旋转文字"""
    save_path = "./uploaded/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    timestamp = str(int(time.time()*1000))
    generate_mask = request.forms.get('generate_mask')
    tolerance = 1  # 角度容差 - 只有1度内偏差才算正常

    upload = request.files.get('upload')
    name, ext = os.path.splitext(upload.filename)

    if ext.lower() not in ('.png','.jpg','.jpeg'):
        return "File extension not allowed."

    savedName = timestamp + ext
    file_path = "{path}/{file}".format(path=save_path, file=savedName)

    if os.path.exists(file_path):
        os.remove(file_path)
    upload.save(file_path)

    img = cv2.imread(file_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = {}

    # 先做常规检测
    all_textlines, mask = t.detect(img_rgb)

    # 详细分析每个检测到的文字区域
    print(f"\n=== 文字区域详细分析 ===")
    rotated_textlines = []
    normal_count = 0
    rotated_count = 0

    # 先统计和分组
    normal_regions = []
    rotated_regions = []

    for i, textline in enumerate(all_textlines):
        is_rotated, angle = is_rotated_text(textline)

        if is_rotated:
            rotated_regions.append((textline, angle))
        else:
            normal_regions.append((textline, angle))

    # 打印水平区域
    for i, (textline, angle) in enumerate(normal_regions):
        print(f"水平区域{i + 1}: 角度{angle:.1f}° [已过滤]")

    if normal_regions:
        print("=========")

    # 打印旋转区域
    for i, (textline, angle) in enumerate(rotated_regions):
        print(f"旋转区域{i + 1}: 角度{angle:.1f}°")
        rotated_textlines.append(textline)

    print(f"=== 总结: 原始{len(all_textlines)}个,水平{len(normal_regions)}个,旋转{len(rotated_regions)}个 ===\n")

    if generate_mask == "true" and len(rotated_textlines) > 0:
        print("生成旋转文字掩码")
        mask_path = "{path}/{file}".format(path=save_path, file=savedName+"-mask.png")
        # 只为旋转文字生成掩码
        rotated_mask = t.gen_mask(img_rgb, mask, rotated_textlines)
        cv2.imwrite(mask_path, rotated_mask)
        mask_converted = convert_mask(mask_path)
        png = cv2.imencode('.png', mask_converted)[1]
        image_code = str(base64.b64encode(png))[2:-1]
        result["mask"] = image_code
        os.remove(mask_path)

    # 返回旋转文字信息
    text_lines = []
    for line in rotated_textlines:
        text_lines.append(textline_as_map(line))

    result["text_lines"] = text_lines
    # 为了兼容性,也提供boxes字段
    result["boxes"] = text_lines

    os.remove(file_path)
    return result

@route('/getmask', method='POST')
def getmask():
    """生成旋转文字掩码"""
    upload = request.files.get('upload')
    name, ext = os.path.splitext(upload.filename)
    print(ext.lower())
    if ext.lower() not in ('.png','.jpg','.jpeg'):
        return "File extension not allowed."

    timestamp=str(int(time.time()*1000))
    savedName=timestamp+ext
    save_path = "./uploaded/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    file_path = "{path}/{file}".format(path=save_path, file=savedName)
    mask_path = "{path}/{file}".format(path=save_path, file=savedName+"-mask.png")
    if os.path.exists(file_path)==True:
        os.remove(file_path)
    upload.save(file_path)

    img = cv2.imread(file_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # 检测所有文字
    all_textlines, mask = t.detect(img_rgb)

    # 过滤出旋转文字
    rotated_textlines = []
    for textline in all_textlines:
        is_rotated, angle = is_rotated_text(textline, 15)
        if is_rotated:
            rotated_textlines.append(textline)

    print(f"掩码生成:{len(all_textlines)}个文字区域,{len(rotated_textlines)}个旋转文字")

    # 只为旋转文字生成掩码
    mask = t.gen_mask(img_rgb, mask, rotated_textlines)
    cv2.imwrite(mask_path, mask)
    convert_mask(mask_path)
    return static_file(savedName+"-mask.png", root='uploaded')

def textline_as_map(textline):
    """转换textline为字典格式"""
    box = {}
    box["text"] = textline.text
    points = textline.pts
    box["x0"] = int(points[0][0])
    box["y0"] = int(points[0][1])
    box["x1"] = int(points[1][0])
    box["y1"] = int(points[1][1])
    box["x2"] = int(points[2][0])
    box["y2"] = int(points[2][1])
    box["x3"] = int(points[3][0])
    box["y3"] = int(points[3][1])
    box["fg_r"] = textline.fg_r
    box["fg_g"] = textline.fg_g
    box["fg_b"] = textline.fg_b
    box["bg_r"] = textline.bg_r
    box["bg_g"] = textline.bg_g
    box["bg_b"] = textline.bg_b
    return box

def convert_mask(mask_path):
    """转换掩码格式"""
    img = cv2.imread(mask_path)
    b, g, r = cv2.split(img)
    a = np.ones(b.shape, dtype=b.dtype) * 255
    for x in range(0, img.shape[1]):
        for y in range(0, img.shape[0]):
            pixel = img[y][x]
            if pixel[0]==0 and pixel[1]==0 and pixel[2]==0:
                a[y][x] = 0
            else:
                b[y][x] = 0
                g[y][x] = 0
                r[y][x] = 255
    img_BGRA = cv2.merge((b, g, r, a))
    cv2.imwrite(mask_path, img_BGRA)
    return img_BGRA

@route('/<filepath:path>')
def server_static(filepath):
    return static_file(filepath, root='www')

if __name__ == '__main__':
    if len(sys.argv) == 2:
        service_port = int(sys.argv[1])
    else:
        service_port = 8080

    print(f"纯旋转文字检测服务器启动在端口 {service_port}")
    print("功能:只检测和返回旋转角度大于15度的文字区域")
    print("兼容原有接口:/ocr, /detect, /ocr_and_mask, /getmask")

    run(host='127.0.0.1', port=service_port)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions