-
Notifications
You must be signed in to change notification settings - Fork 9
Open
Description
老哥 我发现
如图 这个使用 manga-image-translator 的旋转检测 是
包含 旋转的检测 而不是纯的 只对旋转检测
然后问claude code 修改了下
达到了 伪 只检测 旋转矩形 的目的
bandicam.2025-09-25.19-19-56-952.mp4
不过这个检测结果看得有点胃疼
置信度 修改 \manga-image-translator\manga_translator.py 第21行
def init(self, use_cuda=False,img_detect_size=1536,unclip_ratio=0,box_threshold=0.5,text_threshold=0.5
检测相关参数:
- box_threshold=0.7 - 文字框置信度阈值
- 只有置信度 ≥ 0.7 的文字区域才会被保留
- 值越高越严格,漏检增加;值越低检测更全面,误检增加
- text_threshold=0.5 - 文字像素阈值
- 用于二值化文字区域,区分文字和背景
- 影响文字区域边界的精确度
- unclip_ratio=2.2 - 文字框扩展比例
- 检测到的文字框会向外扩展 2.2 倍
- 确保完整包含文字内容
不是很精准 独立的旋转检测的 服务器启动脚本
#!/usr/bin/env python3
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import torch
import os
import time
import datetime
from bottle import route, run, template, request, static_file
import json
import sys
import base64
from manga_translator import Translator
import cv2
import numpy as np
from utils import Quadrilateral
import math
if os.path.exists("use_cuda"):
t = Translator(use_cuda=True)
print("✓ 纯旋转文字检测服务器 - GPU模式已启用")
print(f"CUDA可用: {torch.cuda.is_available()}")
else:
t = Translator()
print("✓ 纯旋转文字检测服务器 - CPU模式")
t.use_ctc_model = False
def calculate_text_angle(quadrilateral):
"""计算文字区域的旋转角度"""
pts = quadrilateral.pts
# 计算第一条边的角度
dx = pts[1][0] - pts[0][0]
dy = pts[1][1] - pts[0][1]
angle = math.degrees(math.atan2(dy, dx))
# 标准化角度到 [0, 360)
angle = angle % 360
return angle
def is_rotated_text(quadrilateral, tolerance=0):
"""判断是否为旋转文字 - 只有精确0°、90°、180°、270°才算正常"""
angle = calculate_text_angle(quadrilateral)
# 只有精确的标准角度才算正常
normal_angles = [0, 90, 180, 270]
if angle in normal_angles:
return False, angle # 精确标准角度,不是旋转
return True, angle # 其他所有角度都是旋转
# 将原来的接口路径映射到旋转检测,保持兼容性
@route('/ocr_and_mask', method='POST')
@route('/detect', method='POST')
@route('/ocr', method='POST')
def rotation_only_detect():
"""统一接口:只检测旋转文字"""
save_path = "./uploaded/"
if not os.path.exists(save_path):
os.makedirs(save_path)
timestamp = str(int(time.time()*1000))
generate_mask = request.forms.get('generate_mask')
tolerance = 1 # 角度容差 - 只有1度内偏差才算正常
upload = request.files.get('upload')
name, ext = os.path.splitext(upload.filename)
if ext.lower() not in ('.png','.jpg','.jpeg'):
return "File extension not allowed."
savedName = timestamp + ext
file_path = "{path}/{file}".format(path=save_path, file=savedName)
if os.path.exists(file_path):
os.remove(file_path)
upload.save(file_path)
img = cv2.imread(file_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
result = {}
# 先做常规检测
all_textlines, mask = t.detect(img_rgb)
# 详细分析每个检测到的文字区域
print(f"\n=== 文字区域详细分析 ===")
rotated_textlines = []
normal_count = 0
rotated_count = 0
# 先统计和分组
normal_regions = []
rotated_regions = []
for i, textline in enumerate(all_textlines):
is_rotated, angle = is_rotated_text(textline)
if is_rotated:
rotated_regions.append((textline, angle))
else:
normal_regions.append((textline, angle))
# 打印水平区域
for i, (textline, angle) in enumerate(normal_regions):
print(f"水平区域{i + 1}: 角度{angle:.1f}° [已过滤]")
if normal_regions:
print("=========")
# 打印旋转区域
for i, (textline, angle) in enumerate(rotated_regions):
print(f"旋转区域{i + 1}: 角度{angle:.1f}°")
rotated_textlines.append(textline)
print(f"=== 总结: 原始{len(all_textlines)}个,水平{len(normal_regions)}个,旋转{len(rotated_regions)}个 ===\n")
if generate_mask == "true" and len(rotated_textlines) > 0:
print("生成旋转文字掩码")
mask_path = "{path}/{file}".format(path=save_path, file=savedName+"-mask.png")
# 只为旋转文字生成掩码
rotated_mask = t.gen_mask(img_rgb, mask, rotated_textlines)
cv2.imwrite(mask_path, rotated_mask)
mask_converted = convert_mask(mask_path)
png = cv2.imencode('.png', mask_converted)[1]
image_code = str(base64.b64encode(png))[2:-1]
result["mask"] = image_code
os.remove(mask_path)
# 返回旋转文字信息
text_lines = []
for line in rotated_textlines:
text_lines.append(textline_as_map(line))
result["text_lines"] = text_lines
# 为了兼容性,也提供boxes字段
result["boxes"] = text_lines
os.remove(file_path)
return result
@route('/getmask', method='POST')
def getmask():
"""生成旋转文字掩码"""
upload = request.files.get('upload')
name, ext = os.path.splitext(upload.filename)
print(ext.lower())
if ext.lower() not in ('.png','.jpg','.jpeg'):
return "File extension not allowed."
timestamp=str(int(time.time()*1000))
savedName=timestamp+ext
save_path = "./uploaded/"
if not os.path.exists(save_path):
os.makedirs(save_path)
file_path = "{path}/{file}".format(path=save_path, file=savedName)
mask_path = "{path}/{file}".format(path=save_path, file=savedName+"-mask.png")
if os.path.exists(file_path)==True:
os.remove(file_path)
upload.save(file_path)
img = cv2.imread(file_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 检测所有文字
all_textlines, mask = t.detect(img_rgb)
# 过滤出旋转文字
rotated_textlines = []
for textline in all_textlines:
is_rotated, angle = is_rotated_text(textline, 15)
if is_rotated:
rotated_textlines.append(textline)
print(f"掩码生成:{len(all_textlines)}个文字区域,{len(rotated_textlines)}个旋转文字")
# 只为旋转文字生成掩码
mask = t.gen_mask(img_rgb, mask, rotated_textlines)
cv2.imwrite(mask_path, mask)
convert_mask(mask_path)
return static_file(savedName+"-mask.png", root='uploaded')
def textline_as_map(textline):
"""转换textline为字典格式"""
box = {}
box["text"] = textline.text
points = textline.pts
box["x0"] = int(points[0][0])
box["y0"] = int(points[0][1])
box["x1"] = int(points[1][0])
box["y1"] = int(points[1][1])
box["x2"] = int(points[2][0])
box["y2"] = int(points[2][1])
box["x3"] = int(points[3][0])
box["y3"] = int(points[3][1])
box["fg_r"] = textline.fg_r
box["fg_g"] = textline.fg_g
box["fg_b"] = textline.fg_b
box["bg_r"] = textline.bg_r
box["bg_g"] = textline.bg_g
box["bg_b"] = textline.bg_b
return box
def convert_mask(mask_path):
"""转换掩码格式"""
img = cv2.imread(mask_path)
b, g, r = cv2.split(img)
a = np.ones(b.shape, dtype=b.dtype) * 255
for x in range(0, img.shape[1]):
for y in range(0, img.shape[0]):
pixel = img[y][x]
if pixel[0]==0 and pixel[1]==0 and pixel[2]==0:
a[y][x] = 0
else:
b[y][x] = 0
g[y][x] = 0
r[y][x] = 255
img_BGRA = cv2.merge((b, g, r, a))
cv2.imwrite(mask_path, img_BGRA)
return img_BGRA
@route('/<filepath:path>')
def server_static(filepath):
return static_file(filepath, root='www')
if __name__ == '__main__':
if len(sys.argv) == 2:
service_port = int(sys.argv[1])
else:
service_port = 8080
print(f"纯旋转文字检测服务器启动在端口 {service_port}")
print("功能:只检测和返回旋转角度大于15度的文字区域")
print("兼容原有接口:/ocr, /detect, /ocr_and_mask, /getmask")
run(host='127.0.0.1', port=service_port)
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels