Python使用PaddleOCR實(shí)現(xiàn)PDF/圖片文字識(shí)別與版面還原

更新時(shí)間：2025年11月28日 09:10:29 作者：weixin_46244623

本文介紹了如何利用 PaddleOCR 實(shí)現(xiàn)對(duì) PDF 文件或圖片的文字識(shí)別,并在識(shí)別后將文本內(nèi)容按照原始版面位置進(jìn)行還原重建,感興趣的小伙伴可以了解下

摘要
1. 初始化 PaddleOCR（使用高精度服務(wù)器模型）
2. 執(zhí)行 OCR 識(shí)別
3. 讀取原圖獲取尺寸
4. 創(chuàng)建 SVG 根節(jié)點(diǎn)（透明背景）
5. 內(nèi)嵌仿宋字體（Base64 編碼）
6. 智能繪制文字（支持旋轉(zhuǎn) + 豎排）
7. 保存 SVG 文件
8.效果展示
9.完整代碼

摘要

本文介紹了如何利用 PaddleOCR 實(shí)現(xiàn)對(duì) PDF 文件或圖片 的文字識(shí)別，并在識(shí)別后將文本內(nèi)容按照 原始版面位置 進(jìn)行還原重建。文章詳細(xì)講解了實(shí)現(xiàn)流程，包括 圖像預(yù)處理、OCR 識(shí)別、版面坐標(biāo)提取與重排、以及最終生成 可編輯的 PDF 或可視化輸出 的過程。

本文將帶你使用 PaddleOCR 實(shí)現(xiàn)一個(gè)完整流程：

# 升級(jí) pip
python -m pip install --upgrade pip
 
# 設(shè)置清華源加速下載（可選）
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 
# 安裝 PaddlePaddle CPU 版本
python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
 
# 安裝輔助庫(kù)
python -m pip install PyMuPDF Pillow reportlab tqdm beautifulsoup4
 
# 安裝指定版本 PaddleOCR
python -m pip install paddleocr==3.2.0

1. 初始化 PaddleOCR（使用高精度服務(wù)器模型）

import os
import cv2
import json
import numpy as np
import base64
from xml.etree.ElementTree import Element, SubElement, tostring
from paddleocr import PaddleOCR

ocr = PaddleOCR(
    text_detection_model_name="PP-OCRv5_server_det",
    text_recognition_model_name="PP-OCRv5_server_rec",
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)

2. 執(zhí)行 OCR 識(shí)別

INPUT_IMG = "./page_4.png"
result = ocr.predict(INPUT_IMG)
data = result[0]  # 取第一頁(yè)結(jié)果

返回結(jié)構(gòu)包含：

rec_texts: 識(shí)別的文本列表
rec_polys: 每個(gè)文本的多邊形坐標(biāo)（4個(gè)點(diǎn)）
input_path: 原圖路徑

3. 讀取原圖獲取尺寸

orig_img = cv2.imread(data["input_path"])
img_h, img_w = orig_img.shape[:2]
print(f"原圖尺寸: {img_w} x {img_h}")

4. 創(chuàng)建 SVG 根節(jié)點(diǎn)（透明背景）

svg = Element("svg", {
    "xmlns": "http://www.w3.org/2000/svg",
    "width": str(img_w),
    "height": str(img_h),
    "viewBox": f"0 0 {img_w} {img_h}",
    "style": "background:none"
})

5. 內(nèi)嵌仿宋字體（Base64 編碼）

FONT_PATH = os.path.expanduser("~/.paddlex/fonts/simfang.ttf")

with open(FONT_PATH, "rb") as f:
    font_data = base64.b64encode(f.read()).decode("utf-8")

style_el = SubElement(svg, "style")
style_el.text = f"""
@font-face {{
  font-family: 'SimFang';
  src: url(data:font/truetype;charset=utf-8;base64,{font_data}) format('truetype');
}}
text {{
  font-family: 'SimFang';
  fill: rgb(0,0,0);
  dominant-baseline: middle;
  text-anchor: middle;
  white-space: pre;
}}

6. 智能繪制文字（支持旋轉(zhuǎn) + 豎排）

for text, poly in zip(data["rec_texts"], data["rec_polys"]):
    if not text.strip(): 
        continue

    box = np.array(poly, dtype=np.float32).reshape(4, 2)
    x0, y0 = box.min(axis=0)
    x1, y1 = box.max(axis=0)
    w_box, h_box = x1 - x0, y1 - y0

    # 計(jì)算旋轉(zhuǎn)角度（以左下→右下邊為基準(zhǔn)）
    angle = np.degrees(np.arctan2(box[1][1] - box[0][1], box[1][0] - box[0][0]))
    font_size = max(8, int(min(w_box, h_box) * 0.8))

    # 判斷是否為豎排文字
    vertical = h_box > 2.5 * w_box and h_box > 60

    if vertical:
        # 豎排：逐字垂直排列
        cx = (x0 + x1) / 2
        y = y0
        gap = h_box / max(len(text), 1)
        for ch in text:
            text_el = SubElement(svg, "text", {
                "x": str(cx),
                "y": str(y + gap / 2),
                "font-size": str(font_size),
                "transform": f"rotate({angle},{cx},{y + gap / 2})"
            })
            text_el.text = ch
            y += gap
    else:
        # 橫排：整體旋轉(zhuǎn)
        cx = (x0 + x1) / 2
        cy = (y0 + y1) / 2
        text_el = SubElement(svg, "text", {
            "x": str(cx),
            "y": str(cy),
            "font-size": str(font_size),
            "transform": f"rotate({angle},{cx},{cy})"
        })
        text_el.text = text

7. 保存 SVG 文件

OUTPUT_SVG = "page_1_transparent.svg"
with open(OUTPUT_SVG, "wb") as f:
    f.write(tostring(svg, encoding="utf-8", xml_declaration=True))

print(f"已生成透明可復(fù)制文字 SVG: {OUTPUT_SVG}")

8.效果展示

9.完整代碼

import os
import cv2
import json
import numpy as np
import base64
from xml.etree.ElementTree import Element, SubElement, tostring
from paddleocr import PaddleOCR
# ================== 配置 ==================
ocr = PaddleOCR(
    text_detection_model_name="PP-OCRv5_server_det",
    text_recognition_model_name="PP-OCRv5_server_rec",
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)

INPUT_IMG   = "./page_4.png"
OUTPUT_PDF  = "page_1_restored.pdf"
INPUT_JSON = "./page_4_res.json"
FONT_PATH   = os.path.expanduser("~/.paddlex/fonts/simfang.ttf")
SCALE       = 3


# ================== 1. OCR 使用 predict ==================
print("正在執(zhí)行 OCR: result = ocr.predict(INPUT_IMG)")
try:
    results = ocr.predict(INPUT_IMG)
except Exception as e:
    print(f"\nOCR 失敗頁(yè): {e}")
    continue

# ---------- 保存 JSON + 帶框圖 ----------
for res_idx, res in enumerate(results):
    res.save_to_img(os.path.join(f"page_boxed.png"))
    res.save_to_json(INPUT_JSON)

# ================== 配置 ==================
FONT_PATH  = os.path.expanduser("~/.paddlex/fonts/simfang.ttf")
OUTPUT_SVG = "page_1_transparent.svg"
TEXT_COLOR = (0, 0, 0)  # 黑色文字
if not os.path.exists(FONT_PATH):
    raise FileNotFoundError(f"字體未找到: {FONT_PATH}")

# ================== 1. 加載 OCR JSON ==================
if not os.path.exists(INPUT_JSON):
    raise FileNotFoundError(f"OCR 結(jié)果未找到: {INPUT_JSON}")
with open(INPUT_JSON, "r", encoding="utf-8") as f:
     data = json.load(f)

texts = data["rec_texts"]
polys = data["rec_polys"]

input_path = data.get("input_path")
if not input_path or not os.path.exists(input_path):
    raise FileNotFoundError(f"原圖未找到: {input_path}")

# ================== 2. 獲取原圖尺寸 ==================
orig_img = cv2.imread(input_path)
if orig_img is None:
    raise ValueError(f"無法讀取原圖: {input_path}")
img_h, img_w = orig_img.shape[:2]
print(f"原圖尺寸: {img_w} x {img_h}")

# ================== 3. 創(chuàng)建 SVG 根節(jié)點(diǎn) ==================
svg = Element("svg", {
    "xmlns": "http://www.w3.org/2000/svg",
    "width": str(img_w),
    "height": str(img_h),
    "viewBox": f"0 0 {img_w} {img_h}",
    "style": "background:none"
})

# ================== 4. 內(nèi)嵌字體（SimFang） ==================
if not os.path.exists(FONT_PATH):
    raise FileNotFoundError(f"字體未找到: {FONT_PATH}")

with open(FONT_PATH, "rb") as f:
    font_data = base64.b64encode(f.read()).decode("utf-8")

style_el = SubElement(svg, "style")
style_el.text = f"""
@font-face {{
  font-family: 'SimFang';
  src: url(data:font/truetype;charset=utf-8;base64,{font_data}) format('truetype');
}}
text {{
  font-family: 'SimFang';
  fill: rgb({TEXT_COLOR[0]}, {TEXT_COLOR[1]}, {TEXT_COLOR[2]});
  dominant-baseline: middle;
  text-anchor: middle;
  white-space: pre;
}}
"""

# ================== 5. 繪制文字（透明背景） ==================
for text, poly in zip(texts, polys):
    if not text.strip():
        continue

    box = np.array(poly, dtype=np.float32).reshape(4, 2)
    x0, y0 = box.min(axis=0)
    x1, y1 = box.max(axis=0)
    w_box, h_box = x1 - x0, y1 - y0
    angle = np.degrees(np.arctan2(box[1][1] - box[0][1], box[1][0] - box[0][0]))
    font_size = max(8, int(min(w_box, h_box) * 0.8))

    vertical = h_box > 2.5 * w_box and h_box > 60

    if vertical:
        # 豎排文字
        cx = (x0 + x1) / 2
        y = y0
        gap = h_box / max(len(text), 1)
        for ch in text:
            text_el = SubElement(svg, "text", {
                "x": str(cx),
                "y": str(y + gap / 2),
                "font-size": str(font_size),
                "transform": f"rotate({angle},{cx},{y + gap / 2})"
            })
            text_el.text = ch
            y += gap
    else:
        # 橫排文字
        cx = (x0 + x1) / 2
        cy = (y0 + y1) / 2
        text_el = SubElement(svg, "text", {
            "x": str(cx),
            "y": str(cy),
            "font-size": str(font_size),
            "transform": f"rotate({angle},{cx},{cy})"
        })
        text_el.text = text

# ================== 6. 保存透明可復(fù)制 SVG ==================
with open(OUTPUT_SVG, "wb") as f:
    f.write(tostring(svg, encoding="utf-8", xml_declaration=True))

print(f"已生成透明可復(fù)制文字 SVG: {OUTPUT_SVG}")

以上就是Python使用PaddleOCR實(shí)現(xiàn)PDF/圖片文字識(shí)別與版面還原的詳細(xì)內(nèi)容，更多關(guān)于Python PaddleOCR文字識(shí)別的資料請(qǐng)關(guān)注腳本之家其它相關(guān)文章！

您可能感興趣的文章: