ONNX model for GLM-OCR
try with ningpp/flux
Flux is a Java-based OCR
Attention
If you download model before 2026-03-07, you can download model again, current version of the model has better inference performance.
ONNX Inference
"""
End-to-end ONNX inference for GLM-OCR model.
This script performs complete inference using exported ONNX models:
1. Vision encoder (processes images)
2. Embedding layer (converts token IDs to embeddings)
3. Prefill model (processes prompt)
4. Decode model (generates tokens autoregressively)
Usage:
python onnx_inference_e2e.py --image <path> --max-tokens 100
python onnx_inference_e2e.py --use-real-images --max-tokens 100
"""
import os
import sys
import time
import argparse
from typing import List, Tuple, Optional
from PIL import Image
import numpy as np
import onnxruntime as ort
from transformers import AutoProcessor, AutoModelForImageTextToText, AutoConfig
class GLMOcrOnnxInference:
"""End-to-end ONNX inference for GLM-OCR."""
def __init__(self, onnx_dir: str, device: str = "cpu"):
"""
Initialize ONNX inference sessions.
Args:
onnx_dir: Directory containing exported ONNX models
device: "cpu" or "cuda"
"""
self.onnx_dir = onnx_dir
self.device = device
self.providers = ["CUDAExecutionProvider"] if device == "cuda" else ["CPUExecutionProvider"]
# Load processor for tokenization
print(f"Loading processor from {onnx_dir}...")
self.processor = AutoProcessor.from_pretrained(onnx_dir, trust_remote_code=True)
# Model config
self.config = self._load_config()
# Create ONNX sessions
self.sessions = self._create_sessions()
def _load_config(self):
"""Load model configuration without loading the entire model."""
# Load config directly instead of the entire model
config = AutoConfig.from_pretrained(self.onnx_dir, trust_remote_code=True)
return config
def _create_sessions(self) -> dict:
"""Create ONNX Runtime sessions for all models."""
print("Creating ONNX Runtime sessions...")
opts = ort.SessionOptions()
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
if self.device == "cuda":
# CUDA-specific optimizations
opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
opts.enable_mem_pattern = True
opts.enable_mem_reuse = True
else:
# CPU optimizations
opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
import multiprocessing
num_cores = multiprocessing.cpu_count()
opts.intra_op_num_threads = num_cores
opts.inter_op_num_threads = 1
sessions = {}
# Get available providers and set up CUDA options
if self.device == "cuda":
available_providers = ort.get_available_providers()
providers = []
# Try TensorRT first if available (best performance)
if "TensorrtExecutionProvider" in available_providers:
print(" TensorRT is available but disabled temporarily due to shape inference requirements")
# Commented out until we run shape inference on the model
# providers.append(("TensorrtExecutionProvider", {
# "trt_engine_cache_enable": True,
# "trt_engine_cache_path": "./trt_cache",
# "trt_fp16_enable": True,
# }))
# print(" Using TensorRT Execution Provider")
# Always add CUDAExecutionProvider
providers.append(("CUDAExecutionProvider", {
"device_id": 0,
"arena_extend_strategy": "kNextPowerOfTwo",
"cudnn_conv_algo_search": "EXHAUSTIVE",
"do_copy_in_default_stream": True,
}))
# Fallback to CPU
providers.append("CPUExecutionProvider")
else:
providers = self.providers
# Vision encoder
vision_path = os.path.join(self.onnx_dir, "vision_encoder_fused.onnx")
if os.path.exists(vision_path):
sessions["vision"] = ort.InferenceSession(
vision_path, opts, providers=providers
)
print(f" ✓ Vision encoder loaded")
# Embedding layer
embedding_path = os.path.join(self.onnx_dir, "embedding.onnx")
if os.path.exists(embedding_path):
sessions["embedding"] = ort.InferenceSession(
embedding_path, opts, providers=providers
)
print(f" ✓ Embedding layer loaded")
# Prefill model
prefill_path = os.path.join(self.onnx_dir, "llm_prefill.onnx")
if os.path.exists(prefill_path):
sessions["prefill"] = ort.InferenceSession(
prefill_path, opts, providers=providers
)
print(f" ✓ Prefill model loaded")
# Decode model
decode_path = os.path.join(self.onnx_dir, "llm_decode.onnx")
if os.path.exists(decode_path):
sessions["decode"] = ort.InferenceSession(
decode_path, opts, providers=providers
)
print(f" ✓ Decode model loaded")
return sessions
def encode_image(self, image_path: str) -> np.ndarray:
"""
Encode image using vision encoder.
Args:
image_path: Path to image file
Returns:
Image features as numpy array
"""
if "vision" not in self.sessions:
raise RuntimeError("Vision encoder not available")
# Load and preprocess image
image = Image.open(image_path).convert("RGB")
# Use full processor to get all necessary inputs (pixel_values, grid_thw)
messages = [{'role': 'user', 'content': [{'type': 'image'}, {'type': 'text', 'text': 'test'}]}]
text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = self.processor(text=text, images=[image], return_tensors='pt')
pixel_values = inputs.pixel_values
grid_thw = inputs.image_grid_thw
# Compute pos_ids and max_grid_size
pos_ids, max_grid_size = self._compute_pos_ids(grid_thw)
# Convert to numpy arrays
pixel_values_np = pixel_values.numpy()
pos_ids_np = pos_ids.numpy()
max_grid_size_np = np.array(max_grid_size, dtype=np.int64)
# Run vision encoder
outputs = self.sessions["vision"].run(None, {
"pixel_values": pixel_values_np,
"pos_ids": pos_ids_np,
"max_grid_size": max_grid_size_np
})
return outputs[0] # image_features
def _compute_pos_ids(self, grid_thw, spatial_merge_size: int = 2):
"""
Pre-compute position IDs for rotary embeddings.
Args:
grid_thw: [batch_size, 3] - (temporal, height_patches, width_patches) for each image
spatial_merge_size: The spatial merge factor (default 2)
Returns:
pos_ids: [total_patches, 2] - position indices for all patches
max_grid_size: int - maximum grid dimension
"""
import torch
pos_ids_list = []
for t, h, w in grid_thw:
t, h, w = int(t), int(h), int(w)
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
hpos_ids = hpos_ids.reshape(
h // spatial_merge_size,
spatial_merge_size,
w // spatial_merge_size,
spatial_merge_size,
)
hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten()
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
wpos_ids = wpos_ids.reshape(
h // spatial_merge_size,
spatial_merge_size,
w // spatial_merge_size,
spatial_merge_size,
)
wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten()
pos_ids_list.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
pos_ids = torch.cat(pos_ids_list, dim=0)
max_grid_size = int(grid_thw[:, 1:].max())
return pos_ids, max_grid_size
def _get_rope_index(self, input_ids_list, image_grid_thw, attention_mask_list=None):
"""
Calculate position_ids for M-RoPE (same logic as PyTorch's get_rope_index).
Args:
input_ids_list: List of input token IDs
image_grid_thw: Tensor of [t, h, w] for image grid
attention_mask_list: List of attention mask values
Returns:
position_ids: numpy array of shape [3, seq_len]
rope_deltas: int, the delta for decode position calculation
"""
import itertools
spatial_merge_size = self.config.vision_config.spatial_merge_size
image_token_id = self.config.image_token_id
# Get image grid dimensions
t, h, w = image_grid_thw[0][0].item(), image_grid_thw[0][1].item(), image_grid_thw[0][2].item()
llm_grid_t = t
llm_grid_h = h // spatial_merge_size
llm_grid_w = w // spatial_merge_size
# Find image token positions
boi_token_id = 59256 #
eoi_token_id = 59257 #
# Build position_ids
seq_len = len(input_ids_list)
position_ids = np.zeros((3, seq_len), dtype=np.int64)
# Find BOI and EOI positions
boi_pos = None
eoi_pos = None
for i, tid in enumerate(input_ids_list):
if tid == boi_token_id:
boi_pos = i
elif tid == eoi_token_id:
eoi_pos = i
if boi_pos is None or eoi_pos is None:
# No image tokens, use simple position_ids
for i in range(seq_len):
position_ids[0, i] = i
position_ids[1, i] = i
position_ids[2, i] = i
return position_ids, 0
# Text tokens before image
for i in range(boi_pos):
position_ids[0, i] = i
position_ids[1, i] = i
position_ids[2, i] = i
# BOI token
st_idx = boi_pos
position_ids[0, boi_pos] = st_idx
position_ids[1, boi_pos] = st_idx
position_ids[2, boi_pos] = st_idx
# Image tokens - use 3D position encoding
# t_index, h_index, w_index for each image token
img_start = boi_pos + 1
img_end = eoi_pos
for idx, pos in enumerate(range(img_start, img_end)):
t_idx = idx // (llm_grid_h * llm_grid_w)
hw_idx = idx % (llm_grid_h * llm_grid_w)
h_idx = hw_idx // llm_grid_w
w_idx = hw_idx % llm_grid_w
position_ids[0, pos] = st_idx + t_idx
position_ids[1, pos] = st_idx + h_idx
position_ids[2, pos] = st_idx + w_idx
# EOI token and text after
max_img_pos = max(
position_ids[0, img_start:img_end].max(),
position_ids[1, img_start:img_end].max(),
position_ids[2, img_start:img_end].max()
)
for i, pos in enumerate(range(eoi_pos, seq_len)):
position_ids[0, pos] = max_img_pos + 1 + i
position_ids[1, pos] = max_img_pos + 1 + i
position_ids[2, pos] = max_img_pos + 1 + i
# Calculate rope_deltas
max_pos = max(
position_ids[0].max(),
position_ids[1].max(),
position_ids[2].max()
)
rope_deltas = max_pos + 1 - seq_len
return position_ids, rope_deltas
def _run_with_io_binding(self, session, inputs_dict, device="cuda"):
"""
Run inference (IO Binding temporarily disabled to ensure correct outputs).
Args:
session: ONNX Runtime InferenceSession
inputs_dict: Dictionary of input name -> numpy array
device: "cuda" or "cpu"
Returns:
list of numpy arrays
"""
# Disable IO Binding temporarily to avoid garbage outputs
return session.run(None, inputs_dict)
def generate(
self,
image_path: str,
prompt: str = "",
max_new_tokens: int = 100,
temperature: float = 0.7,
top_p: float = 0.9,
) -> str:
"""
Generate text from image.
Args:
image_path: Path to input image
prompt: Optional text prompt
max_new_tokens: Maximum number of tokens to generate
temperature: Sampling temperature
top_p: Top-p sampling parameter
Returns:
Generated text
"""
print(f"\nGenerating for image: {image_path}")
print(f" Prompt: '{prompt}'")
print(f" Max tokens: {max_new_tokens}")
print(f" Device: {self.device}")
# Step 1: Encode image
print("\n[1/4] Encoding image...")
start_time = time.time()
image_features = self.encode_image(image_path)
print(f" Image features shape: {image_features.shape}")
print(f" Time: {time.time() - start_time:.2f}s")
# Step 2: Prepare input
print("\n[2/4] Preparing input...")
start_time = time.time()
# Load image for processor
image = Image.open(image_path).convert("RGB")
# Create messages for GLM-OCR chat template (same as transformers_infer.py)
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": image_path},
{"type": "text", "text": prompt if prompt else "Describe this image."}
]
}
]
inputs = self.processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs.pop("token_type_ids", None)
input_ids = inputs["input_ids"].numpy()
attention_mask = inputs["attention_mask"].numpy()
print(f" Input IDs shape: {input_ids.shape}")
print(f" Time: {time.time() - start_time:.2f}s")
# Step 3: Embedding
print("\n[3/4] Getting embeddings...")
start_time = time.time()
image_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|image|>")
input_ids_list = input_ids[0].tolist()
# Get embeddings
embed_outputs = self._run_with_io_binding(
self.sessions["embedding"],
{"input_ids": input_ids},
device=self.device
)
inputs_embeds = embed_outputs[0]
# Replace image token embeddings with actual image features
image_positions = [i for i, tid in enumerate(input_ids_list) if tid == image_token_id]
if len(image_positions) > 0:
num_image_tokens = image_features.shape[0]
if len(image_positions) == num_image_tokens:
for i, pos in enumerate(image_positions):
inputs_embeds[0, pos] = image_features[i]
print(f" Replaced {num_image_tokens} image tokens")
else:
# Remove original <|image|> tokens from input_ids and get embeddings
non_image_mask = np.array([tid != image_token_id for tid in input_ids_list])
inputs_embeds = inputs_embeds[:, non_image_mask, :]
# Also update attention_mask to remove original image token
attention_mask = attention_mask[:, non_image_mask]
boi_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|begin_of_image|>")
if boi_token_id in input_ids_list:
boi_pos = input_ids_list.index(boi_token_id)
before = inputs_embeds[:, :boi_pos+1, :]
after = inputs_embeds[:, boi_pos+1:, :]
image_features_batch = image_features[np.newaxis, :, :]
inputs_embeds = np.concatenate([before, image_features_batch, after], axis=1)
before_mask = attention_mask[:, :boi_pos+1]
image_mask = np.ones((1, num_image_tokens), dtype=np.int64)
after_mask = attention_mask[:, boi_pos+1:]
attention_mask = np.concatenate([before_mask, image_mask, after_mask], axis=1)
print(f" Inserted {num_image_tokens} image tokens")
print(f" Embeddings shape: {inputs_embeds.shape}")
print(f" Time: {time.time() - start_time:.2f}s")
# Step 4: Prefill
print("\n[4/4] Running inference...")
start_time = time.time()
seq_len = inputs_embeds.shape[1]
# M-RoPE: Calculate position_ids with proper 3D positions for image tokens
# We need to use the same logic as PyTorch's get_rope_index
image_grid_thw = inputs.get("image_grid_thw")
if image_grid_thw is not None:
# Calculate position_ids using the same logic as PyTorch
position_ids, rope_deltas = self._get_rope_index(
input_ids[0].tolist(),
image_grid_thw,
attention_mask[0].tolist()
)
position_ids = position_ids[:, np.newaxis, :]
print(f" M-RoPE enabled: rope_deltas={rope_deltas}")
else:
# Fallback to simple position_ids
position_ids = np.arange(seq_len, dtype=np.int64)
position_ids = np.stack([position_ids, position_ids, position_ids], axis=0)
position_ids = position_ids[:, np.newaxis, :]
rope_deltas = 0
prefill_inputs = {
"inputs_embeds": inputs_embeds.astype(np.float32),
"attention_mask": attention_mask.astype(np.int64),
"position_ids": position_ids.astype(np.int64),
}
prefill_outputs = self._run_with_io_binding(
self.sessions["prefill"],
prefill_inputs,
device=self.device
)
logits = prefill_outputs[0]
past_key_values = prefill_outputs[1:]
print(f" Prefill logits shape: {logits.shape}")
print(f" KV cache tensors: {len(past_key_values)}")
print(f" Time: {time.time() - start_time:.2f}s")
print(f"\n[5/5] Generating tokens...", flush=True)
print(f" DEBUG: seq_len={seq_len}, prefill positions=[0..{seq_len-1}]")
generated_tokens = []
decode_attention_mask = attention_mask.copy()
for step in range(max_new_tokens):
next_token_logits = logits[:, -1, :]
next_token_id = int(np.argmax(next_token_logits, axis=-1)[0])
generated_tokens.append(next_token_id)
if step < 5:
print(f" DEBUG step={step}: token={next_token_id} ('{self.processor.tokenizer.decode([next_token_id])}')")
if next_token_id in [self.processor.tokenizer.eos_token_id, 59253]:
print(f" EOS token reached at step {step + 1}")
break
# Update attention mask BEFORE decode (to match PyTorch behavior)
decode_attention_mask = np.concatenate(
[decode_attention_mask, np.ones((1, 1), dtype=np.int64)], axis=1
)
# Get next token embedding
next_token_embeds = self._run_with_io_binding(
self.sessions["embedding"],
{"input_ids": np.array([[next_token_id]], dtype=np.int64)},
device=self.device
)[0]
# Position IDs for M-RoPE: position = cache_position + rope_deltas
# This ensures correct position encoding after image tokens
cache_position = seq_len + step
new_position = cache_position + rope_deltas
decode_position_ids = np.full((3, 1, 1), new_position, dtype=np.int64)
if step < 5:
print(f" DEBUG step={step}: cache_pos={cache_position}, rope_delta={rope_deltas}, position_id={new_position}")
# Prepare decode inputs
decode_inputs = {
"inputs_embeds": next_token_embeds.astype(np.float32),
"attention_mask": decode_attention_mask,
"position_ids": decode_position_ids,
}
for layer_idx in range(16):
decode_inputs[f"past_key_{layer_idx}"] = past_key_values[layer_idx * 2]
decode_inputs[f"past_value_{layer_idx}"] = past_key_values[layer_idx * 2 + 1]
# Run decode
decode_outputs = self._run_with_io_binding(
self.sessions["decode"],
decode_inputs,
device=self.device
)
logits = decode_outputs[0]
past_key_values = decode_outputs[1:]
if (step + 1) % 10 == 0:
print(f" Generated {step + 1} tokens...")
print(f"\n Total tokens generated: {len(generated_tokens)}")
print(f" Time: {time.time() - start_time:.2f}s")
# Save full token sequence (input + generated) to file for comparison
# Note: input_ids_list contains the original 237 tokens from processor
# The actual tokens fed to prefill model may differ due to image token handling
full_sequence = input_ids_list + generated_tokens
with open("result_token_ids_onnx.txt", "w", encoding="utf-8") as f:
f.write(f"ONNX Full Token IDs (including input)\n")
f.write(f"Total: {len(full_sequence)} tokens\n")
f.write(f"Input length: {len(input_ids_list)} tokens (from processor)\n")
f.write(f"Prefill seq_len: {seq_len} tokens (actual embeddings fed to model)\n")
f.write(f"Generated: {len(generated_tokens)} tokens\n")
f.write("="*80 + "\n\n")
f.write(f"Full sequence:\n")
f.write(f"{full_sequence}\n\n")
f.write(f"Input part (first {len(input_ids_list)}):\n")
f.write(f"{input_ids_list}\n\n")
f.write(f"Generated part (last {len(generated_tokens)}):\n")
f.write(f"{generated_tokens}\n")
print(f" Full token IDs saved to result_token_ids_onnx.txt")
generated_text = self.processor.tokenizer.decode(
generated_tokens, skip_special_tokens=True
)
return generated_text
def _remove_duplicate_branches(self, text: str) -> str:
"""
Remove duplicate branches from LaTeX formula output.
This fixes the issue where ONNX model generates repeated formula branches.
"""
import re
# Split by line breaks (\\ in LaTeX)
lines = text.split('\\\\')
seen = set()
unique_lines = []
for line in lines:
# Normalize for comparison (remove extra spaces)
normalized = re.sub(r'\s+', ' ', line.strip())
if not normalized or normalized not in seen:
if normalized:
seen.add(normalized)
unique_lines.append(line)
return '\\\\'.join(unique_lines)
def generate_batch(
self,
image_paths: List[str],
prompt: str = "",
max_new_tokens: int = 100,
) -> List[str]:
"""
Generate text for multiple images.
Args:
image_paths: List of image paths
prompt: Optional text prompt
max_new_tokens: Maximum number of tokens to generate
Returns:
List of generated texts
"""
results = []
for image_path in image_paths:
text = self.generate(image_path, prompt, max_new_tokens)
results.append(text)
return results
def main():
parser = argparse.ArgumentParser(description="GLM-OCR ONNX End-to-End Inference")
parser.add_argument(
"--onnx-dir",
type=str,
default=r"D:\models\onnx-v5\GLM-OCR",
help="ONNX models directory",
)
parser.add_argument(
"--image",
type=str,
default=None,
help="Single image path",
)
parser.add_argument(
"--prompt",
type=str,
default="Formula Recognition:",
help="Text prompt",
)
parser.add_argument(
"--max-tokens",
type=int,
default=1024,
help="Maximum tokens to generate",
)
parser.add_argument(
"--device",
type=str,
default="cpu",
choices=["cpu", "cuda"],
help="Device to use",
)
args = parser.parse_args()
# Get image paths
if args.image:
image_paths = [args.image]
else:
print("Error: --image must be specified")
sys.exit(1)
# Initialize inference
inference = GLMOcrOnnxInference(
onnx_dir=args.onnx_dir,
device=args.device,
)
# Generate
print("\n" + "=" * 60)
print("GLM-OCR ONNX End-to-End Inference")
print("=" * 60)
results = inference.generate_batch(
image_paths=image_paths,
prompt=args.prompt,
max_new_tokens=args.max_tokens,
)
# Print results
print("\n" + "=" * 60)
print("Results")
print("=" * 60)
for i, (image_path, text) in enumerate(zip(image_paths, results)):
print(f"\nImage {i + 1}: {image_path}")
print(f"Generated text:\n{text}")
print("-" * 60)
if __name__ == "__main__":
main()
GLM-OCR
👋 Join our WeChat and Discord community
📍 Use GLM-OCR's API
Introduction
GLM-OCR is a multimodal OCR model for complex document understanding, built on the GLM-V encoder–decoder architecture. It introduces Multi-Token Prediction (MTP) loss and stable full-task reinforcement learning to improve training efficiency, recognition accuracy, and generalization. The model integrates the CogViT visual encoder pre-trained on large-scale image–text data, a lightweight cross-modal connector with efficient token downsampling, and a GLM-0.5B language decoder. Combined with a two-stage pipeline of layout analysis and parallel recognition based on PP-DocLayout-V3, GLM-OCR delivers robust and high-quality OCR performance across diverse document layouts.
Key Features
State-of-the-Art Performance: Achieves a score of 94.62 on OmniDocBench V1.5, ranking #1 overall, and delivers state-of-the-art results across major document understanding benchmarks, including formula recognition, table recognition, and information extraction.
Optimized for Real-World Scenarios: Designed and optimized for practical business use cases, maintaining robust performance on complex tables, code-heavy documents, seals, and other challenging real-world layouts.
Efficient Inference: With only 0.9B parameters, GLM-OCR supports deployment via vLLM, SGLang, and Ollama, significantly reducing inference latency and compute cost, making it ideal for high-concurrency services and edge deployments.
Easy to Use: Fully open-sourced and equipped with a comprehensive SDK and inference toolchain, offering simple installation, one-line invocation, and smooth integration into existing production pipelines.
Usage
vLLM
- run
pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly
or using docker with:
docker pull vllm/vllm-openai:nightly
- run with:
pip install git+https://github.com/huggingface/transformers.git
vllm serve zai-org/GLM-OCR --allowed-local-media-path / --port 8080
SGLang
- using docker with:
docker pull lmsysorg/sglang:dev
or build it from source with:
pip install git+https://github.com/sgl-project/sglang.git#subdirectory=python
- run with:
pip install git+https://github.com/huggingface/transformers.git
python -m sglang.launch_server --model zai-org/GLM-OCR --port 8080
Ollama
- Download Ollama.
- run with:
ollama run glm-ocr
Ollama will automatically use image file path when an image is dragged into the terminal:
ollama run glm-ocr Text Recognition: ./image.png
Transformers
pip install git+https://github.com/huggingface/transformers.git
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
MODEL_PATH = "zai-org/GLM-OCR"
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": "test_image.png"
},
{
"type": "text",
"text": "Text Recognition:"
}
],
}
]
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
pretrained_model_name_or_path=MODEL_PATH,
torch_dtype="auto",
device_map="auto",
)
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to(model.device)
inputs.pop("token_type_ids", None)
generated_ids = model.generate(**inputs, max_new_tokens=8192)
output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
print(output_text)
Prompt Limited
GLM-OCR currently supports two types of prompt scenarios:
- Document Parsing – extract raw content from documents. Supported tasks include:
{
"text": "Text Recognition:",
"formula": "Formula Recognition:",
"table": "Table Recognition:"
}
- Information Extraction – extract structured information from documents. Prompts must follow a strict JSON schema. For example, to extract personal ID information:
请按下列JSON格式输出图中信息:
{
"id_number": "",
"last_name": "",
"first_name": "",
"date_of_birth": "",
"address": {
"street": "",
"city": "",
"state": "",
"zip_code": ""
},
"dates": {
"issue_date": "",
"expiration_date": ""
},
"sex": ""
}
⚠️ Note: When using information extraction, the output must strictly adhere to the defined JSON schema to ensure downstream processing compatibility.
GLM-OCR SDK
We provide an easy-to-use SDK for using GLM-OCR more efficiently and conveniently. please check our github to get more detail.
Acknowledgement
This project is inspired by the excellent work of the following projects and communities:
License
The GLM-OCR model is released under the MIT License.
The complete OCR pipeline integrates PP-DocLayoutV3 for document layout analysis, which is licensed under the Apache License 2.0. Users should comply with both licenses when using this project.
- Downloads last month
- 66
Model tree for ningpp/GLM-OCR
Base model
zai-org/GLM-OCR